From 312f7da2824c82800ee78d6190f12854456957af Mon Sep 17 00:00:00 2001
From: Albert Lee <albertcc@tw.ibm.com>
Date: Tue, 27 Sep 2005 17:38:03 +0800
Subject: [PATCH] libata: interrupt driven pio for libata-core

- add PIO_ST_FIRST for the state before sending ATAPI CDB or sending
"ATA PIO data out" first data block.
- add ATA_TFLAG_POLLING and ATA_DFLAG_CDB_INTR flags
- remove the ATA_FLAG_NOINTR flag since the interrupt handler is now
aware of the states
- modify ata_pio_sector() and atapi_pio_bytes() to work in the interrupt
context
- modify the ata_host_intr() to handle PIO interrupts
- modify ata_qc_issue_prot() to initialize states
- atapi_packet_task() changed to handle "ATA PIO data out" first data block
- support the pre-ATA4 ATAPI device which raise interrupt when ready to
receive CDB

Signed-off-by: Albert Lee <albertcc@tw.ibm.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 include/linux/ata.h    | 3 +++
 include/linux/libata.h | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index a5b74efab067..6fec2f6f2d59 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -181,6 +181,7 @@ enum {
 	ATA_TFLAG_ISADDR	= (1 << 1), /* enable r/w to nsect/lba regs */
 	ATA_TFLAG_DEVICE	= (1 << 2), /* enable r/w to device reg */
 	ATA_TFLAG_WRITE		= (1 << 3), /* data dir: host->dev==1 (write) */
+	ATA_TFLAG_POLLING	= (1 << 4), /* set nIEN to 1 and use polling */
 };
 
 enum ata_tf_protocols {
@@ -250,6 +251,8 @@ struct ata_taskfile {
 	  ((u64) (id)[(n) + 1] << 16) |	\
 	  ((u64) (id)[(n) + 0]) )
 
+#define ata_id_cdb_intr(id)	(((id)[0] & 0x60) == 0x20)
+
 static inline int atapi_cdb_len(u16 *dev_id)
 {
 	u16 tmp = dev_id[0] & 0x3;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index bb2d916bce44..9ac2b69df3c1 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -97,6 +97,7 @@ enum {
 	ATA_DFLAG_LBA48		= (1 << 0), /* device supports LBA48 */
 	ATA_DFLAG_PIO		= (1 << 1), /* device currently in PIO mode */
 	ATA_DFLAG_LOCK_SECTORS	= (1 << 2), /* don't adjust max_sectors */
+	ATA_DFLAG_CDB_INTR	= (1 << 3), /* device asserts INTRQ when ready for CDB */
 
 	ATA_DEV_UNKNOWN		= 0,	/* unknown device */
 	ATA_DEV_ATA		= 1,	/* ATA device */
@@ -115,8 +116,6 @@ enum {
 	ATA_FLAG_MMIO		= (1 << 6), /* use MMIO, not PIO */
 	ATA_FLAG_SATA_RESET	= (1 << 7), /* use COMRESET */
 	ATA_FLAG_PIO_DMA	= (1 << 8), /* PIO cmds via DMA */
-	ATA_FLAG_NOINTR		= (1 << 9), /* FIXME: Remove this once
-					     * proper HSM is in place. */
 
 	ATA_QCFLAG_ACTIVE	= (1 << 1), /* cmd not yet ack'd to scsi lyer */
 	ATA_QCFLAG_SG		= (1 << 3), /* have s/g table? */
@@ -165,6 +164,7 @@ enum hsm_task_states {
 	HSM_ST_LAST,
 	HSM_ST_LAST_POLL,
 	HSM_ST_ERR,
+	HSM_ST_FIRST,
 };
 
 /* forward declarations */
-- 
cgit v1.2.3


From e50362eccd8809a224cda5f71714a088ba37b2ab Mon Sep 17 00:00:00 2001
From: Albert Lee <albertcc@tw.ibm.com>
Date: Tue, 27 Sep 2005 17:39:50 +0800
Subject: [PATCH] libata: interrupt driven pio for LLD

libata.h:
libata-core:
  Add ATA_FLAG_PIO_POLLING flag for LLDs that expect interrupt for
command completion only.

sata_nv.c:
sata_vsc.c:
  irq handler is wrapper around ata_host_intr(), can handle PIO interrupts.

sata_promise.c:
sata_sx4.c:
sata_qstor.c:
sata_mv.c:
  Private irq handler.
  Polling mode ATA_FLAG_PIO_POLLING used for compatibility.

Signed-off-by: Albert Lee <albertcc@tw.ibm.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c  | 19 +++++++++++++++++++
 drivers/scsi/sata_mv.c      |  7 +++++--
 drivers/scsi/sata_nv.c      |  4 ++--
 drivers/scsi/sata_promise.c | 13 ++++++++-----
 drivers/scsi/sata_qstor.c   | 11 +++++------
 drivers/scsi/sata_sx4.c     |  7 ++++---
 drivers/scsi/sata_vsc.c     |  6 +++---
 include/linux/libata.h      |  2 ++
 8 files changed, 48 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index cc2d1308826e..f8a590e59f10 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -3356,6 +3356,25 @@ int ata_qc_issue_prot(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
 
+	/* Use polling pio if the LLD doesn't handle
+	 * interrupt driven pio and atapi CDB interrupt.
+	 */
+	if (ap->flags & ATA_FLAG_PIO_POLLING) {
+		switch (qc->tf.protocol) {
+		case ATA_PROT_PIO:
+		case ATA_PROT_ATAPI:
+		case ATA_PROT_ATAPI_NODATA:
+			qc->tf.flags |= ATA_TFLAG_POLLING;
+			break;
+		case ATA_PROT_ATAPI_DMA:
+			if (qc->dev->flags & ATA_DFLAG_CDB_INTR)
+				BUG();
+			break;
+		default:
+			break;
+		}
+	}
+
 	/* select the device */
 	ata_dev_select(ap, qc->dev->devno, 1, 0);
 
diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c
index ea76fe44585e..b8f1f6963179 100644
--- a/drivers/scsi/sata_mv.c
+++ b/drivers/scsi/sata_mv.c
@@ -241,7 +241,8 @@ static struct ata_port_info mv_port_info[] = {
 	{  /* chip_504x */
 		.sht		= &mv_sht,
 		.host_flags	= (ATA_FLAG_SATA | ATA_FLAG_NO_LEGACY |
-				   ATA_FLAG_SATA_RESET | ATA_FLAG_MMIO),
+				   ATA_FLAG_SATA_RESET | ATA_FLAG_MMIO |
+				   ATA_FLAG_PIO_POLLING),
 		.pio_mask	= 0x1f,	/* pio4-0 */
 		.udma_mask	= 0,	/* 0x7f (udma6-0 disabled for now) */
 		.port_ops	= &mv_ops,
@@ -250,7 +251,7 @@ static struct ata_port_info mv_port_info[] = {
 		.sht		= &mv_sht,
 		.host_flags	= (ATA_FLAG_SATA | ATA_FLAG_NO_LEGACY |
 				   ATA_FLAG_SATA_RESET | ATA_FLAG_MMIO | 
-				   MV_FLAG_DUAL_HC),
+				   ATA_FLAG_PIO_POLLING | MV_FLAG_DUAL_HC),
 		.pio_mask	= 0x1f,	/* pio4-0 */
 		.udma_mask	= 0,	/* 0x7f (udma6-0 disabled for now) */
 		.port_ops	= &mv_ops,
@@ -259,6 +260,7 @@ static struct ata_port_info mv_port_info[] = {
 		.sht		= &mv_sht,
 		.host_flags	= (ATA_FLAG_SATA | ATA_FLAG_NO_LEGACY |
 				   ATA_FLAG_SATA_RESET | ATA_FLAG_MMIO | 
+				   ATA_FLAG_PIO_POLLING |
 				   MV_FLAG_IRQ_COALESCE | MV_FLAG_BDMA),
 		.pio_mask	= 0x1f,	/* pio4-0 */
 		.udma_mask	= 0,	/* 0x7f (udma6-0 disabled for now) */
@@ -268,6 +270,7 @@ static struct ata_port_info mv_port_info[] = {
 		.sht		= &mv_sht,
 		.host_flags	= (ATA_FLAG_SATA | ATA_FLAG_NO_LEGACY |
 				   ATA_FLAG_SATA_RESET | ATA_FLAG_MMIO |
+				   ATA_FLAG_PIO_POLLING |
 				   MV_FLAG_IRQ_COALESCE | MV_FLAG_DUAL_HC |
 				   MV_FLAG_BDMA),
 		.pio_mask	= 0x1f,	/* pio4-0 */
diff --git a/drivers/scsi/sata_nv.c b/drivers/scsi/sata_nv.c
index c05653c7779d..8b7e871ea0bf 100644
--- a/drivers/scsi/sata_nv.c
+++ b/drivers/scsi/sata_nv.c
@@ -304,11 +304,11 @@ static irqreturn_t nv_interrupt (int irq, void *dev_instance,
 
 		ap = host_set->ports[i];
 		if (ap &&
-		    !(ap->flags & (ATA_FLAG_PORT_DISABLED | ATA_FLAG_NOINTR))) {
+		    !(ap->flags & ATA_FLAG_PORT_DISABLED)) {
 			struct ata_queued_cmd *qc;
 
 			qc = ata_qc_from_tag(ap, ap->active_tag);
-			if (qc && (!(qc->tf.ctl & ATA_NIEN)))
+			if (qc && (!(qc->tf.flags & ATA_TFLAG_POLLING)))
 				handled += ata_host_intr(ap, qc);
 		}
 
diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c
index def7e0d9dacb..f67deb0a55c9 100644
--- a/drivers/scsi/sata_promise.c
+++ b/drivers/scsi/sata_promise.c
@@ -162,7 +162,8 @@ static struct ata_port_info pdc_port_info[] = {
 	{
 		.sht		= &pdc_ata_sht,
 		.host_flags	= ATA_FLAG_SATA | ATA_FLAG_NO_LEGACY |
-				  ATA_FLAG_SRST | ATA_FLAG_MMIO,
+				  ATA_FLAG_SRST | ATA_FLAG_MMIO |
+				  ATA_FLAG_PIO_POLLING,
 		.pio_mask	= 0x1f, /* pio0-4 */
 		.mwdma_mask	= 0x07, /* mwdma0-2 */
 		.udma_mask	= 0x7f, /* udma0-6 ; FIXME */
@@ -173,7 +174,8 @@ static struct ata_port_info pdc_port_info[] = {
 	{
 		.sht		= &pdc_ata_sht,
 		.host_flags	= ATA_FLAG_SATA | ATA_FLAG_NO_LEGACY |
-				  ATA_FLAG_SRST | ATA_FLAG_MMIO,
+				  ATA_FLAG_SRST | ATA_FLAG_MMIO |
+				  ATA_FLAG_PIO_POLLING,
 		.pio_mask	= 0x1f, /* pio0-4 */
 		.mwdma_mask	= 0x07, /* mwdma0-2 */
 		.udma_mask	= 0x7f, /* udma0-6 ; FIXME */
@@ -184,7 +186,8 @@ static struct ata_port_info pdc_port_info[] = {
 	{
 		.sht		= &pdc_ata_sht,
 		.host_flags	= ATA_FLAG_NO_LEGACY | ATA_FLAG_SRST |
-				  ATA_FLAG_MMIO | ATA_FLAG_SLAVE_POSS,
+				  ATA_FLAG_MMIO | ATA_FLAG_SLAVE_POSS |
+				  ATA_FLAG_PIO_POLLING,
 		.pio_mask	= 0x1f, /* pio0-4 */
 		.mwdma_mask	= 0x07, /* mwdma0-2 */
 		.udma_mask	= 0x7f, /* udma0-6 ; FIXME */
@@ -493,11 +496,11 @@ static irqreturn_t pdc_interrupt (int irq, void *dev_instance, struct pt_regs *r
 		ap = host_set->ports[i];
 		tmp = mask & (1 << (i + 1));
 		if (tmp && ap &&
-		    !(ap->flags & (ATA_FLAG_PORT_DISABLED | ATA_FLAG_NOINTR))) {
+		    !(ap->flags & ATA_FLAG_PORT_DISABLED)) {
 			struct ata_queued_cmd *qc;
 
 			qc = ata_qc_from_tag(ap, ap->active_tag);
-			if (qc && (!(qc->tf.ctl & ATA_NIEN)))
+			if (qc && (!(qc->tf.flags & ATA_TFLAG_POLLING)))
 				handled += pdc_host_intr(ap, qc);
 		}
 	}
diff --git a/drivers/scsi/sata_qstor.c b/drivers/scsi/sata_qstor.c
index ffcdeb68641c..a604afafaecc 100644
--- a/drivers/scsi/sata_qstor.c
+++ b/drivers/scsi/sata_qstor.c
@@ -175,7 +175,7 @@ static struct ata_port_info qs_port_info[] = {
 		.host_flags	= ATA_FLAG_SATA | ATA_FLAG_NO_LEGACY |
 				  ATA_FLAG_SATA_RESET |
 				  //FIXME ATA_FLAG_SRST |
-				  ATA_FLAG_MMIO,
+				  ATA_FLAG_MMIO | ATA_FLAG_PIO_POLLING,
 		.pio_mask	= 0x10, /* pio4 */
 		.udma_mask	= 0x7f, /* udma0-6 */
 		.port_ops	= &qs_ata_ops,
@@ -389,14 +389,13 @@ static inline unsigned int qs_intr_pkt(struct ata_host_set *host_set)
 			DPRINTK("SFF=%08x%08x: sCHAN=%u sHST=%d sDST=%02x\n",
 					sff1, sff0, port_no, sHST, sDST);
 			handled = 1;
-			if (ap && !(ap->flags &
-				    (ATA_FLAG_PORT_DISABLED|ATA_FLAG_NOINTR))) {
+			if (ap && !(ap->flags & ATA_FLAG_PORT_DISABLED)) {
 				struct ata_queued_cmd *qc;
 				struct qs_port_priv *pp = ap->private_data;
 				if (!pp || pp->state != qs_state_pkt)
 					continue;
 				qc = ata_qc_from_tag(ap, ap->active_tag);
-				if (qc && (!(qc->tf.ctl & ATA_NIEN))) {
+				if (qc && (!(qc->tf.flags & ATA_TFLAG_POLLING))) {
 					switch (sHST) {
 					case 0: /* sucessful CPB */
 					case 3: /* device error */
@@ -422,13 +421,13 @@ static inline unsigned int qs_intr_mmio(struct ata_host_set *host_set)
 		struct ata_port *ap;
 		ap = host_set->ports[port_no];
 		if (ap &&
-		    !(ap->flags & (ATA_FLAG_PORT_DISABLED | ATA_FLAG_NOINTR))) {
+		    !(ap->flags & ATA_FLAG_PORT_DISABLED)) {
 			struct ata_queued_cmd *qc;
 			struct qs_port_priv *pp = ap->private_data;
 			if (!pp || pp->state != qs_state_mmio)
 				continue;
 			qc = ata_qc_from_tag(ap, ap->active_tag);
-			if (qc && (!(qc->tf.ctl & ATA_NIEN))) {
+			if (qc && (!(qc->tf.flags & ATA_TFLAG_POLLING))) {
 
 				/* check main status, clearing INTRQ */
 				u8 status = ata_chk_status(ap);
diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c
index 540a85191172..a9f9f7685a59 100644
--- a/drivers/scsi/sata_sx4.c
+++ b/drivers/scsi/sata_sx4.c
@@ -219,7 +219,8 @@ static struct ata_port_info pdc_port_info[] = {
 	{
 		.sht		= &pdc_sata_sht,
 		.host_flags	= ATA_FLAG_SATA | ATA_FLAG_NO_LEGACY |
-				  ATA_FLAG_SRST | ATA_FLAG_MMIO,
+				  ATA_FLAG_SRST | ATA_FLAG_MMIO |
+				  ATA_FLAG_PIO_POLLING,
 		.pio_mask	= 0x1f, /* pio0-4 */
 		.mwdma_mask	= 0x07, /* mwdma0-2 */
 		.udma_mask	= 0x7f, /* udma0-6 ; FIXME */
@@ -832,11 +833,11 @@ static irqreturn_t pdc20621_interrupt (int irq, void *dev_instance, struct pt_re
 		tmp = mask & (1 << i);
 		VPRINTK("seq %u, port_no %u, ap %p, tmp %x\n", i, port_no, ap, tmp);
 		if (tmp && ap &&
-		    !(ap->flags & (ATA_FLAG_PORT_DISABLED | ATA_FLAG_NOINTR))) {
+		    !(ap->flags & ATA_FLAG_PORT_DISABLED)) {
 			struct ata_queued_cmd *qc;
 
 			qc = ata_qc_from_tag(ap, ap->active_tag);
-			if (qc && (!(qc->tf.ctl & ATA_NIEN)))
+			if (qc && (!(qc->tf.flags & ATA_TFLAG_POLLING)))
 				handled += pdc20621_host_intr(ap, qc, (i > 4),
 							      mmio_base);
 		}
diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c
index cf94e0158a8d..92378d768c86 100644
--- a/drivers/scsi/sata_vsc.c
+++ b/drivers/scsi/sata_vsc.c
@@ -193,12 +193,12 @@ static irqreturn_t vsc_sata_interrupt (int irq, void *dev_instance,
 			struct ata_port *ap;
 
 			ap = host_set->ports[i];
-			if (ap && !(ap->flags &
-				    (ATA_FLAG_PORT_DISABLED|ATA_FLAG_NOINTR))) {
+			if (ap &&
+			    !(ap->flags & ATA_FLAG_PORT_DISABLED)) {
 				struct ata_queued_cmd *qc;
 
 				qc = ata_qc_from_tag(ap, ap->active_tag);
-				if (qc && (!(qc->tf.ctl & ATA_NIEN)))
+				if (qc && (!(qc->tf.flags & ATA_TFLAG_POLLING)))
 					handled += ata_host_intr(ap, qc);
 			}
 		}
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 9ac2b69df3c1..ea8ab29aa92e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -116,6 +116,8 @@ enum {
 	ATA_FLAG_MMIO		= (1 << 6), /* use MMIO, not PIO */
 	ATA_FLAG_SATA_RESET	= (1 << 7), /* use COMRESET */
 	ATA_FLAG_PIO_DMA	= (1 << 8), /* PIO cmds via DMA */
+	ATA_FLAG_PIO_POLLING	= (1 << 9), /* use polling PIO if LLD
+					     * doesn't handle PIO interrupts */
 
 	ATA_QCFLAG_ACTIVE	= (1 << 1), /* cmd not yet ack'd to scsi lyer */
 	ATA_QCFLAG_SG		= (1 << 3), /* have s/g table? */
-- 
cgit v1.2.3


From c56b14d2a3e32695e13cd49b417da889da744d1c Mon Sep 17 00:00:00 2001
From: Albert Lee <albertcc@tw.ibm.com>
Date: Fri, 30 Sep 2005 19:07:39 +0800
Subject: [PATCH] libata irq-pio: add comments and cleanup

Signed-off-by: Albert Lee <albertcc@tw.ibm.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 12 ------------
 include/linux/libata.h     | 19 ++++++++++---------
 2 files changed, 10 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index f8a590e59f10..617836a313f0 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -3430,18 +3430,6 @@ int ata_qc_issue_prot(struct ata_queued_cmd *qc)
 		break;
 
 	case ATA_PROT_ATAPI:
-		if (qc->tf.flags & ATA_TFLAG_POLLING)
-			ata_qc_set_polling(qc);
-
-		ata_tf_to_host_nolock(ap, &qc->tf);
-		ap->hsm_task_state = HSM_ST_FIRST;
-
-		/* send cdb by polling if no cdb interrupt */
-		if ((!(qc->dev->flags & ATA_DFLAG_CDB_INTR)) ||
-		    (qc->tf.flags & ATA_TFLAG_POLLING))
-			queue_work(ata_wq, &ap->packet_task);
-		break;
-
 	case ATA_PROT_ATAPI_NODATA:
 		if (qc->tf.flags & ATA_TFLAG_POLLING)
 			ata_qc_set_polling(qc);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index ea8ab29aa92e..1fcd0ef9e1c9 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -158,15 +158,16 @@ enum {
 };
 
 enum hsm_task_states {
-	HSM_ST_UNKNOWN,
-	HSM_ST_IDLE,
-	HSM_ST_POLL,
-	HSM_ST_TMOUT,
-	HSM_ST,
-	HSM_ST_LAST,
-	HSM_ST_LAST_POLL,
-	HSM_ST_ERR,
-	HSM_ST_FIRST,
+	HSM_ST_UNKNOWN,		/* state unknown */
+	HSM_ST_IDLE,		/* no command on going */
+	HSM_ST_POLL,		/* same as HSM_ST, waits longer */
+	HSM_ST_TMOUT,		/* timeout */
+	HSM_ST,			/* (waiting the device to) transfer data */
+	HSM_ST_LAST,		/* (waiting the device to) complete command */
+	HSM_ST_LAST_POLL,	/* same as HSM_ST_LAST, waits longer */
+	HSM_ST_ERR,		/* error */
+	HSM_ST_FIRST,		/* (waiting the device to)
+				   write CDB or first data block */
 };
 
 /* forward declarations */
-- 
cgit v1.2.3


From f9997be974be40e884e9e8157ded2f2f9aed454c Mon Sep 17 00:00:00 2001
From: Albert Lee <albertcc@tw.ibm.com>
Date: Fri, 30 Sep 2005 19:09:31 +0800
Subject: [PATCH] libata irq-pio: rename atapi_packet_task() and comments

Signed-off-by: Albert Lee <albertcc@tw.ibm.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 25 +++++++++++++------------
 include/linux/libata.h     |  6 +++---
 2 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 617836a313f0..a63758da4892 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -3416,7 +3416,7 @@ int ata_qc_issue_prot(struct ata_queued_cmd *qc)
 			if (qc->tf.flags & ATA_TFLAG_WRITE) {
 				/* PIO data out protocol */
 				ap->hsm_task_state = HSM_ST_FIRST;
-				queue_work(ata_wq, &ap->packet_task);
+				queue_work(ata_wq, &ap->dataout_task);
 
 				/* send first data block by polling */
 			} else {
@@ -3440,7 +3440,7 @@ int ata_qc_issue_prot(struct ata_queued_cmd *qc)
 		/* send cdb by polling if no cdb interrupt */
 		if ((!(qc->dev->flags & ATA_DFLAG_CDB_INTR)) ||
 		    (qc->tf.flags & ATA_TFLAG_POLLING))
-			queue_work(ata_wq, &ap->packet_task);
+			queue_work(ata_wq, &ap->dataout_task);
 		break;
 
 	case ATA_PROT_ATAPI_DMA:
@@ -3452,7 +3452,7 @@ int ata_qc_issue_prot(struct ata_queued_cmd *qc)
 
 		/* send cdb by polling if no cdb interrupt */
 		if (!(qc->dev->flags & ATA_DFLAG_CDB_INTR))
-			queue_work(ata_wq, &ap->packet_task);
+			queue_work(ata_wq, &ap->dataout_task);
 		break;
 
 	default:
@@ -3952,20 +3952,21 @@ irqreturn_t ata_interrupt (int irq, void *dev_instance, struct pt_regs *regs)
 }
 
 /**
- *	atapi_packet_task - Write CDB bytes to hardware
- *	@_data: Port to which ATAPI device is attached.
+ *	ata_dataout_task - Write first data block to hardware
+ *	@_data: Port to which ATA/ATAPI device is attached.
  *
  *	When device has indicated its readiness to accept
- *	a CDB, this function is called.  Send the CDB.
- *	If DMA is to be performed, exit immediately.
- *	Otherwise, we are in polling mode, so poll
- *	status under operation succeeds or fails.
+ *	the data, this function sends out the CDB or 
+ *	the first data block by PIO.
+ *	After this, 
+ *	  - If polling, ata_pio_task() handles the rest.
+ *	  - Otherwise, interrupt handler takes over.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep)
  */
 
-static void atapi_packet_task(void *_data)
+static void ata_dataout_task(void *_data)
 {
 	struct ata_port *ap = _data;
 	struct ata_queued_cmd *qc;
@@ -3978,7 +3979,7 @@ static void atapi_packet_task(void *_data)
 
 	/* sleep-wait for BSY to clear */
 	DPRINTK("busy wait\n");
-	if (ata_busy_sleep(ap, ATA_TMOUT_CDB_QUICK, ATA_TMOUT_CDB))
+	if (ata_busy_sleep(ap, ATA_TMOUT_DATAOUT_QUICK, ATA_TMOUT_DATAOUT))
 		goto err_out;
 
 	/* make sure DRQ is set */
@@ -4141,7 +4142,7 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 	ap->active_tag = ATA_TAG_POISON;
 	ap->last_ctl = 0xFF;
 
-	INIT_WORK(&ap->packet_task, atapi_packet_task, ap);
+	INIT_WORK(&ap->dataout_task, ata_dataout_task, ap);
 	INIT_WORK(&ap->pio_task, ata_pio_task, ap);
 
 	for (i = 0; i < ATA_MAX_DEVICES; i++)
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 1fcd0ef9e1c9..7e6feb97406e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -129,8 +129,8 @@ enum {
 	ATA_TMOUT_PIO		= 30 * HZ,
 	ATA_TMOUT_BOOT		= 30 * HZ,	/* hueristic */
 	ATA_TMOUT_BOOT_QUICK	= 7 * HZ,	/* hueristic */
-	ATA_TMOUT_CDB		= 30 * HZ,
-	ATA_TMOUT_CDB_QUICK	= 5 * HZ,
+	ATA_TMOUT_DATAOUT	= 30 * HZ,
+	ATA_TMOUT_DATAOUT_QUICK	= 5 * HZ,
 
 	/* ATA bus states */
 	BUS_UNKNOWN		= 0,
@@ -319,7 +319,7 @@ struct ata_port {
 	struct ata_host_stats	stats;
 	struct ata_host_set	*host_set;
 
-	struct work_struct	packet_task;
+	struct work_struct	dataout_task;
 
 	struct work_struct	pio_task;
 	unsigned int		hsm_task_state;
-- 
cgit v1.2.3


From e27486db89ef04d5df1727c52362fa3d50cff241 Mon Sep 17 00:00:00 2001
From: Albert Lee <albertcc@tw.ibm.com>
Date: Tue, 1 Nov 2005 19:24:49 +0800
Subject: [PATCH] libata irq-pio: merge the ata_dataout_task workqueue with
 ata_pio_task workqueue

   - remove ap->dataout_task from struct ata_port
   - let ata_pio_task() handle the HSM_ST_FIRST state.
   - rename ata_dataout_task() to ata_pio_first_block()
   - replace the ata_dataout_task workqueue with ata_pio_task workqueue

Signed-off-by: Albert Lee <albertcc@tw.ibm.com>

========
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 20 +++++++++++---------
 include/linux/libata.h     |  2 --
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 15736e3313f0..96b8bbaa7631 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -3056,8 +3056,8 @@ static void atapi_send_cdb(struct ata_port *ap, struct ata_queued_cmd *qc)
 }
 
 /**
- *	ata_dataout_task - Write first data block to hardware
- *	@_data: Port to which ATA/ATAPI device is attached.
+ *	ata_pio_first_block - Write first data block to hardware
+ *	@ap: Port to which ATA/ATAPI device is attached.
  *
  *	When device has indicated its readiness to accept
  *	the data, this function sends out the CDB or 
@@ -3070,9 +3070,8 @@ static void atapi_send_cdb(struct ata_port *ap, struct ata_queued_cmd *qc)
  *	Kernel thread context (may sleep)
  */
 
-static void ata_dataout_task(void *_data)
+static void ata_pio_first_block(struct ata_port *ap)
 {
-	struct ata_port *ap = _data;
 	struct ata_queued_cmd *qc;
 	u8 status;
 	unsigned long flags;
@@ -3346,6 +3345,10 @@ fsm_start:
 	qc_completed = 0;
 
 	switch (ap->hsm_task_state) {
+	case HSM_ST_FIRST:
+		ata_pio_first_block(ap);
+		return;
+
 	case HSM_ST:
 		ata_pio_block(ap);
 		break;
@@ -3796,10 +3799,10 @@ int ata_qc_issue_prot(struct ata_queued_cmd *qc)
 		if (qc->tf.flags & ATA_TFLAG_WRITE) {
 			/* PIO data out protocol */
 			ap->hsm_task_state = HSM_ST_FIRST;
-			queue_work(ata_wq, &ap->dataout_task);
+			queue_work(ata_wq, &ap->pio_task);
 
 			/* always send first data block using
-			 * the ata_dataout_task() codepath.
+			 * the ata_pio_task() codepath.
 			 */
 		} else {
 			/* PIO data in protocol */
@@ -3826,7 +3829,7 @@ int ata_qc_issue_prot(struct ata_queued_cmd *qc)
 		/* send cdb by polling if no cdb interrupt */
 		if ((!(qc->dev->flags & ATA_DFLAG_CDB_INTR)) ||
 		    (qc->tf.flags & ATA_TFLAG_POLLING))
-			queue_work(ata_wq, &ap->dataout_task);
+			queue_work(ata_wq, &ap->pio_task);
 		break;
 
 	case ATA_PROT_ATAPI_DMA:
@@ -3838,7 +3841,7 @@ int ata_qc_issue_prot(struct ata_queued_cmd *qc)
 
 		/* send cdb by polling if no cdb interrupt */
 		if (!(qc->dev->flags & ATA_DFLAG_CDB_INTR))
-			queue_work(ata_wq, &ap->dataout_task);
+			queue_work(ata_wq, &ap->pio_task);
 		break;
 
 	default:
@@ -4426,7 +4429,6 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 	ap->active_tag = ATA_TAG_POISON;
 	ap->last_ctl = 0xFF;
 
-	INIT_WORK(&ap->dataout_task, ata_dataout_task, ap);
 	INIT_WORK(&ap->pio_task, ata_pio_task, ap);
 
 	for (i = 0; i < ATA_MAX_DEVICES; i++)
diff --git a/include/linux/libata.h b/include/linux/libata.h
index ad0451dfee15..70ae140dbf23 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -346,8 +346,6 @@ struct ata_port {
 	struct ata_host_stats	stats;
 	struct ata_host_set	*host_set;
 
-	struct work_struct	dataout_task;
-
 	struct work_struct	pio_task;
 	unsigned int		hsm_task_state;
 	unsigned long		pio_task_timeout;
-- 
cgit v1.2.3


From 07f6f7d074e68d56d82e7cc5c65096033ac8dc56 Mon Sep 17 00:00:00 2001
From: Albert Lee <albertcc@tw.ibm.com>
Date: Tue, 1 Nov 2005 19:33:20 +0800
Subject: [PATCH] libata irq-pio: add read/write multiple support

   - add is_multi_taskfile() to ata.h
   - initialize ata_device->multi_count with device identify data
   - use ata_pio_sectors() to support r/w multiple commands

Signed-off-by: Albert Lee <albertcc@tw.ibm.com>

========
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 44 ++++++++++++++++++++++++++++++++++++++------
 include/linux/ata.h        |  8 ++++++++
 2 files changed, 46 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 2e0e6cca327c..59a4a26bc13f 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1261,6 +1261,12 @@ retry:
 
 		}
 
+		if (dev->id[59] & 0x100) {
+			dev->multi_count = dev->id[59] & 0xff;
+			DPRINTK("ata%u: dev %u multi count %u\n",
+				ap->id, device, dev->multi_count);
+		}
+
 		ap->host->max_cmd_len = 16;
 	}
 
@@ -2804,7 +2810,7 @@ static int ata_pio_complete (struct ata_port *ap)
 	 * we enter, BSY will be cleared in a chk-status or two.  If not,
 	 * the drive is probably seeking or something.  Snooze for a couple
 	 * msecs, then chk-status again.  If still busy, fall back to
-	 * HSM_ST_POLL state.
+	 * HSM_ST_LAST_POLL state.
 	 */
 	drv_stat = ata_busy_wait(ap, ATA_BUSY | ATA_DRQ, 10);
 	if (drv_stat & (ATA_BUSY | ATA_DRQ)) {
@@ -3020,6 +3026,32 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
 	}
 }
 
+/**
+ *	ata_pio_sectors - Transfer one or many 512-byte sectors.
+ *	@qc: Command on going
+ *
+ *	Transfer one or many ATA_SECT_SIZE of data from/to the 
+ *	ATA device for the DRQ request.
+ *
+ *	LOCKING:
+ *	Inherited from caller.
+ */
+
+static void ata_pio_sectors(struct ata_queued_cmd *qc)
+{
+	if (is_multi_taskfile(&qc->tf)) {
+		/* READ/WRITE MULTIPLE */
+		unsigned int nsect;
+
+		assert(qc->dev->multi_count);
+
+		nsect = min(qc->nsect - qc->cursect, qc->dev->multi_count);
+		while (nsect--)
+			ata_pio_sector(qc);
+	} else
+		ata_pio_sector(qc);
+}
+
 /**
  *	atapi_send_cdb - Write CDB bytes to hardware
  *	@ap: Port to which ATAPI device is attached.
@@ -3118,11 +3150,11 @@ static int ata_pio_first_block(struct ata_port *ap)
 		 * send first data block.
 		 */
 
-		/* ata_pio_sector() might change the state to HSM_ST_LAST.
-		 * so, the state is changed here before ata_pio_sector().
+		/* ata_pio_sectors() might change the state to HSM_ST_LAST.
+		 * so, the state is changed here before ata_pio_sectors().
 		 */
 		ap->hsm_task_state = HSM_ST;
-		ata_pio_sector(qc);
+		ata_pio_sectors(qc);
 		ata_altstatus(ap); /* flush */
 	} else
 		/* send CDB */
@@ -3327,7 +3359,7 @@ static void ata_pio_block(struct ata_port *ap)
 			return;
 		}
 
-		ata_pio_sector(qc);
+		ata_pio_sectors(qc);
 	}
 
 	ata_altstatus(ap); /* flush */
@@ -4213,7 +4245,7 @@ fsm_start:
 				goto fsm_start;
 			}
 
-			ata_pio_sector(qc);
+			ata_pio_sectors(qc);
 
 			if (ap->hsm_task_state == HSM_ST_LAST &&
 			    (!(qc->tf.flags & ATA_TFLAG_WRITE))) {
diff --git a/include/linux/ata.h b/include/linux/ata.h
index d54da3306d2c..f512104a1a3f 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -293,6 +293,14 @@ static inline int is_atapi_taskfile(const struct ata_taskfile *tf)
 	       (tf->protocol == ATA_PROT_ATAPI_DMA);
 }
 
+static inline int is_multi_taskfile(struct ata_taskfile *tf)
+{
+	return (tf->command == ATA_CMD_READ_MULTI) ||
+	       (tf->command == ATA_CMD_WRITE_MULTI) ||
+	       (tf->command == ATA_CMD_READ_MULTI_EXT) ||
+	       (tf->command == ATA_CMD_WRITE_MULTI_EXT);
+}
+
 static inline int ata_ok(u8 status)
 {
 	return ((status & (ATA_BUSY | ATA_DRDY | ATA_DF | ATA_DRQ | ATA_ERR))
-- 
cgit v1.2.3


From 3b2d99429e3386b6e2ac949fc72486509c8bbe36 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Wed, 14 Dec 2005 15:05:00 -0500
Subject: P-state software coordination for ACPI core

http://bugzilla.kernel.org/show_bug.cgi?id=5737

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/processor_perflib.c | 228 +++++++++++++++++++++++++++++++++++++++
 include/acpi/processor.h         |  27 ++++-
 include/linux/cpufreq.h          |   4 +
 3 files changed, 258 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index abbdb37a7f5f..ffc5280334c8 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -553,6 +553,234 @@ static void acpi_cpufreq_remove_file(struct acpi_processor *pr)
 }
 #endif				/* CONFIG_X86_ACPI_CPUFREQ_PROC_INTF */
 
+static int acpi_processor_get_psd(struct acpi_processor	*pr)
+{
+	int result = 0;
+	acpi_status status = AE_OK;
+	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
+	struct acpi_buffer format = {sizeof("NNNNN"), "NNNNN"};
+	struct acpi_buffer state = {0, NULL};
+	union acpi_object  *psd = NULL;
+	struct acpi_psd_package *pdomain;
+
+	ACPI_FUNCTION_TRACE("acpi_processor_get_psd");
+
+	status = acpi_evaluate_object(pr->handle, "_PSD", NULL, &buffer);
+	if (ACPI_FAILURE(status)) {
+		return_VALUE(-ENODEV);
+	}
+
+	psd = (union acpi_object *) buffer.pointer;
+	if (!psd || (psd->type != ACPI_TYPE_PACKAGE)) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Invalid _PSD data\n"));
+		result = -EFAULT;
+		goto end;
+	}
+
+	if (psd->package.count != 1) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Invalid _PSD data\n"));
+		result = -EFAULT;
+		goto end;
+	}
+
+	pdomain = &(pr->performance->domain_info);
+
+	state.length = sizeof(struct acpi_psd_package);
+	state.pointer = pdomain;
+
+	status = acpi_extract_package(&(psd->package.elements[0]),
+		&format, &state);
+	if (ACPI_FAILURE(status)) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Invalid _PSD data\n"));
+		result = -EFAULT;
+		goto end;
+	}
+
+	if (pdomain->num_entries != ACPI_PSD_REV0_ENTRIES) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Unknown _PSD:num_entries\n"));
+		result = -EFAULT;
+		goto end;
+	}
+
+	if (pdomain->revision != ACPI_PSD_REV0_REVISION) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Unknown _PSD:revision\n"));
+		result = -EFAULT;
+		goto end;
+	}
+
+end:
+	acpi_os_free(buffer.pointer);
+	return_VALUE(result);
+}
+
+int acpi_processor_preregister_performance(
+		struct acpi_processor_performance **performance)
+{
+	int count, count_target;
+	int retval = 0;
+	unsigned int i, j;
+	cpumask_t covered_cpus;
+	struct acpi_processor *pr;
+	struct acpi_psd_package *pdomain;
+	struct acpi_processor *match_pr;
+	struct acpi_psd_package *match_pdomain;
+
+	ACPI_FUNCTION_TRACE("acpi_processor_preregister_performance");
+
+	down(&performance_sem);
+
+	retval = 0;
+
+	/* Call _PSD for all CPUs */
+	for_each_cpu(i) {
+		pr = processors[i];
+		if (!pr) {
+			/* Look only at processors in ACPI namespace */
+			continue;
+		}
+
+		if (pr->performance) {
+			retval = -EBUSY;
+			continue;
+		}
+
+		if (!performance || !performance[i]) {
+			retval = -EINVAL;
+			continue;
+		}
+
+		pr->performance = performance[i];
+		cpu_set(i, pr->performance->shared_cpu_map);
+		if (acpi_processor_get_psd(pr)) {
+			retval = -EINVAL;
+			continue;
+		}
+	}
+	if (retval)
+		goto err_ret;
+
+	/*
+	 * Now that we have _PSD data from all CPUs, lets setup P-state 
+	 * domain info.
+	 */
+	for_each_cpu(i) {
+		pr = processors[i];
+		if (!pr)
+			continue;
+
+		/* Basic validity check for domain info */
+		pdomain = &(pr->performance->domain_info);
+		if ((pdomain->revision != ACPI_PSD_REV0_REVISION) ||
+		    (pdomain->num_entries != ACPI_PSD_REV0_ENTRIES)) {
+			retval = -EINVAL;
+			goto err_ret;
+		}
+		if (pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ALL &&
+		    pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ANY &&
+		    pdomain->coord_type != DOMAIN_COORD_TYPE_HW_ALL) {
+			retval = -EINVAL;
+			goto err_ret;
+		}
+	}
+
+	cpus_clear(covered_cpus);
+	for_each_cpu(i) {
+		pr = processors[i];
+		if (!pr)
+			continue;
+
+		if (cpu_isset(i, covered_cpus))
+			continue;
+
+		pdomain = &(pr->performance->domain_info);
+		cpu_set(i, pr->performance->shared_cpu_map);
+		cpu_set(i, covered_cpus);
+		if (pdomain->num_processors <= 1)
+			continue;
+
+		/* Validate the Domain info */
+		count_target = pdomain->num_processors;
+		count = 1;
+		if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL ||
+		    pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) {
+			pr->performance->shared_type = CPUFREQ_SHARED_TYPE_ALL;
+		} else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) {
+			pr->performance->shared_type = CPUFREQ_SHARED_TYPE_ANY;
+		}
+
+		for_each_cpu(j) {
+			if (i == j)
+				continue;
+
+			match_pr = processors[j];
+			if (!match_pr)
+				continue;
+
+			match_pdomain = &(match_pr->performance->domain_info);
+			if (match_pdomain->domain != pdomain->domain)
+				continue;
+
+			/* Here i and j are in the same domain */
+
+			if (match_pdomain->num_processors != count_target) {
+				retval = -EINVAL;
+				goto err_ret;
+			}
+
+			if (pdomain->coord_type != match_pdomain->coord_type) {
+				retval = -EINVAL;
+				goto err_ret;
+			}
+
+			cpu_set(j, covered_cpus);
+			cpu_set(j, pr->performance->shared_cpu_map);
+			count++;
+		}
+
+		for_each_cpu(j) {
+			if (i == j)
+				continue;
+
+			match_pr = processors[j];
+			if (!match_pr)
+				continue;
+
+			match_pdomain = &(match_pr->performance->domain_info);
+			if (match_pdomain->domain != pdomain->domain)
+				continue;
+
+			match_pr->performance->shared_type = 
+					pr->performance->shared_type;
+			match_pr->performance->shared_cpu_map =
+				pr->performance->shared_cpu_map;
+		}
+	}
+
+err_ret:
+	if (retval) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Error while parsing _PSD domain information. Assuming no coordination\n"));
+	}
+
+	for_each_cpu(i) {
+		pr = processors[i];
+		if (!pr || !pr->performance)
+			continue;
+
+		/* Assume no coordination on any error parsing domain info */
+		if (retval) {
+			cpus_clear(pr->performance->shared_cpu_map);
+			cpu_set(i, pr->performance->shared_cpu_map);
+			pr->performance->shared_type = CPUFREQ_SHARED_TYPE_ALL;
+		}
+		pr->performance = NULL; /* Will be set for real in register */
+	}
+
+	up(&performance_sem);
+	return_VALUE(retval);
+}
+EXPORT_SYMBOL(acpi_processor_preregister_performance);
+
+
 int
 acpi_processor_register_performance(struct acpi_processor_performance
 				    *performance, unsigned int cpu)
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index badf0277b1be..0c46d1b3dda2 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -3,6 +3,7 @@
 
 #include <linux/kernel.h>
 #include <linux/config.h>
+#include <linux/cpu.h>
 
 #include <asm/acpi.h>
 
@@ -18,6 +19,17 @@
 
 #define ACPI_PDC_REVISION_ID		0x1
 
+#define ACPI_PSD_REV0_REVISION		0 /* Support for _PSD as in ACPI 3.0 */
+#define ACPI_PSD_REV0_ENTRIES		5
+
+/*
+ * Types of coordination defined in ACPI 3.0. Same macros can be used across
+ * P, C and T states
+ */
+#define DOMAIN_COORD_TYPE_SW_ALL	0xfc
+#define DOMAIN_COORD_TYPE_SW_ANY	0xfd
+#define DOMAIN_COORD_TYPE_HW_ALL	0xfe
+
 /* Power Management */
 
 struct acpi_processor_cx;
@@ -66,6 +78,14 @@ struct acpi_processor_power {
 
 /* Performance Management */
 
+struct acpi_psd_package {
+	acpi_integer num_entries;
+	acpi_integer revision;
+	acpi_integer domain;
+	acpi_integer coord_type;
+	acpi_integer num_processors;
+} __attribute__ ((packed));
+
 struct acpi_pct_register {
 	u8 descriptor;
 	u16 length;
@@ -92,7 +112,9 @@ struct acpi_processor_performance {
 	struct acpi_pct_register status_register;
 	unsigned int state_count;
 	struct acpi_processor_px *states;
-
+	struct acpi_psd_package domain_info;
+	cpumask_t shared_cpu_map;
+	unsigned int shared_type;
 };
 
 /* Throttling Control */
@@ -161,6 +183,9 @@ struct acpi_processor_errata {
 	} piix4;
 };
 
+extern int acpi_processor_preregister_performance(
+		struct acpi_processor_performance **performance);
+
 extern int acpi_processor_register_performance(struct acpi_processor_performance
 					       *performance, unsigned int cpu);
 extern void acpi_processor_unregister_performance(struct
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 17866d7e2b71..f7d988366941 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -73,6 +73,8 @@ struct cpufreq_real_policy {
 
 struct cpufreq_policy {
 	cpumask_t		cpus;	/* affected CPUs */
+	unsigned int		shared_type; /* ANY or ALL affected CPUs
+						should set cpufreq */
 	unsigned int		cpu;    /* cpu nr of registered CPU */
 	struct cpufreq_cpuinfo	cpuinfo;/* see above */
 
@@ -99,6 +101,8 @@ struct cpufreq_policy {
 #define CPUFREQ_INCOMPATIBLE	(1)
 #define CPUFREQ_NOTIFY		(2)
 
+#define CPUFREQ_SHARED_TYPE_ALL	(0) /* All dependent CPUs should set freq */
+#define CPUFREQ_SHARED_TYPE_ANY	(1) /* Freq can be set from any dependent CPU */
 
 /******************** cpufreq transition notifiers *******************/
 
-- 
cgit v1.2.3


From c2956a3b0d1c17b38da369811a6ce93eb7a01a04 Mon Sep 17 00:00:00 2001
From: Albert Lee <albertcc@tw.ibm.com>
Date: Fri, 3 Mar 2006 10:34:05 +0800
Subject: [PATCH] libata-dev: recognize WRITE_MULTI_FUA_EXT for r/w multiple

Recognize ATA_CMD_WRITE_MULTI_FUA_EXT as r/w multiple commands.

Signed-off-by: Albert Lee <albertcc@tw.ibm.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/ata.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 469952366ed4..e7b0c21f6cd4 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -314,7 +314,8 @@ static inline int is_multi_taskfile(struct ata_taskfile *tf)
 	return (tf->command == ATA_CMD_READ_MULTI) ||
 	       (tf->command == ATA_CMD_WRITE_MULTI) ||
 	       (tf->command == ATA_CMD_READ_MULTI_EXT) ||
-	       (tf->command == ATA_CMD_WRITE_MULTI_EXT);
+	       (tf->command == ATA_CMD_WRITE_MULTI_EXT) ||
+	       (tf->command == ATA_CMD_WRITE_MULTI_FUA_EXT);
 }
 
 static inline int ata_ok(u8 status)
-- 
cgit v1.2.3


From 200d5a7684cc49ef4be40e832daf3f217e70dfbb Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 15 Feb 2006 16:24:49 +0900
Subject: [PATCH] libata: increase LBA48 max sectors to 65535

max_hw_sectors/max_sectors separation patch made into the tree,
increase max_sectors to its hardware limit.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-scsi.c | 2 +-
 include/linux/libata.h     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index dd954a23b357..b53ef494a206 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -678,7 +678,7 @@ static void ata_scsi_dev_config(struct scsi_device *sdev,
 	 */
 	max_sectors = ATA_MAX_SECTORS;
 	if (dev->flags & ATA_DFLAG_LBA48)
-		max_sectors = 2048;
+		max_sectors = ATA_MAX_SECTORS_LBA48;
 	if (dev->max_sectors)
 		max_sectors = dev->max_sectors;
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index d81cecdda4f3..4dff3cf9d389 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -110,6 +110,7 @@ enum {
 	ATA_DEF_QUEUE		= 1,
 	ATA_MAX_QUEUE		= 1,
 	ATA_MAX_SECTORS		= 200,	/* FIXME */
+	ATA_MAX_SECTORS_LBA48	= 65535,
 	ATA_MAX_BUS		= 2,
 	ATA_DEF_BUSY_WAIT	= 10000,
 	ATA_SHORT_PAUSE		= (HZ >> 6) + 1,
-- 
cgit v1.2.3


From 27cdadef6dfe0d0614653919a110fc75ab1650ce Mon Sep 17 00:00:00 2001
From: Albert Lee <albertcc@tw.ibm.com>
Date: Sat, 25 Mar 2006 17:53:57 +0800
Subject: [PATCH] libata-dev: Cleanup unused enums/functions

Cleanup the following unused functions:
- ata_pio_poll()
- ata_pio_complete()
- ata_pio_first_block()
- ata_pio_block()
- ata_pio_error()
ap->pio_task_timeout and other enums.

Signed-off-by: Albert Lee <albertcc@tw.ibm.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 279 ---------------------------------------------
 include/linux/libata.h     |   9 --
 2 files changed, 288 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index eeeeda0481a2..ef0d0dd90e17 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -67,7 +67,6 @@ static void ata_set_mode(struct ata_port *ap);
 static unsigned int ata_dev_set_xfermode(struct ata_port *ap,
 					 struct ata_device *dev);
 static void ata_dev_xfermask(struct ata_port *ap, struct ata_device *dev);
-static void ata_pio_error(struct ata_port *ap);
 
 static unsigned int ata_unique_id = 1;
 static struct workqueue_struct *ata_wq;
@@ -3131,114 +3130,6 @@ void ata_poll_qc_complete(struct ata_queued_cmd *qc)
 	spin_unlock_irqrestore(&ap->host_set->lock, flags);
 }
 
-/**
- *	ata_pio_poll - poll using PIO, depending on current state
- *	@ap: the target ata_port
- *
- *	LOCKING:
- *	None.  (executing in kernel thread context)
- *
- *	RETURNS:
- *	timeout value to use
- */
-
-static unsigned long ata_pio_poll(struct ata_port *ap)
-{
-	struct ata_queued_cmd *qc;
-	u8 status;
-	unsigned int poll_state = HSM_ST_UNKNOWN;
-	unsigned int reg_state = HSM_ST_UNKNOWN;
-
-	qc = ata_qc_from_tag(ap, ap->active_tag);
-	WARN_ON(qc == NULL);
-
-	switch (ap->hsm_task_state) {
-	case HSM_ST:
-	case HSM_ST_POLL:
-		poll_state = HSM_ST_POLL;
-		reg_state = HSM_ST;
-		break;
-	case HSM_ST_LAST:
-	case HSM_ST_LAST_POLL:
-		poll_state = HSM_ST_LAST_POLL;
-		reg_state = HSM_ST_LAST;
-		break;
-	default:
-		BUG();
-		break;
-	}
-
-	status = ata_chk_status(ap);
-	if (status & ATA_BUSY) {
-		if (time_after(jiffies, ap->pio_task_timeout)) {
-			qc->err_mask |= AC_ERR_TIMEOUT;
-			ap->hsm_task_state = HSM_ST_TMOUT;
-			return 0;
-		}
-		ap->hsm_task_state = poll_state;
-		return ATA_SHORT_PAUSE;
-	}
-
-	ap->hsm_task_state = reg_state;
-	return 0;
-}
-
-/**
- *	ata_pio_complete - check if drive is busy or idle
- *	@ap: the target ata_port
- *
- *	LOCKING:
- *	None.  (executing in kernel thread context)
- *
- *	RETURNS:
- *	Zero if qc completed.
- *	Non-zero if has next.
- */
-
-static int ata_pio_complete (struct ata_port *ap)
-{
-	struct ata_queued_cmd *qc;
-	u8 drv_stat;
-
-	/*
-	 * This is purely heuristic.  This is a fast path.  Sometimes when
-	 * we enter, BSY will be cleared in a chk-status or two.  If not,
-	 * the drive is probably seeking or something.  Snooze for a couple
-	 * msecs, then chk-status again.  If still busy, fall back to
-	 * HSM_ST_LAST_POLL state.
-	 */
-	drv_stat = ata_busy_wait(ap, ATA_BUSY, 10);
-	if (drv_stat & ATA_BUSY) {
-		msleep(2);
-		drv_stat = ata_busy_wait(ap, ATA_BUSY, 10);
-		if (drv_stat & ATA_BUSY) {
-			ap->hsm_task_state = HSM_ST_LAST_POLL;
-			ap->pio_task_timeout = jiffies + ATA_TMOUT_PIO;
-			return 1;
-		}
-	}
-
-	qc = ata_qc_from_tag(ap, ap->active_tag);
-	WARN_ON(qc == NULL);
-
-	drv_stat = ata_wait_idle(ap);
-	if (!ata_ok(drv_stat)) {
-		qc->err_mask |= __ac_err_mask(drv_stat);
-		ap->hsm_task_state = HSM_ST_ERR;
-		return 1;
-	}
-
-	ap->hsm_task_state = HSM_ST_IDLE;
-
-	WARN_ON(qc->err_mask);
-	ata_poll_qc_complete(qc);
-
-	/* another command may start at this point */
-
-	return 0;
-}
-
-
 /**
  *	swap_buf_le16 - swap halves of 16-bit words in place
  *	@buf:  Buffer to swap
@@ -3496,91 +3387,6 @@ static void atapi_send_cdb(struct ata_port *ap, struct ata_queued_cmd *qc)
 	}
 }
 
-/**
- *	ata_pio_first_block - Write first data block to hardware
- *	@ap: Port to which ATA/ATAPI device is attached.
- *
- *	When device has indicated its readiness to accept
- *	the data, this function sends out the CDB or 
- *	the first data block by PIO.
- *	After this, 
- *	  - If polling, ata_pio_task() handles the rest.
- *	  - Otherwise, interrupt handler takes over.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep)
- *
- *	RETURNS:
- *	Zero if irq handler takes over
- *	Non-zero if has next (polling).
- */
-
-static int ata_pio_first_block(struct ata_port *ap)
-{
-	struct ata_queued_cmd *qc;
-	u8 status;
-	unsigned long flags;
-	int has_next;
-
-	qc = ata_qc_from_tag(ap, ap->active_tag);
-	WARN_ON(qc == NULL);
-	WARN_ON((qc->flags & ATA_QCFLAG_ACTIVE) == 0);
-
-	/* if polling, we will stay in the work queue after sending the data.
-	 * otherwise, interrupt handler takes over after sending the data.
-	 */
-	has_next = (qc->tf.flags & ATA_TFLAG_POLLING);
-
-	/* sleep-wait for BSY to clear */
-	DPRINTK("busy wait\n");
-	if (ata_busy_sleep(ap, ATA_TMOUT_DATAOUT_QUICK, ATA_TMOUT_DATAOUT)) {
-		qc->err_mask |= AC_ERR_TIMEOUT;
-		ap->hsm_task_state = HSM_ST_TMOUT;
-		goto err_out;
-	}
-
-	/* make sure DRQ is set */
-	status = ata_chk_status(ap);
-	if ((status & (ATA_BUSY | ATA_DRQ)) != ATA_DRQ) {
-		/* device status error */
-		qc->err_mask |= AC_ERR_HSM;
-		ap->hsm_task_state = HSM_ST_ERR;
-		goto err_out;
-	}
-
-	/* Send the CDB (atapi) or the first data block (ata pio out).
-	 * During the state transition, interrupt handler shouldn't
-	 * be invoked before the data transfer is complete and
-	 * hsm_task_state is changed. Hence, the following locking.
-	 */
-	spin_lock_irqsave(&ap->host_set->lock, flags);
-
-	if (qc->tf.protocol == ATA_PROT_PIO) {
-		/* PIO data out protocol.
-		 * send first data block.
-		 */
-
-		/* ata_pio_sectors() might change the state to HSM_ST_LAST.
-		 * so, the state is changed here before ata_pio_sectors().
-		 */
-		ap->hsm_task_state = HSM_ST;
-		ata_pio_sectors(qc);
-		ata_altstatus(ap); /* flush */
-	} else
-		/* send CDB */
-		atapi_send_cdb(ap, qc);
-
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
-
-	/* if polling, ata_pio_task() handles the rest.
-	 * otherwise, interrupt handler takes over from here.
-	 */
-	return has_next;
-
-err_out:
-	return 1; /* has next */
-}
-
 /**
  *	__atapi_pio_bytes - Transfer data from/to the ATAPI device.
  *	@qc: Command on going
@@ -3720,91 +3526,6 @@ err_out:
 	ap->hsm_task_state = HSM_ST_ERR;
 }
 
-/**
- *	ata_pio_block - start PIO on a block
- *	@ap: the target ata_port
- *
- *	LOCKING:
- *	None.  (executing in kernel thread context)
- */
-
-static void ata_pio_block(struct ata_port *ap)
-{
-	struct ata_queued_cmd *qc;
-	u8 status;
-
-	/*
-	 * This is purely heuristic.  This is a fast path.
-	 * Sometimes when we enter, BSY will be cleared in
-	 * a chk-status or two.  If not, the drive is probably seeking
-	 * or something.  Snooze for a couple msecs, then
-	 * chk-status again.  If still busy, fall back to
-	 * HSM_ST_POLL state.
-	 */
-	status = ata_busy_wait(ap, ATA_BUSY, 5);
-	if (status & ATA_BUSY) {
-		msleep(2);
-		status = ata_busy_wait(ap, ATA_BUSY, 10);
-		if (status & ATA_BUSY) {
-			ap->hsm_task_state = HSM_ST_POLL;
-			ap->pio_task_timeout = jiffies + ATA_TMOUT_PIO;
-			return;
-		}
-	}
-
-	qc = ata_qc_from_tag(ap, ap->active_tag);
-	WARN_ON(qc == NULL);
-
-	/* check error */
-	if (status & (ATA_ERR | ATA_DF)) {
-		qc->err_mask |= AC_ERR_DEV;
-		ap->hsm_task_state = HSM_ST_ERR;
-		return;
-	}
-
-	/* transfer data if any */
-	if (is_atapi_taskfile(&qc->tf)) {
-		/* DRQ=0 means no more data to transfer */
-		if ((status & ATA_DRQ) == 0) {
-			ap->hsm_task_state = HSM_ST_LAST;
-			return;
-		}
-
-		atapi_pio_bytes(qc);
-	} else {
-		/* handle BSY=0, DRQ=0 as error */
-		if ((status & ATA_DRQ) == 0) {
-			qc->err_mask |= AC_ERR_HSM;
-			ap->hsm_task_state = HSM_ST_ERR;
-			return;
-		}
-
-		ata_pio_sectors(qc);
-	}
-
-	ata_altstatus(ap); /* flush */
-}
-
-static void ata_pio_error(struct ata_port *ap)
-{
-	struct ata_queued_cmd *qc;
-
-	qc = ata_qc_from_tag(ap, ap->active_tag);
-	WARN_ON(qc == NULL);
-
-	if (qc->tf.command != ATA_CMD_PACKET)
-		printk(KERN_WARNING "ata%u: PIO error\n", ap->id);
-
-	/* make sure qc->err_mask is available to
-	 * know what's wrong and recover
-	 */
-	WARN_ON(qc->err_mask == 0);
-
-	ap->hsm_task_state = HSM_ST_IDLE;
-
-	ata_poll_qc_complete(qc);
-}
-
 /**
  *	ata_hsm_move - move the HSM to the next state.
  *	@ap: the target ata_port
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 70ca99bbc6c7..0eb71c1773a1 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -162,13 +162,8 @@ enum {
 	ATA_QCFLAG_EH_SCHEDULED = (1 << 5), /* EH scheduled */
 
 	/* various lengths of time */
-	ATA_TMOUT_PIO		= 30 * HZ,
 	ATA_TMOUT_BOOT		= 30 * HZ,	/* heuristic */
 	ATA_TMOUT_BOOT_QUICK	= 7 * HZ,	/* heuristic */
-	ATA_TMOUT_DATAOUT	= 30 * HZ,
-	ATA_TMOUT_DATAOUT_QUICK	= 5 * HZ,
-	ATA_TMOUT_CDB		= 30 * HZ,
-	ATA_TMOUT_CDB_QUICK	= 5 * HZ,
 	ATA_TMOUT_INTERNAL	= 30 * HZ,
 	ATA_TMOUT_INTERNAL_QUICK = 5 * HZ,
 
@@ -216,11 +211,8 @@ enum {
 enum hsm_task_states {
 	HSM_ST_UNKNOWN,		/* state unknown */
 	HSM_ST_IDLE,		/* no command on going */
-	HSM_ST_POLL,		/* same as HSM_ST, waits longer */
-	HSM_ST_TMOUT,		/* timeout */
 	HSM_ST,			/* (waiting the device to) transfer data */
 	HSM_ST_LAST,		/* (waiting the device to) complete command */
-	HSM_ST_LAST_POLL,	/* same as HSM_ST_LAST, waits longer */
 	HSM_ST_ERR,		/* error */
 	HSM_ST_FIRST,		/* (waiting the device to)
 				   write CDB or first data block */
@@ -409,7 +401,6 @@ struct ata_port {
 	struct work_struct	port_task;
 
 	unsigned int		hsm_task_state;
-	unsigned long		pio_task_timeout;
 
 	u32			msg_enable;
 	struct list_head	eh_done_q;
-- 
cgit v1.2.3


From e1211e3fa7fd05ff0d4f597fd37e40de8acc6784 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sat, 1 Apr 2006 01:38:18 +0900
Subject: [PATCH] libata: implement ata_dev_enabled and disabled()

This patch renames ata_dev_present() to ata_dev_enabled() and adds
ata_dev_disabled().  This is to discern the state where a device is
present but disabled from not-present state.  This disctinction is
necessary when configuring transfer mode because device selection
timing must not be violated even if a device fails to configure.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 24 ++++++++++++------------
 drivers/scsi/libata-scsi.c |  4 ++--
 drivers/scsi/sata_mv.c     |  2 +-
 drivers/scsi/sata_sil.c    |  2 +-
 include/linux/libata.h     | 16 +++++++++++++---
 5 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 559abe4ea4e9..c10c550da38b 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -411,7 +411,7 @@ static const char *sata_spd_string(unsigned int spd)
 
 static void ata_dev_disable(struct ata_port *ap, struct ata_device *dev)
 {
-	if (ata_dev_present(dev)) {
+	if (ata_dev_enabled(dev)) {
 		printk(KERN_WARNING "ata%u: dev %u disabled\n",
 		       ap->id, dev->devno);
 		dev->class++;
@@ -1222,7 +1222,7 @@ static int ata_dev_configure(struct ata_port *ap, struct ata_device *dev,
 	unsigned int xfer_mask;
 	int i, rc;
 
-	if (!ata_dev_present(dev)) {
+	if (!ata_dev_enabled(dev)) {
 		DPRINTK("ENTER/EXIT (host %u, dev %u) -- nodev\n",
 			ap->id, dev->devno);
 		return 0;
@@ -1401,7 +1401,7 @@ static int ata_bus_probe(struct ata_port *ap)
 
 		dev->class = classes[i];
 
-		if (!ata_dev_present(dev))
+		if (!ata_dev_enabled(dev))
 			continue;
 
 		WARN_ON(dev->id != NULL);
@@ -1565,7 +1565,7 @@ void sata_phy_reset(struct ata_port *ap)
 struct ata_device *ata_dev_pair(struct ata_port *ap, struct ata_device *adev)
 {
 	struct ata_device *pair = &ap->device[1 - adev->devno];
-	if (!ata_dev_present(pair))
+	if (!ata_dev_enabled(pair))
 		return NULL;
 	return pair;
 }
@@ -1778,7 +1778,7 @@ static int ata_host_set_pio(struct ata_port *ap)
 	for (i = 0; i < ATA_MAX_DEVICES; i++) {
 		struct ata_device *dev = &ap->device[i];
 
-		if (!ata_dev_present(dev))
+		if (!ata_dev_enabled(dev))
 			continue;
 
 		if (!dev->pio_mode) {
@@ -1802,7 +1802,7 @@ static void ata_host_set_dma(struct ata_port *ap)
 	for (i = 0; i < ATA_MAX_DEVICES; i++) {
 		struct ata_device *dev = &ap->device[i];
 
-		if (!ata_dev_present(dev) || !dev->dma_mode)
+		if (!ata_dev_enabled(dev) || !dev->dma_mode)
 			continue;
 
 		dev->xfer_mode = dev->dma_mode;
@@ -1830,7 +1830,7 @@ static void ata_set_mode(struct ata_port *ap)
 		struct ata_device *dev = &ap->device[i];
 		unsigned int pio_mask, dma_mask;
 
-		if (!ata_dev_present(dev))
+		if (!ata_dev_enabled(dev))
 			continue;
 
 		ata_dev_xfermask(ap, dev);
@@ -1858,7 +1858,7 @@ static void ata_set_mode(struct ata_port *ap)
 	for (i = 0; i < ATA_MAX_DEVICES; i++) {
 		struct ata_device *dev = &ap->device[i];
 
-		if (!ata_dev_present(dev))
+		if (!ata_dev_enabled(dev))
 			continue;
 
 		rc = ata_dev_set_mode(ap, dev);
@@ -2550,7 +2550,7 @@ int ata_dev_revalidate(struct ata_port *ap, struct ata_device *dev,
 	u16 *id;
 	int rc;
 
-	if (!ata_dev_present(dev))
+	if (!ata_dev_enabled(dev))
 		return -ENODEV;
 
 	class = dev->class;
@@ -2679,7 +2679,7 @@ static void ata_dev_xfermask(struct ata_port *ap, struct ata_device *dev)
 	/* FIXME: Use port-wide xfermask for now */
 	for (i = 0; i < ATA_MAX_DEVICES; i++) {
 		struct ata_device *d = &ap->device[i];
-		if (!ata_dev_present(d))
+		if (!ata_dev_enabled(d))
 			continue;
 		xfer_mask &= ata_pack_xfermask(d->pio_mask, d->mwdma_mask,
 					       d->udma_mask);
@@ -4299,7 +4299,7 @@ int ata_device_resume(struct ata_port *ap, struct ata_device *dev)
 		ap->flags &= ~ATA_FLAG_SUSPENDED;
 		ata_set_mode(ap);
 	}
-	if (!ata_dev_present(dev))
+	if (!ata_dev_enabled(dev))
 		return 0;
 	if (dev->class == ATA_DEV_ATA)
 		ata_start_drive(ap, dev);
@@ -4317,7 +4317,7 @@ int ata_device_resume(struct ata_port *ap, struct ata_device *dev)
  */
 int ata_device_suspend(struct ata_port *ap, struct ata_device *dev, pm_message_t state)
 {
-	if (!ata_dev_present(dev))
+	if (!ata_dev_enabled(dev))
 		return 0;
 	if (dev->class == ATA_DEV_ATA)
 		ata_flush_cache(ap, dev);
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 53f5b0d9161c..c1a4b29a9ae1 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -2349,7 +2349,7 @@ ata_scsi_find_dev(struct ata_port *ap, const struct scsi_device *scsidev)
 		     (scsidev->lun != 0)))
 		return NULL;
 
-	if (unlikely(!ata_dev_present(dev)))
+	if (unlikely(!ata_dev_enabled(dev)))
 		return NULL;
 
 	if (!atapi_enabled || (ap->flags & ATA_FLAG_NO_ATAPI)) {
@@ -2743,7 +2743,7 @@ void ata_scsi_scan_host(struct ata_port *ap)
 	for (i = 0; i < ATA_MAX_DEVICES; i++) {
 		dev = &ap->device[i];
 
-		if (ata_dev_present(dev))
+		if (ata_dev_enabled(dev))
 			scsi_scan_target(&ap->host->shost_gendev, 0, i, 0, 0);
 	}
 }
diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c
index fa901fd65085..0f7d334aadcc 100644
--- a/drivers/scsi/sata_mv.c
+++ b/drivers/scsi/sata_mv.c
@@ -1991,7 +1991,7 @@ comreset_retry:
 	tf.nsect = readb((void __iomem *) ap->ioaddr.nsect_addr);
 
 	dev->class = ata_dev_classify(&tf);
-	if (!ata_dev_present(dev)) {
+	if (!ata_dev_enabled(dev)) {
 		VPRINTK("Port disabled post-sig: No device present.\n");
 		ata_port_disable(ap);
 	}
diff --git a/drivers/scsi/sata_sil.c b/drivers/scsi/sata_sil.c
index 18c296c56899..d6c7086a5379 100644
--- a/drivers/scsi/sata_sil.c
+++ b/drivers/scsi/sata_sil.c
@@ -264,7 +264,7 @@ static void sil_post_set_mode (struct ata_port *ap)
 
 	for (i = 0; i < 2; i++) {
 		dev = &ap->device[i];
-		if (!ata_dev_present(dev))
+		if (!ata_dev_enabled(dev))
 			dev_mode[i] = 0;	/* PIO0/1/2 */
 		else if (dev->flags & ATA_DFLAG_PIO)
 			dev_mode[i] = 1;	/* PIO3/4 */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0d61357604d5..c6883ba8cba9 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -672,14 +672,24 @@ static inline unsigned int ata_tag_valid(unsigned int tag)
 	return (tag < ATA_MAX_QUEUE) ? 1 : 0;
 }
 
-static inline unsigned int ata_class_present(unsigned int class)
+static inline unsigned int ata_class_enabled(unsigned int class)
 {
 	return class == ATA_DEV_ATA || class == ATA_DEV_ATAPI;
 }
 
-static inline unsigned int ata_dev_present(const struct ata_device *dev)
+static inline unsigned int ata_class_disabled(unsigned int class)
 {
-	return ata_class_present(dev->class);
+	return class == ATA_DEV_ATA_UNSUP || class == ATA_DEV_ATAPI_UNSUP;
+}
+
+static inline unsigned int ata_dev_enabled(const struct ata_device *dev)
+{
+	return ata_class_enabled(dev->class);
+}
+
+static inline unsigned int ata_dev_disabled(const struct ata_device *dev)
+{
+	return ata_class_disabled(dev->class);
 }
 
 static inline u8 ata_chk_status(struct ata_port *ap)
-- 
cgit v1.2.3


From 002c8054fa8d0f1afce2b0c728be32d338b9293a Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 2 Apr 2006 17:54:46 +0900
Subject: [PATCH] libata: implement ata_dev_absent()

For the time being we cannot use ata_dev_present() as it was renamed
to ata_dev_enabled() but we still need presence test.  Implement
negation of the test.  Conveniently, the negated result is needed in
more places.  This is suggested by Jeff Garzik.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/libata.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index c6883ba8cba9..0f8e3720edd9 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -682,6 +682,11 @@ static inline unsigned int ata_class_disabled(unsigned int class)
 	return class == ATA_DEV_ATA_UNSUP || class == ATA_DEV_ATAPI_UNSUP;
 }
 
+static inline unsigned int ata_class_absent(unsigned int class)
+{
+	return !ata_class_enabled(class) && !ata_class_disabled(class);
+}
+
 static inline unsigned int ata_dev_enabled(const struct ata_device *dev)
 {
 	return ata_class_enabled(dev->class);
@@ -692,6 +697,11 @@ static inline unsigned int ata_dev_disabled(const struct ata_device *dev)
 	return ata_class_disabled(dev->class);
 }
 
+static inline unsigned int ata_dev_absent(const struct ata_device *dev)
+{
+	return ata_class_absent(dev->class);
+}
+
 static inline u8 ata_chk_status(struct ata_port *ap)
 {
 	return ap->ops->check_status(ap);
-- 
cgit v1.2.3


From 1c3fae4d7eb121933341443c37d3bbee43c0fb68 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 2 Apr 2006 20:53:28 +0900
Subject: [PATCH] libata: implement ap->sata_spd_limit and helpers

ap->sata_spd_limit contrains SATA PHY speed of the port.  It is
initialized to the configured value prior to probing thus preserving
BIOS configured value.  hardreset is responsible for applying SPD
limit and sata_std_hardreset() is updated to do that.  SATA SPD limit
will be used to enhance failure handling during probing and later by
EH.

This patch also normalizes some comments around affected code.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 143 +++++++++++++++++++++++++++++++++++++++++++--
 include/linux/libata.h     |   1 +
 2 files changed, 140 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 3acf56200d87..63488673765f 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -65,6 +65,7 @@ static unsigned int ata_dev_init_params(struct ata_port *ap,
 					struct ata_device *dev,
 					u16 heads,
 					u16 sectors);
+static int ata_down_sata_spd_limit(struct ata_port *ap);
 static int ata_set_mode(struct ata_port *ap, struct ata_device **r_failed_dev);
 static unsigned int ata_dev_set_xfermode(struct ata_port *ap,
 					 struct ata_device *dev);
@@ -1596,6 +1597,120 @@ void ata_port_disable(struct ata_port *ap)
 	ap->flags |= ATA_FLAG_PORT_DISABLED;
 }
 
+/**
+ *	ata_down_sata_spd_limit - adjust SATA spd limit downward
+ *	@ap: Port to adjust SATA spd limit for
+ *
+ *	Adjust SATA spd limit of @ap downward.  Note that this
+ *	function only adjusts the limit.  The change must be applied
+ *	using ata_set_sata_spd().
+ *
+ *	LOCKING:
+ *	Inherited from caller.
+ *
+ *	RETURNS:
+ *	0 on success, negative errno on failure
+ */
+static int ata_down_sata_spd_limit(struct ata_port *ap)
+{
+	u32 spd, mask;
+	int highbit;
+
+	if (ap->cbl != ATA_CBL_SATA || !ap->ops->scr_read)
+		return -EOPNOTSUPP;
+
+	mask = ap->sata_spd_limit;
+	if (mask <= 1)
+		return -EINVAL;
+	highbit = fls(mask) - 1;
+	mask &= ~(1 << highbit);
+
+	spd = (scr_read(ap, SCR_STATUS) >> 4) & 0xf;
+	if (spd <= 1)
+		return -EINVAL;
+	spd--;
+	mask &= (1 << spd) - 1;
+	if (!mask)
+		return -EINVAL;
+
+	ap->sata_spd_limit = mask;
+
+	printk(KERN_WARNING "ata%u: limiting SATA link speed to %s\n",
+	       ap->id, sata_spd_string(fls(mask)));
+
+	return 0;
+}
+
+static int __ata_set_sata_spd_needed(struct ata_port *ap, u32 *scontrol)
+{
+	u32 spd, limit;
+
+	if (ap->sata_spd_limit == UINT_MAX)
+		limit = 0;
+	else
+		limit = fls(ap->sata_spd_limit);
+
+	spd = (*scontrol >> 4) & 0xf;
+	*scontrol = (*scontrol & ~0xf0) | ((limit & 0xf) << 4);
+
+	return spd != limit;
+}
+
+/**
+ *	ata_set_sata_spd_needed - is SATA spd configuration needed
+ *	@ap: Port in question
+ *
+ *	Test whether the spd limit in SControl matches
+ *	@ap->sata_spd_limit.  This function is used to determine
+ *	whether hardreset is necessary to apply SATA spd
+ *	configuration.
+ *
+ *	LOCKING:
+ *	Inherited from caller.
+ *
+ *	RETURNS:
+ *	1 if SATA spd configuration is needed, 0 otherwise.
+ */
+static int ata_set_sata_spd_needed(struct ata_port *ap)
+{
+	u32 scontrol;
+
+	if (ap->cbl != ATA_CBL_SATA || !ap->ops->scr_read)
+		return 0;
+
+	scontrol = scr_read(ap, SCR_CONTROL);
+
+	return __ata_set_sata_spd_needed(ap, &scontrol);
+}
+
+/**
+ *	ata_set_sata_spd - set SATA spd according to spd limit
+ *	@ap: Port to set SATA spd for
+ *
+ *	Set SATA spd of @ap according to sata_spd_limit.
+ *
+ *	LOCKING:
+ *	Inherited from caller.
+ *
+ *	RETURNS:
+ *	0 if spd doesn't need to be changed, 1 if spd has been
+ *	changed.  -EOPNOTSUPP if SCR registers are inaccessible.
+ */
+static int ata_set_sata_spd(struct ata_port *ap)
+{
+	u32 scontrol;
+
+	if (ap->cbl != ATA_CBL_SATA || !ap->ops->scr_read)
+		return -EOPNOTSUPP;
+
+	scontrol = scr_read(ap, SCR_CONTROL);
+	if (!__ata_set_sata_spd_needed(ap, &scontrol))
+		return 0;
+
+	scr_write(ap, SCR_CONTROL, scontrol);
+	return 1;
+}
+
 /*
  * This mode timing computation functionality is ported over from
  * drivers/ide/ide-timing.h and was originally written by Vojtech Pavlik
@@ -2165,7 +2280,14 @@ static int sata_phy_resume(struct ata_port *ap)
 void ata_std_probeinit(struct ata_port *ap)
 {
 	if ((ap->flags & ATA_FLAG_SATA) && ap->ops->scr_read) {
+		u32 spd;
+
 		sata_phy_resume(ap);
+
+		spd = (scr_read(ap, SCR_CONTROL) & 0xf0) >> 4;
+		if (spd)
+			ap->sata_spd_limit &= (1 << spd) - 1;
+
 		if (sata_dev_present(ap))
 			ata_busy_sleep(ap, ATA_TMOUT_BOOT_QUICK, ATA_TMOUT_BOOT);
 	}
@@ -2253,18 +2375,30 @@ int sata_std_hardreset(struct ata_port *ap, int verbose, unsigned int *class)
 
 	DPRINTK("ENTER\n");
 
-	/* Issue phy wake/reset */
+	if (ata_set_sata_spd_needed(ap)) {
+		/* SATA spec says nothing about how to reconfigure
+		 * spd.  To be on the safe side, turn off phy during
+		 * reconfiguration.  This works for at least ICH7 AHCI
+		 * and Sil3124.
+		 */
+		scontrol = scr_read(ap, SCR_CONTROL);
+		scontrol = (scontrol & 0x0f0) | 0x302;
+		scr_write_flush(ap, SCR_CONTROL, scontrol);
+
+		ata_set_sata_spd(ap);
+	}
+
+	/* issue phy wake/reset */
 	scontrol = scr_read(ap, SCR_CONTROL);
 	scontrol = (scontrol & 0x0f0) | 0x301;
 	scr_write_flush(ap, SCR_CONTROL, scontrol);
 
-	/*
-	 * Couldn't find anything in SATA I/II specs, but AHCI-1.1
+	/* Couldn't find anything in SATA I/II specs, but AHCI-1.1
 	 * 10.4.2 says at least 1 ms.
 	 */
 	msleep(1);
 
-	/* Bring phy back */
+	/* bring phy back */
 	sata_phy_resume(ap);
 
 	/* TODO: phy layer with polling, timeouts, etc. */
@@ -4454,6 +4588,7 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 	ap->flags |= ent->host_flags;
 	ap->ops = ent->port_ops;
 	ap->cbl = ATA_CBL_NONE;
+	ap->sata_spd_limit = UINT_MAX;
 	ap->active_tag = ATA_TAG_POISON;
 	ap->last_ctl = 0xFF;
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0f8e3720edd9..a5207e66ca52 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -397,6 +397,7 @@ struct ata_port {
 	unsigned int		mwdma_mask;
 	unsigned int		udma_mask;
 	unsigned int		cbl;	/* cable type; ATA_CBL_xxx */
+	unsigned int		sata_spd_limit;	/* SATA PHY speed limit */
 
 	struct ata_device	device[ATA_MAX_DEVICES];
 
-- 
cgit v1.2.3


From 14d2bac1877ed4e2cc940d1680db1a4f29225811 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 2 Apr 2006 17:54:46 +0900
Subject: [PATCH] libata: improve ata_bus_probe()

Improve ata_bus_probe() such that configuration failures are handled
better.  Each device is given ATA_PROBE_MAX_TRIES chances, but any
non-transient error (revalidation failure with -ENODEV, configuration
failure with -EINVAL...) disables the device directly.  Any IO error
results in SATA PHY speed down and ata_set_mode() failure lowers
transfer mode.  The last try always puts a device into PIO-0.

After each failure, the whole port is reset to make sure that the
controller and all the devices are in a known and stable state.  The
reset also applies SATA SPD configuration if necessary.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 65 ++++++++++++++++++++++++++++++++++------------
 include/linux/libata.h     |  3 +++
 2 files changed, 52 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 68fa64d24721..33b5bff58cc3 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1370,11 +1370,18 @@ err_out_nosup:
 static int ata_bus_probe(struct ata_port *ap)
 {
 	unsigned int classes[ATA_MAX_DEVICES];
-	int i, rc, found = 0;
+	int tries[ATA_MAX_DEVICES];
+	int i, rc, down_xfermask;
 	struct ata_device *dev;
 
 	ata_port_probe(ap);
 
+	for (i = 0; i < ATA_MAX_DEVICES; i++)
+		tries[i] = ATA_PROBE_MAX_TRIES;
+
+ retry:
+	down_xfermask = 0;
+
 	/* reset and determine device classes */
 	for (i = 0; i < ATA_MAX_DEVICES; i++)
 		classes[i] = ATA_DEV_UNKNOWN;
@@ -1404,21 +1411,23 @@ static int ata_bus_probe(struct ata_port *ap)
 		dev = &ap->device[i];
 		dev->class = classes[i];
 
-		if (!ata_dev_enabled(dev))
-			continue;
-
-		WARN_ON(dev->id != NULL);
-		if (ata_dev_read_id(ap, dev, &dev->class, 1, &dev->id)) {
-			dev->class = ATA_DEV_NONE;
-			continue;
+		if (!tries[i]) {
+			ata_down_xfermask_limit(ap, dev, 1);
+			ata_dev_disable(ap, dev);
 		}
 
-		if (ata_dev_configure(ap, dev, 1)) {
-			ata_dev_disable(ap, dev);
+		if (!ata_dev_enabled(dev))
 			continue;
-		}
 
-		found = 1;
+		kfree(dev->id);
+		dev->id = NULL;
+		rc = ata_dev_read_id(ap, dev, &dev->class, 1, &dev->id);
+		if (rc)
+			goto fail;
+
+		rc = ata_dev_configure(ap, dev, 1);
+		if (rc)
+			goto fail;
 	}
 
 	/* configure transfer mode */
@@ -1427,12 +1436,18 @@ static int ata_bus_probe(struct ata_port *ap)
 		 * return error code and failing device on failure as
 		 * ata_set_mode() does.
 		 */
-		if (found)
-			ap->ops->set_mode(ap);
+		for (i = 0; i < ATA_MAX_DEVICES; i++)
+			if (ata_dev_enabled(&ap->device[i])) {
+				ap->ops->set_mode(ap);
+				break;
+			}
 		rc = 0;
 	} else {
-		while (ata_set_mode(ap, &dev))
-			ata_dev_disable(ap, dev);
+		rc = ata_set_mode(ap, &dev);
+		if (rc) {
+			down_xfermask = 1;
+			goto fail;
+		}
 	}
 
 	for (i = 0; i < ATA_MAX_DEVICES; i++)
@@ -1443,6 +1458,24 @@ static int ata_bus_probe(struct ata_port *ap)
 	ata_port_disable(ap);
 	ap->ops->port_disable(ap);
 	return -ENODEV;
+
+ fail:
+	switch (rc) {
+	case -EINVAL:
+	case -ENODEV:
+		tries[dev->devno] = 0;
+		break;
+	case -EIO:
+		ata_down_sata_spd_limit(ap);
+		/* fall through */
+	default:
+		tries[dev->devno]--;
+		if (down_xfermask &&
+		    ata_down_xfermask_limit(ap, dev, tries[dev->devno] == 1))
+			tries[dev->devno] = 0;
+	}
+
+	goto retry;
 }
 
 /**
diff --git a/include/linux/libata.h b/include/linux/libata.h
index a5207e66ca52..a4a1e6304e78 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -211,6 +211,9 @@ enum {
 	/* Masks for port functions */
 	ATA_PORT_PRIMARY	= (1 << 0),
 	ATA_PORT_SECONDARY	= (1 << 1),
+
+	/* how hard are we gonna try to probe/recover devices */
+	ATA_PROBE_MAX_TRIES	= 3,
 };
 
 enum hsm_task_states {
-- 
cgit v1.2.3


From c43c555c3a6db7f0b55fd9b66d7ecff16e827d4e Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 2 Apr 2006 18:51:52 +0900
Subject: [PATCH] libata: ATA_FLAG_IN_EH is not used, kill it

Kill unused flag ATA_FLAG_IN_EH.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-scsi.c | 17 ++++-------------
 include/linux/libata.h     |  1 -
 2 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index c1a4b29a9ae1..bcbf71e9895b 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -779,20 +779,15 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 
 int ata_scsi_error(struct Scsi_Host *host)
 {
-	struct ata_port *ap;
-	unsigned long flags;
+	struct ata_port *ap = (struct ata_port *)&host->hostdata[0];
 
 	DPRINTK("ENTER\n");
 
-	ap = (struct ata_port *) &host->hostdata[0];
+	/* synchronize with IRQ handler and port task */
+	spin_unlock_wait(&ap->host_set->lock);
+	ata_port_flush_task(ap);
 
-	spin_lock_irqsave(&ap->host_set->lock, flags);
-	WARN_ON(ap->flags & ATA_FLAG_IN_EH);
-	ap->flags |= ATA_FLAG_IN_EH;
 	WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL);
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
-
-	ata_port_flush_task(ap);
 
 	ap->ops->eng_timeout(ap);
 
@@ -800,10 +795,6 @@ int ata_scsi_error(struct Scsi_Host *host)
 
 	scsi_eh_flush_done_q(&ap->eh_done_q);
 
-	spin_lock_irqsave(&ap->host_set->lock, flags);
-	ap->flags &= ~ATA_FLAG_IN_EH;
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
-
 	DPRINTK("EXIT\n");
 	return 0;
 }
diff --git a/include/linux/libata.h b/include/linux/libata.h
index a4a1e6304e78..e20b0bfbd5f2 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -152,7 +152,6 @@ enum {
 	ATA_FLAG_IRQ_MASK	= (1 << 14), /* Mask IRQ in PIO xfers */
 
 	ATA_FLAG_FLUSH_PORT_TASK = (1 << 15), /* Flush port task */
-	ATA_FLAG_IN_EH		= (1 << 16), /* EH in progress */
 
 	ATA_QCFLAG_ACTIVE	= (1 << 1), /* cmd not yet ack'd to scsi lyer */
 	ATA_QCFLAG_SG		= (1 << 3), /* have s/g table? */
-- 
cgit v1.2.3


From 949b38af40a0b88b7267908b1554a45b97b5b737 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 2 Apr 2006 18:51:52 +0900
Subject: [PATCH] libata: clean up constants

* Reorder ATA_DFLAG_* such that feature flags determined by
  ata_dev_configure() are on lower bits.  Reserve lower eight bits
  for this purpose and allocate dynamic flags from bit 8.

* Reorder ATA_FLAG_* such that feature flags determined during driver
  initiailization are on bits 0:15, dynamic flags on 16:23 and LLDD
  specific flags on 24:31.

* Kill trailing white space and lower-case an one line comment for
  consistency.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/libata.h | 61 ++++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index e20b0bfbd5f2..b7488a31e320 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -120,9 +120,10 @@ enum {
 	ATA_SHT_USE_CLUSTERING	= 1,
 
 	/* struct ata_device stuff */
-	ATA_DFLAG_LBA48		= (1 << 0), /* device supports LBA48 */
-	ATA_DFLAG_PIO		= (1 << 1), /* device currently in PIO mode */
-	ATA_DFLAG_LBA		= (1 << 2), /* device supports LBA */
+	ATA_DFLAG_LBA		= (1 << 0), /* device supports LBA */
+	ATA_DFLAG_LBA48		= (1 << 1), /* device supports LBA48 */
+
+	ATA_DFLAG_PIO		= (1 << 8), /* device currently in PIO mode */
 
 	ATA_DEV_UNKNOWN		= 0,	/* unknown device */
 	ATA_DEV_ATA		= 1,	/* ATA device */
@@ -132,32 +133,34 @@ enum {
 	ATA_DEV_NONE		= 5,	/* no device */
 
 	/* struct ata_port flags */
-	ATA_FLAG_SLAVE_POSS	= (1 << 1), /* host supports slave dev */
+	ATA_FLAG_SLAVE_POSS	= (1 << 0), /* host supports slave dev */
 					    /* (doesn't imply presence) */
-	ATA_FLAG_PORT_DISABLED	= (1 << 2), /* port is disabled, ignore it */
-	ATA_FLAG_SATA		= (1 << 3),
-	ATA_FLAG_NO_LEGACY	= (1 << 4), /* no legacy mode check */
-	ATA_FLAG_SRST		= (1 << 5), /* (obsolete) use ATA SRST, not E.D.D. */
-	ATA_FLAG_MMIO		= (1 << 6), /* use MMIO, not PIO */
-	ATA_FLAG_SATA_RESET	= (1 << 7), /* (obsolete) use COMRESET */
-	ATA_FLAG_PIO_DMA	= (1 << 8), /* PIO cmds via DMA */
-	ATA_FLAG_NOINTR		= (1 << 9), /* FIXME: Remove this once
-					     * proper HSM is in place. */
-	ATA_FLAG_DEBUGMSG	= (1 << 10),
-	ATA_FLAG_NO_ATAPI	= (1 << 11), /* No ATAPI support */
-
-	ATA_FLAG_SUSPENDED	= (1 << 12), /* port is suspended */
-
-	ATA_FLAG_PIO_LBA48	= (1 << 13), /* Host DMA engine is LBA28 only */
-	ATA_FLAG_IRQ_MASK	= (1 << 14), /* Mask IRQ in PIO xfers */
-
-	ATA_FLAG_FLUSH_PORT_TASK = (1 << 15), /* Flush port task */
-
-	ATA_QCFLAG_ACTIVE	= (1 << 1), /* cmd not yet ack'd to scsi lyer */
-	ATA_QCFLAG_SG		= (1 << 3), /* have s/g table? */
-	ATA_QCFLAG_SINGLE	= (1 << 4), /* no s/g, just a single buffer */
+	ATA_FLAG_SATA		= (1 << 1),
+	ATA_FLAG_NO_LEGACY	= (1 << 2), /* no legacy mode check */
+	ATA_FLAG_MMIO		= (1 << 3), /* use MMIO, not PIO */
+	ATA_FLAG_SRST		= (1 << 4), /* (obsolete) use ATA SRST, not E.D.D. */
+	ATA_FLAG_SATA_RESET	= (1 << 5), /* (obsolete) use COMRESET */
+	ATA_FLAG_NO_ATAPI	= (1 << 6), /* No ATAPI support */
+	ATA_FLAG_PIO_DMA	= (1 << 7), /* PIO cmds via DMA */
+	ATA_FLAG_PIO_LBA48	= (1 << 8), /* Host DMA engine is LBA28 only */
+	ATA_FLAG_IRQ_MASK	= (1 << 9), /* Mask IRQ in PIO xfers */
+
+	ATA_FLAG_NOINTR		= (1 << 16), /* FIXME: Remove this once
+					      * proper HSM is in place. */
+	ATA_FLAG_DEBUGMSG	= (1 << 17),
+	ATA_FLAG_FLUSH_PORT_TASK = (1 << 18), /* flush port task */
+
+	ATA_FLAG_PORT_DISABLED	= (1 << 19), /* port is disabled, ignore it */
+	ATA_FLAG_SUSPENDED	= (1 << 20), /* port is suspended */
+
+	/* bits 24:31 of ap->flags are reserved for LLDD specific flags */
+
+	/* struct ata_queued_cmd flags */
+	ATA_QCFLAG_ACTIVE	= (1 << 0), /* cmd not yet ack'd to scsi lyer */
+	ATA_QCFLAG_SG		= (1 << 1), /* have s/g table? */
+	ATA_QCFLAG_SINGLE	= (1 << 2), /* no s/g, just a single buffer */
 	ATA_QCFLAG_DMAMAP	= ATA_QCFLAG_SG | ATA_QCFLAG_SINGLE,
-	ATA_QCFLAG_EH_SCHEDULED = (1 << 5), /* EH scheduled */
+	ATA_QCFLAG_EH_SCHEDULED = (1 << 3), /* EH scheduled */
 
 	/* host set flags */
 	ATA_HOST_SIMPLEX	= (1 << 0),	/* Host is simplex, one DMA channel per host_set only */
@@ -206,8 +209,8 @@ enum {
 	/* size of buffer to pad xfers ending on unaligned boundaries */
 	ATA_DMA_PAD_SZ		= 4,
 	ATA_DMA_PAD_BUF_SZ	= ATA_DMA_PAD_SZ * ATA_MAX_QUEUE,
-	
-	/* Masks for port functions */
+
+	/* masks for port functions */
 	ATA_PORT_PRIMARY	= (1 << 0),
 	ATA_PORT_SECONDARY	= (1 << 1),
 
-- 
cgit v1.2.3


From 198e0fed9e59461fc1890dd8b75ec72d14638873 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 2 Apr 2006 18:51:52 +0900
Subject: [PATCH] libata: rename ATA_FLAG_PORT_DISABLED to ATA_FLAG_DISABLED

Rename ATA_FLAG_PORT_DISABLED to ATA_FLAG_DISABLED for consistency.
(ATA_FLAG_* are always about ports).

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c  | 18 +++++++++---------
 drivers/scsi/libata-scsi.c  |  2 +-
 drivers/scsi/pdc_adma.c     |  4 ++--
 drivers/scsi/sata_mv.c      |  2 +-
 drivers/scsi/sata_nv.c      |  2 +-
 drivers/scsi/sata_promise.c |  2 +-
 drivers/scsi/sata_qstor.c   |  4 ++--
 drivers/scsi/sata_sil24.c   |  2 +-
 drivers/scsi/sata_sx4.c     |  2 +-
 drivers/scsi/sata_vsc.c     |  2 +-
 include/linux/libata.h      |  2 +-
 11 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index f39352a3ac11..53226b16355c 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1045,7 +1045,7 @@ ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
 	 *
 	 * Kill the following code as soon as those drivers are fixed.
 	 */
-	if (ap->flags & ATA_FLAG_PORT_DISABLED) {
+	if (ap->flags & ATA_FLAG_DISABLED) {
 		err_mask |= AC_ERR_SYSTEM;
 		ata_port_probe(ap);
 	}
@@ -1395,7 +1395,7 @@ static int ata_bus_probe(struct ata_port *ap)
 	} else {
 		ap->ops->phy_reset(ap);
 
-		if (!(ap->flags & ATA_FLAG_PORT_DISABLED))
+		if (!(ap->flags & ATA_FLAG_DISABLED))
 			for (i = 0; i < ATA_MAX_DEVICES; i++)
 				classes[i] = ap->device[i].class;
 
@@ -1491,7 +1491,7 @@ static int ata_bus_probe(struct ata_port *ap)
 
 void ata_port_probe(struct ata_port *ap)
 {
-	ap->flags &= ~ATA_FLAG_PORT_DISABLED;
+	ap->flags &= ~ATA_FLAG_DISABLED;
 }
 
 /**
@@ -1565,7 +1565,7 @@ void __sata_phy_reset(struct ata_port *ap)
 	else
 		ata_port_disable(ap);
 
-	if (ap->flags & ATA_FLAG_PORT_DISABLED)
+	if (ap->flags & ATA_FLAG_DISABLED)
 		return;
 
 	if (ata_busy_sleep(ap, ATA_TMOUT_BOOT_QUICK, ATA_TMOUT_BOOT)) {
@@ -1590,7 +1590,7 @@ void __sata_phy_reset(struct ata_port *ap)
 void sata_phy_reset(struct ata_port *ap)
 {
 	__sata_phy_reset(ap);
-	if (ap->flags & ATA_FLAG_PORT_DISABLED)
+	if (ap->flags & ATA_FLAG_DISABLED)
 		return;
 	ata_bus_reset(ap);
 }
@@ -1629,7 +1629,7 @@ void ata_port_disable(struct ata_port *ap)
 {
 	ap->device[0].class = ATA_DEV_NONE;
 	ap->device[1].class = ATA_DEV_NONE;
-	ap->flags |= ATA_FLAG_PORT_DISABLED;
+	ap->flags |= ATA_FLAG_DISABLED;
 }
 
 /**
@@ -2251,7 +2251,7 @@ static unsigned int ata_bus_softreset(struct ata_port *ap,
  *	Obtains host_set lock.
  *
  *	SIDE EFFECTS:
- *	Sets ATA_FLAG_PORT_DISABLED if bus reset fails.
+ *	Sets ATA_FLAG_DISABLED if bus reset fails.
  */
 
 void ata_bus_reset(struct ata_port *ap)
@@ -4468,7 +4468,7 @@ irqreturn_t ata_interrupt (int irq, void *dev_instance, struct pt_regs *regs)
 
 		ap = host_set->ports[i];
 		if (ap &&
-		    !(ap->flags & (ATA_FLAG_PORT_DISABLED | ATA_FLAG_NOINTR))) {
+		    !(ap->flags & (ATA_FLAG_DISABLED | ATA_FLAG_NOINTR))) {
 			struct ata_queued_cmd *qc;
 
 			qc = ata_qc_from_tag(ap, ap->active_tag);
@@ -4689,7 +4689,7 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 	host->unique_id = ata_unique_id++;
 	host->max_cmd_len = 12;
 
-	ap->flags = ATA_FLAG_PORT_DISABLED;
+	ap->flags = ATA_FLAG_DISABLED;
 	ap->id = host->unique_id;
 	ap->host = host;
 	ap->ctl = ATA_DEVCTL_OBS;
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index bcbf71e9895b..096bdffbde11 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -2728,7 +2728,7 @@ void ata_scsi_scan_host(struct ata_port *ap)
 	struct ata_device *dev;
 	unsigned int i;
 
-	if (ap->flags & ATA_FLAG_PORT_DISABLED)
+	if (ap->flags & ATA_FLAG_DISABLED)
 		return;
 
 	for (i = 0; i < ATA_MAX_DEVICES; i++) {
diff --git a/drivers/scsi/pdc_adma.c b/drivers/scsi/pdc_adma.c
index 3c85c4b66e19..d64073dd028f 100644
--- a/drivers/scsi/pdc_adma.c
+++ b/drivers/scsi/pdc_adma.c
@@ -456,7 +456,7 @@ static inline unsigned int adma_intr_pkt(struct ata_host_set *host_set)
 			continue;
 		handled = 1;
 		adma_enter_reg_mode(ap);
-		if (ap->flags & (ATA_FLAG_PORT_DISABLED | ATA_FLAG_NOINTR))
+		if (ap->flags & (ATA_FLAG_DISABLED | ATA_FLAG_NOINTR))
 			continue;
 		pp = ap->private_data;
 		if (!pp || pp->state != adma_state_pkt)
@@ -481,7 +481,7 @@ static inline unsigned int adma_intr_mmio(struct ata_host_set *host_set)
 	for (port_no = 0; port_no < host_set->n_ports; ++port_no) {
 		struct ata_port *ap;
 		ap = host_set->ports[port_no];
-		if (ap && (!(ap->flags & (ATA_FLAG_PORT_DISABLED | ATA_FLAG_NOINTR)))) {
+		if (ap && (!(ap->flags & (ATA_FLAG_DISABLED | ATA_FLAG_NOINTR)))) {
 			struct ata_queued_cmd *qc;
 			struct adma_port_priv *pp = ap->private_data;
 			if (!pp || pp->state != adma_state_mmio)
diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c
index 0f7d334aadcc..e9152f850003 100644
--- a/drivers/scsi/sata_mv.c
+++ b/drivers/scsi/sata_mv.c
@@ -1397,7 +1397,7 @@ static void mv_host_intr(struct ata_host_set *host_set, u32 relevant,
 			}
 		}
 
-		if (ap->flags & (ATA_FLAG_PORT_DISABLED | ATA_FLAG_NOINTR))
+		if (ap->flags & (ATA_FLAG_DISABLED | ATA_FLAG_NOINTR))
 			continue;
 
 		err_mask = ac_err_mask(ata_status);
diff --git a/drivers/scsi/sata_nv.c b/drivers/scsi/sata_nv.c
index f77bf183dfab..72721ac482d8 100644
--- a/drivers/scsi/sata_nv.c
+++ b/drivers/scsi/sata_nv.c
@@ -280,7 +280,7 @@ static irqreturn_t nv_interrupt (int irq, void *dev_instance,
 
 		ap = host_set->ports[i];
 		if (ap &&
-		    !(ap->flags & (ATA_FLAG_PORT_DISABLED | ATA_FLAG_NOINTR))) {
+		    !(ap->flags & (ATA_FLAG_DISABLED | ATA_FLAG_NOINTR))) {
 			struct ata_queued_cmd *qc;
 
 			qc = ata_qc_from_tag(ap, ap->active_tag);
diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c
index cc928c68a479..9557c7aa45e0 100644
--- a/drivers/scsi/sata_promise.c
+++ b/drivers/scsi/sata_promise.c
@@ -534,7 +534,7 @@ static irqreturn_t pdc_interrupt (int irq, void *dev_instance, struct pt_regs *r
 		ap = host_set->ports[i];
 		tmp = mask & (1 << (i + 1));
 		if (tmp && ap &&
-		    !(ap->flags & (ATA_FLAG_PORT_DISABLED | ATA_FLAG_NOINTR))) {
+		    !(ap->flags & (ATA_FLAG_DISABLED | ATA_FLAG_NOINTR))) {
 			struct ata_queued_cmd *qc;
 
 			qc = ata_qc_from_tag(ap, ap->active_tag);
diff --git a/drivers/scsi/sata_qstor.c b/drivers/scsi/sata_qstor.c
index 9ffe1ef0d205..8ef042a09448 100644
--- a/drivers/scsi/sata_qstor.c
+++ b/drivers/scsi/sata_qstor.c
@@ -396,7 +396,7 @@ static inline unsigned int qs_intr_pkt(struct ata_host_set *host_set)
 					sff1, sff0, port_no, sHST, sDST);
 			handled = 1;
 			if (ap && !(ap->flags &
-				    (ATA_FLAG_PORT_DISABLED|ATA_FLAG_NOINTR))) {
+				    (ATA_FLAG_DISABLED|ATA_FLAG_NOINTR))) {
 				struct ata_queued_cmd *qc;
 				struct qs_port_priv *pp = ap->private_data;
 				if (!pp || pp->state != qs_state_pkt)
@@ -429,7 +429,7 @@ static inline unsigned int qs_intr_mmio(struct ata_host_set *host_set)
 		struct ata_port *ap;
 		ap = host_set->ports[port_no];
 		if (ap &&
-		    !(ap->flags & (ATA_FLAG_PORT_DISABLED | ATA_FLAG_NOINTR))) {
+		    !(ap->flags & (ATA_FLAG_DISABLED | ATA_FLAG_NOINTR))) {
 			struct ata_queued_cmd *qc;
 			struct qs_port_priv *pp = ap->private_data;
 			if (!pp || pp->state != qs_state_mmio)
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c
index 068c98a4111b..c34f6dabf418 100644
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -770,7 +770,7 @@ static irqreturn_t sil24_interrupt(int irq, void *dev_instance, struct pt_regs *
 	for (i = 0; i < host_set->n_ports; i++)
 		if (status & (1 << i)) {
 			struct ata_port *ap = host_set->ports[i];
-			if (ap && !(ap->flags & ATA_FLAG_PORT_DISABLED)) {
+			if (ap && !(ap->flags & ATA_FLAG_DISABLED)) {
 				sil24_host_intr(host_set->ports[i]);
 				handled++;
 			} else
diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c
index ae70f60c7c0d..3af28ef76fd9 100644
--- a/drivers/scsi/sata_sx4.c
+++ b/drivers/scsi/sata_sx4.c
@@ -834,7 +834,7 @@ static irqreturn_t pdc20621_interrupt (int irq, void *dev_instance, struct pt_re
 		tmp = mask & (1 << i);
 		VPRINTK("seq %u, port_no %u, ap %p, tmp %x\n", i, port_no, ap, tmp);
 		if (tmp && ap &&
-		    !(ap->flags & (ATA_FLAG_PORT_DISABLED | ATA_FLAG_NOINTR))) {
+		    !(ap->flags & (ATA_FLAG_DISABLED | ATA_FLAG_NOINTR))) {
 			struct ata_queued_cmd *qc;
 
 			qc = ata_qc_from_tag(ap, ap->active_tag);
diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c
index 836bbbb26ff2..cecc1f76256b 100644
--- a/drivers/scsi/sata_vsc.c
+++ b/drivers/scsi/sata_vsc.c
@@ -222,7 +222,7 @@ static irqreturn_t vsc_sata_interrupt (int irq, void *dev_instance,
 			ap = host_set->ports[i];
 
 			if (ap && !(ap->flags &
-				    (ATA_FLAG_PORT_DISABLED|ATA_FLAG_NOINTR))) {
+				    (ATA_FLAG_DISABLED|ATA_FLAG_NOINTR))) {
 				struct ata_queued_cmd *qc;
 
 				qc = ata_qc_from_tag(ap, ap->active_tag);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index b7488a31e320..890262f44d0a 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -150,7 +150,7 @@ enum {
 	ATA_FLAG_DEBUGMSG	= (1 << 17),
 	ATA_FLAG_FLUSH_PORT_TASK = (1 << 18), /* flush port task */
 
-	ATA_FLAG_PORT_DISABLED	= (1 << 19), /* port is disabled, ignore it */
+	ATA_FLAG_DISABLED	= (1 << 19), /* port is disabled, ignore it */
 	ATA_FLAG_SUSPENDED	= (1 << 20), /* port is suspended */
 
 	/* bits 24:31 of ap->flags are reserved for LLDD specific flags */
-- 
cgit v1.2.3


From ea1dd4e13010eb9dd5ffb4bfabbb472bc238bebb Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 2 Apr 2006 18:51:53 +0900
Subject: [PATCH] libata: clear only affected flags during ata_dev_configure()

ata_dev_configure() should not clear dynamic device flags determined
elsewhere.  Lower eight bits are reserved for feature flags, define
ATA_DFLAG_CFG_MASK and clear only those bits before configuring
device.  Without this patch, ATA_DFLAG_PIO gets turned off during
revalidation making PIO mode unuseable.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 2 +-
 include/linux/libata.h     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 53226b16355c..985283cbcbdf 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1241,7 +1241,7 @@ static int ata_dev_configure(struct ata_port *ap, struct ata_device *dev,
 		       id[84], id[85], id[86], id[87], id[88]);
 
 	/* initialize to-be-configured parameters */
-	dev->flags = 0;
+	dev->flags &= ~ATA_DFLAG_CFG_MASK;
 	dev->max_sectors = 0;
 	dev->cdb_len = 0;
 	dev->n_sectors = 0;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 890262f44d0a..cbbc821fe22c 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -122,6 +122,7 @@ enum {
 	/* struct ata_device stuff */
 	ATA_DFLAG_LBA		= (1 << 0), /* device supports LBA */
 	ATA_DFLAG_LBA48		= (1 << 1), /* device supports LBA48 */
+	ATA_DFLAG_CFG_MASK	= (1 << 8) - 1,
 
 	ATA_DFLAG_PIO		= (1 << 8), /* device currently in PIO mode */
 
-- 
cgit v1.2.3


From 2719736779da2c7fbb17d3de16c817b429bfeb9c Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 2 Apr 2006 18:51:53 +0900
Subject: [PATCH] libata: add ATA_QCFLAG_IO

Add a new qc flag ATA_QCFLAG_IO.  This flag gets set for normal IO
commands originating from SCSI midlayer.  This information will be
used by EH to determine transfer speed reconfiguration.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-scsi.c | 1 +
 include/linux/libata.h     | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 096bdffbde11..234e1cadb070 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -1188,6 +1188,7 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
 	u64 block;
 	u32 n_block;
 
+	qc->flags |= ATA_QCFLAG_IO;
 	tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
 
 	if (scsicmd[0] == WRITE_10 || scsicmd[0] == WRITE_6 ||
diff --git a/include/linux/libata.h b/include/linux/libata.h
index cbbc821fe22c..a6d829cb0567 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -161,7 +161,8 @@ enum {
 	ATA_QCFLAG_SG		= (1 << 1), /* have s/g table? */
 	ATA_QCFLAG_SINGLE	= (1 << 2), /* no s/g, just a single buffer */
 	ATA_QCFLAG_DMAMAP	= ATA_QCFLAG_SG | ATA_QCFLAG_SINGLE,
-	ATA_QCFLAG_EH_SCHEDULED = (1 << 3), /* EH scheduled */
+	ATA_QCFLAG_IO		= (1 << 3), /* standard IO command */
+	ATA_QCFLAG_EH_SCHEDULED = (1 << 4), /* EH scheduled */
 
 	/* host set flags */
 	ATA_HOST_SIMPLEX	= (1 << 0),	/* Host is simplex, one DMA channel per host_set only */
-- 
cgit v1.2.3


From ece1d63619df010b8c4f08e43755e2a03f3b6eed Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 2 Apr 2006 18:51:53 +0900
Subject: [PATCH] libata: separate out libata-eh.c

A lot of EH codes are about to be added to libata.  Separate out
libata-eh.c.  ata_scsi_timed_out(), ata_scsi_error(),
ata_qc_timeout(), ata_eng_timeout(), ata_eh_qc_complete() and
ata_eh_qc_retry() are moved.  No code is changed by this patch.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/Makefile      |   2 +-
 drivers/scsi/libata-core.c | 102 +-----------------
 drivers/scsi/libata-eh.c   | 264 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/scsi/libata-scsi.c | 130 ----------------------
 drivers/scsi/libata.h      |   4 +-
 include/linux/libata.h     |  12 ++-
 6 files changed, 281 insertions(+), 233 deletions(-)
 create mode 100644 drivers/scsi/libata-eh.c

(limited to 'include/linux')

diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index e513c3158ad9..503f189dab3b 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -165,7 +165,7 @@ ncr53c8xx-flags-$(CONFIG_SCSI_ZALON) \
 CFLAGS_ncr53c8xx.o	:= $(ncr53c8xx-flags-y) $(ncr53c8xx-flags-m)
 zalon7xx-objs	:= zalon.o ncr53c8xx.o
 NCR_Q720_mod-objs	:= NCR_Q720.o ncr53c8xx.o
-libata-objs	:= libata-core.o libata-scsi.o libata-bmdma.o
+libata-objs	:= libata-core.o libata-scsi.o libata-bmdma.o libata-eh.o
 oktagon_esp_mod-objs	:= oktagon_esp.o oktagon_io.o
 
 # Files generated that shall be removed upon make clean
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 66b48b11fa0e..186a9ce4f072 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -4015,99 +4015,6 @@ err_out:
 	ata_poll_qc_complete(qc);
 }
 
-/**
- *	ata_qc_timeout - Handle timeout of queued command
- *	@qc: Command that timed out
- *
- *	Some part of the kernel (currently, only the SCSI layer)
- *	has noticed that the active command on port @ap has not
- *	completed after a specified length of time.  Handle this
- *	condition by disabling DMA (if necessary) and completing
- *	transactions, with error if necessary.
- *
- *	This also handles the case of the "lost interrupt", where
- *	for some reason (possibly hardware bug, possibly driver bug)
- *	an interrupt was not delivered to the driver, even though the
- *	transaction completed successfully.
- *
- *	LOCKING:
- *	Inherited from SCSI layer (none, can sleep)
- */
-
-static void ata_qc_timeout(struct ata_queued_cmd *qc)
-{
-	struct ata_port *ap = qc->ap;
-	struct ata_host_set *host_set = ap->host_set;
-	u8 host_stat = 0, drv_stat;
-	unsigned long flags;
-
-	DPRINTK("ENTER\n");
-
-	ap->hsm_task_state = HSM_ST_IDLE;
-
-	spin_lock_irqsave(&host_set->lock, flags);
-
-	switch (qc->tf.protocol) {
-
-	case ATA_PROT_DMA:
-	case ATA_PROT_ATAPI_DMA:
-		host_stat = ap->ops->bmdma_status(ap);
-
-		/* before we do anything else, clear DMA-Start bit */
-		ap->ops->bmdma_stop(qc);
-
-		/* fall through */
-
-	default:
-		ata_altstatus(ap);
-		drv_stat = ata_chk_status(ap);
-
-		/* ack bmdma irq events */
-		ap->ops->irq_clear(ap);
-
-		printk(KERN_ERR "ata%u: command 0x%x timeout, stat 0x%x host_stat 0x%x\n",
-		       ap->id, qc->tf.command, drv_stat, host_stat);
-
-		/* complete taskfile transaction */
-		qc->err_mask |= ac_err_mask(drv_stat);
-		break;
-	}
-
-	spin_unlock_irqrestore(&host_set->lock, flags);
-
-	ata_eh_qc_complete(qc);
-
-	DPRINTK("EXIT\n");
-}
-
-/**
- *	ata_eng_timeout - Handle timeout of queued command
- *	@ap: Port on which timed-out command is active
- *
- *	Some part of the kernel (currently, only the SCSI layer)
- *	has noticed that the active command on port @ap has not
- *	completed after a specified length of time.  Handle this
- *	condition by disabling DMA (if necessary) and completing
- *	transactions, with error if necessary.
- *
- *	This also handles the case of the "lost interrupt", where
- *	for some reason (possibly hardware bug, possibly driver bug)
- *	an interrupt was not delivered to the driver, even though the
- *	transaction completed successfully.
- *
- *	LOCKING:
- *	Inherited from SCSI layer (none, can sleep)
- */
-
-void ata_eng_timeout(struct ata_port *ap)
-{
-	DPRINTK("ENTER\n");
-
-	ata_qc_timeout(ata_qc_from_tag(ap, ap->active_tag));
-
-	DPRINTK("EXIT\n");
-}
-
 /**
  *	ata_qc_new - Request an available ATA command, for queueing
  *	@ap: Port associated with device @dev
@@ -5145,7 +5052,6 @@ EXPORT_SYMBOL_GPL(ata_sg_init);
 EXPORT_SYMBOL_GPL(ata_sg_init_one);
 EXPORT_SYMBOL_GPL(__ata_qc_complete);
 EXPORT_SYMBOL_GPL(ata_qc_issue_prot);
-EXPORT_SYMBOL_GPL(ata_eng_timeout);
 EXPORT_SYMBOL_GPL(ata_tf_load);
 EXPORT_SYMBOL_GPL(ata_tf_read);
 EXPORT_SYMBOL_GPL(ata_noop_dev_select);
@@ -5185,15 +5091,12 @@ EXPORT_SYMBOL_GPL(ata_busy_sleep);
 EXPORT_SYMBOL_GPL(ata_port_queue_task);
 EXPORT_SYMBOL_GPL(ata_scsi_ioctl);
 EXPORT_SYMBOL_GPL(ata_scsi_queuecmd);
-EXPORT_SYMBOL_GPL(ata_scsi_error);
 EXPORT_SYMBOL_GPL(ata_scsi_slave_config);
 EXPORT_SYMBOL_GPL(ata_scsi_release);
 EXPORT_SYMBOL_GPL(ata_host_intr);
 EXPORT_SYMBOL_GPL(ata_id_string);
 EXPORT_SYMBOL_GPL(ata_id_c_string);
 EXPORT_SYMBOL_GPL(ata_scsi_simulate);
-EXPORT_SYMBOL_GPL(ata_eh_qc_complete);
-EXPORT_SYMBOL_GPL(ata_eh_qc_retry);
 
 EXPORT_SYMBOL_GPL(ata_pio_need_iordy);
 EXPORT_SYMBOL_GPL(ata_timing_compute);
@@ -5215,3 +5118,8 @@ EXPORT_SYMBOL_GPL(ata_device_suspend);
 EXPORT_SYMBOL_GPL(ata_device_resume);
 EXPORT_SYMBOL_GPL(ata_scsi_device_suspend);
 EXPORT_SYMBOL_GPL(ata_scsi_device_resume);
+
+EXPORT_SYMBOL_GPL(ata_scsi_error);
+EXPORT_SYMBOL_GPL(ata_eng_timeout);
+EXPORT_SYMBOL_GPL(ata_eh_qc_complete);
+EXPORT_SYMBOL_GPL(ata_eh_qc_retry);
diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
new file mode 100644
index 000000000000..e73f5612aea8
--- /dev/null
+++ b/drivers/scsi/libata-eh.c
@@ -0,0 +1,264 @@
+/*
+ *  libata-eh.c - libata error handling
+ *
+ *  Maintained by:  Jeff Garzik <jgarzik@pobox.com>
+ *    		    Please ALWAYS copy linux-ide@vger.kernel.org
+ *		    on emails.
+ *
+ *  Copyright 2006 Tejun Heo <htejun@gmail.com>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation; either version 2, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ *  USA.
+ *
+ *
+ *  libata documentation is available via 'make {ps|pdf}docs',
+ *  as Documentation/DocBook/libata.*
+ *
+ *  Hardware documentation available from http://www.t13.org/ and
+ *  http://www.sata-io.org/
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_eh.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_cmnd.h>
+
+#include <linux/libata.h>
+
+#include "libata.h"
+
+/**
+ *	ata_scsi_timed_out - SCSI layer time out callback
+ *	@cmd: timed out SCSI command
+ *
+ *	Handles SCSI layer timeout.  We race with normal completion of
+ *	the qc for @cmd.  If the qc is already gone, we lose and let
+ *	the scsi command finish (EH_HANDLED).  Otherwise, the qc has
+ *	timed out and EH should be invoked.  Prevent ata_qc_complete()
+ *	from finishing it by setting EH_SCHEDULED and return
+ *	EH_NOT_HANDLED.
+ *
+ *	LOCKING:
+ *	Called from timer context
+ *
+ *	RETURNS:
+ *	EH_HANDLED or EH_NOT_HANDLED
+ */
+enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
+{
+	struct Scsi_Host *host = cmd->device->host;
+	struct ata_port *ap = (struct ata_port *) &host->hostdata[0];
+	unsigned long flags;
+	struct ata_queued_cmd *qc;
+	enum scsi_eh_timer_return ret = EH_HANDLED;
+
+	DPRINTK("ENTER\n");
+
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	qc = ata_qc_from_tag(ap, ap->active_tag);
+	if (qc) {
+		WARN_ON(qc->scsicmd != cmd);
+		qc->flags |= ATA_QCFLAG_EH_SCHEDULED;
+		qc->err_mask |= AC_ERR_TIMEOUT;
+		ret = EH_NOT_HANDLED;
+	}
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	DPRINTK("EXIT, ret=%d\n", ret);
+	return ret;
+}
+
+/**
+ *	ata_scsi_error - SCSI layer error handler callback
+ *	@host: SCSI host on which error occurred
+ *
+ *	Handles SCSI-layer-thrown error events.
+ *
+ *	LOCKING:
+ *	Inherited from SCSI layer (none, can sleep)
+ *
+ *	RETURNS:
+ *	Zero.
+ */
+int ata_scsi_error(struct Scsi_Host *host)
+{
+	struct ata_port *ap = (struct ata_port *)&host->hostdata[0];
+
+	DPRINTK("ENTER\n");
+
+	/* synchronize with IRQ handler and port task */
+	spin_unlock_wait(&ap->host_set->lock);
+	ata_port_flush_task(ap);
+
+	WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL);
+
+	ap->ops->eng_timeout(ap);
+
+	WARN_ON(host->host_failed || !list_empty(&host->eh_cmd_q));
+
+	scsi_eh_flush_done_q(&ap->eh_done_q);
+
+	DPRINTK("EXIT\n");
+	return 0;
+}
+
+/**
+ *	ata_qc_timeout - Handle timeout of queued command
+ *	@qc: Command that timed out
+ *
+ *	Some part of the kernel (currently, only the SCSI layer)
+ *	has noticed that the active command on port @ap has not
+ *	completed after a specified length of time.  Handle this
+ *	condition by disabling DMA (if necessary) and completing
+ *	transactions, with error if necessary.
+ *
+ *	This also handles the case of the "lost interrupt", where
+ *	for some reason (possibly hardware bug, possibly driver bug)
+ *	an interrupt was not delivered to the driver, even though the
+ *	transaction completed successfully.
+ *
+ *	LOCKING:
+ *	Inherited from SCSI layer (none, can sleep)
+ */
+static void ata_qc_timeout(struct ata_queued_cmd *qc)
+{
+	struct ata_port *ap = qc->ap;
+	struct ata_host_set *host_set = ap->host_set;
+	u8 host_stat = 0, drv_stat;
+	unsigned long flags;
+
+	DPRINTK("ENTER\n");
+
+	ap->hsm_task_state = HSM_ST_IDLE;
+
+	spin_lock_irqsave(&host_set->lock, flags);
+
+	switch (qc->tf.protocol) {
+
+	case ATA_PROT_DMA:
+	case ATA_PROT_ATAPI_DMA:
+		host_stat = ap->ops->bmdma_status(ap);
+
+		/* before we do anything else, clear DMA-Start bit */
+		ap->ops->bmdma_stop(qc);
+
+		/* fall through */
+
+	default:
+		ata_altstatus(ap);
+		drv_stat = ata_chk_status(ap);
+
+		/* ack bmdma irq events */
+		ap->ops->irq_clear(ap);
+
+		printk(KERN_ERR "ata%u: command 0x%x timeout, stat 0x%x host_stat 0x%x\n",
+		       ap->id, qc->tf.command, drv_stat, host_stat);
+
+		/* complete taskfile transaction */
+		qc->err_mask |= ac_err_mask(drv_stat);
+		break;
+	}
+
+	spin_unlock_irqrestore(&host_set->lock, flags);
+
+	ata_eh_qc_complete(qc);
+
+	DPRINTK("EXIT\n");
+}
+
+/**
+ *	ata_eng_timeout - Handle timeout of queued command
+ *	@ap: Port on which timed-out command is active
+ *
+ *	Some part of the kernel (currently, only the SCSI layer)
+ *	has noticed that the active command on port @ap has not
+ *	completed after a specified length of time.  Handle this
+ *	condition by disabling DMA (if necessary) and completing
+ *	transactions, with error if necessary.
+ *
+ *	This also handles the case of the "lost interrupt", where
+ *	for some reason (possibly hardware bug, possibly driver bug)
+ *	an interrupt was not delivered to the driver, even though the
+ *	transaction completed successfully.
+ *
+ *	LOCKING:
+ *	Inherited from SCSI layer (none, can sleep)
+ */
+void ata_eng_timeout(struct ata_port *ap)
+{
+	DPRINTK("ENTER\n");
+
+	ata_qc_timeout(ata_qc_from_tag(ap, ap->active_tag));
+
+	DPRINTK("EXIT\n");
+}
+
+static void ata_eh_scsidone(struct scsi_cmnd *scmd)
+{
+	/* nada */
+}
+
+static void __ata_eh_qc_complete(struct ata_queued_cmd *qc)
+{
+	struct ata_port *ap = qc->ap;
+	struct scsi_cmnd *scmd = qc->scsicmd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	qc->scsidone = ata_eh_scsidone;
+	__ata_qc_complete(qc);
+	WARN_ON(ata_tag_valid(qc->tag));
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
+}
+
+/**
+ *	ata_eh_qc_complete - Complete an active ATA command from EH
+ *	@qc: Command to complete
+ *
+ *	Indicate to the mid and upper layers that an ATA command has
+ *	completed.  To be used from EH.
+ */
+void ata_eh_qc_complete(struct ata_queued_cmd *qc)
+{
+	struct scsi_cmnd *scmd = qc->scsicmd;
+	scmd->retries = scmd->allowed;
+	__ata_eh_qc_complete(qc);
+}
+
+/**
+ *	ata_eh_qc_retry - Tell midlayer to retry an ATA command after EH
+ *	@qc: Command to retry
+ *
+ *	Indicate to the mid and upper layers that an ATA command
+ *	should be retried.  To be used from EH.
+ *
+ *	SCSI midlayer limits the number of retries to scmd->allowed.
+ *	scmd->retries is decremented for commands which get retried
+ *	due to unrelated failures (qc->err_mask is zero).
+ */
+void ata_eh_qc_retry(struct ata_queued_cmd *qc)
+{
+	struct scsi_cmnd *scmd = qc->scsicmd;
+	if (!qc->err_mask && scmd->retries)
+		scmd->retries--;
+	__ata_eh_qc_complete(qc);
+}
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 44008150d310..745fc263feeb 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -53,7 +53,6 @@
 typedef unsigned int (*ata_xlat_func_t)(struct ata_queued_cmd *qc, const u8 *scsicmd);
 static struct ata_device *
 ata_scsi_find_dev(struct ata_port *ap, const struct scsi_device *scsidev);
-enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
 
 #define RW_RECOVERY_MPAGE 0x1
 #define RW_RECOVERY_MPAGE_LEN 12
@@ -713,135 +712,6 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
 	return 0;	/* scsi layer doesn't check return value, sigh */
 }
 
-/**
- *	ata_scsi_timed_out - SCSI layer time out callback
- *	@cmd: timed out SCSI command
- *
- *	Handles SCSI layer timeout.  We race with normal completion of
- *	the qc for @cmd.  If the qc is already gone, we lose and let
- *	the scsi command finish (EH_HANDLED).  Otherwise, the qc has
- *	timed out and EH should be invoked.  Prevent ata_qc_complete()
- *	from finishing it by setting EH_SCHEDULED and return
- *	EH_NOT_HANDLED.
- *
- *	LOCKING:
- *	Called from timer context
- *
- *	RETURNS:
- *	EH_HANDLED or EH_NOT_HANDLED
- */
-enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
-{
-	struct Scsi_Host *host = cmd->device->host;
-	struct ata_port *ap = (struct ata_port *) &host->hostdata[0];
-	unsigned long flags;
-	struct ata_queued_cmd *qc;
-	enum scsi_eh_timer_return ret = EH_HANDLED;
-
-	DPRINTK("ENTER\n");
-
-	spin_lock_irqsave(&ap->host_set->lock, flags);
-	qc = ata_qc_from_tag(ap, ap->active_tag);
-	if (qc) {
-		WARN_ON(qc->scsicmd != cmd);
-		qc->flags |= ATA_QCFLAG_EH_SCHEDULED;
-		qc->err_mask |= AC_ERR_TIMEOUT;
-		ret = EH_NOT_HANDLED;
-	}
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
-
-	DPRINTK("EXIT, ret=%d\n", ret);
-	return ret;
-}
-
-/**
- *	ata_scsi_error - SCSI layer error handler callback
- *	@host: SCSI host on which error occurred
- *
- *	Handles SCSI-layer-thrown error events.
- *
- *	LOCKING:
- *	Inherited from SCSI layer (none, can sleep)
- *
- *	RETURNS:
- *	Zero.
- */
-
-int ata_scsi_error(struct Scsi_Host *host)
-{
-	struct ata_port *ap = (struct ata_port *)&host->hostdata[0];
-
-	DPRINTK("ENTER\n");
-
-	/* synchronize with IRQ handler and port task */
-	spin_unlock_wait(&ap->host_set->lock);
-	ata_port_flush_task(ap);
-
-	WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL);
-
-	ap->ops->eng_timeout(ap);
-
-	WARN_ON(host->host_failed || !list_empty(&host->eh_cmd_q));
-
-	scsi_eh_flush_done_q(&ap->eh_done_q);
-
-	DPRINTK("EXIT\n");
-	return 0;
-}
-
-static void ata_eh_scsidone(struct scsi_cmnd *scmd)
-{
-	/* nada */
-}
-
-static void __ata_eh_qc_complete(struct ata_queued_cmd *qc)
-{
-	struct ata_port *ap = qc->ap;
-	struct scsi_cmnd *scmd = qc->scsicmd;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ap->host_set->lock, flags);
-	qc->scsidone = ata_eh_scsidone;
-	__ata_qc_complete(qc);
-	WARN_ON(ata_tag_valid(qc->tag));
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
-
-	scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
-}
-
-/**
- *	ata_eh_qc_complete - Complete an active ATA command from EH
- *	@qc: Command to complete
- *
- *	Indicate to the mid and upper layers that an ATA command has
- *	completed.  To be used from EH.
- */
-void ata_eh_qc_complete(struct ata_queued_cmd *qc)
-{
-	struct scsi_cmnd *scmd = qc->scsicmd;
-	scmd->retries = scmd->allowed;
-	__ata_eh_qc_complete(qc);
-}
-
-/**
- *	ata_eh_qc_retry - Tell midlayer to retry an ATA command after EH
- *	@qc: Command to retry
- *
- *	Indicate to the mid and upper layers that an ATA command
- *	should be retried.  To be used from EH.
- *
- *	SCSI midlayer limits the number of retries to scmd->allowed.
- *	scmd->retries is decremented for commands which get retried
- *	due to unrelated failures (qc->err_mask is zero).
- */
-void ata_eh_qc_retry(struct ata_queued_cmd *qc)
-{
-	struct scsi_cmnd *scmd = qc->scsicmd;
-	if (!qc->err_mask && scmd->retries)
-		scmd->retries--;
-	__ata_eh_qc_complete(qc);
-}
-
 /**
  *	ata_scsi_start_stop_xlat - Translate SCSI START STOP UNIT command
  *	@qc: Storage for translated ATA taskfile
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index 1c755b14521a..75e9bd5833da 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -60,7 +60,6 @@ extern int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg);
 extern struct scsi_transport_template ata_scsi_transport_template;
 
 extern void ata_scsi_scan_host(struct ata_port *ap);
-extern int ata_scsi_error(struct Scsi_Host *host);
 extern unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf,
 			       unsigned int buflen);
 
@@ -90,4 +89,7 @@ extern void ata_scsi_rbuf_fill(struct ata_scsi_args *args,
                         unsigned int (*actor) (struct ata_scsi_args *args,
                                            u8 *rbuf, unsigned int buflen));
 
+/* libata-eh.c */
+extern enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
+
 #endif /* __LIBATA_H__ */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index a6d829cb0567..75bdee09c307 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -531,9 +531,6 @@ extern void ata_host_set_remove(struct ata_host_set *host_set);
 extern int ata_scsi_detect(struct scsi_host_template *sht);
 extern int ata_scsi_ioctl(struct scsi_device *dev, int cmd, void __user *arg);
 extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *));
-extern int ata_scsi_error(struct Scsi_Host *host);
-extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
-extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
 extern int ata_scsi_release(struct Scsi_Host *host);
 extern unsigned int ata_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc);
 extern int ata_scsi_device_resume(struct scsi_device *);
@@ -582,7 +579,6 @@ extern void ata_bmdma_stop(struct ata_queued_cmd *qc);
 extern u8   ata_bmdma_status(struct ata_port *ap);
 extern void ata_bmdma_irq_clear(struct ata_port *ap);
 extern void __ata_qc_complete(struct ata_queued_cmd *qc);
-extern void ata_eng_timeout(struct ata_port *ap);
 extern void ata_scsi_simulate(struct ata_port *ap, struct ata_device *dev,
 			      struct scsi_cmnd *cmd,
 			      void (*done)(struct scsi_cmnd *));
@@ -637,6 +633,14 @@ extern int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bit
 extern unsigned long ata_pci_default_filter(const struct ata_port *, struct ata_device *, unsigned long);
 #endif /* CONFIG_PCI */
 
+/*
+ * EH
+ */
+extern int ata_scsi_error(struct Scsi_Host *host);
+extern void ata_eng_timeout(struct ata_port *ap);
+extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
+extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
+
 
 static inline int
 ata_sg_is_last(struct scatterlist *sg, struct ata_queued_cmd *qc)
-- 
cgit v1.2.3


From 95de719adc94392a95c3c4d0a2d6b8b1ea39d236 Mon Sep 17 00:00:00 2001
From: Albert Lee <albertcc@tw.ibm.com>
Date: Tue, 4 Apr 2006 10:57:18 +0800
Subject: [PATCH] libata: convert ATAPI_ENABLE_DMADIR to module parameter

Convert the ATAPI_ENABLE_DMADIR compile time option needed
by some SATA-PATA bridge to runtime module parameter.

Signed-off-by: Albert Lee <albertcc@tw.ibm.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 4 ++++
 drivers/scsi/libata-scsi.c | 6 ++----
 drivers/scsi/libata.h      | 1 +
 include/linux/libata.h     | 1 -
 4 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 966be3034bf0..3387fe35c54f 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -76,6 +76,10 @@ int atapi_enabled = 1;
 module_param(atapi_enabled, int, 0444);
 MODULE_PARM_DESC(atapi_enabled, "Enable discovery of ATAPI devices (0=off, 1=on)");
 
+int atapi_dmadir = 0;
+module_param(atapi_dmadir, int, 0444);
+MODULE_PARM_DESC(atapi_dmadir, "Enable ATAPI DMADIR bridge support (0=off, 1=on)");
+
 int libata_fua = 0;
 module_param_named(fua, libata_fua, int, 0444);
 MODULE_PARM_DESC(fua, "FUA support (0=off, 1=on)");
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 745fc263feeb..c9c001490293 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -2163,11 +2163,9 @@ static unsigned int atapi_xlat(struct ata_queued_cmd *qc, const u8 *scsicmd)
 		qc->tf.protocol = ATA_PROT_ATAPI_DMA;
 		qc->tf.feature |= ATAPI_PKT_DMA;
 
-#ifdef ATAPI_ENABLE_DMADIR
-		/* some SATA bridges need us to indicate data xfer direction */
-		if (cmd->sc_data_direction != DMA_TO_DEVICE)
+		if (atapi_dmadir && (cmd->sc_data_direction != DMA_TO_DEVICE))
+			/* some SATA bridges need us to indicate data xfer direction */
 			qc->tf.feature |= ATAPI_DMADIR;
-#endif
 	}
 
 	qc->nbytes = cmd->bufflen;
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index e3bd1ddb5387..652c08e3808e 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -41,6 +41,7 @@ struct ata_scsi_args {
 
 /* libata-core.c */
 extern int atapi_enabled;
+extern int atapi_dmadir;
 extern int libata_fua;
 extern struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
 				      struct ata_device *dev);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 75bdee09c307..03231cb6b406 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -44,7 +44,6 @@
 #undef ATA_NDEBUG		/* define to disable quick runtime checks */
 #undef ATA_ENABLE_PATA		/* define to enable PATA support in some
 				 * low-level drivers */
-#undef ATAPI_ENABLE_DMADIR	/* enables ATAPI DMADIR bridge support */
 
 
 /* note: prints function name for you */
-- 
cgit v1.2.3


From 381544bba3ae6f2f1004b267da34f840b469033c Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Tue, 11 Apr 2006 13:04:39 -0400
Subject: libata: Fix EH merge difference between this branch and upstream.

---
 drivers/scsi/libata-eh.c | 3 +--
 drivers/scsi/libata.h    | 1 +
 include/linux/libata.h   | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index e73f5612aea8..b518654b9d60 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -97,7 +97,7 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
  *	RETURNS:
  *	Zero.
  */
-int ata_scsi_error(struct Scsi_Host *host)
+void ata_scsi_error(struct Scsi_Host *host)
 {
 	struct ata_port *ap = (struct ata_port *)&host->hostdata[0];
 
@@ -116,7 +116,6 @@ int ata_scsi_error(struct Scsi_Host *host)
 	scsi_eh_flush_done_q(&ap->eh_done_q);
 
 	DPRINTK("EXIT\n");
-	return 0;
 }
 
 /**
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index 652c08e3808e..b18d377588dc 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -105,5 +105,6 @@ extern void ata_scsi_rbuf_fill(struct ata_scsi_args *args,
 
 /* libata-eh.c */
 extern enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
+extern void ata_scsi_error(struct Scsi_Host *host);
 
 #endif /* __LIBATA_H__ */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index a7161d42d18f..2564bc514bca 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -637,7 +637,6 @@ extern unsigned long ata_pci_default_filter(const struct ata_port *, struct ata_
 /*
  * EH
  */
-extern int ata_scsi_error(struct Scsi_Host *host);
 extern void ata_eng_timeout(struct ata_port *ap);
 extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
-- 
cgit v1.2.3


From 35bb94b116e1fd4959ef0d3187458b5820eac8c4 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Tue, 11 Apr 2006 13:12:34 -0400
Subject: libata: Add helper ata_shost_to_port()

---
 drivers/scsi/libata-core.c | 4 ++--
 drivers/scsi/libata-eh.c   | 4 ++--
 drivers/scsi/libata-scsi.c | 8 ++++----
 drivers/scsi/sata_svw.c    | 2 +-
 include/linux/libata.h     | 6 ++++++
 5 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 3387fe35c54f..6420062f2c1e 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -4659,7 +4659,7 @@ static struct ata_port * ata_host_add(const struct ata_probe_ent *ent,
 
 	host->transportt = &ata_scsi_transport_template;
 
-	ap = (struct ata_port *) &host->hostdata[0];
+	ap = ata_shost_to_port(host);
 
 	ata_host_init(ap, host, host_set, ent, port_no);
 
@@ -4872,7 +4872,7 @@ void ata_host_set_remove(struct ata_host_set *host_set)
 
 int ata_scsi_release(struct Scsi_Host *host)
 {
-	struct ata_port *ap = (struct ata_port *) &host->hostdata[0];
+	struct ata_port *ap = ata_shost_to_port(host);
 	int i;
 
 	DPRINTK("ENTER\n");
diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index b518654b9d60..c31b13fd5307 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -64,7 +64,7 @@
 enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 {
 	struct Scsi_Host *host = cmd->device->host;
-	struct ata_port *ap = (struct ata_port *) &host->hostdata[0];
+	struct ata_port *ap = ata_shost_to_port(host);
 	unsigned long flags;
 	struct ata_queued_cmd *qc;
 	enum scsi_eh_timer_return ret = EH_HANDLED;
@@ -99,7 +99,7 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
  */
 void ata_scsi_error(struct Scsi_Host *host)
 {
-	struct ata_port *ap = (struct ata_port *)&host->hostdata[0];
+	struct ata_port *ap = ata_shost_to_port(host);
 
 	DPRINTK("ENTER\n");
 
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 5ae7613bc157..9871f8272df0 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -395,7 +395,7 @@ void ata_dump_status(unsigned id, struct ata_taskfile *tf)
 
 int ata_scsi_device_resume(struct scsi_device *sdev)
 {
-	struct ata_port *ap = (struct ata_port *) &sdev->host->hostdata[0];
+	struct ata_port *ap = ata_shost_to_port(sdev->host);
 	struct ata_device *dev = &ap->device[sdev->id];
 
 	return ata_device_resume(ap, dev);
@@ -403,7 +403,7 @@ int ata_scsi_device_resume(struct scsi_device *sdev)
 
 int ata_scsi_device_suspend(struct scsi_device *sdev, pm_message_t state)
 {
-	struct ata_port *ap = (struct ata_port *) &sdev->host->hostdata[0];
+	struct ata_port *ap = ata_shost_to_port(sdev->host);
 	struct ata_device *dev = &ap->device[sdev->id];
 
 	return ata_device_suspend(ap, dev, state);
@@ -704,7 +704,7 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
 		struct ata_port *ap;
 		struct ata_device *dev;
 
-		ap = (struct ata_port *) &sdev->host->hostdata[0];
+		ap = ata_shost_to_port(sdev->host);
 		dev = &ap->device[sdev->id];
 
 		ata_scsi_dev_config(sdev, dev);
@@ -2478,7 +2478,7 @@ int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
 	struct scsi_device *scsidev = cmd->device;
 	struct Scsi_Host *shost = scsidev->host;
 
-	ap = (struct ata_port *) &shost->hostdata[0];
+	ap = ata_shost_to_port(shost);
 
 	spin_unlock(shost->host_lock);
 	spin_lock(&ap->host_set->lock);
diff --git a/drivers/scsi/sata_svw.c b/drivers/scsi/sata_svw.c
index e15c693e0539..d5eb5375e265 100644
--- a/drivers/scsi/sata_svw.c
+++ b/drivers/scsi/sata_svw.c
@@ -257,7 +257,7 @@ static int k2_sata_proc_info(struct Scsi_Host *shost, char *page, char **start,
 	int len, index;
 
 	/* Find  the ata_port */
-	ap = (struct ata_port *) &shost->hostdata[0];
+	ap = ata_shost_to_port(shost);
 	if (ap == NULL)
 		return 0;
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2564bc514bca..fe0a1dcc76c2 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -33,6 +33,7 @@
 #include <asm/io.h>
 #include <linux/ata.h>
 #include <linux/workqueue.h>
+#include <scsi/scsi_host.h>
 
 /*
  * compile-time options: to be removed as soon as all the drivers are
@@ -977,4 +978,9 @@ static inline void ata_pad_free(struct ata_port *ap, struct device *dev)
 	dma_free_coherent(dev, ATA_DMA_PAD_BUF_SZ, ap->pad, ap->pad_dma);
 }
 
+static inline struct ata_port *ata_shost_to_port(struct Scsi_Host *host)
+{
+	return (struct ata_port *) &host->hostdata[0];
+}
+
 #endif /* __LINUX_LIBATA_H__ */
-- 
cgit v1.2.3


From 2bf2cb26b2512c6a609bb152982c388329bedff6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 11 Apr 2006 22:16:45 +0900
Subject: [PATCH] libata: kill @verbose from ata_reset_fn_t

@verbose was added to ata_reset_fn_t because AHCI complained during
probing if no device was attached to the port.  However, muting
failure message isn't the correct approach.  Reset methods are
responsible for detecting no device condition and finishing
successfully.  Now that AHCI softreset is fixed, kill @verbose.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/ahci.c        | 13 +++++--------
 drivers/scsi/libata-core.c | 36 ++++++++++++------------------------
 drivers/scsi/libata.h      |  6 ++----
 drivers/scsi/sata_sil24.c  |  8 +++-----
 include/linux/libata.h     |  8 +++-----
 5 files changed, 25 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index 20fc0846e0be..0e7fb9bf2cd1 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -534,7 +534,7 @@ static int ahci_poll_register(void __iomem *reg, u32 mask, u32 val,
 	return -1;
 }
 
-static int ahci_softreset(struct ata_port *ap, int verbose, unsigned int *class)
+static int ahci_softreset(struct ata_port *ap, unsigned int *class)
 {
 	struct ahci_host_priv *hpriv = ap->host_set->private_data;
 	struct ahci_port_priv *pp = ap->private_data;
@@ -646,22 +646,19 @@ static int ahci_softreset(struct ata_port *ap, int verbose, unsigned int *class)
  fail_restart:
 	ahci_start_engine(ap);
  fail:
-	if (verbose)
-		printk(KERN_ERR "ata%u: softreset failed (%s)\n",
-		       ap->id, reason);
-	else
-		DPRINTK("EXIT, rc=%d reason=\"%s\"\n", rc, reason);
+	printk(KERN_ERR "ata%u: softreset failed (%s)\n",
+	       ap->id, reason);
 	return rc;
 }
 
-static int ahci_hardreset(struct ata_port *ap, int verbose, unsigned int *class)
+static int ahci_hardreset(struct ata_port *ap, unsigned int *class)
 {
 	int rc;
 
 	DPRINTK("ENTER\n");
 
 	ahci_stop_engine(ap);
-	rc = sata_std_hardreset(ap, verbose, class);
+	rc = sata_std_hardreset(ap, class);
 	ahci_start_engine(ap);
 
 	if (rc == 0)
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index f37179623efa..92b5077ac052 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -2387,7 +2387,6 @@ void ata_std_probeinit(struct ata_port *ap)
 /**
  *	ata_std_softreset - reset host port via ATA SRST
  *	@ap: port to reset
- *	@verbose: fail verbosely
  *	@classes: resulting classes of attached devices
  *
  *	Reset host port using ATA SRST.  This function is to be used
@@ -2399,7 +2398,7 @@ void ata_std_probeinit(struct ata_port *ap)
  *	RETURNS:
  *	0 on success, -errno otherwise.
  */
-int ata_std_softreset(struct ata_port *ap, int verbose, unsigned int *classes)
+int ata_std_softreset(struct ata_port *ap, unsigned int *classes)
 {
 	unsigned int slave_possible = ap->flags & ATA_FLAG_SLAVE_POSS;
 	unsigned int devmask = 0, err_mask;
@@ -2425,12 +2424,8 @@ int ata_std_softreset(struct ata_port *ap, int verbose, unsigned int *classes)
 	DPRINTK("about to softreset, devmask=%x\n", devmask);
 	err_mask = ata_bus_softreset(ap, devmask);
 	if (err_mask) {
-		if (verbose)
-			printk(KERN_ERR "ata%u: SRST failed (err_mask=0x%x)\n",
-			       ap->id, err_mask);
-		else
-			DPRINTK("EXIT, softreset failed (err_mask=0x%x)\n",
-				err_mask);
+		printk(KERN_ERR "ata%u: SRST failed (err_mask=0x%x)\n",
+		       ap->id, err_mask);
 		return -EIO;
 	}
 
@@ -2447,7 +2442,6 @@ int ata_std_softreset(struct ata_port *ap, int verbose, unsigned int *classes)
 /**
  *	sata_std_hardreset - reset host port via SATA phy reset
  *	@ap: port to reset
- *	@verbose: fail verbosely
  *	@class: resulting class of attached device
  *
  *	SATA phy-reset host port using DET bits of SControl register.
@@ -2460,7 +2454,7 @@ int ata_std_softreset(struct ata_port *ap, int verbose, unsigned int *classes)
  *	RETURNS:
  *	0 on success, -errno otherwise.
  */
-int sata_std_hardreset(struct ata_port *ap, int verbose, unsigned int *class)
+int sata_std_hardreset(struct ata_port *ap, unsigned int *class)
 {
 	u32 scontrol;
 
@@ -2500,11 +2494,8 @@ int sata_std_hardreset(struct ata_port *ap, int verbose, unsigned int *class)
 	}
 
 	if (ata_busy_sleep(ap, ATA_TMOUT_BOOT_QUICK, ATA_TMOUT_BOOT)) {
-		if (verbose)
-			printk(KERN_ERR "ata%u: COMRESET failed "
-			       "(device not ready)\n", ap->id);
-		else
-			DPRINTK("EXIT, device not ready\n");
+		printk(KERN_ERR "ata%u: COMRESET failed "
+		       "(device not ready)\n", ap->id);
 		return -EIO;
 	}
 
@@ -2592,16 +2583,15 @@ int ata_std_probe_reset(struct ata_port *ap, unsigned int *classes)
 				     ata_std_postreset, classes);
 }
 
-int ata_do_reset(struct ata_port *ap,
-		 ata_reset_fn_t reset, ata_postreset_fn_t postreset,
-		 int verbose, unsigned int *classes)
+int ata_do_reset(struct ata_port *ap, ata_reset_fn_t reset,
+		 ata_postreset_fn_t postreset, unsigned int *classes)
 {
 	int i, rc;
 
 	for (i = 0; i < ATA_MAX_DEVICES; i++)
 		classes[i] = ATA_DEV_UNKNOWN;
 
-	rc = reset(ap, verbose, classes);
+	rc = reset(ap, classes);
 	if (rc)
 		return rc;
 
@@ -2645,8 +2635,6 @@ int ata_do_reset(struct ata_port *ap,
  *	- If classification is supported, fill classes[] with
  *	  recognized class codes.
  *	- If classification is not supported, leave classes[] alone.
- *	- If verbose is non-zero, print error message on failure;
- *	  otherwise, shut up.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep)
@@ -2666,7 +2654,7 @@ int ata_drive_probe_reset(struct ata_port *ap, ata_probeinit_fn_t probeinit,
 		probeinit(ap);
 
 	if (softreset && !ata_set_sata_spd_needed(ap)) {
-		rc = ata_do_reset(ap, softreset, postreset, 0, classes);
+		rc = ata_do_reset(ap, softreset, postreset, classes);
 		if (rc == 0 && classes[0] != ATA_DEV_UNKNOWN)
 			goto done;
 		printk(KERN_INFO "ata%u: softreset failed, will try "
@@ -2678,7 +2666,7 @@ int ata_drive_probe_reset(struct ata_port *ap, ata_probeinit_fn_t probeinit,
 		goto done;
 
 	while (1) {
-		rc = ata_do_reset(ap, hardreset, postreset, 0, classes);
+		rc = ata_do_reset(ap, hardreset, postreset, classes);
 		if (rc == 0) {
 			if (classes[0] != ATA_DEV_UNKNOWN)
 				goto done;
@@ -2699,7 +2687,7 @@ int ata_drive_probe_reset(struct ata_port *ap, ata_probeinit_fn_t probeinit,
 		       ap->id);
 		ssleep(5);
 
-		rc = ata_do_reset(ap, softreset, postreset, 0, classes);
+		rc = ata_do_reset(ap, softreset, postreset, classes);
 	}
 
  done:
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index b18d377588dc..3f8b0a863781 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -56,10 +56,8 @@ extern int ata_set_sata_spd_needed(struct ata_port *ap);
 extern int ata_down_xfermask_limit(struct ata_port *ap, struct ata_device *dev,
 				   int force_pio0);
 extern int ata_set_mode(struct ata_port *ap, struct ata_device **r_failed_dev);
-extern int ata_do_reset(struct ata_port *ap,
-			ata_reset_fn_t reset,
-			ata_postreset_fn_t postreset,
-			int verbose, unsigned int *classes);
+extern int ata_do_reset(struct ata_port *ap, ata_reset_fn_t reset,
+			ata_postreset_fn_t postreset, unsigned int *classes);
 extern void ata_qc_free(struct ata_queued_cmd *qc);
 extern void ata_qc_issue(struct ata_queued_cmd *qc);
 extern int ata_check_atapi_dma(struct ata_queued_cmd *qc);
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c
index 9aa7493ea146..26d84e094b1d 100644
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -426,8 +426,7 @@ static void sil24_tf_read(struct ata_port *ap, struct ata_taskfile *tf)
 	*tf = pp->tf;
 }
 
-static int sil24_softreset(struct ata_port *ap, int verbose,
-			   unsigned int *class)
+static int sil24_softreset(struct ata_port *ap, unsigned int *class)
 {
 	void __iomem *port = (void __iomem *)ap->ioaddr.cmd_addr;
 	struct sil24_port_priv *pp = ap->private_data;
@@ -489,13 +488,12 @@ static int sil24_softreset(struct ata_port *ap, int verbose,
 	return 0;
 }
 
-static int sil24_hardreset(struct ata_port *ap, int verbose,
-			   unsigned int *class)
+static int sil24_hardreset(struct ata_port *ap, unsigned int *class)
 {
 	unsigned int dummy_class;
 
 	/* sil24 doesn't report device signature after hard reset */
-	return sata_std_hardreset(ap, verbose, &dummy_class);
+	return sata_std_hardreset(ap, &dummy_class);
 }
 
 static int sil24_probe_reset(struct ata_port *ap, unsigned int *classes)
diff --git a/include/linux/libata.h b/include/linux/libata.h
index fe0a1dcc76c2..d5fd5c06e755 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -252,7 +252,7 @@ struct ata_queued_cmd;
 /* typedefs */
 typedef void (*ata_qc_cb_t) (struct ata_queued_cmd *qc);
 typedef void (*ata_probeinit_fn_t)(struct ata_port *);
-typedef int (*ata_reset_fn_t)(struct ata_port *, int, unsigned int *);
+typedef int (*ata_reset_fn_t)(struct ata_port *, unsigned int *);
 typedef void (*ata_postreset_fn_t)(struct ata_port *ap, unsigned int *);
 
 struct ata_ioports {
@@ -509,10 +509,8 @@ extern int ata_drive_probe_reset(struct ata_port *ap,
 			ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
 			ata_postreset_fn_t postreset, unsigned int *classes);
 extern void ata_std_probeinit(struct ata_port *ap);
-extern int ata_std_softreset(struct ata_port *ap, int verbose,
-			     unsigned int *classes);
-extern int sata_std_hardreset(struct ata_port *ap, int verbose,
-			      unsigned int *class);
+extern int ata_std_softreset(struct ata_port *ap, unsigned int *classes);
+extern int sata_std_hardreset(struct ata_port *ap, unsigned int *class);
 extern void ata_std_postreset(struct ata_port *ap, unsigned int *classes);
 extern int ata_dev_revalidate(struct ata_port *ap, struct ata_device *dev,
 			      int post_reset);
-- 
cgit v1.2.3


From c22daff41001e9ccead87179ac0547f85447139e Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 11 Apr 2006 22:22:29 +0900
Subject: [PATCH] libata: implement ata_wait_register()

As waiting for some register bits to change seems to be a common
operation shared by some controllers, implement helper function
ata_wait_register().  This function also takes care of register write
flushing.

Note that the condition is inverted, the wait is over when the masked
value does NOT match @val.  As we're waiting for bits to change, this
test is more powerful and allows the function to be used in more
places.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/libata.h     |  3 +++
 2 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 2d76ce23728f..0075fe7404d5 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -5029,6 +5029,52 @@ int ata_ratelimit(void)
 	return rc;
 }
 
+/**
+ *	ata_wait_register - wait until register value changes
+ *	@reg: IO-mapped register
+ *	@mask: Mask to apply to read register value
+ *	@val: Wait condition
+ *	@interval_msec: polling interval in milliseconds
+ *	@timeout_msec: timeout in milliseconds
+ *
+ *	Waiting for some bits of register to change is a common
+ *	operation for ATA controllers.  This function reads 32bit LE
+ *	IO-mapped register @reg and tests for the following condition.
+ *
+ *	(*@reg & mask) != val
+ *
+ *	If the condition is met, it returns; otherwise, the process is
+ *	repeated after @interval_msec until timeout.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ *
+ *	RETURNS:
+ *	The final register value.
+ */
+u32 ata_wait_register(void __iomem *reg, u32 mask, u32 val,
+		      unsigned long interval_msec,
+		      unsigned long timeout_msec)
+{
+	unsigned long timeout;
+	u32 tmp;
+
+	tmp = ioread32(reg);
+
+	/* Calculate timeout _after_ the first read to make sure
+	 * preceding writes reach the controller before starting to
+	 * eat away the timeout.
+	 */
+	timeout = jiffies + (timeout_msec * HZ) / 1000;
+
+	while ((tmp & mask) == val && time_before(jiffies, timeout)) {
+		msleep(interval_msec);
+		tmp = ioread32(reg);
+	}
+
+	return tmp;
+}
+
 /*
  * libata is essentially a library of internal helper functions for
  * low-level ATA host controller drivers.  As such, the API/ABI is
@@ -5079,6 +5125,7 @@ EXPORT_SYMBOL_GPL(ata_dev_classify);
 EXPORT_SYMBOL_GPL(ata_dev_pair);
 EXPORT_SYMBOL_GPL(ata_port_disable);
 EXPORT_SYMBOL_GPL(ata_ratelimit);
+EXPORT_SYMBOL_GPL(ata_wait_register);
 EXPORT_SYMBOL_GPL(ata_busy_sleep);
 EXPORT_SYMBOL_GPL(ata_port_queue_task);
 EXPORT_SYMBOL_GPL(ata_scsi_ioctl);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index d5fd5c06e755..dd5bcb5d29b5 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -543,6 +543,9 @@ extern unsigned int ata_busy_sleep(struct ata_port *ap,
 				   unsigned long timeout);
 extern void ata_port_queue_task(struct ata_port *ap, void (*fn)(void *),
 				void *data, unsigned long delay);
+extern u32 ata_wait_register(void __iomem *reg, u32 mask, u32 val,
+			     unsigned long interval_msec,
+			     unsigned long timeout_msec);
 
 /*
  * Default driver ops implementations
-- 
cgit v1.2.3


From 499a86af41cf5a4bf811726841bbc49c0e96fd35 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 11 Apr 2006 22:32:18 +0900
Subject: [PATCH] libata: export ata_set_sata_spd()

This will be used by LLDD hardreset implementation.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 3 ++-
 include/linux/libata.h     | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 588578dc6caf..509178c3700c 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1738,7 +1738,7 @@ int ata_set_sata_spd_needed(struct ata_port *ap)
  *	0 if spd doesn't need to be changed, 1 if spd has been
  *	changed.  -EOPNOTSUPP if SCR registers are inaccessible.
  */
-static int ata_set_sata_spd(struct ata_port *ap)
+int ata_set_sata_spd(struct ata_port *ap)
 {
 	u32 scontrol;
 
@@ -5113,6 +5113,7 @@ EXPORT_SYMBOL_GPL(ata_bmdma_irq_clear);
 EXPORT_SYMBOL_GPL(ata_bmdma_status);
 EXPORT_SYMBOL_GPL(ata_bmdma_stop);
 EXPORT_SYMBOL_GPL(ata_port_probe);
+EXPORT_SYMBOL_GPL(ata_set_sata_spd);
 EXPORT_SYMBOL_GPL(sata_phy_reset);
 EXPORT_SYMBOL_GPL(__sata_phy_reset);
 EXPORT_SYMBOL_GPL(ata_bus_reset);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index dd5bcb5d29b5..d35b1e3bb7e0 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -504,6 +504,7 @@ extern void ata_port_probe(struct ata_port *);
 extern void __sata_phy_reset(struct ata_port *ap);
 extern void sata_phy_reset(struct ata_port *ap);
 extern void ata_bus_reset(struct ata_port *ap);
+extern int ata_set_sata_spd(struct ata_port *ap);
 extern int ata_drive_probe_reset(struct ata_port *ap,
 			ata_probeinit_fn_t probeinit,
 			ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
-- 
cgit v1.2.3


From 7fe1e133bf45b0fe70491ed3d4c5b491feff7aa8 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 21 Apr 2006 13:12:44 +0100
Subject: [RBTREE] Add accessor macros for colour and parent fields of rb_node

This is in preparation for merging those fields into a single
'unsigned long', because using a whole machine-word for a single bit
of colour information is wasteful.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/rbtree.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 4b7cc4fe366d..ffee81ce7b6f 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -107,6 +107,15 @@ struct rb_node
 	struct rb_node *rb_left;
 };
 
+#define rb_parent(r)		((r)->rb_parent)
+#define rb_set_parent(r,p)	do { (r)->rb_parent = p; } while (0)
+#define rb_colour(r)		((r)->rb_colour)
+#define rb_is_red(r)		((r)->colour == RB_RED)
+#define rb_is_black(r)		((r)->colour == RB_BLACK)
+#define rb_set_red(r)		do { (r)->colour = RB_RED; } while (0)
+#define rb_set_black(r)		do { (r)->colour = RB_BLACK; } while (0)
+#define rb_set_colour(r,c)	do { (r)->colour = (c); } while (0)
+
 struct rb_root
 {
 	struct rb_node *rb_node;
-- 
cgit v1.2.3


From 55a981027fc393c86de2c4e7836c9515088a9a58 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 21 Apr 2006 13:35:51 +0100
Subject: [RBTREE] Merge colour and parent fields of struct rb_node.

We only used a single bit for colour information, so having a whole
machine word of space allocated for it was a bit wasteful. Instead,
store it in the lowest bit of the 'parent' pointer, since that was
always going to be aligned anyway.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/rbtree.h |  32 +++++----
 lib/rbtree.c           | 178 +++++++++++++++++++++++++------------------------
 2 files changed, 109 insertions(+), 101 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index ffee81ce7b6f..748be50329d8 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -99,28 +99,35 @@ static inline struct page * rb_insert_page_cache(struct inode * inode,
 
 struct rb_node
 {
-	struct rb_node *rb_parent;
-	int rb_color;
+	unsigned long  rb_parent_colour;
 #define	RB_RED		0
 #define	RB_BLACK	1
 	struct rb_node *rb_right;
 	struct rb_node *rb_left;
 };
 
-#define rb_parent(r)		((r)->rb_parent)
-#define rb_set_parent(r,p)	do { (r)->rb_parent = p; } while (0)
-#define rb_colour(r)		((r)->rb_colour)
-#define rb_is_red(r)		((r)->colour == RB_RED)
-#define rb_is_black(r)		((r)->colour == RB_BLACK)
-#define rb_set_red(r)		do { (r)->colour = RB_RED; } while (0)
-#define rb_set_black(r)		do { (r)->colour = RB_BLACK; } while (0)
-#define rb_set_colour(r,c)	do { (r)->colour = (c); } while (0)
-
 struct rb_root
 {
 	struct rb_node *rb_node;
 };
 
+
+#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_colour & ~3))
+#define rb_colour(r)   ((r)->rb_parent_colour & 1)
+#define rb_is_red(r)   (!rb_colour(r))
+#define rb_is_black(r) rb_colour(r)
+#define rb_set_red(r)  do { (r)->rb_parent_colour &= ~1; } while (0)
+#define rb_set_black(r)  do { (r)->rb_parent_colour |= 1; } while (0)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+	rb->rb_parent_colour = (rb->rb_parent_colour & 3) | (unsigned long)p;
+}
+static inline void rb_set_colour(struct rb_node *rb, int colour)
+{
+	rb->rb_parent_colour = (rb->rb_parent_colour & ~1) | colour;
+}
+
 #define RB_ROOT	(struct rb_root) { NULL, }
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 
@@ -140,8 +147,7 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
 				struct rb_node ** rb_link)
 {
-	node->rb_parent = parent;
-	node->rb_color = RB_RED;
+	node->rb_parent_colour = (unsigned long )parent;
 	node->rb_left = node->rb_right = NULL;
 
 	*rb_link = node;
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 63473e04f18a..4a7173cad149 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -26,60 +26,66 @@
 static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *right = node->rb_right;
+	struct rb_node *parent = rb_parent(node);
 
 	if ((node->rb_right = right->rb_left))
-		right->rb_left->rb_parent = node;
+		rb_set_parent(right->rb_left, node);
 	right->rb_left = node;
 
-	if ((right->rb_parent = node->rb_parent))
+	rb_set_parent(right, parent);
+
+	if (parent)
 	{
-		if (node == node->rb_parent->rb_left)
-			node->rb_parent->rb_left = right;
+		if (node == parent->rb_left)
+			parent->rb_left = right;
 		else
-			node->rb_parent->rb_right = right;
+			parent->rb_right = right;
 	}
 	else
 		root->rb_node = right;
-	node->rb_parent = right;
+	rb_set_parent(node, right);
 }
 
 static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *left = node->rb_left;
+	struct rb_node *parent = rb_parent(node);
 
 	if ((node->rb_left = left->rb_right))
-		left->rb_right->rb_parent = node;
+		rb_set_parent(left->rb_right, node);
 	left->rb_right = node;
 
-	if ((left->rb_parent = node->rb_parent))
+	rb_set_parent(left, parent);
+
+	if (parent)
 	{
-		if (node == node->rb_parent->rb_right)
-			node->rb_parent->rb_right = left;
+		if (node == parent->rb_right)
+			parent->rb_right = left;
 		else
-			node->rb_parent->rb_left = left;
+			parent->rb_left = left;
 	}
 	else
 		root->rb_node = left;
-	node->rb_parent = left;
+	rb_set_parent(node, left);
 }
 
 void rb_insert_color(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *parent, *gparent;
 
-	while ((parent = node->rb_parent) && parent->rb_color == RB_RED)
+	while ((parent = rb_parent(node)) && rb_is_red(parent))
 	{
-		gparent = parent->rb_parent;
+		gparent = rb_parent(parent);
 
 		if (parent == gparent->rb_left)
 		{
 			{
 				register struct rb_node *uncle = gparent->rb_right;
-				if (uncle && uncle->rb_color == RB_RED)
+				if (uncle && rb_is_red(uncle))
 				{
-					uncle->rb_color = RB_BLACK;
-					parent->rb_color = RB_BLACK;
-					gparent->rb_color = RB_RED;
+					rb_set_black(uncle);
+					rb_set_black(parent);
+					rb_set_red(gparent);
 					node = gparent;
 					continue;
 				}
@@ -94,17 +100,17 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 				node = tmp;
 			}
 
-			parent->rb_color = RB_BLACK;
-			gparent->rb_color = RB_RED;
+			rb_set_black(parent);
+			rb_set_red(gparent);
 			__rb_rotate_right(gparent, root);
 		} else {
 			{
 				register struct rb_node *uncle = gparent->rb_left;
-				if (uncle && uncle->rb_color == RB_RED)
+				if (uncle && rb_is_red(uncle))
 				{
-					uncle->rb_color = RB_BLACK;
-					parent->rb_color = RB_BLACK;
-					gparent->rb_color = RB_RED;
+					rb_set_black(uncle);
+					rb_set_black(parent);
+					rb_set_red(gparent);
 					node = gparent;
 					continue;
 				}
@@ -119,13 +125,13 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 				node = tmp;
 			}
 
-			parent->rb_color = RB_BLACK;
-			gparent->rb_color = RB_RED;
+			rb_set_black(parent);
+			rb_set_red(gparent);
 			__rb_rotate_left(gparent, root);
 		}
 	}
 
-	root->rb_node->rb_color = RB_BLACK;
+	rb_set_black(root->rb_node);
 }
 EXPORT_SYMBOL(rb_insert_color);
 
@@ -134,43 +140,40 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 {
 	struct rb_node *other;
 
-	while ((!node || node->rb_color == RB_BLACK) && node != root->rb_node)
+	while ((!node || rb_is_black(node)) && node != root->rb_node)
 	{
 		if (parent->rb_left == node)
 		{
 			other = parent->rb_right;
-			if (other->rb_color == RB_RED)
+			if (rb_is_red(other))
 			{
-				other->rb_color = RB_BLACK;
-				parent->rb_color = RB_RED;
+				rb_set_black(other);
+				rb_set_red(parent);
 				__rb_rotate_left(parent, root);
 				other = parent->rb_right;
 			}
-			if ((!other->rb_left ||
-			     other->rb_left->rb_color == RB_BLACK)
-			    && (!other->rb_right ||
-				other->rb_right->rb_color == RB_BLACK))
+			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+			    (!other->rb_right || rb_is_black(other->rb_right)))
 			{
-				other->rb_color = RB_RED;
+				rb_set_red(other);
 				node = parent;
-				parent = node->rb_parent;
+				parent = rb_parent(node);
 			}
 			else
 			{
-				if (!other->rb_right ||
-				    other->rb_right->rb_color == RB_BLACK)
+				if (!other->rb_right || rb_is_black(other->rb_right))
 				{
-					register struct rb_node *o_left;
+					struct rb_node *o_left;
 					if ((o_left = other->rb_left))
-						o_left->rb_color = RB_BLACK;
-					other->rb_color = RB_RED;
+						rb_set_black(o_left);
+					rb_set_red(other);
 					__rb_rotate_right(other, root);
 					other = parent->rb_right;
 				}
-				other->rb_color = parent->rb_color;
-				parent->rb_color = RB_BLACK;
+				rb_set_colour(other, rb_colour(parent));
+				rb_set_black(parent);
 				if (other->rb_right)
-					other->rb_right->rb_color = RB_BLACK;
+					rb_set_black(other->rb_right);
 				__rb_rotate_left(parent, root);
 				node = root->rb_node;
 				break;
@@ -179,38 +182,35 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 		else
 		{
 			other = parent->rb_left;
-			if (other->rb_color == RB_RED)
+			if (rb_is_red(other))
 			{
-				other->rb_color = RB_BLACK;
-				parent->rb_color = RB_RED;
+				rb_set_black(other);
+				rb_set_red(parent);
 				__rb_rotate_right(parent, root);
 				other = parent->rb_left;
 			}
-			if ((!other->rb_left ||
-			     other->rb_left->rb_color == RB_BLACK)
-			    && (!other->rb_right ||
-				other->rb_right->rb_color == RB_BLACK))
+			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+			    (!other->rb_right || rb_is_black(other->rb_right)))
 			{
-				other->rb_color = RB_RED;
+				rb_set_red(other);
 				node = parent;
-				parent = node->rb_parent;
+				parent = rb_parent(node);
 			}
 			else
 			{
-				if (!other->rb_left ||
-				    other->rb_left->rb_color == RB_BLACK)
+				if (!other->rb_left || rb_is_black(other->rb_left))
 				{
 					register struct rb_node *o_right;
 					if ((o_right = other->rb_right))
-						o_right->rb_color = RB_BLACK;
-					other->rb_color = RB_RED;
+						rb_set_black(o_right);
+					rb_set_red(other);
 					__rb_rotate_left(other, root);
 					other = parent->rb_left;
 				}
-				other->rb_color = parent->rb_color;
-				parent->rb_color = RB_BLACK;
+				rb_set_colour(other, rb_colour(parent));
+				rb_set_black(parent);
 				if (other->rb_left)
-					other->rb_left->rb_color = RB_BLACK;
+					rb_set_black(other->rb_left);
 				__rb_rotate_right(parent, root);
 				node = root->rb_node;
 				break;
@@ -218,7 +218,7 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 		}
 	}
 	if (node)
-		node->rb_color = RB_BLACK;
+		rb_set_black(node);
 }
 
 void rb_erase(struct rb_node *node, struct rb_root *root)
@@ -238,43 +238,41 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 		while ((left = node->rb_left) != NULL)
 			node = left;
 		child = node->rb_right;
-		parent = node->rb_parent;
-		color = node->rb_color;
+		parent = rb_parent(node);
+		color = rb_colour(node);
 
 		if (child)
-			child->rb_parent = parent;
-
-		if (node->rb_parent == old) {
+			rb_set_parent(child, parent);
+		if (parent == old) {
 			parent->rb_right = child;
 			parent = node;
-		} else			
+		} else
 			parent->rb_left = child;
 
-		node->rb_parent = old->rb_parent;
-		node->rb_color = old->rb_color;
+		node->rb_parent_colour = old->rb_parent_colour;
 		node->rb_right = old->rb_right;
 		node->rb_left = old->rb_left;
 
-		if (old->rb_parent)
+		if (rb_parent(old))
 		{
-			if (old->rb_parent->rb_left == old)
-				old->rb_parent->rb_left = node;
+			if (rb_parent(old)->rb_left == old)
+				rb_parent(old)->rb_left = node;
 			else
-				old->rb_parent->rb_right = node;
+				rb_parent(old)->rb_right = node;
 		} else
 			root->rb_node = node;
 
-		old->rb_left->rb_parent = node;
+		rb_set_parent(old->rb_left, node);
 		if (old->rb_right)
-			old->rb_right->rb_parent = node;
+			rb_set_parent(old->rb_right, node);
 		goto color;
 	}
 
-	parent = node->rb_parent;
-	color = node->rb_color;
+	parent = rb_parent(node);
+	color = rb_colour(node);
 
 	if (child)
-		child->rb_parent = parent;
+		rb_set_parent(child, parent);
 	if (parent)
 	{
 		if (parent->rb_left == node)
@@ -322,6 +320,8 @@ EXPORT_SYMBOL(rb_last);
 
 struct rb_node *rb_next(struct rb_node *node)
 {
+	struct rb_node *parent;
+
 	/* If we have a right-hand child, go down and then left as far
 	   as we can. */
 	if (node->rb_right) {
@@ -337,15 +337,17 @@ struct rb_node *rb_next(struct rb_node *node)
 	   ancestor is a right-hand child of its parent, keep going
 	   up. First time it's a left-hand child of its parent, said
 	   parent is our 'next' node. */
-	while (node->rb_parent && node == node->rb_parent->rb_right)
-		node = node->rb_parent;
+	while ((parent = rb_parent(node)) && node == parent->rb_right)
+		node = parent;
 
-	return node->rb_parent;
+	return parent;
 }
 EXPORT_SYMBOL(rb_next);
 
 struct rb_node *rb_prev(struct rb_node *node)
 {
+	struct rb_node *parent;
+
 	/* If we have a left-hand child, go down and then right as far
 	   as we can. */
 	if (node->rb_left) {
@@ -357,17 +359,17 @@ struct rb_node *rb_prev(struct rb_node *node)
 
 	/* No left-hand children. Go up till we find an ancestor which
 	   is a right-hand child of its parent */
-	while (node->rb_parent && node == node->rb_parent->rb_left)
-		node = node->rb_parent;
+	while ((parent = rb_parent(node)) && node == parent->rb_left)
+		node = parent;
 
-	return node->rb_parent;
+	return parent;
 }
 EXPORT_SYMBOL(rb_prev);
 
 void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 		     struct rb_root *root)
 {
-	struct rb_node *parent = victim->rb_parent;
+	struct rb_node *parent = rb_parent(victim);
 
 	/* Set the surrounding nodes to point to the replacement */
 	if (parent) {
@@ -379,9 +381,9 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 		root->rb_node = new;
 	}
 	if (victim->rb_left)
-		victim->rb_left->rb_parent = new;
+		rb_set_parent(victim->rb_left, new);
 	if (victim->rb_right)
-		victim->rb_right->rb_parent = new;
+		rb_set_parent(victim->rb_right, new);
 
 	/* Copy the pointers/colour from the victim to the replacement */
 	*new = *victim;
-- 
cgit v1.2.3


From e977145aeaad23d443686f2a2d5b32800d1607c5 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 21 Apr 2006 23:15:39 +0100
Subject: [RBTREE] Add explicit alignment to sizeof(long) for struct rb_node.

Seems like a strange requirement, but allegedly it was necessary for
struct address_space on CRIS, because it otherwise ended up being only
byte-aligned. It's harmless enough, and easier to just do it than to
prove it isn't necessary... although I really ought to dig out my etrax
board and test it some time.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/rbtree.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 748be50329d8..3cc30b0ab828 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -104,7 +104,8 @@ struct rb_node
 #define	RB_BLACK	1
 	struct rb_node *rb_right;
 	struct rb_node *rb_left;
-};
+} __attribute__((aligned(sizeof(long))));
+    /* The alignment might seem pointless, but allegedly CRIS needs it */
 
 struct rb_root
 {
-- 
cgit v1.2.3


From ed198cb49750fd9ec564e9f1df66c10efea605f1 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sat, 22 Apr 2006 02:38:50 +0100
Subject: [RBTREE] Update hrtimers to use rb_parent() accessor macro.

Also switch it to use the same method of using off-tree nodes as
everyone else now does -- set them to point to themselves.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/hrtimer.h | 2 +-
 kernel/hrtimer.c        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 306acf1dc6d5..7d2a1b974c5e 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -127,7 +127,7 @@ extern ktime_t hrtimer_get_next_event(void);
 
 static inline int hrtimer_active(const struct hrtimer *timer)
 {
-	return timer->node.rb_parent != HRTIMER_INACTIVE;
+	return rb_parent(&timer->node) != &timer->node;
 }
 
 /* Forward a hrtimer so it expires after now: */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index d2a7296c8251..04ab27ddfd90 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -393,7 +393,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 	if (base->first == &timer->node)
 		base->first = rb_next(&timer->node);
 	rb_erase(&timer->node, &base->active);
-	timer->node.rb_parent = HRTIMER_INACTIVE;
+	rb_set_parent(&timer->node, &timer->node);
 }
 
 /*
@@ -578,7 +578,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 		clock_id = CLOCK_MONOTONIC;
 
 	timer->base = &bases[clock_id];
-	timer->node.rb_parent = HRTIMER_INACTIVE;
+	rb_set_parent(&timer->node, &timer->node);
 }
 
 /**
-- 
cgit v1.2.3


From a1ff0eafce81a58861534926722a70f211b04faa Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 13:57:44 +0100
Subject: Include <linux/jiffies.h> from linux/acct.h only in kernel-private
 part.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/acct.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/acct.h b/include/linux/acct.h
index 9a66401073fc..255b11293a8d 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -16,7 +16,6 @@
 #define _LINUX_ACCT_H
 
 #include <linux/types.h>
-#include <linux/jiffies.h>
 
 #include <asm/param.h>
 #include <asm/byteorder.h>
@@ -165,6 +164,7 @@ typedef struct acct acct_t;
 #endif	/* __KERNEL */
 
 #ifdef __KERNEL__
+#include <linux/jiffies.h>
 /*
  * Yet another set of HZ to *HZ helper functions.
  * See <linux/jiffies.h> for the original.
-- 
cgit v1.2.3


From 72b9760b65cbe0d24e668c34c8fefb2ba417f14b Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 13:58:23 +0100
Subject: Don't include agp_backend.h in user-visible part of agpgart.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/agpgart.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/agpgart.h b/include/linux/agpgart.h
index 6d59c8efe3be..bfb8ec791b7b 100644
--- a/include/linux/agpgart.h
+++ b/include/linux/agpgart.h
@@ -27,8 +27,6 @@
 #ifndef _AGP_H
 #define _AGP_H 1
 
-#include <linux/agp_backend.h>
-
 #define AGPIOC_BASE       'A'
 #define AGPIOC_INFO       _IOR (AGPIOC_BASE, 0, struct agp_info*)
 #define AGPIOC_ACQUIRE    _IO  (AGPIOC_BASE, 1)
@@ -112,6 +110,7 @@ typedef struct _agp_unbind {
 
 #else				/* __KERNEL__ */
 #include <linux/mutex.h>
+#include <linux/agp_backend.h>
 
 #define AGPGART_MINOR 175
 
-- 
cgit v1.2.3


From 25478bb26f2be1504112b764047105811a52c3cb Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 13:59:30 +0100
Subject: Use __KERNEL__ to hide kernel-private bits of linux/gameport.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/gameport.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gameport.h b/include/linux/gameport.h
index 71e7b2847cb3..2cdba0c23957 100644
--- a/include/linux/gameport.h
+++ b/include/linux/gameport.h
@@ -9,6 +9,7 @@
  * the Free Software Foundation.
  */
 
+#ifdef __KERNEL__
 #include <asm/io.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
@@ -154,6 +155,8 @@ static inline void gameport_register_driver(struct gameport_driver *drv)
 
 void gameport_unregister_driver(struct gameport_driver *drv);
 
+#endif /* __KERNEL__ */
+
 #define GAMEPORT_MODE_DISABLED		0
 #define GAMEPORT_MODE_RAW		1
 #define GAMEPORT_MODE_COOKED		2
@@ -169,6 +172,8 @@ void gameport_unregister_driver(struct gameport_driver *drv);
 #define GAMEPORT_ID_VENDOR_GRAVIS	0x0009
 #define GAMEPORT_ID_VENDOR_GUILLEMOT	0x000a
 
+#ifdef __KERNEL__
+
 static inline void gameport_trigger(struct gameport *gameport)
 {
 	if (gameport->trigger)
@@ -219,4 +224,5 @@ static inline void gameport_set_poll_interval(struct gameport *gameport, unsigne
 void gameport_start_polling(struct gameport *gameport);
 void gameport_stop_polling(struct gameport *gameport);
 
+#endif /* __KERNEL__ */
 #endif
-- 
cgit v1.2.3


From f2999e4ea41d6ec6252d3b6d275b40d468a3c07e Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:07:02 +0100
Subject: Export only the appropriate GS_xxx flags to userspace from
 generic_serial.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/generic_serial.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/generic_serial.h b/include/linux/generic_serial.h
index 652611a4bdcd..e25384561955 100644
--- a/include/linux/generic_serial.h
+++ b/include/linux/generic_serial.h
@@ -12,6 +12,7 @@
 #ifndef GENERIC_SERIAL_H
 #define GENERIC_SERIAL_H
 
+#ifdef __KERNEL__
 #include <linux/mutex.h>
 
 struct real_driver {
@@ -54,6 +55,7 @@ struct gs_port {
   spinlock_t              driver_lock;
 };
 
+#endif /* __KERNEL__ */
 
 /* Flags */
 /* Warning: serial.h defines some ASYNC_ flags, they say they are "only"
@@ -75,7 +77,7 @@ struct gs_port {
 #define GS_DEBUG_FLOW    0x00000020
 #define GS_DEBUG_WRITE   0x00000040
 
-
+#ifdef __KERNEL__
 void gs_put_char(struct tty_struct *tty, unsigned char ch);
 int  gs_write(struct tty_struct *tty, 
              const unsigned char *buf, int count);
@@ -94,5 +96,5 @@ int  gs_init_port(struct gs_port *port);
 int  gs_setserial(struct gs_port *port, struct serial_struct __user *sp);
 int  gs_getserial(struct gs_port *port, struct serial_struct __user *sp);
 void gs_got_break(struct gs_port *port);
-
+#endif /* __KERNEL__ */
 #endif
-- 
cgit v1.2.3


From 34186efc17025520a53a48468338003d238a77d7 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:07:57 +0100
Subject: Include various private files only from within __KERNEL__ in genhd.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/genhd.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 2ef845b35175..3498a0c68184 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -9,13 +9,7 @@
  *		<drew@colorado.edu>
  */
 
-#include <linux/config.h>
 #include <linux/types.h>
-#include <linux/major.h>
-#include <linux/device.h>
-#include <linux/smp.h>
-#include <linux/string.h>
-#include <linux/fs.h>
 
 enum {
 /* These three have identical behaviour; use the second one if DOS FDISK gets
@@ -61,6 +55,12 @@ struct partition {
 #endif
 
 #ifdef __KERNEL__
+#include <linux/major.h>
+#include <linux/device.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+
 struct partition {
 	unsigned char boot_ind;		/* 0x80 - active */
 	unsigned char head;		/* starting head */
-- 
cgit v1.2.3


From 5a570cc0a41bd316afc91ba2c7151fed70d10b31 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:10:40 +0100
Subject: Sanitise linux/i2c-algo-ite.h for userspace consumption

It doesn't need to include i2c.h, because a forward declaration of
struct i2c_adapter is perfectly sufficient. And it can be inside
#ifdef __KERNEL__ along with the kernel-internal structure definition.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/i2c-algo-ite.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c-algo-ite.h b/include/linux/i2c-algo-ite.h
index 26a8b89855f1..0073fe96c76e 100644
--- a/include/linux/i2c-algo-ite.h
+++ b/include/linux/i2c-algo-ite.h
@@ -29,7 +29,7 @@
 #ifndef I2C_ALGO_ITE_H
 #define I2C_ALGO_ITE_H 1
 
-#include <linux/i2c.h>
+#include <linux/types.h>
 
 /* Example of a sequential read request:
 	struct i2c_iic_msg s_msg; 
@@ -49,6 +49,9 @@ struct i2c_iic_msg {
 	char *buf;	/* pointer to msg data */
 };
 
+#ifdef __KERNEL__
+struct i2c_adapter;
+
 struct i2c_algo_iic_data {
 	void *data;		/* private data for lolevel routines	*/
 	void (*setiic) (void *data, int ctl, int val);
@@ -65,5 +68,5 @@ struct i2c_algo_iic_data {
 
 int i2c_iic_add_bus(struct i2c_adapter *);
 int i2c_iic_del_bus(struct i2c_adapter *);
-
+#endif /* __KERNEL__ */
 #endif /* I2C_ALGO_ITE_H */
-- 
cgit v1.2.3


From a1b9298e55d2395be4ac25de3340b6eee01c6f67 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:14:52 +0100
Subject: Sanitise linux/i2c.h for userspace consumption

It was unconditionally including a whole bunch of headers which aren't
user-visible, and also exposing a lot of private internal stuff of its
own. Also fix some legacy character set to UTF-8 while we're at it.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/i2c.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 1635ee25918f..0510430e00db 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -20,14 +20,15 @@
     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.		     */
 /* ------------------------------------------------------------------------- */
 
-/* With some changes from Ky�sti M�lkki <kmalkki@cc.hut.fi> and
+/* With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and
    Frodo Looijaard <frodol@dds.nl> */
 
 #ifndef _LINUX_I2C_H
 #define _LINUX_I2C_H
 
-#include <linux/module.h>
 #include <linux/types.h>
+#ifdef __KERNEL__ 
+#include <linux/module.h>
 #include <linux/i2c-id.h>
 #include <linux/mod_devicetable.h>
 #include <linux/device.h>	/* for struct device */
@@ -354,6 +355,7 @@ static inline int i2c_adapter_id(struct i2c_adapter *adap)
 {
 	return adap->nr;
 }
+#endif /* __KERNEL__ */
 
 /*
  * I2C Message - used for pure i2c transaction, also from /dev interface
@@ -469,6 +471,7 @@ union i2c_smbus_data {
 #define I2C_SMBUS	0x0720	/* SMBus-level access */
 
 /* ----- I2C-DEV: char device interface stuff ------------------------- */
+#ifdef __KERNEL__
 
 #define I2C_MAJOR	89		/* Device major number		*/
 
@@ -646,5 +649,5 @@ static unsigned short *forces[] = { force, force_##chip1,		\
 				    force_##chip6, force_##chip7,	\
 				    force_##chip8, NULL };		\
 I2C_CLIENT_INSMOD_COMMON
-
+#endif /* __KERNEL__ */
 #endif /* _LINUX_I2C_H */
-- 
cgit v1.2.3


From 2e0e1f9f1c478ee14fb60524024f7b730df76912 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:15:44 +0100
Subject: Don't include <linux/device.h> from user-visible part of linux/ipmi.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/ipmi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 0a84b56935c2..5653b2f23b6a 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -36,7 +36,6 @@
 
 #include <linux/ipmi_msgdefs.h>
 #include <linux/compiler.h>
-#include <linux/device.h>
 
 /*
  * This file describes an interface to an IPMI driver.  You have to
@@ -210,6 +209,7 @@ struct kernel_ipmi_msg
  */
 #include <linux/list.h>
 #include <linux/module.h>
+#include <linux/device.h>
 
 #ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
-- 
cgit v1.2.3


From 8e442735ae6e2e1c857fb0c746027da8d8e40a81 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:16:14 +0100
Subject: Remove gratuitous inclusion of <linux/pci.h> from linux/isdn/tpam.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/isdn/tpam.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/isdn/tpam.h b/include/linux/isdn/tpam.h
index 9f65bea49d11..d18dd0dc570d 100644
--- a/include/linux/isdn/tpam.h
+++ b/include/linux/isdn/tpam.h
@@ -26,7 +26,6 @@
 #define _TPAM_H_
 
 #include <linux/types.h>
-#include <linux/pci.h>
 
 /* IOCTL commands */
 #define TPAM_CMD_DSPLOAD	0x0001
-- 
cgit v1.2.3


From 9cdcb56636717ccb935dc66c5d56681eaa5941c1 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:18:07 +0100
Subject: Sanitise linux/mman.h for userspace consumption

It only really needs to define a few constants and include <asm/mman.h>
when it's used by userspace. Move the rest within #ifdef __KERNEL__

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/mman.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mman.h b/include/linux/mman.h
index 18a5689ef748..4ad21c5863fd 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -1,10 +1,6 @@
 #ifndef _LINUX_MMAN_H
 #define _LINUX_MMAN_H
 
-#include <linux/config.h>
-#include <linux/mm.h>
-
-#include <asm/atomic.h>
 #include <asm/mman.h>
 
 #define MREMAP_MAYMOVE	1
@@ -13,6 +9,13 @@
 #define OVERCOMMIT_GUESS		0
 #define OVERCOMMIT_ALWAYS		1
 #define OVERCOMMIT_NEVER		2
+
+#ifdef __KERNEL__
+#include <linux/config.h>
+#include <linux/mm.h>
+
+#include <asm/atomic.h>
+
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern atomic_t vm_committed_space;
@@ -63,5 +66,5 @@ calc_vm_flag_bits(unsigned long flags)
 	       _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
 }
-
+#endif /* __KERNEL__ */
 #endif /* _LINUX_MMAN_H */
-- 
cgit v1.2.3


From 58908d093e77224973b3f7bf54470d51949ff110 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:26:26 +0100
Subject: Don't include private files from user-visible part of linux/ncp_fs.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/ncp_fs.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ncp_fs.h b/include/linux/ncp_fs.h
index 96dc237b8f03..b208f0cd556b 100644
--- a/include/linux/ncp_fs.h
+++ b/include/linux/ncp_fs.h
@@ -12,8 +12,6 @@
 #include <linux/in.h>
 #include <linux/types.h>
 
-#include <linux/ncp_fs_i.h>
-#include <linux/ncp_fs_sb.h>
 #include <linux/ipx.h>
 #include <linux/ncp_no.h>
 
@@ -146,7 +144,8 @@ struct ncp_nls_ioctl
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
+#include <linux/ncp_fs_i.h>
+#include <linux/ncp_fs_sb.h>
 
 /* undef because public define in umsdos_fs.h (ncp_fs.h isn't public) */
 #undef PRINTK
-- 
cgit v1.2.3


From 77597ad663f9e2d40a89c6e27824701bb5fabb83 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:26:46 +0100
Subject: Don't include <linux/list.h> from user-visible part of linux/msg.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/msg.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/msg.h b/include/linux/msg.h
index 903e0ab8101f..acc7c174ff00 100644
--- a/include/linux/msg.h
+++ b/include/linux/msg.h
@@ -2,7 +2,6 @@
 #define _LINUX_MSG_H
 
 #include <linux/ipc.h>
-#include <linux/list.h>
 
 /* ipcs ctl commands */
 #define MSG_STAT 11
@@ -63,6 +62,7 @@ struct msginfo {
 #define MSGSEG (__MSGSEG <= 0xffff ? __MSGSEG : 0xffff)
 
 #ifdef __KERNEL__
+#include <linux/list.h>
 
 /* one msg_msg structure for each message */
 struct msg_msg {
-- 
cgit v1.2.3


From eacf17bdbc8e6f24fe46cd7e10fb9a657f060d08 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:46:09 +0100
Subject: Don't include <linux/stringify> from user-visible part of linux/net.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index 84a490e5f0a1..c88d7cf7f6b7 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -20,7 +20,6 @@
 
 #include <linux/config.h>
 #include <linux/wait.h>
-#include <linux/stringify.h>
 #include <asm/socket.h>
 
 struct poll_table_struct;
@@ -57,6 +56,7 @@ typedef enum {
 #define __SO_ACCEPTCON	(1 << 16)	/* performed a listen		*/
 
 #ifdef __KERNEL__
+#include <linux/stringify.h>
 
 #define SOCK_ASYNC_NOSPACE	0
 #define SOCK_ASYNC_WAITDATA	1
-- 
cgit v1.2.3


From 997b7af2fe0810ca82a2f801a295218b51426e5a Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:51:45 +0100
Subject: Don't include private headers from user-visible parts of
 include/linux/nfs*.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/nfs.h    |  8 ++++----
 include/linux/nfs4.h   |  6 +++---
 include/linux/nfs_fs.h | 39 +++++++++++++++++++--------------------
 3 files changed, 26 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs.h b/include/linux/nfs.h
index ca2ffa6ae1d5..54af92c1c70b 100644
--- a/include/linux/nfs.h
+++ b/include/linux/nfs.h
@@ -7,9 +7,6 @@
 #ifndef _LINUX_NFS_H
 #define _LINUX_NFS_H
 
-#include <linux/sunrpc/msg_prot.h>
-#include <linux/string.h>
-
 #define NFS_PROGRAM	100003
 #define NFS_PORT	2049
 #define NFS_MAXDATA	8192
@@ -129,7 +126,10 @@ enum nfs_ftype {
 	NFFIFO = 8
 };
 
-#if defined(__KERNEL__)
+#ifdef __KERNEL__
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/string.h>
+
 /*
  * This is the kernel NFS client file handle representation
  */
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 0c1c306cdaec..1059e6d69d3b 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -14,7 +14,6 @@
 #define _LINUX_NFS4_H
 
 #include <linux/types.h>
-#include <linux/list.h>
 
 #define NFS4_VERIFIER_SIZE	8
 #define NFS4_FHSIZE		128
@@ -97,6 +96,9 @@ enum nfs4_acl_whotype {
 	NFS4_ACL_WHO_EVERYONE,
 };
 
+#ifdef __KERNEL__
+#include <linux/list.h>
+
 struct nfs4_ace {
 	uint32_t	type;
 	uint32_t	flag;
@@ -345,8 +347,6 @@ enum lock_type4 {
 #define NFS4_MINOR_VERSION 0
 #define NFS4_DEBUG 1
 
-#ifdef __KERNEL__
-
 /* Index of predefined Linux client operations */
 
 enum {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c71227dd4389..7e079f8ce18b 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -9,26 +9,6 @@
 #ifndef _LINUX_NFS_FS_H
 #define _LINUX_NFS_FS_H
 
-#include <linux/config.h>
-#include <linux/in.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/rwsem.h>
-#include <linux/wait.h>
-
-#include <linux/nfs_fs_sb.h>
-
-#include <linux/sunrpc/debug.h>
-#include <linux/sunrpc/auth.h>
-#include <linux/sunrpc/clnt.h>
-
-#include <linux/nfs.h>
-#include <linux/nfs2.h>
-#include <linux/nfs3.h>
-#include <linux/nfs4.h>
-#include <linux/nfs_xdr.h>
-#include <linux/rwsem.h>
-#include <linux/mempool.h>
 
 /*
  * Enable debugging support for nfs client.
@@ -63,6 +43,25 @@
 #define FLUSH_NOCOMMIT		32	/* Don't send the NFSv3/v4 COMMIT */
 
 #ifdef __KERNEL__
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <linux/wait.h>
+
+#include <linux/nfs_fs_sb.h>
+
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/clnt.h>
+
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_xdr.h>
+#include <linux/rwsem.h>
+#include <linux/mempool.h>
 
 /*
  * NFSv3/v4 Access mode cache entry
-- 
cgit v1.2.3


From 0409d3a332fc4347efba535a5003943f2a4aa1ca Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:52:13 +0100
Subject: Don't include private headers from user-visible parts of
 linux/quota.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/quota.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/quota.h b/include/linux/quota.h
index 2dab71e1c3d1..b8fbf26eb885 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -37,8 +37,6 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/mutex.h>
 
 #define __DQUOT_VERSION__	"dquot_6.5.1"
 #define __DQUOT_NUM_VERSION__	6*10000+5*100+1
@@ -133,6 +131,8 @@ struct if_dqinfo {
 };
 
 #ifdef __KERNEL__
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
 
 #include <linux/dqblk_xfs.h>
 #include <linux/dqblk_v1.h>
-- 
cgit v1.2.3


From 98ca79d52bc34b8dfff729bc8559dbb918c9d02a Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:52:51 +0100
Subject: Don't include <linux/list.h> from user-visible part of
 reiserfs_xattr.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/reiserfs_xattr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 5353afb11db3..d42603dafc7c 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -3,7 +3,6 @@
 */
 
 #include <linux/config.h>
-#include <linux/init.h>
 #include <linux/xattr.h>
 
 /* Magic value in header */
@@ -15,6 +14,7 @@ struct reiserfs_xattr_header {
 };
 
 #ifdef __KERNEL__
+#include <linux/init.h>
 
 struct reiserfs_xattr_handler {
 	char *prefix;
-- 
cgit v1.2.3


From a3b6714e1744a5e841753d74aca1de5972f24e6d Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:54:40 +0100
Subject: Partially sanitise linux/sched.h for userspace consumption

For now, just make sure all inclusion of private header files is done
within #ifdef __KERNEL__. There'll be more to clean up later.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/sched.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29b7d4f87d20..2e05e402df4f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1,9 +1,9 @@
 #ifndef _LINUX_SCHED_H
 #define _LINUX_SCHED_H
 
+#ifdef __KERNEL__
 #include <asm/param.h>	/* for HZ */
 
-#include <linux/config.h>
 #include <linux/capability.h>
 #include <linux/threads.h>
 #include <linux/kernel.h>
@@ -37,6 +37,15 @@
 #include <linux/rcupdate.h>
 #include <linux/futex.h>
 
+#include <linux/time.h>
+#include <linux/param.h>
+#include <linux/resource.h>
+#include <linux/timer.h>
+#include <linux/hrtimer.h>
+
+#include <asm/processor.h>
+#endif
+
 #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 
 struct exec_domain;
@@ -103,13 +112,6 @@ extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
 
-#include <linux/time.h>
-#include <linux/param.h>
-#include <linux/resource.h>
-#include <linux/timer.h>
-#include <linux/hrtimer.h>
-
-#include <asm/processor.h>
 
 /*
  * Task state bitmask. NOTE! These bits are also
-- 
cgit v1.2.3


From 8ffbc759a5b655feb69435c4dfa857c391f9dcc8 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:55:13 +0100
Subject: Don't include <asm/atomic.h> from user-visible part of linux/sem.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/sem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 3c1f1120fe88..9aaffb0b1d81 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -2,7 +2,6 @@
 #define _LINUX_SEM_H
 
 #include <linux/ipc.h>
-#include <asm/atomic.h>
 
 /* semop flags */
 #define SEM_UNDO        0x1000  /* undo the operation on exit */
@@ -78,6 +77,7 @@ struct  seminfo {
 #define SEMUSZ  20		/* sizeof struct sem_undo */
 
 #ifdef __KERNEL__
+#include <asm/atomic.h>
 
 struct task_struct;
 
-- 
cgit v1.2.3


From 7ab2febd4d3c6f50545cee11a116536a09748d59 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 14:55:46 +0100
Subject: Don't include private headers from user-visible part of
 linux/signal.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/signal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 162a8fd10b29..4b42df3860ed 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -1,12 +1,12 @@
 #ifndef _LINUX_SIGNAL_H
 #define _LINUX_SIGNAL_H
 
-#include <linux/list.h>
-#include <linux/spinlock.h>
 #include <asm/signal.h>
 #include <asm/siginfo.h>
 
 #ifdef __KERNEL__
+#include <linux/list.h>
+#include <linux/spinlock.h>
 
 /*
  * These values of sa_flags are used only by the kernel as part of the
-- 
cgit v1.2.3


From 468db83658f776ec87a953778f18611301668148 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 15:00:56 +0100
Subject: Don't include <linux/spinlock.h> from user-visible part of
 linux/wanrouter.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/wanrouter.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wanrouter.h b/include/linux/wanrouter.h
index 1b6b76a4eb54..2cd05013edfc 100644
--- a/include/linux/wanrouter.h
+++ b/include/linux/wanrouter.h
@@ -44,8 +44,6 @@
 * Jan 02, 1997	Gene Kozin	Initial version (based on wanpipe.h).
 *****************************************************************************/
 
-#include <linux/spinlock.h>       /* Support for SMP Locking */
-
 #ifndef	_ROUTER_H
 #define	_ROUTER_H
 
@@ -457,6 +455,8 @@ typedef struct wanif_conf
 #include <linux/fs.h>		/* support for device drivers */
 #include <linux/proc_fs.h>	/* proc filesystem pragmatics */
 #include <linux/netdevice.h>	/* support for network drivers */
+#include <linux/spinlock.h>     /* Support for SMP Locking */
+
 /*----------------------------------------------------------------------------
  * WAN device data space.
  */
-- 
cgit v1.2.3


From eae19a762de975e109394b1edcba6587323c7d1a Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 15:14:50 +0100
Subject: Don't export CONFIG_COMPAT stuff in linux/usbdevice_fs.h to userspace

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/usbdevice_fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usbdevice_fs.h b/include/linux/usbdevice_fs.h
index 8859f0b41543..7b7aadb69092 100644
--- a/include/linux/usbdevice_fs.h
+++ b/include/linux/usbdevice_fs.h
@@ -123,6 +123,7 @@ struct usbdevfs_hub_portinfo {
 	char port [127];	/* e.g. port 3 connects to device 27 */
 };
 
+#ifdef __KERNEL__
 #ifdef CONFIG_COMPAT
 #include <linux/compat.h>
 struct usbdevfs_urb32 {
@@ -147,6 +148,7 @@ struct usbdevfs_ioctl32 {
 	compat_caddr_t data;
 };
 #endif
+#endif /* __KERNEL__ */
 
 #define USBDEVFS_CONTROL           _IOWR('U', 0, struct usbdevfs_ctrltransfer)
 #define USBDEVFS_BULK              _IOWR('U', 2, struct usbdevfs_bulktransfer)
-- 
cgit v1.2.3


From 1af042271f9bf7601f7ecf4d328ccde3a44d2c72 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 15:16:44 +0100
Subject: Sanitise linux/sunrpc/debug.h for userspace consumption

Move some inclusion of private header files and the definition of
RPC_DEBUG inside the existing #ifdef __KERNEL__

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/sunrpc/debug.h | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index 1a42d902bc11..e0cae8deb465 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -9,19 +9,6 @@
 #ifndef _LINUX_SUNRPC_DEBUG_H_
 #define _LINUX_SUNRPC_DEBUG_H_
 
-#include <linux/config.h>
-
-#include <linux/timer.h>
-#include <linux/workqueue.h>
-
-/*
- * Enable RPC debugging/profiling.
- */
-#ifdef CONFIG_SYSCTL
-#define  RPC_DEBUG
-#endif
-/* #define  RPC_PROFILE */
-
 /*
  * RPC debug facilities
  */
@@ -40,6 +27,18 @@
 #define RPCDBG_ALL		0x7fff
 
 #ifdef __KERNEL__
+#include <linux/config.h>
+
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+
+/*
+ * Enable RPC debugging/profiling.
+ */
+#ifdef CONFIG_SYSCTL
+#define  RPC_DEBUG
+#endif
+/* #define  RPC_PROFILE */
 
 /*
  * Debugging macros etc
-- 
cgit v1.2.3


From 19b3bd667b6a4fc4c164c743492cec08d91d74a5 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 15:18:05 +0100
Subject: Don't include private headers from user-visible part of
 linux/smb_fs.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/smb_fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smb_fs.h b/include/linux/smb_fs.h
index 621a3d3662f3..367d6c3e8ed4 100644
--- a/include/linux/smb_fs.h
+++ b/include/linux/smb_fs.h
@@ -10,8 +10,6 @@
 #define _LINUX_SMB_FS_H
 
 #include <linux/smb.h>
-#include <linux/smb_fs_i.h>
-#include <linux/smb_fs_sb.h>
 
 /*
  * ioctl commands
@@ -24,6 +22,8 @@
 
 
 #ifdef __KERNEL__
+#include <linux/smb_fs_i.h>
+#include <linux/smb_fs_sb.h>
 
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-- 
cgit v1.2.3


From 52a78c1cae382ff5684f3970848676de12449745 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 15:18:31 +0100
Subject: Don't include private headers from user-visible part of
 linux/ext2_fs.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/ext2_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h
index f7bd1c7ebefb..facf34e98954 100644
--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -17,7 +17,6 @@
 #define _LINUX_EXT2_FS_H
 
 #include <linux/types.h>
-#include <linux/ext2_fs_sb.h>
 
 /*
  * The second extended filesystem constants/structures
@@ -70,6 +69,7 @@
 #define EXT2_SUPER_MAGIC	0xEF53
 
 #ifdef __KERNEL__
+#include <linux/ext2_fs_sb.h>
 static inline struct ext2_sb_info *EXT2_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
-- 
cgit v1.2.3


From d85004eb15a635b3937e91d1dbadb1d37541983c Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 15:18:46 +0100
Subject: Don't include private headers from user-visible part of
 linux/ext3_fs.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/ext3_fs.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 3ade6a4e3bdd..f327a3b5dfbe 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -17,11 +17,6 @@
 #define _LINUX_EXT3_FS_H
 
 #include <linux/types.h>
-#include <linux/ext3_fs_i.h>
-#include <linux/ext3_fs_sb.h>
-
-
-struct statfs;
 
 /*
  * The second extended filesystem constants/structures
@@ -487,6 +482,8 @@ struct ext3_super_block {
 };
 
 #ifdef __KERNEL__
+#include <linux/ext3_fs_i.h>
+#include <linux/ext3_fs_sb.h>
 static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
-- 
cgit v1.2.3


From 089f26d5e31b7bf42a9a8fefec08b30cd27f4b0e Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 25 Apr 2006 15:29:01 +0100
Subject: Don't include <linux/config.h> and <linux/linkage.h> from
 linux/socket.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/socket.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 9ab2ddd80221..361409094649 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -18,8 +18,6 @@ struct __kernel_sockaddr_storage {
 
 #if defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2)
 
-#include <linux/config.h>		/* for CONFIG_COMPAT */
-#include <linux/linkage.h>
 #include <asm/socket.h>			/* arch-dependent defines	*/
 #include <linux/sockios.h>		/* the SIOCxxx I/O controls	*/
 #include <linux/uio.h>			/* iovec support		*/
-- 
cgit v1.2.3


From 62c4f0a2d5a188f73a94f2cb8ea0dba3e7cf0a7f Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 26 Apr 2006 12:56:16 +0100
Subject: Don't include linux/config.h from anywhere else in include/

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/acpi/platform/aclinux.h                      | 1 -
 include/acpi/processor.h                             | 1 -
 include/asm-alpha/bitops.h                           | 1 -
 include/asm-alpha/cache.h                            | 1 -
 include/asm-alpha/cacheflush.h                       | 1 -
 include/asm-alpha/core_cia.h                         | 1 -
 include/asm-alpha/core_t2.h                          | 1 -
 include/asm-alpha/dma-mapping.h                      | 1 -
 include/asm-alpha/dma.h                              | 1 -
 include/asm-alpha/floppy.h                           | 1 -
 include/asm-alpha/hardirq.h                          | 1 -
 include/asm-alpha/hw_irq.h                           | 1 -
 include/asm-alpha/ide.h                              | 1 -
 include/asm-alpha/io.h                               | 1 -
 include/asm-alpha/irq.h                              | 1 -
 include/asm-alpha/kmap_types.h                       | 1 -
 include/asm-alpha/machvec.h                          | 1 -
 include/asm-alpha/mmu_context.h                      | 1 -
 include/asm-alpha/mmzone.h                           | 1 -
 include/asm-alpha/page.h                             | 1 -
 include/asm-alpha/param.h                            | 1 -
 include/asm-alpha/pgalloc.h                          | 1 -
 include/asm-alpha/pgtable.h                          | 1 -
 include/asm-alpha/serial.h                           | 1 -
 include/asm-alpha/smp.h                              | 1 -
 include/asm-alpha/spinlock.h                         | 1 -
 include/asm-alpha/system.h                           | 1 -
 include/asm-alpha/tlbflush.h                         | 1 -
 include/asm-arm/apm.h                                | 1 -
 include/asm-arm/arch-aaec2000/memory.h               | 1 -
 include/asm-arm/arch-cl7500/acornfb.h                | 1 -
 include/asm-arm/arch-clps711x/hardware.h             | 1 -
 include/asm-arm/arch-clps711x/memory.h               | 1 -
 include/asm-arm/arch-clps711x/uncompress.h           | 1 -
 include/asm-arm/arch-ebsa285/hardware.h              | 1 -
 include/asm-arm/arch-ebsa285/memory.h                | 1 -
 include/asm-arm/arch-ebsa285/vmalloc.h               | 1 -
 include/asm-arm/arch-integrator/smp.h                | 1 -
 include/asm-arm/arch-iop3xx/memory.h                 | 1 -
 include/asm-arm/arch-iop3xx/timex.h                  | 1 -
 include/asm-arm/arch-iop3xx/uncompress.h             | 1 -
 include/asm-arm/arch-ixp4xx/dma.h                    | 1 -
 include/asm-arm/arch-lh7a40x/constants.h             | 1 -
 include/asm-arm/arch-lh7a40x/irqs.h                  | 1 -
 include/asm-arm/arch-lh7a40x/registers.h             | 1 -
 include/asm-arm/arch-omap/board.h                    | 1 -
 include/asm-arm/arch-omap/hardware.h                 | 1 -
 include/asm-arm/arch-omap/system.h                   | 1 -
 include/asm-arm/arch-omap/uncompress.h               | 1 -
 include/asm-arm/arch-pxa/idp.h                       | 1 -
 include/asm-arm/arch-pxa/irqs.h                      | 1 -
 include/asm-arm/arch-pxa/pxa-regs.h                  | 1 -
 include/asm-arm/arch-pxa/timex.h                     | 1 -
 include/asm-arm/arch-realview/smp.h                  | 1 -
 include/asm-arm/arch-s3c2410/dma.h                   | 1 -
 include/asm-arm/arch-s3c2410/uncompress.h            | 1 -
 include/asm-arm/arch-sa1100/assabet.h                | 1 -
 include/asm-arm/arch-sa1100/cerf.h                   | 1 -
 include/asm-arm/arch-sa1100/collie.h                 | 1 -
 include/asm-arm/arch-sa1100/dma.h                    | 1 -
 include/asm-arm/arch-sa1100/hardware.h               | 1 -
 include/asm-arm/arch-sa1100/ide.h                    | 1 -
 include/asm-arm/arch-sa1100/irqs.h                   | 1 -
 include/asm-arm/arch-sa1100/memory.h                 | 1 -
 include/asm-arm/arch-sa1100/system.h                 | 1 -
 include/asm-arm/atomic.h                             | 1 -
 include/asm-arm/bug.h                                | 1 -
 include/asm-arm/cacheflush.h                         | 1 -
 include/asm-arm/cpu.h                                | 1 -
 include/asm-arm/dma-mapping.h                        | 1 -
 include/asm-arm/dma.h                                | 1 -
 include/asm-arm/elf.h                                | 1 -
 include/asm-arm/fpstate.h                            | 1 -
 include/asm-arm/glue.h                               | 1 -
 include/asm-arm/hardirq.h                            | 1 -
 include/asm-arm/hardware/dec21285.h                  | 1 -
 include/asm-arm/hardware/iomd.h                      | 1 -
 include/asm-arm/leds.h                               | 1 -
 include/asm-arm/mach/serial_at91rm9200.h             | 1 -
 include/asm-arm/mach/serial_sa1100.h                 | 1 -
 include/asm-arm/memory.h                             | 1 -
 include/asm-arm/page.h                               | 1 -
 include/asm-arm/pci.h                                | 1 -
 include/asm-arm/proc-fns.h                           | 1 -
 include/asm-arm/ptrace.h                             | 1 -
 include/asm-arm/smp.h                                | 1 -
 include/asm-arm/system.h                             | 1 -
 include/asm-arm/tlbflush.h                           | 1 -
 include/asm-arm26/atomic.h                           | 1 -
 include/asm-arm26/bug.h                              | 1 -
 include/asm-arm26/dma.h                              | 1 -
 include/asm-arm26/hardirq.h                          | 1 -
 include/asm-arm26/hardware.h                         | 1 -
 include/asm-arm26/io.h                               | 1 -
 include/asm-arm26/leds.h                             | 1 -
 include/asm-arm26/mach-types.h                       | 1 -
 include/asm-arm26/page.h                             | 1 -
 include/asm-arm26/pgtable.h                          | 1 -
 include/asm-arm26/serial.h                           | 1 -
 include/asm-arm26/smp.h                              | 1 -
 include/asm-arm26/sysirq.h                           | 1 -
 include/asm-arm26/system.h                           | 1 -
 include/asm-cris/arch-v10/io.h                       | 1 -
 include/asm-cris/arch-v10/page.h                     | 1 -
 include/asm-cris/arch-v10/system.h                   | 1 -
 include/asm-cris/arch-v32/io.h                       | 1 -
 include/asm-cris/arch-v32/irq.h                      | 1 -
 include/asm-cris/arch-v32/page.h                     | 1 -
 include/asm-cris/arch-v32/processor.h                | 1 -
 include/asm-cris/arch-v32/system.h                   | 1 -
 include/asm-cris/eshlibld.h                          | 1 -
 include/asm-cris/etraxgpio.h                         | 1 -
 include/asm-cris/fasttimer.h                         | 1 -
 include/asm-cris/page.h                              | 1 -
 include/asm-cris/pci.h                               | 1 -
 include/asm-cris/pgtable.h                           | 1 -
 include/asm-cris/processor.h                         | 1 -
 include/asm-cris/rtc.h                               | 1 -
 include/asm-cris/tlbflush.h                          | 1 -
 include/asm-frv/atomic.h                             | 1 -
 include/asm-frv/bitops.h                             | 1 -
 include/asm-frv/bug.h                                | 1 -
 include/asm-frv/cache.h                              | 1 -
 include/asm-frv/dma.h                                | 1 -
 include/asm-frv/elf.h                                | 1 -
 include/asm-frv/fpu.h                                | 1 -
 include/asm-frv/hardirq.h                            | 1 -
 include/asm-frv/highmem.h                            | 1 -
 include/asm-frv/ide.h                                | 1 -
 include/asm-frv/io.h                                 | 1 -
 include/asm-frv/irq.h                                | 1 -
 include/asm-frv/mmu_context.h                        | 1 -
 include/asm-frv/page.h                               | 1 -
 include/asm-frv/pci.h                                | 1 -
 include/asm-frv/pgalloc.h                            | 1 -
 include/asm-frv/pgtable.h                            | 1 -
 include/asm-frv/processor.h                          | 1 -
 include/asm-frv/segment.h                            | 1 -
 include/asm-frv/serial.h                             | 1 -
 include/asm-frv/smp.h                                | 1 -
 include/asm-frv/system.h                             | 1 -
 include/asm-frv/tlbflush.h                           | 1 -
 include/asm-frv/types.h                              | 1 -
 include/asm-frv/unaligned.h                          | 1 -
 include/asm-frv/virtconvert.h                        | 1 -
 include/asm-generic/bug.h                            | 1 -
 include/asm-generic/dma-mapping.h                    | 1 -
 include/asm-generic/fcntl.h                          | 1 -
 include/asm-generic/local.h                          | 1 -
 include/asm-generic/tlb.h                            | 1 -
 include/asm-h8300/bitops.h                           | 1 -
 include/asm-h8300/dma.h                              | 1 -
 include/asm-h8300/elf.h                              | 1 -
 include/asm-h8300/hardirq.h                          | 1 -
 include/asm-h8300/io.h                               | 1 -
 include/asm-h8300/keyboard.h                         | 1 -
 include/asm-h8300/mmu_context.h                      | 1 -
 include/asm-h8300/page.h                             | 1 -
 include/asm-h8300/page_offset.h                      | 1 -
 include/asm-h8300/param.h                            | 1 -
 include/asm-h8300/pgtable.h                          | 1 -
 include/asm-h8300/processor.h                        | 1 -
 include/asm-h8300/semaphore-helper.h                 | 1 -
 include/asm-h8300/shm.h                              | 1 -
 include/asm-h8300/system.h                           | 1 -
 include/asm-h8300/unaligned.h                        | 1 -
 include/asm-h8300/virtconvert.h                      | 1 -
 include/asm-i386/apic.h                              | 1 -
 include/asm-i386/atomic.h                            | 1 -
 include/asm-i386/bitops.h                            | 1 -
 include/asm-i386/bug.h                               | 1 -
 include/asm-i386/bugs.h                              | 1 -
 include/asm-i386/byteorder.h                         | 1 -
 include/asm-i386/cache.h                             | 1 -
 include/asm-i386/dma.h                               | 1 -
 include/asm-i386/fixmap.h                            | 1 -
 include/asm-i386/hardirq.h                           | 1 -
 include/asm-i386/highmem.h                           | 1 -
 include/asm-i386/hpet.h                              | 1 -
 include/asm-i386/hw_irq.h                            | 1 -
 include/asm-i386/ide.h                               | 1 -
 include/asm-i386/io.h                                | 1 -
 include/asm-i386/io_apic.h                           | 1 -
 include/asm-i386/irq.h                               | 1 -
 include/asm-i386/kmap_types.h                        | 1 -
 include/asm-i386/mach-summit/mach_apic.h             | 1 -
 include/asm-i386/mmu_context.h                       | 1 -
 include/asm-i386/mtrr.h                              | 1 -
 include/asm-i386/page.h                              | 1 -
 include/asm-i386/param.h                             | 1 -
 include/asm-i386/pci.h                               | 1 -
 include/asm-i386/pgalloc.h                           | 1 -
 include/asm-i386/pgtable.h                           | 1 -
 include/asm-i386/processor.h                         | 1 -
 include/asm-i386/serial.h                            | 1 -
 include/asm-i386/smp.h                               | 1 -
 include/asm-i386/spinlock.h                          | 1 -
 include/asm-i386/string.h                            | 1 -
 include/asm-i386/system.h                            | 1 -
 include/asm-i386/thread_info.h                       | 1 -
 include/asm-i386/timex.h                             | 1 -
 include/asm-i386/tlbflush.h                          | 1 -
 include/asm-i386/types.h                             | 1 -
 include/asm-i386/uaccess.h                           | 1 -
 include/asm-ia64/asmmacro.h                          | 1 -
 include/asm-ia64/cache.h                             | 1 -
 include/asm-ia64/delay.h                             | 1 -
 include/asm-ia64/dma-mapping.h                       | 1 -
 include/asm-ia64/dma.h                               | 1 -
 include/asm-ia64/elf.h                               | 1 -
 include/asm-ia64/hardirq.h                           | 1 -
 include/asm-ia64/ia32.h                              | 1 -
 include/asm-ia64/ide.h                               | 1 -
 include/asm-ia64/intrinsics.h                        | 1 -
 include/asm-ia64/kmap_types.h                        | 1 -
 include/asm-ia64/machvec.h                           | 1 -
 include/asm-ia64/meminit.h                           | 1 -
 include/asm-ia64/nodedata.h                          | 1 -
 include/asm-ia64/numa.h                              | 1 -
 include/asm-ia64/page.h                              | 1 -
 include/asm-ia64/param.h                             | 1 -
 include/asm-ia64/percpu.h                            | 1 -
 include/asm-ia64/pgalloc.h                           | 1 -
 include/asm-ia64/pgtable.h                           | 1 -
 include/asm-ia64/processor.h                         | 1 -
 include/asm-ia64/ptrace.h                            | 1 -
 include/asm-ia64/smp.h                               | 1 -
 include/asm-ia64/sn/simulator.h                      | 1 -
 include/asm-ia64/sn/sn_cpuid.h                       | 1 -
 include/asm-ia64/sn/sn_sal.h                         | 1 -
 include/asm-ia64/sn/xpc.h                            | 1 -
 include/asm-ia64/string.h                            | 1 -
 include/asm-ia64/system.h                            | 1 -
 include/asm-ia64/tlb.h                               | 1 -
 include/asm-ia64/tlbflush.h                          | 1 -
 include/asm-ia64/unistd.h                            | 1 -
 include/asm-m32r/assembler.h                         | 1 -
 include/asm-m32r/atomic.h                            | 1 -
 include/asm-m32r/bitops.h                            | 1 -
 include/asm-m32r/cacheflush.h                        | 1 -
 include/asm-m32r/hardirq.h                           | 1 -
 include/asm-m32r/ide.h                               | 1 -
 include/asm-m32r/irq.h                               | 1 -
 include/asm-m32r/kmap_types.h                        | 1 -
 include/asm-m32r/m32104ut/m32104ut_pld.h             | 1 -
 include/asm-m32r/m32700ut/m32700ut_lan.h             | 1 -
 include/asm-m32r/m32700ut/m32700ut_lcd.h             | 1 -
 include/asm-m32r/m32700ut/m32700ut_pld.h             | 1 -
 include/asm-m32r/m32r.h                              | 1 -
 include/asm-m32r/mmu.h                               | 1 -
 include/asm-m32r/mmu_context.h                       | 2 --
 include/asm-m32r/opsput/opsput_lan.h                 | 1 -
 include/asm-m32r/opsput/opsput_lcd.h                 | 1 -
 include/asm-m32r/opsput/opsput_pld.h                 | 1 -
 include/asm-m32r/page.h                              | 1 -
 include/asm-m32r/pgalloc.h                           | 1 -
 include/asm-m32r/pgtable-2level.h                    | 1 -
 include/asm-m32r/pgtable.h                           | 1 -
 include/asm-m32r/processor.h                         | 1 -
 include/asm-m32r/ptrace.h                            | 1 -
 include/asm-m32r/rtc.h                               | 1 -
 include/asm-m32r/semaphore.h                         | 1 -
 include/asm-m32r/serial.h                            | 1 -
 include/asm-m32r/sigcontext.h                        | 1 -
 include/asm-m32r/smp.h                               | 1 -
 include/asm-m32r/spinlock.h                          | 1 -
 include/asm-m32r/system.h                            | 1 -
 include/asm-m32r/timex.h                             | 1 -
 include/asm-m32r/tlbflush.h                          | 1 -
 include/asm-m32r/uaccess.h                           | 1 -
 include/asm-m68k/atomic.h                            | 1 -
 include/asm-m68k/bug.h                               | 1 -
 include/asm-m68k/dma-mapping.h                       | 1 -
 include/asm-m68k/dma.h                               | 1 -
 include/asm-m68k/dvma.h                              | 1 -
 include/asm-m68k/elf.h                               | 1 -
 include/asm-m68k/entry.h                             | 1 -
 include/asm-m68k/fpu.h                               | 1 -
 include/asm-m68k/hardirq.h                           | 1 -
 include/asm-m68k/ide.h                               | 1 -
 include/asm-m68k/io.h                                | 1 -
 include/asm-m68k/irq.h                               | 1 -
 include/asm-m68k/mc146818rtc.h                       | 1 -
 include/asm-m68k/mmu_context.h                       | 1 -
 include/asm-m68k/motorola_pgtable.h                  | 1 -
 include/asm-m68k/openprom.h                          | 1 -
 include/asm-m68k/page.h                              | 1 -
 include/asm-m68k/page_offset.h                       | 1 -
 include/asm-m68k/pgalloc.h                           | 1 -
 include/asm-m68k/pgtable.h                           | 1 -
 include/asm-m68k/processor.h                         | 1 -
 include/asm-m68k/semaphore-helper.h                  | 1 -
 include/asm-m68k/serial.h                            | 1 -
 include/asm-m68k/setup.h                             | 1 -
 include/asm-m68k/shm.h                               | 1 -
 include/asm-m68k/system.h                            | 1 -
 include/asm-m68k/tlbflush.h                          | 1 -
 include/asm-m68k/virtconvert.h                       | 1 -
 include/asm-m68knommu/bitops.h                       | 1 -
 include/asm-m68knommu/coldfire.h                     | 1 -
 include/asm-m68knommu/commproc.h                     | 1 -
 include/asm-m68knommu/dma-mapping.h                  | 1 -
 include/asm-m68knommu/dma.h                          | 1 -
 include/asm-m68knommu/elf.h                          | 1 -
 include/asm-m68knommu/elia.h                         | 1 -
 include/asm-m68knommu/entry.h                        | 1 -
 include/asm-m68knommu/fpu.h                          | 1 -
 include/asm-m68knommu/hardirq.h                      | 1 -
 include/asm-m68knommu/io.h                           | 1 -
 include/asm-m68knommu/irq.h                          | 1 -
 include/asm-m68knommu/m5206sim.h                     | 1 -
 include/asm-m68knommu/m520xsim.h                     | 1 -
 include/asm-m68knommu/m523xsim.h                     | 1 -
 include/asm-m68knommu/m5272sim.h                     | 1 -
 include/asm-m68knommu/m527xsim.h                     | 1 -
 include/asm-m68knommu/m528xsim.h                     | 1 -
 include/asm-m68knommu/mcfcache.h                     | 1 -
 include/asm-m68knommu/mcfdma.h                       | 1 -
 include/asm-m68knommu/mcfmbus.h                      | 1 -
 include/asm-m68knommu/mcfne.h                        | 1 -
 include/asm-m68knommu/mcfpci.h                       | 1 -
 include/asm-m68knommu/mcfpit.h                       | 1 -
 include/asm-m68knommu/mcfsim.h                       | 1 -
 include/asm-m68knommu/mcfsmc.h                       | 1 -
 include/asm-m68knommu/mcftimer.h                     | 1 -
 include/asm-m68knommu/mcfuart.h                      | 1 -
 include/asm-m68knommu/mcfwdebug.h                    | 1 -
 include/asm-m68knommu/mmu_context.h                  | 1 -
 include/asm-m68knommu/nettel.h                       | 1 -
 include/asm-m68knommu/page.h                         | 1 -
 include/asm-m68knommu/page_offset.h                  | 1 -
 include/asm-m68knommu/param.h                        | 1 -
 include/asm-m68knommu/pgtable.h                      | 1 -
 include/asm-m68knommu/processor.h                    | 1 -
 include/asm-m68knommu/semaphore-helper.h             | 1 -
 include/asm-m68knommu/system.h                       | 1 -
 include/asm-m68knommu/unaligned.h                    | 1 -
 include/asm-mips/a.out.h                             | 1 -
 include/asm-mips/addrspace.h                         | 1 -
 include/asm-mips/arc/types.h                         | 1 -
 include/asm-mips/asm.h                               | 1 -
 include/asm-mips/asmmacro.h                          | 1 -
 include/asm-mips/atomic.h                            | 1 -
 include/asm-mips/bcache.h                            | 1 -
 include/asm-mips/bitops.h                            | 1 -
 include/asm-mips/bug.h                               | 1 -
 include/asm-mips/bugs.h                              | 1 -
 include/asm-mips/byteorder.h                         | 1 -
 include/asm-mips/cache.h                             | 1 -
 include/asm-mips/checksum.h                          | 1 -
 include/asm-mips/cpu-features.h                      | 1 -
 include/asm-mips/cpu-info.h                          | 1 -
 include/asm-mips/ddb5xxx/ddb5477.h                   | 1 -
 include/asm-mips/ddb5xxx/ddb5xxx.h                   | 1 -
 include/asm-mips/debug.h                             | 1 -
 include/asm-mips/dec/prom.h                          | 1 -
 include/asm-mips/delay.h                             | 1 -
 include/asm-mips/dma.h                               | 1 -
 include/asm-mips/elf.h                               | 1 -
 include/asm-mips/fcntl.h                             | 1 -
 include/asm-mips/fixmap.h                            | 1 -
 include/asm-mips/fpu.h                               | 1 -
 include/asm-mips/futex.h                             | 1 -
 include/asm-mips/hazards.h                           | 1 -
 include/asm-mips/highmem.h                           | 1 -
 include/asm-mips/interrupt.h                         | 1 -
 include/asm-mips/io.h                                | 1 -
 include/asm-mips/ip32/machine.h                      | 1 -
 include/asm-mips/irq.h                               | 1 -
 include/asm-mips/isadep.h                            | 1 -
 include/asm-mips/jmr3927/irq.h                       | 1 -
 include/asm-mips/kmap_types.h                        | 1 -
 include/asm-mips/local.h                             | 1 -
 include/asm-mips/mach-au1x00/au1000.h                | 1 -
 include/asm-mips/mach-au1x00/au1xxx.h                | 1 -
 include/asm-mips/mach-au1x00/au1xxx_dbdma.h          | 1 -
 include/asm-mips/mach-au1x00/au1xxx_ide.h            | 1 -
 include/asm-mips/mach-au1x00/au1xxx_psc.h            | 1 -
 include/asm-mips/mach-au1x00/ioremap.h               | 1 -
 include/asm-mips/mach-cobalt/cpu-feature-overrides.h | 1 -
 include/asm-mips/mach-db1x00/db1x00.h                | 1 -
 include/asm-mips/mach-generic/ide.h                  | 1 -
 include/asm-mips/mach-generic/kmalloc.h              | 1 -
 include/asm-mips/mach-generic/spaces.h               | 1 -
 include/asm-mips/mach-ip22/spaces.h                  | 1 -
 include/asm-mips/mach-ip32/cpu-feature-overrides.h   | 1 -
 include/asm-mips/mach-ip32/kmalloc.h                 | 1 -
 include/asm-mips/mach-mips/cpu-feature-overrides.h   | 1 -
 include/asm-mips/mach-mips/irq.h                     | 1 -
 include/asm-mips/mach-pb1x00/pb1550.h                | 1 -
 include/asm-mips/mach-sim/cpu-feature-overrides.h    | 1 -
 include/asm-mips/mips-boards/generic.h               | 1 -
 include/asm-mips/mipsregs.h                          | 1 -
 include/asm-mips/mmu_context.h                       | 1 -
 include/asm-mips/mmzone.h                            | 1 -
 include/asm-mips/module.h                            | 1 -
 include/asm-mips/msgbuf.h                            | 1 -
 include/asm-mips/paccess.h                           | 1 -
 include/asm-mips/page.h                              | 1 -
 include/asm-mips/pci.h                               | 1 -
 include/asm-mips/pgalloc.h                           | 1 -
 include/asm-mips/pgtable-32.h                        | 1 -
 include/asm-mips/pgtable-64.h                        | 1 -
 include/asm-mips/pgtable-bits.h                      | 1 -
 include/asm-mips/pgtable.h                           | 1 -
 include/asm-mips/prefetch.h                          | 1 -
 include/asm-mips/processor.h                         | 1 -
 include/asm-mips/ptrace.h                            | 1 -
 include/asm-mips/reg.h                               | 1 -
 include/asm-mips/resource.h                          | 1 -
 include/asm-mips/serial.h                            | 1 -
 include/asm-mips/sgiarcs.h                           | 1 -
 include/asm-mips/sibyte/board.h                      | 1 -
 include/asm-mips/sibyte/carmel.h                     | 1 -
 include/asm-mips/sibyte/sentosa.h                    | 1 -
 include/asm-mips/sibyte/swarm.h                      | 1 -
 include/asm-mips/siginfo.h                           | 1 -
 include/asm-mips/signal.h                            | 1 -
 include/asm-mips/sim.h                               | 1 -
 include/asm-mips/smp.h                               | 1 -
 include/asm-mips/sn/addrs.h                          | 1 -
 include/asm-mips/sn/agent.h                          | 1 -
 include/asm-mips/sn/arch.h                           | 1 -
 include/asm-mips/sn/io.h                             | 1 -
 include/asm-mips/sn/klconfig.h                       | 1 -
 include/asm-mips/sn/kldir.h                          | 1 -
 include/asm-mips/sn/launch.h                         | 1 -
 include/asm-mips/sn/mapped_kernel.h                  | 1 -
 include/asm-mips/sn/sn0/addrs.h                      | 1 -
 include/asm-mips/sn/sn0/arch.h                       | 1 -
 include/asm-mips/sn/sn0/hubmd.h                      | 1 -
 include/asm-mips/stackframe.h                        | 1 -
 include/asm-mips/string.h                            | 1 -
 include/asm-mips/system.h                            | 1 -
 include/asm-mips/thread_info.h                       | 1 -
 include/asm-mips/tlbflush.h                          | 1 -
 include/asm-mips/tx4927/toshiba_rbtx4927.h           | 1 -
 include/asm-mips/types.h                             | 1 -
 include/asm-mips/uaccess.h                           | 1 -
 include/asm-mips/unistd.h                            | 1 -
 include/asm-mips/vr41xx/vrc4173.h                    | 1 -
 include/asm-mips/war.h                               | 1 -
 include/asm-mips/wbflush.h                           | 1 -
 include/asm-parisc/atomic.h                          | 1 -
 include/asm-parisc/cache.h                           | 1 -
 include/asm-parisc/cacheflush.h                      | 1 -
 include/asm-parisc/dma-mapping.h                     | 1 -
 include/asm-parisc/dma.h                             | 1 -
 include/asm-parisc/io.h                              | 1 -
 include/asm-parisc/irq.h                             | 1 -
 include/asm-parisc/kmap_types.h                      | 1 -
 include/asm-parisc/page.h                            | 1 -
 include/asm-parisc/param.h                           | 1 -
 include/asm-parisc/pci.h                             | 1 -
 include/asm-parisc/pdc.h                             | 1 -
 include/asm-parisc/pgtable.h                         | 1 -
 include/asm-parisc/processor.h                       | 1 -
 include/asm-parisc/psw.h                             | 1 -
 include/asm-parisc/smp.h                             | 1 -
 include/asm-parisc/system.h                          | 1 -
 include/asm-parisc/tlbflush.h                        | 1 -
 include/asm-powerpc/abs_addr.h                       | 1 -
 include/asm-powerpc/cache.h                          | 1 -
 include/asm-powerpc/dma-mapping.h                    | 1 -
 include/asm-powerpc/dma.h                            | 1 -
 include/asm-powerpc/eeh.h                            | 1 -
 include/asm-powerpc/floppy.h                         | 1 -
 include/asm-powerpc/hw_irq.h                         | 1 -
 include/asm-powerpc/ide.h                            | 1 -
 include/asm-powerpc/iommu.h                          | 1 -
 include/asm-powerpc/irq.h                            | 1 -
 include/asm-powerpc/iseries/iseries_io.h             | 1 -
 include/asm-powerpc/machdep.h                        | 1 -
 include/asm-powerpc/mmzone.h                         | 1 -
 include/asm-powerpc/paca.h                           | 1 -
 include/asm-powerpc/page.h                           | 1 -
 include/asm-powerpc/pgtable.h                        | 1 -
 include/asm-powerpc/ppc_asm.h                        | 1 -
 include/asm-powerpc/prom.h                           | 1 -
 include/asm-powerpc/smp.h                            | 1 -
 include/asm-powerpc/smu.h                            | 1 -
 include/asm-powerpc/spu.h                            | 1 -
 include/asm-powerpc/thread_info.h                    | 1 -
 include/asm-powerpc/time.h                           | 1 -
 include/asm-powerpc/timex.h                          | 1 -
 include/asm-powerpc/tlb.h                            | 1 -
 include/asm-powerpc/tlbflush.h                       | 1 -
 include/asm-powerpc/topology.h                       | 1 -
 include/asm-powerpc/types.h                          | 1 -
 include/asm-powerpc/unistd.h                         | 1 -
 include/asm-powerpc/vga.h                            | 1 -
 include/asm-powerpc/vio.h                            | 1 -
 include/asm-ppc/amigahw.h                            | 1 -
 include/asm-ppc/bootinfo.h                           | 1 -
 include/asm-ppc/commproc.h                           | 1 -
 include/asm-ppc/ibm403.h                             | 1 -
 include/asm-ppc/ibm44x.h                             | 1 -
 include/asm-ppc/ibm4xx.h                             | 1 -
 include/asm-ppc/io.h                                 | 1 -
 include/asm-ppc/machdep.h                            | 1 -
 include/asm-ppc/mmu.h                                | 1 -
 include/asm-ppc/mmu_context.h                        | 1 -
 include/asm-ppc/mpc8260.h                            | 1 -
 include/asm-ppc/mpc83xx.h                            | 1 -
 include/asm-ppc/mpc85xx.h                            | 1 -
 include/asm-ppc/mpc8xx.h                             | 1 -
 include/asm-ppc/mv64x60.h                            | 1 -
 include/asm-ppc/ocp.h                                | 1 -
 include/asm-ppc/open_pic.h                           | 1 -
 include/asm-ppc/page.h                               | 2 --
 include/asm-ppc/pc_serial.h                          | 1 -
 include/asm-ppc/pgalloc.h                            | 1 -
 include/asm-ppc/pgtable.h                            | 1 -
 include/asm-ppc/ppc4xx_dma.h                         | 1 -
 include/asm-ppc/ppc4xx_pic.h                         | 1 -
 include/asm-ppc/serial.h                             | 1 -
 include/asm-ppc/smp.h                                | 1 -
 include/asm-ppc/time.h                               | 1 -
 include/asm-s390/bitops.h                            | 1 -
 include/asm-s390/debug.h                             | 1 -
 include/asm-s390/hardirq.h                           | 1 -
 include/asm-s390/idals.h                             | 1 -
 include/asm-s390/local.h                             | 1 -
 include/asm-s390/lowcore.h                           | 1 -
 include/asm-s390/pgalloc.h                           | 1 -
 include/asm-s390/ptrace.h                            | 1 -
 include/asm-s390/sfp-machine.h                       | 1 -
 include/asm-s390/smp.h                               | 1 -
 include/asm-s390/system.h                            | 1 -
 include/asm-s390/tlbflush.h                          | 1 -
 include/asm-s390/types.h                             | 1 -
 include/asm-s390/unistd.h                            | 1 -
 include/asm-sh/bug.h                                 | 1 -
 include/asm-sh/checksum.h                            | 1 -
 include/asm-sh/dma-mapping.h                         | 1 -
 include/asm-sh/dma.h                                 | 1 -
 include/asm-sh/fixmap.h                              | 1 -
 include/asm-sh/hardirq.h                             | 1 -
 include/asm-sh/hd64461/hd64461.h                     | 1 -
 include/asm-sh/hd64465/hd64465.h                     | 1 -
 include/asm-sh/ide.h                                 | 1 -
 include/asm-sh/io.h                                  | 1 -
 include/asm-sh/irq.h                                 | 1 -
 include/asm-sh/keyboard.h                            | 1 -
 include/asm-sh/kmap_types.h                          | 1 -
 include/asm-sh/machvec.h                             | 1 -
 include/asm-sh/machvec_init.h                        | 1 -
 include/asm-sh/mpc1211/dma.h                         | 1 -
 include/asm-sh/overdrive/overdrive.h                 | 1 -
 include/asm-sh/page.h                                | 1 -
 include/asm-sh/pgtable.h                             | 1 -
 include/asm-sh/serial.h                              | 1 -
 include/asm-sh/smp.h                                 | 1 -
 include/asm-sh/system.h                              | 1 -
 include/asm-sh/types.h                               | 1 -
 include/asm-sh/watchdog.h                            | 1 -
 include/asm-sh64/bug.h                               | 1 -
 include/asm-sh64/dma-mapping.h                       | 1 -
 include/asm-sh64/hardirq.h                           | 1 -
 include/asm-sh64/ide.h                               | 1 -
 include/asm-sh64/irq.h                               | 1 -
 include/asm-sh64/mmu_context.h                       | 1 -
 include/asm-sh64/page.h                              | 1 -
 include/asm-sh64/param.h                             | 1 -
 include/asm-sh64/pgtable.h                           | 1 -
 include/asm-sh64/system.h                            | 1 -
 include/asm-sparc/asmmacro.h                         | 1 -
 include/asm-sparc/atomic.h                           | 1 -
 include/asm-sparc/bugs.h                             | 1 -
 include/asm-sparc/cacheflush.h                       | 1 -
 include/asm-sparc/delay.h                            | 1 -
 include/asm-sparc/dma-mapping.h                      | 1 -
 include/asm-sparc/dma.h                              | 1 -
 include/asm-sparc/elf.h                              | 1 -
 include/asm-sparc/fixmap.h                           | 1 -
 include/asm-sparc/hardirq.h                          | 1 -
 include/asm-sparc/ide.h                              | 1 -
 include/asm-sparc/irq.h                              | 1 -
 include/asm-sparc/mostek.h                           | 1 -
 include/asm-sparc/page.h                             | 1 -
 include/asm-sparc/pgalloc.h                          | 1 -
 include/asm-sparc/pgtable.h                          | 1 -
 include/asm-sparc/sfp-machine.h                      | 1 -
 include/asm-sparc/smp.h                              | 1 -
 include/asm-sparc/system.h                           | 2 --
 include/asm-sparc/timer.h                            | 1 -
 include/asm-sparc/tlbflush.h                         | 1 -
 include/asm-sparc/vac-ops.h                          | 1 -
 include/asm-sparc/winmacro.h                         | 1 -
 include/asm-sparc64/atomic.h                         | 1 -
 include/asm-sparc64/bitops.h                         | 1 -
 include/asm-sparc64/bugs.h                           | 1 -
 include/asm-sparc64/cacheflush.h                     | 1 -
 include/asm-sparc64/delay.h                          | 1 -
 include/asm-sparc64/dma-mapping.h                    | 1 -
 include/asm-sparc64/dma.h                            | 1 -
 include/asm-sparc64/floppy.h                         | 1 -
 include/asm-sparc64/ide.h                            | 1 -
 include/asm-sparc64/irq.h                            | 1 -
 include/asm-sparc64/kprobes.h                        | 1 -
 include/asm-sparc64/mc146818rtc.h                    | 1 -
 include/asm-sparc64/mmu.h                            | 1 -
 include/asm-sparc64/oplib.h                          | 1 -
 include/asm-sparc64/page.h                           | 1 -
 include/asm-sparc64/param.h                          | 1 -
 include/asm-sparc64/pgalloc.h                        | 1 -
 include/asm-sparc64/pgtable.h                        | 1 -
 include/asm-sparc64/processor.h                      | 1 -
 include/asm-sparc64/siginfo.h                        | 1 -
 include/asm-sparc64/signal.h                         | 1 -
 include/asm-sparc64/smp.h                            | 1 -
 include/asm-sparc64/spinlock.h                       | 1 -
 include/asm-sparc64/system.h                         | 1 -
 include/asm-sparc64/timer.h                          | 1 -
 include/asm-sparc64/tlb.h                            | 1 -
 include/asm-sparc64/tlbflush.h                       | 1 -
 include/asm-sparc64/ttable.h                         | 1 -
 include/asm-um/a.out.h                               | 1 -
 include/asm-um/cache.h                               | 1 -
 include/asm-um/elf-ppc.h                             | 1 -
 include/asm-um/fixmap.h                              | 1 -
 include/asm-um/hardirq.h                             | 1 -
 include/asm-um/linkage.h                             | 1 -
 include/asm-um/mmu_context.h                         | 1 -
 include/asm-um/page.h                                | 1 -
 include/asm-um/pgalloc.h                             | 1 -
 include/asm-um/processor-generic.h                   | 1 -
 include/asm-um/ptrace-generic.h                      | 1 -
 include/asm-um/smp.h                                 | 1 -
 include/asm-um/thread_info.h                         | 1 -
 include/asm-v850/atomic.h                            | 1 -
 include/asm-v850/bitops.h                            | 1 -
 include/asm-v850/dma-mapping.h                       | 1 -
 include/asm-v850/hardirq.h                           | 1 -
 include/asm-v850/machdep.h                           | 1 -
 include/asm-v850/pgtable.h                           | 1 -
 include/asm-v850/processor.h                         | 1 -
 include/asm-v850/serial.h                            | 1 -
 include/asm-v850/v850e_uart.h                        | 1 -
 include/asm-x86_64/apic.h                            | 1 -
 include/asm-x86_64/atomic.h                          | 1 -
 include/asm-x86_64/bitops.h                          | 1 -
 include/asm-x86_64/bugs.h                            | 1 -
 include/asm-x86_64/cache.h                           | 1 -
 include/asm-x86_64/calling.h                         | 1 -
 include/asm-x86_64/dma-mapping.h                     | 1 -
 include/asm-x86_64/dma.h                             | 1 -
 include/asm-x86_64/dwarf2.h                          | 1 -
 include/asm-x86_64/fixmap.h                          | 1 -
 include/asm-x86_64/hardirq.h                         | 1 -
 include/asm-x86_64/hw_irq.h                          | 1 -
 include/asm-x86_64/ia32.h                            | 1 -
 include/asm-x86_64/io.h                              | 1 -
 include/asm-x86_64/io_apic.h                         | 1 -
 include/asm-x86_64/mmu_context.h                     | 1 -
 include/asm-x86_64/mmzone.h                          | 1 -
 include/asm-x86_64/mtrr.h                            | 1 -
 include/asm-x86_64/page.h                            | 1 -
 include/asm-x86_64/param.h                           | 1 -
 include/asm-x86_64/pci.h                             | 1 -
 include/asm-x86_64/processor.h                       | 1 -
 include/asm-x86_64/serial.h                          | 1 -
 include/asm-x86_64/smp.h                             | 1 -
 include/asm-x86_64/spinlock.h                        | 1 -
 include/asm-x86_64/swiotlb.h                         | 1 -
 include/asm-x86_64/system.h                          | 1 -
 include/asm-x86_64/tlbflush.h                        | 1 -
 include/asm-x86_64/topology.h                        | 1 -
 include/asm-x86_64/uaccess.h                         | 1 -
 include/asm-xtensa/atomic.h                          | 1 -
 include/asm-xtensa/checksum.h                        | 1 -
 include/asm-xtensa/delay.h                           | 1 -
 include/asm-xtensa/dma.h                             | 1 -
 include/asm-xtensa/hardirq.h                         | 1 -
 include/asm-xtensa/ide.h                             | 1 -
 include/asm-xtensa/io.h                              | 1 -
 include/asm-xtensa/irq.h                             | 1 -
 include/asm-xtensa/mmu_context.h                     | 1 -
 include/asm-xtensa/page.h                            | 1 -
 include/asm-xtensa/pgalloc.h                         | 1 -
 include/asm-xtensa/platform.h                        | 1 -
 include/asm-xtensa/system.h                          | 1 -
 include/linux/acct.h                                 | 1 -
 include/linux/acpi.h                                 | 1 -
 include/linux/amba/clcd.h                            | 1 -
 include/linux/atmdev.h                               | 1 -
 include/linux/blkdev.h                               | 1 -
 include/linux/blktrace_api.h                         | 1 -
 include/linux/blockgroup_lock.h                      | 1 -
 include/linux/cache.h                                | 1 -
 include/linux/coda.h                                 | 1 -
 include/linux/compat.h                               | 1 -
 include/linux/cpufreq.h                              | 1 -
 include/linux/crypto.h                               | 1 -
 include/linux/cyclomx.h                              | 1 -
 include/linux/dcookies.h                             | 1 -
 include/linux/devfs_fs_kernel.h                      | 1 -
 include/linux/device.h                               | 1 -
 include/linux/dmi.h                                  | 1 -
 include/linux/dnotify.h                              | 1 -
 include/linux/errqueue.h                             | 1 -
 include/linux/fs.h                                   | 1 -
 include/linux/ftape.h                                | 1 -
 include/linux/gfp.h                                  | 1 -
 include/linux/hardirq.h                              | 1 -
 include/linux/highmem.h                              | 1 -
 include/linux/highuid.h                              | 1 -
 include/linux/ide.h                                  | 1 -
 include/linux/if_frad.h                              | 1 -
 include/linux/if_tr.h                                | 1 -
 include/linux/init.h                                 | 1 -
 include/linux/inotify.h                              | 1 -
 include/linux/interrupt.h                            | 1 -
 include/linux/ipv6.h                                 | 1 -
 include/linux/irq.h                                  | 1 -
 include/linux/irq_cpustat.h                          | 1 -
 include/linux/isapnp.h                               | 1 -
 include/linux/isdn.h                                 | 1 -
 include/linux/isdn_ppp.h                             | 1 -
 include/linux/isdnif.h                               | 1 -
 include/linux/kallsyms.h                             | 1 -
 include/linux/kernel_stat.h                          | 1 -
 include/linux/kmod.h                                 | 1 -
 include/linux/kprobes.h                              | 1 -
 include/linux/linkage.h                              | 1 -
 include/linux/lockd/lockd.h                          | 1 -
 include/linux/lockd/nlm.h                            | 1 -
 include/linux/mempolicy.h                            | 1 -
 include/linux/migrate.h                              | 1 -
 include/linux/mm.h                                   | 1 -
 include/linux/mman.h                                 | 1 -
 include/linux/mmzone.h                               | 1 -
 include/linux/module.h                               | 1 -
 include/linux/mtd/cfi.h                              | 1 -
 include/linux/mtd/map.h                              | 1 -
 include/linux/mtd/mtd.h                              | 1 -
 include/linux/mtd/nand.h                             | 1 -
 include/linux/mtd/physmap.h                          | 1 -
 include/linux/mtd/xip.h                              | 1 -
 include/linux/net.h                                  | 1 -
 include/linux/netdevice.h                            | 1 -
 include/linux/netfilter.h                            | 1 -
 include/linux/netfilter_arp.h                        | 1 -
 include/linux/netfilter_bridge.h                     | 1 -
 include/linux/netfilter_ipv4.h                       | 1 -
 include/linux/netfilter_ipv4/ip_conntrack.h          | 1 -
 include/linux/netfilter_ipv4/listhelp.h              | 1 -
 include/linux/nfsd/nfsd.h                            | 1 -
 include/linux/nfsd/nfsfh.h                           | 1 -
 include/linux/nfsd/syscall.h                         | 1 -
 include/linux/numa.h                                 | 1 -
 include/linux/parport.h                              | 1 -
 include/linux/pci.h                                  | 1 -
 include/linux/percpu_counter.h                       | 1 -
 include/linux/pm.h                                   | 1 -
 include/linux/pm_legacy.h                            | 1 -
 include/linux/pmu.h                                  | 1 -
 include/linux/preempt.h                              | 1 -
 include/linux/proc_fs.h                              | 1 -
 include/linux/profile.h                              | 1 -
 include/linux/quotaops.h                             | 1 -
 include/linux/reiserfs_xattr.h                       | 1 -
 include/linux/relay.h                                | 1 -
 include/linux/rio.h                                  | 1 -
 include/linux/rio_drv.h                              | 1 -
 include/linux/rmap.h                                 | 1 -
 include/linux/rtnetlink.h                            | 1 -
 include/linux/rwsem.h                                | 1 -
 include/linux/scc.h                                  | 1 -
 include/linux/seccomp.h                              | 1 -
 include/linux/seqlock.h                              | 1 -
 include/linux/serialP.h                              | 1 -
 include/linux/serial_core.h                          | 1 -
 include/linux/skbuff.h                               | 1 -
 include/linux/slab.h                                 | 1 -
 include/linux/smp.h                                  | 1 -
 include/linux/smp_lock.h                             | 1 -
 include/linux/spinlock.h                             | 1 -
 include/linux/stop_machine.h                         | 1 -
 include/linux/sunrpc/auth.h                          | 1 -
 include/linux/sunrpc/debug.h                         | 1 -
 include/linux/sunrpc/stats.h                         | 1 -
 include/linux/suspend.h                              | 1 -
 include/linux/swap.h                                 | 1 -
 include/linux/syscalls.h                             | 1 -
 include/linux/sysrq.h                                | 1 -
 include/linux/tcp.h                                  | 1 -
 include/linux/threads.h                              | 1 -
 include/linux/timer.h                                | 1 -
 include/linux/timex.h                                | 1 -
 include/linux/tty.h                                  | 1 -
 include/linux/types.h                                | 1 -
 include/linux/udp.h                                  | 1 -
 include/linux/usb.h                                  | 1 -
 include/linux/usb_usual.h                            | 1 -
 include/linux/vt_buffer.h                            | 1 -
 include/linux/vt_kern.h                              | 1 -
 include/linux/wait.h                                 | 1 -
 include/net/addrconf.h                               | 1 -
 include/net/af_unix.h                                | 1 -
 include/net/ax25.h                                   | 1 -
 include/net/compat.h                                 | 1 -
 include/net/dst.h                                    | 1 -
 include/net/icmp.h                                   | 1 -
 include/net/inet6_hashtables.h                       | 1 -
 include/net/inet_hashtables.h                        | 1 -
 include/net/inet_sock.h                              | 1 -
 include/net/inet_timewait_sock.h                     | 1 -
 include/net/ip.h                                     | 1 -
 include/net/ip_fib.h                                 | 1 -
 include/net/ip_mp_alg.h                              | 1 -
 include/net/ip_vs.h                                  | 1 -
 include/net/ipv6.h                                   | 1 -
 include/net/irda/irda.h                              | 1 -
 include/net/irda/irda_device.h                       | 1 -
 include/net/irda/irlap.h                             | 1 -
 include/net/irda/irlmp.h                             | 1 -
 include/net/irda/irlmp_frame.h                       | 1 -
 include/net/irda/qos.h                               | 1 -
 include/net/ndisc.h                                  | 1 -
 include/net/netfilter/nf_conntrack.h                 | 1 -
 include/net/pkt_act.h                                | 1 -
 include/net/protocol.h                               | 1 -
 include/net/raw.h                                    | 1 -
 include/net/red.h                                    | 1 -
 include/net/route.h                                  | 1 -
 include/net/sch_generic.h                            | 1 -
 include/net/sctp/sctp.h                              | 1 -
 include/net/sock.h                                   | 1 -
 include/net/tcp.h                                    | 1 -
 include/pcmcia/ss.h                                  | 1 -
 include/scsi/scsi_transport_fc.h                     | 1 -
 include/scsi/scsi_transport_spi.h                    | 1 -
 include/sound/driver.h                               | 1 -
 include/video/edid.h                                 | 1 -
 include/video/vga.h                                  | 1 -
 836 files changed, 839 deletions(-)

(limited to 'include/linux')

diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index 2e6d54569ee8..3c6a6205853a 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -49,7 +49,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/ctype.h>
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index badf0277b1be..ef7d83a41470 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -2,7 +2,6 @@
 #define __ACPI_PROCESSOR_H
 
 #include <linux/kernel.h>
-#include <linux/config.h>
 
 #include <asm/acpi.h>
 
diff --git a/include/asm-alpha/bitops.h b/include/asm-alpha/bitops.h
index 3f88715e811e..4b6ef7f21b93 100644
--- a/include/asm-alpha/bitops.h
+++ b/include/asm-alpha/bitops.h
@@ -1,7 +1,6 @@
 #ifndef _ALPHA_BITOPS_H
 #define _ALPHA_BITOPS_H
 
-#include <linux/config.h>
 #include <asm/compiler.h>
 
 /*
diff --git a/include/asm-alpha/cache.h b/include/asm-alpha/cache.h
index e6d4d1695e25..f199e69a5d0b 100644
--- a/include/asm-alpha/cache.h
+++ b/include/asm-alpha/cache.h
@@ -4,7 +4,6 @@
 #ifndef __ARCH_ALPHA_CACHE_H
 #define __ARCH_ALPHA_CACHE_H
 
-#include <linux/config.h>
 
 /* Bytes per L1 (data) cache line. */
 #if defined(CONFIG_ALPHA_GENERIC) || defined(CONFIG_ALPHA_EV6)
diff --git a/include/asm-alpha/cacheflush.h b/include/asm-alpha/cacheflush.h
index 3fc6ef726d8c..805640b41078 100644
--- a/include/asm-alpha/cacheflush.h
+++ b/include/asm-alpha/cacheflush.h
@@ -1,7 +1,6 @@
 #ifndef _ALPHA_CACHEFLUSH_H
 #define _ALPHA_CACHEFLUSH_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 
 /* Caches aren't brain-dead on the Alpha. */
diff --git a/include/asm-alpha/core_cia.h b/include/asm-alpha/core_cia.h
index 3a70d68bfce8..9e0516c0ca27 100644
--- a/include/asm-alpha/core_cia.h
+++ b/include/asm-alpha/core_cia.h
@@ -4,7 +4,6 @@
 /* Define to experiment with fitting everything into one 512MB HAE window.  */
 #define CIA_ONE_HAE_WINDOW 1
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <asm/compiler.h>
 
diff --git a/include/asm-alpha/core_t2.h b/include/asm-alpha/core_t2.h
index 5c1c40338c82..dba70c62a16c 100644
--- a/include/asm-alpha/core_t2.h
+++ b/include/asm-alpha/core_t2.h
@@ -1,7 +1,6 @@
 #ifndef __ALPHA_T2__H__
 #define __ALPHA_T2__H__
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/spinlock.h>
 #include <asm/compiler.h>
diff --git a/include/asm-alpha/dma-mapping.h b/include/asm-alpha/dma-mapping.h
index 62d0d6681aa9..b9ff4d8cb33a 100644
--- a/include/asm-alpha/dma-mapping.h
+++ b/include/asm-alpha/dma-mapping.h
@@ -1,7 +1,6 @@
 #ifndef _ALPHA_DMA_MAPPING_H
 #define _ALPHA_DMA_MAPPING_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_PCI
 
diff --git a/include/asm-alpha/dma.h b/include/asm-alpha/dma.h
index 683afaa3deed..87cfdbdf08fc 100644
--- a/include/asm-alpha/dma.h
+++ b/include/asm-alpha/dma.h
@@ -18,7 +18,6 @@
 #ifndef _ASM_DMA_H
 #define _ASM_DMA_H
 
-#include <linux/config.h>
 #include <linux/spinlock.h>
 #include <asm/io.h>
 
diff --git a/include/asm-alpha/floppy.h b/include/asm-alpha/floppy.h
index 289a00d51a90..e177d4180f83 100644
--- a/include/asm-alpha/floppy.h
+++ b/include/asm-alpha/floppy.h
@@ -10,7 +10,6 @@
 #ifndef __ASM_ALPHA_FLOPPY_H
 #define __ASM_ALPHA_FLOPPY_H
 
-#include <linux/config.h>
 
 #define fd_inb(port)			inb_p(port)
 #define fd_outb(value,port)		outb_p(value,port)
diff --git a/include/asm-alpha/hardirq.h b/include/asm-alpha/hardirq.h
index 7bb6a36c96a1..d953e234daa8 100644
--- a/include/asm-alpha/hardirq.h
+++ b/include/asm-alpha/hardirq.h
@@ -1,7 +1,6 @@
 #ifndef _ALPHA_HARDIRQ_H
 #define _ALPHA_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/cache.h>
 
diff --git a/include/asm-alpha/hw_irq.h b/include/asm-alpha/hw_irq.h
index a310b9efc906..ca9d43b63502 100644
--- a/include/asm-alpha/hw_irq.h
+++ b/include/asm-alpha/hw_irq.h
@@ -1,7 +1,6 @@
 #ifndef _ALPHA_HW_IRQ_H
 #define _ALPHA_HW_IRQ_H
 
-#include <linux/config.h>
 
 static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i) {}
 
diff --git a/include/asm-alpha/ide.h b/include/asm-alpha/ide.h
index 6126afe27380..2a5cc0b367ab 100644
--- a/include/asm-alpha/ide.h
+++ b/include/asm-alpha/ide.h
@@ -13,7 +13,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #define IDE_ARCH_OBSOLETE_DEFAULTS
 
diff --git a/include/asm-alpha/io.h b/include/asm-alpha/io.h
index 3ebbeee753e9..f5ae98c25d1f 100644
--- a/include/asm-alpha/io.h
+++ b/include/asm-alpha/io.h
@@ -3,7 +3,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <asm/compiler.h>
 #include <asm/system.h>
diff --git a/include/asm-alpha/irq.h b/include/asm-alpha/irq.h
index 566db720000a..f6de033718a0 100644
--- a/include/asm-alpha/irq.h
+++ b/include/asm-alpha/irq.h
@@ -8,7 +8,6 @@
  */
 
 #include <linux/linkage.h>
-#include <linux/config.h>
 
 #if   defined(CONFIG_ALPHA_GENERIC)
 
diff --git a/include/asm-alpha/kmap_types.h b/include/asm-alpha/kmap_types.h
index 3d10cd3ea75f..3e6735a34c57 100644
--- a/include/asm-alpha/kmap_types.h
+++ b/include/asm-alpha/kmap_types.h
@@ -3,7 +3,6 @@
 
 /* Dummy header just to define km_type. */
 
-#include <linux/config.h>
 
 #ifdef CONFIG_DEBUG_HIGHMEM
 # define D(n) __KM_FENCE_##n ,
diff --git a/include/asm-alpha/machvec.h b/include/asm-alpha/machvec.h
index ece166a203ec..aced22f91752 100644
--- a/include/asm-alpha/machvec.h
+++ b/include/asm-alpha/machvec.h
@@ -1,7 +1,6 @@
 #ifndef __ALPHA_MACHVEC_H
 #define __ALPHA_MACHVEC_H 1
 
-#include <linux/config.h>
 #include <linux/types.h>
 
 /*
diff --git a/include/asm-alpha/mmu_context.h b/include/asm-alpha/mmu_context.h
index 0c017fc181c1..fe249e9d3360 100644
--- a/include/asm-alpha/mmu_context.h
+++ b/include/asm-alpha/mmu_context.h
@@ -7,7 +7,6 @@
  * Copyright (C) 1996, Linus Torvalds
  */
 
-#include <linux/config.h>
 #include <asm/system.h>
 #include <asm/machvec.h>
 #include <asm/compiler.h>
diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h
index 192d80c875b0..64d0ab98fcd8 100644
--- a/include/asm-alpha/mmzone.h
+++ b/include/asm-alpha/mmzone.h
@@ -5,7 +5,6 @@
 #ifndef _ASM_MMZONE_H_
 #define _ASM_MMZONE_H_
 
-#include <linux/config.h>
 #include <asm/smp.h>
 
 struct bootmem_data_t; /* stupid forward decl. */
diff --git a/include/asm-alpha/page.h b/include/asm-alpha/page.h
index 61bcf70b5eac..8c7cd50d4eae 100644
--- a/include/asm-alpha/page.h
+++ b/include/asm-alpha/page.h
@@ -1,7 +1,6 @@
 #ifndef _ALPHA_PAGE_H
 #define _ALPHA_PAGE_H
 
-#include <linux/config.h>
 #include <asm/pal.h>
 
 /* PAGE_SHIFT determines the page size */
diff --git a/include/asm-alpha/param.h b/include/asm-alpha/param.h
index 3ed0b3b02e52..214e7996346f 100644
--- a/include/asm-alpha/param.h
+++ b/include/asm-alpha/param.h
@@ -5,7 +5,6 @@
    hardware ignores reprogramming.  We also need userland buy-in to the 
    change in HZ, since this is visible in the wait4 resources etc.  */
 
-#include <linux/config.h>
 
 #ifndef HZ
 # ifndef CONFIG_ALPHA_RAWHIDE
diff --git a/include/asm-alpha/pgalloc.h b/include/asm-alpha/pgalloc.h
index 308475642913..471864e8d4c3 100644
--- a/include/asm-alpha/pgalloc.h
+++ b/include/asm-alpha/pgalloc.h
@@ -1,7 +1,6 @@
 #ifndef _ALPHA_PGALLOC_H
 #define _ALPHA_PGALLOC_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 
diff --git a/include/asm-alpha/pgtable.h b/include/asm-alpha/pgtable.h
index a985cd29b6db..93eaa58b7961 100644
--- a/include/asm-alpha/pgtable.h
+++ b/include/asm-alpha/pgtable.h
@@ -10,7 +10,6 @@
  * This hopefully works with any standard Alpha page-size, as defined
  * in <asm/page.h> (currently 8192).
  */
-#include <linux/config.h>
 #include <linux/mmzone.h>
 
 #include <asm/page.h>
diff --git a/include/asm-alpha/serial.h b/include/asm-alpha/serial.h
index 7e4b2987d453..9d263e8d8ccc 100644
--- a/include/asm-alpha/serial.h
+++ b/include/asm-alpha/serial.h
@@ -2,7 +2,6 @@
  * include/asm-alpha/serial.h
  */
 
-#include <linux/config.h>
 
 /*
  * This assumes you have a 1.8432 MHz clock for your UART.
diff --git a/include/asm-alpha/smp.h b/include/asm-alpha/smp.h
index 9950706abdf8..06fb6c119671 100644
--- a/include/asm-alpha/smp.h
+++ b/include/asm-alpha/smp.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_SMP_H
 #define __ASM_SMP_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/bitops.h>
diff --git a/include/asm-alpha/spinlock.h b/include/asm-alpha/spinlock.h
index 8197c69eff44..0c294c9b0c55 100644
--- a/include/asm-alpha/spinlock.h
+++ b/include/asm-alpha/spinlock.h
@@ -1,7 +1,6 @@
 #ifndef _ALPHA_SPINLOCK_H
 #define _ALPHA_SPINLOCK_H
 
-#include <linux/config.h>
 #include <asm/system.h>
 #include <linux/kernel.h>
 #include <asm/current.h>
diff --git a/include/asm-alpha/system.h b/include/asm-alpha/system.h
index f3b7b1a59c56..03e9c0e5ed74 100644
--- a/include/asm-alpha/system.h
+++ b/include/asm-alpha/system.h
@@ -1,7 +1,6 @@
 #ifndef __ALPHA_SYSTEM_H
 #define __ALPHA_SYSTEM_H
 
-#include <linux/config.h>
 #include <asm/pal.h>
 #include <asm/page.h>
 #include <asm/barrier.h>
diff --git a/include/asm-alpha/tlbflush.h b/include/asm-alpha/tlbflush.h
index 9d484c1fdc82..1ca3ed3bd6d3 100644
--- a/include/asm-alpha/tlbflush.h
+++ b/include/asm-alpha/tlbflush.h
@@ -1,7 +1,6 @@
 #ifndef _ALPHA_TLBFLUSH_H
 #define _ALPHA_TLBFLUSH_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <asm/compiler.h>
 
diff --git a/include/asm-arm/apm.h b/include/asm-arm/apm.h
index 3a50eb759c28..d09113b37e4a 100644
--- a/include/asm-arm/apm.h
+++ b/include/asm-arm/apm.h
@@ -13,7 +13,6 @@
 #ifndef ARM_ASM_SA1100_APM_H
 #define ARM_ASM_SA1100_APM_H
 
-#include <linux/config.h>
 #include <linux/apm_bios.h>
 
 /*
diff --git a/include/asm-arm/arch-aaec2000/memory.h b/include/asm-arm/arch-aaec2000/memory.h
index d8209f8911d6..24b51cccde8f 100644
--- a/include/asm-arm/arch-aaec2000/memory.h
+++ b/include/asm-arm/arch-aaec2000/memory.h
@@ -11,7 +11,6 @@
 #ifndef __ASM_ARCH_MEMORY_H
 #define __ASM_ARCH_MEMORY_H
 
-#include <linux/config.h>
 
 #define PHYS_OFFSET	UL(0xf0000000)
 
diff --git a/include/asm-arm/arch-cl7500/acornfb.h b/include/asm-arm/arch-cl7500/acornfb.h
index 3867231a4470..aea6330c9745 100644
--- a/include/asm-arm/arch-cl7500/acornfb.h
+++ b/include/asm-arm/arch-cl7500/acornfb.h
@@ -1,4 +1,3 @@
-#include <linux/config.h>
 #define acornfb_valid_pixrate(var) (var->pixclock >= 39325 && var->pixclock <= 40119)
 
 static inline void
diff --git a/include/asm-arm/arch-clps711x/hardware.h b/include/asm-arm/arch-clps711x/hardware.h
index 1386871e1a5a..0fdbe72fff2a 100644
--- a/include/asm-arm/arch-clps711x/hardware.h
+++ b/include/asm-arm/arch-clps711x/hardware.h
@@ -22,7 +22,6 @@
 #ifndef __ASM_ARCH_HARDWARE_H
 #define __ASM_ARCH_HARDWARE_H
 
-#include <linux/config.h>
 
 #define CLPS7111_VIRT_BASE	0xff000000
 #define CLPS7111_BASE		CLPS7111_VIRT_BASE
diff --git a/include/asm-arm/arch-clps711x/memory.h b/include/asm-arm/arch-clps711x/memory.h
index 61d8717406ce..c6e8dcf674de 100644
--- a/include/asm-arm/arch-clps711x/memory.h
+++ b/include/asm-arm/arch-clps711x/memory.h
@@ -20,7 +20,6 @@
 #ifndef __ASM_ARCH_MEMORY_H
 #define __ASM_ARCH_MEMORY_H
 
-#include <linux/config.h>
 
 /*
  * Physical DRAM offset.
diff --git a/include/asm-arm/arch-clps711x/uncompress.h b/include/asm-arm/arch-clps711x/uncompress.h
index 07157b7e4b20..03d233ae87ce 100644
--- a/include/asm-arm/arch-clps711x/uncompress.h
+++ b/include/asm-arm/arch-clps711x/uncompress.h
@@ -17,7 +17,6 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
-#include <linux/config.h>
 #include <asm/arch/io.h>
 #include <asm/hardware.h>
 #include <asm/hardware/clps7111.h>
diff --git a/include/asm-arm/arch-ebsa285/hardware.h b/include/asm-arm/arch-ebsa285/hardware.h
index ec51fe92483b..daad8ee2d194 100644
--- a/include/asm-arm/arch-ebsa285/hardware.h
+++ b/include/asm-arm/arch-ebsa285/hardware.h
@@ -12,7 +12,6 @@
 #ifndef __ASM_ARCH_HARDWARE_H
 #define __ASM_ARCH_HARDWARE_H
 
-#include <linux/config.h>
 #include <asm/arch/memory.h>
 
 #ifdef CONFIG_ARCH_FOOTBRIDGE
diff --git a/include/asm-arm/arch-ebsa285/memory.h b/include/asm-arm/arch-ebsa285/memory.h
index 99181ffc7e27..cbd7ae64bcc9 100644
--- a/include/asm-arm/arch-ebsa285/memory.h
+++ b/include/asm-arm/arch-ebsa285/memory.h
@@ -19,7 +19,6 @@
 #ifndef __ASM_ARCH_MEMORY_H
 #define __ASM_ARCH_MEMORY_H
 
-#include <linux/config.h>
 
 #if defined(CONFIG_FOOTBRIDGE_ADDIN)
 /*
diff --git a/include/asm-arm/arch-ebsa285/vmalloc.h b/include/asm-arm/arch-ebsa285/vmalloc.h
index d1ca955ce434..02598200997d 100644
--- a/include/asm-arm/arch-ebsa285/vmalloc.h
+++ b/include/asm-arm/arch-ebsa285/vmalloc.h
@@ -6,7 +6,6 @@
  * published by the Free Software Foundation.
  */
 
-#include <linux/config.h>
 
 #ifdef CONFIG_ARCH_FOOTBRIDGE
 #define VMALLOC_END       (PAGE_OFFSET + 0x30000000)
diff --git a/include/asm-arm/arch-integrator/smp.h b/include/asm-arm/arch-integrator/smp.h
index da6981efdc39..ab2c79bb9505 100644
--- a/include/asm-arm/arch-integrator/smp.h
+++ b/include/asm-arm/arch-integrator/smp.h
@@ -1,7 +1,6 @@
 #ifndef ASMARM_ARCH_SMP_H
 #define ASMARM_ARCH_SMP_H
 
-#include <linux/config.h>
 
 #include <asm/hardware.h>
 #include <asm/io.h>
diff --git a/include/asm-arm/arch-iop3xx/memory.h b/include/asm-arm/arch-iop3xx/memory.h
index bc62f4b13235..e43ebd984745 100644
--- a/include/asm-arm/arch-iop3xx/memory.h
+++ b/include/asm-arm/arch-iop3xx/memory.h
@@ -5,7 +5,6 @@
 #ifndef __ASM_ARCH_MEMORY_H
 #define __ASM_ARCH_MEMORY_H
 
-#include <linux/config.h>
 #include <asm/hardware.h>
 
 /*
diff --git a/include/asm-arm/arch-iop3xx/timex.h b/include/asm-arm/arch-iop3xx/timex.h
index 472badb451c4..14ca8d0f7b29 100644
--- a/include/asm-arm/arch-iop3xx/timex.h
+++ b/include/asm-arm/arch-iop3xx/timex.h
@@ -3,7 +3,6 @@
  *
  * IOP3xx architecture timex specifications
  */
-#include <linux/config.h>
 #include <asm/hardware.h>
 
 #if defined(CONFIG_ARCH_IQ80321) || defined(CONFIG_ARCH_IQ31244)
diff --git a/include/asm-arm/arch-iop3xx/uncompress.h b/include/asm-arm/arch-iop3xx/uncompress.h
index c98eb6254b1f..fbdd5af644fe 100644
--- a/include/asm-arm/arch-iop3xx/uncompress.h
+++ b/include/asm-arm/arch-iop3xx/uncompress.h
@@ -1,7 +1,6 @@
 /*
  *  linux/include/asm-arm/arch-iop3xx/uncompress.h
  */
-#include <linux/config.h>
 #include <asm/types.h>
 #include <asm/mach-types.h>
 #include <linux/serial_reg.h>
diff --git a/include/asm-arm/arch-ixp4xx/dma.h b/include/asm-arm/arch-ixp4xx/dma.h
index b1a071ecebc8..789f7f53c357 100644
--- a/include/asm-arm/arch-ixp4xx/dma.h
+++ b/include/asm-arm/arch-ixp4xx/dma.h
@@ -11,7 +11,6 @@
 #ifndef __ASM_ARCH_DMA_H
 #define __ASM_ARCH_DMA_H
 
-#include <linux/config.h>
 #include <linux/device.h>
 #include <linux/pci.h>
 #include <asm/page.h>
diff --git a/include/asm-arm/arch-lh7a40x/constants.h b/include/asm-arm/arch-lh7a40x/constants.h
index 52c1cb9c39c6..267d1145c3f9 100644
--- a/include/asm-arm/arch-lh7a40x/constants.h
+++ b/include/asm-arm/arch-lh7a40x/constants.h
@@ -12,7 +12,6 @@
 #ifndef __ASM_ARCH_CONSTANTS_H
 #define __ASM_ARCH_CONSTANTS_H
 
-#include <linux/config.h>
 
 /* Addressing constants */
 
diff --git a/include/asm-arm/arch-lh7a40x/irqs.h b/include/asm-arm/arch-lh7a40x/irqs.h
index f91f3e59f3ab..189908b2b79a 100644
--- a/include/asm-arm/arch-lh7a40x/irqs.h
+++ b/include/asm-arm/arch-lh7a40x/irqs.h
@@ -18,7 +18,6 @@
 #ifndef __ASM_ARCH_IRQS_H
 #define __ASM_ARCH_IRQS_H
 
-#include <linux/config.h>
 
 #define FIQ_START	80
 
diff --git a/include/asm-arm/arch-lh7a40x/registers.h b/include/asm-arm/arch-lh7a40x/registers.h
index 2edb22e35450..3b0d4fcd36f7 100644
--- a/include/asm-arm/arch-lh7a40x/registers.h
+++ b/include/asm-arm/arch-lh7a40x/registers.h
@@ -9,7 +9,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <asm/arch/constants.h>
 
 #ifndef __ASM_ARCH_REGISTERS_H
diff --git a/include/asm-arm/arch-omap/board.h b/include/asm-arm/arch-omap/board.h
index 6d6240a4681c..dfdbf06fd646 100644
--- a/include/asm-arm/arch-omap/board.h
+++ b/include/asm-arm/arch-omap/board.h
@@ -10,7 +10,6 @@
 #ifndef _OMAP_BOARD_H
 #define _OMAP_BOARD_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 
 /* Different peripheral ids */
diff --git a/include/asm-arm/arch-omap/hardware.h b/include/asm-arm/arch-omap/hardware.h
index 7909b729826c..c7d9e857795d 100644
--- a/include/asm-arm/arch-omap/hardware.h
+++ b/include/asm-arm/arch-omap/hardware.h
@@ -37,7 +37,6 @@
 #define __ASM_ARCH_OMAP_HARDWARE_H
 
 #include <asm/sizes.h>
-#include <linux/config.h>
 #ifndef __ASSEMBLER__
 #include <asm/types.h>
 #include <asm/arch/cpu.h>
diff --git a/include/asm-arm/arch-omap/system.h b/include/asm-arm/arch-omap/system.h
index 67970d1a2020..ac2bfa433f06 100644
--- a/include/asm-arm/arch-omap/system.h
+++ b/include/asm-arm/arch-omap/system.h
@@ -4,7 +4,6 @@
  */
 #ifndef __ASM_ARCH_SYSTEM_H
 #define __ASM_ARCH_SYSTEM_H
-#include <linux/config.h>
 #include <linux/clk.h>
 
 #include <asm/mach-types.h>
diff --git a/include/asm-arm/arch-omap/uncompress.h b/include/asm-arm/arch-omap/uncompress.h
index ca2c8bec82e7..aca0adfef1b8 100644
--- a/include/asm-arm/arch-omap/uncompress.h
+++ b/include/asm-arm/arch-omap/uncompress.h
@@ -17,7 +17,6 @@
  * kind, whether express or implied.
  */
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/serial_reg.h>
 #include <asm/arch/serial.h>
diff --git a/include/asm-arm/arch-pxa/idp.h b/include/asm-arm/arch-pxa/idp.h
index e7ef497417bb..b6952534a4e1 100644
--- a/include/asm-arm/arch-pxa/idp.h
+++ b/include/asm-arm/arch-pxa/idp.h
@@ -15,7 +15,6 @@
  *             Changes for 2.6 kernel.
  */
 
-#include <linux/config.h>
 
 /*
  * Note: this file must be safe to include in assembly files
diff --git a/include/asm-arm/arch-pxa/irqs.h b/include/asm-arm/arch-pxa/irqs.h
index 67af238a8f8e..f3bc70eee35b 100644
--- a/include/asm-arm/arch-pxa/irqs.h
+++ b/include/asm-arm/arch-pxa/irqs.h
@@ -10,7 +10,6 @@
  * published by the Free Software Foundation.
  */
 
-#include <linux/config.h>
 
 #ifdef CONFIG_PXA27x
 #define PXA_IRQ_SKIP	0
diff --git a/include/asm-arm/arch-pxa/pxa-regs.h b/include/asm-arm/arch-pxa/pxa-regs.h
index c8f53a71c076..6650d4decaeb 100644
--- a/include/asm-arm/arch-pxa/pxa-regs.h
+++ b/include/asm-arm/arch-pxa/pxa-regs.h
@@ -13,7 +13,6 @@
 #ifndef __PXA_REGS_H
 #define __PXA_REGS_H
 
-#include <linux/config.h>
 
 /*
  * PXA Chip selects
diff --git a/include/asm-arm/arch-pxa/timex.h b/include/asm-arm/arch-pxa/timex.h
index aa125ec56a32..2473bb51d0a6 100644
--- a/include/asm-arm/arch-pxa/timex.h
+++ b/include/asm-arm/arch-pxa/timex.h
@@ -10,7 +10,6 @@
  * published by the Free Software Foundation.
  */
 
-#include <linux/config.h>
 
 #if defined(CONFIG_PXA25x)
 /* PXA250/210 timer base */
diff --git a/include/asm-arm/arch-realview/smp.h b/include/asm-arm/arch-realview/smp.h
index fc87783e8e8b..515819efd046 100644
--- a/include/asm-arm/arch-realview/smp.h
+++ b/include/asm-arm/arch-realview/smp.h
@@ -1,7 +1,6 @@
 #ifndef ASMARM_ARCH_SMP_H
 #define ASMARM_ARCH_SMP_H
 
-#include <linux/config.h>
 
 #include <asm/hardware/gic.h>
 
diff --git a/include/asm-arm/arch-s3c2410/dma.h b/include/asm-arm/arch-s3c2410/dma.h
index b011e14f3bc6..72964f9b8414 100644
--- a/include/asm-arm/arch-s3c2410/dma.h
+++ b/include/asm-arm/arch-s3c2410/dma.h
@@ -18,7 +18,6 @@
 #ifndef __ASM_ARCH_DMA_H
 #define __ASM_ARCH_DMA_H __FILE__
 
-#include <linux/config.h>
 #include <linux/sysdev.h>
 #include "hardware.h"
 
diff --git a/include/asm-arm/arch-s3c2410/uncompress.h b/include/asm-arm/arch-s3c2410/uncompress.h
index a6f6a0e44afa..0ecb8103fa70 100644
--- a/include/asm-arm/arch-s3c2410/uncompress.h
+++ b/include/asm-arm/arch-s3c2410/uncompress.h
@@ -22,7 +22,6 @@
 #ifndef __ASM_ARCH_UNCOMPRESS_H
 #define __ASM_ARCH_UNCOMPRESS_H
 
-#include <linux/config.h>
 
 /* defines for UART registers */
 #include "asm/arch/regs-serial.h"
diff --git a/include/asm-arm/arch-sa1100/assabet.h b/include/asm-arm/arch-sa1100/assabet.h
index 1f59b368c3f6..d6a1bb5b4944 100644
--- a/include/asm-arm/arch-sa1100/assabet.h
+++ b/include/asm-arm/arch-sa1100/assabet.h
@@ -12,7 +12,6 @@
 #ifndef __ASM_ARCH_ASSABET_H
 #define __ASM_ARCH_ASSABET_H
 
-#include <linux/config.h>
 
 /* System Configuration Register flags */
 
diff --git a/include/asm-arm/arch-sa1100/cerf.h b/include/asm-arm/arch-sa1100/cerf.h
index 356d5ba88991..9a19c3d07c1e 100644
--- a/include/asm-arm/arch-sa1100/cerf.h
+++ b/include/asm-arm/arch-sa1100/cerf.h
@@ -10,7 +10,6 @@
 #ifndef _INCLUDE_CERF_H_
 #define _INCLUDE_CERF_H_
 
-#include <linux/config.h>
 
 #define CERF_ETH_IO			0xf0000000
 #define CERF_ETH_IRQ IRQ_GPIO26
diff --git a/include/asm-arm/arch-sa1100/collie.h b/include/asm-arm/arch-sa1100/collie.h
index d49e5ff63ca4..14a344aa3cc7 100644
--- a/include/asm-arm/arch-sa1100/collie.h
+++ b/include/asm-arm/arch-sa1100/collie.h
@@ -13,7 +13,6 @@
 #ifndef __ASM_ARCH_COLLIE_H
 #define __ASM_ARCH_COLLIE_H
 
-#include <linux/config.h>
 
 #define COLLIE_SCP_CHARGE_ON	SCOOP_GPCR_PA11
 #define COLLIE_SCP_DIAG_BOOT1	SCOOP_GPCR_PA12
diff --git a/include/asm-arm/arch-sa1100/dma.h b/include/asm-arm/arch-sa1100/dma.h
index 02575d72ac6b..6b7917a2e77a 100644
--- a/include/asm-arm/arch-sa1100/dma.h
+++ b/include/asm-arm/arch-sa1100/dma.h
@@ -10,7 +10,6 @@
 #ifndef __ASM_ARCH_DMA_H
 #define __ASM_ARCH_DMA_H
 
-#include <linux/config.h>
 #include "hardware.h"
 
 
diff --git a/include/asm-arm/arch-sa1100/hardware.h b/include/asm-arm/arch-sa1100/hardware.h
index ee008a5484f3..1abd7cfc8bce 100644
--- a/include/asm-arm/arch-sa1100/hardware.h
+++ b/include/asm-arm/arch-sa1100/hardware.h
@@ -12,7 +12,6 @@
 #ifndef __ASM_ARCH_HARDWARE_H
 #define __ASM_ARCH_HARDWARE_H
 
-#include <linux/config.h>
 
 #define UNCACHEABLE_ADDR	0xfa050000
 
diff --git a/include/asm-arm/arch-sa1100/ide.h b/include/asm-arm/arch-sa1100/ide.h
index 2153538069c7..98b10bcf9f1b 100644
--- a/include/asm-arm/arch-sa1100/ide.h
+++ b/include/asm-arm/arch-sa1100/ide.h
@@ -9,7 +9,6 @@
  *              architectures.
  */
 
-#include <linux/config.h>
 #include <asm/irq.h>
 #include <asm/hardware.h>
 #include <asm/mach-types.h>
diff --git a/include/asm-arm/arch-sa1100/irqs.h b/include/asm-arm/arch-sa1100/irqs.h
index eabd3be3d705..d7940683efb1 100644
--- a/include/asm-arm/arch-sa1100/irqs.h
+++ b/include/asm-arm/arch-sa1100/irqs.h
@@ -7,7 +7,6 @@
  *
  * 2001/11/14	RMK	Cleaned up and standardised a lot of the IRQs.
  */
-#include <linux/config.h>
 
 #define	IRQ_GPIO0		0
 #define	IRQ_GPIO1		1
diff --git a/include/asm-arm/arch-sa1100/memory.h b/include/asm-arm/arch-sa1100/memory.h
index a29fac1387ca..1ff172dc8e33 100644
--- a/include/asm-arm/arch-sa1100/memory.h
+++ b/include/asm-arm/arch-sa1100/memory.h
@@ -7,7 +7,6 @@
 #ifndef __ASM_ARCH_MEMORY_H
 #define __ASM_ARCH_MEMORY_H
 
-#include <linux/config.h>
 #include <asm/sizes.h>
 
 /*
diff --git a/include/asm-arm/arch-sa1100/system.h b/include/asm-arm/arch-sa1100/system.h
index 0f0612f79b2b..aef91e3b63fe 100644
--- a/include/asm-arm/arch-sa1100/system.h
+++ b/include/asm-arm/arch-sa1100/system.h
@@ -3,7 +3,6 @@
  *
  * Copyright (c) 1999 Nicolas Pitre <nico@cam.org>
  */
-#include <linux/config.h>
 #include <asm/hardware.h>
 
 static inline void arch_idle(void)
diff --git a/include/asm-arm/atomic.h b/include/asm-arm/atomic.h
index 3d7283d84405..4b0ce3e7de9a 100644
--- a/include/asm-arm/atomic.h
+++ b/include/asm-arm/atomic.h
@@ -11,7 +11,6 @@
 #ifndef __ASM_ARM_ATOMIC_H
 #define __ASM_ARM_ATOMIC_H
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 
 typedef struct { volatile int counter; } atomic_t;
diff --git a/include/asm-arm/bug.h b/include/asm-arm/bug.h
index 7fb02138f585..0e36fd5d87df 100644
--- a/include/asm-arm/bug.h
+++ b/include/asm-arm/bug.h
@@ -1,7 +1,6 @@
 #ifndef _ASMARM_BUG_H
 #define _ASMARM_BUG_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_BUG
 #ifdef CONFIG_DEBUG_BUGVERBOSE
diff --git a/include/asm-arm/cacheflush.h b/include/asm-arm/cacheflush.h
index 746be56b1b70..fe0c744e0266 100644
--- a/include/asm-arm/cacheflush.h
+++ b/include/asm-arm/cacheflush.h
@@ -10,7 +10,6 @@
 #ifndef _ASMARM_CACHEFLUSH_H
 #define _ASMARM_CACHEFLUSH_H
 
-#include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 
diff --git a/include/asm-arm/cpu.h b/include/asm-arm/cpu.h
index 751bc7462074..715426b9b08e 100644
--- a/include/asm-arm/cpu.h
+++ b/include/asm-arm/cpu.h
@@ -10,7 +10,6 @@
 #ifndef __ASM_ARM_CPU_H
 #define __ASM_ARM_CPU_H
 
-#include <linux/config.h>
 #include <linux/percpu.h>
 
 struct cpuinfo_arm {
diff --git a/include/asm-arm/dma-mapping.h b/include/asm-arm/dma-mapping.h
index 63ca7412a462..55eb4dc3253d 100644
--- a/include/asm-arm/dma-mapping.h
+++ b/include/asm-arm/dma-mapping.h
@@ -3,7 +3,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/mm.h> /* need struct page */
 
 #include <asm/scatterlist.h>
diff --git a/include/asm-arm/dma.h b/include/asm-arm/dma.h
index 49c01e2bf7c8..9f2c5305c260 100644
--- a/include/asm-arm/dma.h
+++ b/include/asm-arm/dma.h
@@ -3,7 +3,6 @@
 
 typedef unsigned int dmach_t;
 
-#include <linux/config.h>
 #include <linux/spinlock.h>
 #include <asm/system.h>
 #include <asm/scatterlist.h>
diff --git a/include/asm-arm/elf.h b/include/asm-arm/elf.h
index 2d44b42d1847..71061ca5c5d0 100644
--- a/include/asm-arm/elf.h
+++ b/include/asm-arm/elf.h
@@ -1,7 +1,6 @@
 #ifndef __ASMARM_ELF_H
 #define __ASMARM_ELF_H
 
-#include <linux/config.h>
 
 /*
  * ELF register definitions..
diff --git a/include/asm-arm/fpstate.h b/include/asm-arm/fpstate.h
index 52bae088a185..132c3c5628b2 100644
--- a/include/asm-arm/fpstate.h
+++ b/include/asm-arm/fpstate.h
@@ -11,7 +11,6 @@
 #ifndef __ASM_ARM_FPSTATE_H
 #define __ASM_ARM_FPSTATE_H
 
-#include <linux/config.h>
 
 #ifndef __ASSEMBLY__
 
diff --git a/include/asm-arm/glue.h b/include/asm-arm/glue.h
index 223e0d6c41be..0cc5d3b10ce2 100644
--- a/include/asm-arm/glue.h
+++ b/include/asm-arm/glue.h
@@ -15,7 +15,6 @@
  */
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #ifdef __STDC__
 #define ____glue(name,fn)	name##fn
diff --git a/include/asm-arm/hardirq.h b/include/asm-arm/hardirq.h
index 1cbb173bf5b1..182310b99195 100644
--- a/include/asm-arm/hardirq.h
+++ b/include/asm-arm/hardirq.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_HARDIRQ_H
 #define __ASM_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/cache.h>
 #include <linux/threads.h>
 #include <asm/irq.h>
diff --git a/include/asm-arm/hardware/dec21285.h b/include/asm-arm/hardware/dec21285.h
index 6685e3fb97b1..546f7077be9c 100644
--- a/include/asm-arm/hardware/dec21285.h
+++ b/include/asm-arm/hardware/dec21285.h
@@ -18,7 +18,6 @@
 #define DC21285_PCI_IO			0x7c000000
 #define DC21285_PCI_MEM			0x80000000
 
-#include <linux/config.h>
 #ifndef __ASSEMBLY__
 #include <asm/hardware.h>
 #define DC21285_IO(x)		((volatile unsigned long *)(ARMCSR_BASE+(x)))
diff --git a/include/asm-arm/hardware/iomd.h b/include/asm-arm/hardware/iomd.h
index 82fa2c279a18..396e55ad06c6 100644
--- a/include/asm-arm/hardware/iomd.h
+++ b/include/asm-arm/hardware/iomd.h
@@ -13,7 +13,6 @@
 #ifndef __ASMARM_HARDWARE_IOMD_H
 #define __ASMARM_HARDWARE_IOMD_H
 
-#include <linux/config.h>
 
 #ifndef __ASSEMBLY__
 
diff --git a/include/asm-arm/leds.h b/include/asm-arm/leds.h
index 88ce4124f854..12290ea55801 100644
--- a/include/asm-arm/leds.h
+++ b/include/asm-arm/leds.h
@@ -13,7 +13,6 @@
 #ifndef ASM_ARM_LEDS_H
 #define ASM_ARM_LEDS_H
 
-#include <linux/config.h>
 
 typedef enum {
 	led_idle_start,
diff --git a/include/asm-arm/mach/serial_at91rm9200.h b/include/asm-arm/mach/serial_at91rm9200.h
index 98f4b0cb883c..a0269de12079 100644
--- a/include/asm-arm/mach/serial_at91rm9200.h
+++ b/include/asm-arm/mach/serial_at91rm9200.h
@@ -7,7 +7,6 @@
  *
  *  Low level machine dependent UART functions.
  */
-#include <linux/config.h>
 
 struct uart_port;
 
diff --git a/include/asm-arm/mach/serial_sa1100.h b/include/asm-arm/mach/serial_sa1100.h
index 9162018585df..20c22bb218d9 100644
--- a/include/asm-arm/mach/serial_sa1100.h
+++ b/include/asm-arm/mach/serial_sa1100.h
@@ -7,7 +7,6 @@
  *
  * Low level machine dependent UART functions.
  */
-#include <linux/config.h>
 
 struct uart_port;
 struct uart_info;
diff --git a/include/asm-arm/memory.h b/include/asm-arm/memory.h
index 209289407595..731e321a57d1 100644
--- a/include/asm-arm/memory.h
+++ b/include/asm-arm/memory.h
@@ -22,7 +22,6 @@
 #define UL(x) (x)
 #endif
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <asm/arch/memory.h>
 #include <asm/sizes.h>
diff --git a/include/asm-arm/page.h b/include/asm-arm/page.h
index a404d2bf0c68..66cfeb5290ea 100644
--- a/include/asm-arm/page.h
+++ b/include/asm-arm/page.h
@@ -10,7 +10,6 @@
 #ifndef _ASMARM_PAGE_H
 #define _ASMARM_PAGE_H
 
-#include <linux/config.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT		12
diff --git a/include/asm-arm/pci.h b/include/asm-arm/pci.h
index ead3ced38cb8..f21abd4ddac6 100644
--- a/include/asm-arm/pci.h
+++ b/include/asm-arm/pci.h
@@ -2,7 +2,6 @@
 #define ASMARM_PCI_H
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 #include <asm-generic/pci-dma-compat.h>
 
 #include <asm/hardware.h> /* for PCIBIOS_MIN_* */
diff --git a/include/asm-arm/proc-fns.h b/include/asm-arm/proc-fns.h
index 106045edb862..e9310895e79d 100644
--- a/include/asm-arm/proc-fns.h
+++ b/include/asm-arm/proc-fns.h
@@ -13,7 +13,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 /*
  * Work out if we need multiple CPU support
diff --git a/include/asm-arm/ptrace.h b/include/asm-arm/ptrace.h
index 77adb7fa169b..2bebe3dc0a30 100644
--- a/include/asm-arm/ptrace.h
+++ b/include/asm-arm/ptrace.h
@@ -10,7 +10,6 @@
 #ifndef __ASM_ARM_PTRACE_H
 #define __ASM_ARM_PTRACE_H
 
-#include <linux/config.h>
 
 #define PTRACE_GETREGS		12
 #define PTRACE_SETREGS		13
diff --git a/include/asm-arm/smp.h b/include/asm-arm/smp.h
index fe45f7f61223..f67acce387e7 100644
--- a/include/asm-arm/smp.h
+++ b/include/asm-arm/smp.h
@@ -10,7 +10,6 @@
 #ifndef __ASM_ARM_SMP_H
 #define __ASM_ARM_SMP_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/thread_info.h>
diff --git a/include/asm-arm/system.h b/include/asm-arm/system.h
index 95b3abf4851b..f5eafd7ed8fa 100644
--- a/include/asm-arm/system.h
+++ b/include/asm-arm/system.h
@@ -3,7 +3,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #define CPU_ARCH_UNKNOWN	0
 #define CPU_ARCH_ARMv3		1
diff --git a/include/asm-arm/tlbflush.h b/include/asm-arm/tlbflush.h
index 728992451dd1..d97fc76189a5 100644
--- a/include/asm-arm/tlbflush.h
+++ b/include/asm-arm/tlbflush.h
@@ -10,7 +10,6 @@
 #ifndef _ASMARM_TLBFLUSH_H
 #define _ASMARM_TLBFLUSH_H
 
-#include <linux/config.h>
 
 #ifndef CONFIG_MMU
 
diff --git a/include/asm-arm26/atomic.h b/include/asm-arm26/atomic.h
index 1552c8653990..97e944fe1cff 100644
--- a/include/asm-arm26/atomic.h
+++ b/include/asm-arm26/atomic.h
@@ -20,7 +20,6 @@
 #ifndef __ASM_ARM_ATOMIC_H
 #define __ASM_ARM_ATOMIC_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_SMP
 #error SMP is NOT supported
diff --git a/include/asm-arm26/bug.h b/include/asm-arm26/bug.h
index 7177c7399967..8545d58b0475 100644
--- a/include/asm-arm26/bug.h
+++ b/include/asm-arm26/bug.h
@@ -1,7 +1,6 @@
 #ifndef _ASMARM_BUG_H
 #define _ASMARM_BUG_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_BUG
 #ifdef CONFIG_DEBUG_BUGVERBOSE
diff --git a/include/asm-arm26/dma.h b/include/asm-arm26/dma.h
index 995e223e43a1..4326ba85eb72 100644
--- a/include/asm-arm26/dma.h
+++ b/include/asm-arm26/dma.h
@@ -3,7 +3,6 @@
 
 typedef unsigned int dmach_t;
 
-#include <linux/config.h>
 #include <linux/spinlock.h>
 #include <asm/system.h>
 #include <asm/memory.h>
diff --git a/include/asm-arm26/hardirq.h b/include/asm-arm26/hardirq.h
index 87c19d2bb6a8..e717742ffce0 100644
--- a/include/asm-arm26/hardirq.h
+++ b/include/asm-arm26/hardirq.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_HARDIRQ_H
 #define __ASM_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/cache.h>
 #include <linux/threads.h>
 #include <asm/irq.h>
diff --git a/include/asm-arm26/hardware.h b/include/asm-arm26/hardware.h
index 82fc55e2a009..801df0bde8b7 100644
--- a/include/asm-arm26/hardware.h
+++ b/include/asm-arm26/hardware.h
@@ -16,7 +16,6 @@
 #ifndef __ASM_HARDWARE_H
 #define __ASM_HARDWARE_H
 
-#include <linux/config.h>
 
 
 /*
diff --git a/include/asm-arm26/io.h b/include/asm-arm26/io.h
index 02f94d88a124..2aa033bd0678 100644
--- a/include/asm-arm26/io.h
+++ b/include/asm-arm26/io.h
@@ -22,7 +22,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <asm/byteorder.h>
 #include <asm/memory.h>
diff --git a/include/asm-arm26/leds.h b/include/asm-arm26/leds.h
index 88ce4124f854..12290ea55801 100644
--- a/include/asm-arm26/leds.h
+++ b/include/asm-arm26/leds.h
@@ -13,7 +13,6 @@
 #ifndef ASM_ARM_LEDS_H
 #define ASM_ARM_LEDS_H
 
-#include <linux/config.h>
 
 typedef enum {
 	led_idle_start,
diff --git a/include/asm-arm26/mach-types.h b/include/asm-arm26/mach-types.h
index b34045b78128..0aeaedcbac96 100644
--- a/include/asm-arm26/mach-types.h
+++ b/include/asm-arm26/mach-types.h
@@ -6,7 +6,6 @@
 #ifndef __ASM_ARM_MACH_TYPE_H
 #define __ASM_ARM_MACH_TYPE_H
 
-#include <linux/config.h>
 
 #ifndef __ASSEMBLY__
 extern unsigned int __machine_arch_type;
diff --git a/include/asm-arm26/page.h b/include/asm-arm26/page.h
index d3f23ac4d468..fa19de28fda0 100644
--- a/include/asm-arm26/page.h
+++ b/include/asm-arm26/page.h
@@ -1,7 +1,6 @@
 #ifndef _ASMARM_PAGE_H
 #define _ASMARM_PAGE_H
 
-#include <linux/config.h>
 
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
diff --git a/include/asm-arm26/pgtable.h b/include/asm-arm26/pgtable.h
index a590250277f8..19ac9101a6bb 100644
--- a/include/asm-arm26/pgtable.h
+++ b/include/asm-arm26/pgtable.h
@@ -13,7 +13,6 @@
 
 #include <asm-generic/4level-fixup.h>
 
-#include <linux/config.h>
 #include <asm/memory.h>
 
 /*
diff --git a/include/asm-arm26/serial.h b/include/asm-arm26/serial.h
index 5fc747d1b501..dd86a716cb0b 100644
--- a/include/asm-arm26/serial.h
+++ b/include/asm-arm26/serial.h
@@ -14,7 +14,6 @@
 #ifndef __ASM_SERIAL_H
 #define __ASM_SERIAL_H
 
-#include <linux/config.h>
 
 /*
  * This assumes you have a 1.8432 MHz clock for your UART.
diff --git a/include/asm-arm26/smp.h b/include/asm-arm26/smp.h
index 5ca771631fd8..38349ec8b61b 100644
--- a/include/asm-arm26/smp.h
+++ b/include/asm-arm26/smp.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_SMP_H
 #define __ASM_SMP_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_SMP
 #error SMP not supported
diff --git a/include/asm-arm26/sysirq.h b/include/asm-arm26/sysirq.h
index cad250c7b9ec..81dca90d9a3f 100644
--- a/include/asm-arm26/sysirq.h
+++ b/include/asm-arm26/sysirq.h
@@ -11,7 +11,6 @@
  *   04-04-1998 PJB     Merged arc and a5k versions
  */
 
-#include <linux/config.h>
 
 #if defined(CONFIG_ARCH_A5K)
 #define IRQ_PRINTER             0
diff --git a/include/asm-arm26/system.h b/include/asm-arm26/system.h
index 702884926a55..d1f69d706198 100644
--- a/include/asm-arm26/system.h
+++ b/include/asm-arm26/system.h
@@ -3,7 +3,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 /*
  * This is used to ensure the compiler did actually allocate the register we
diff --git a/include/asm-cris/arch-v10/io.h b/include/asm-cris/arch-v10/io.h
index dd39198ec67d..11ef5b53d84e 100644
--- a/include/asm-cris/arch-v10/io.h
+++ b/include/asm-cris/arch-v10/io.h
@@ -2,7 +2,6 @@
 #define _ASM_ARCH_CRIS_IO_H
 
 #include <asm/arch/svinto.h>
-#include <linux/config.h>
 
 /* Etrax shadow registers - which live in arch/cris/kernel/shadows.c */
 
diff --git a/include/asm-cris/arch-v10/page.h b/include/asm-cris/arch-v10/page.h
index 407e6e68f49e..7d8307aed7f3 100644
--- a/include/asm-cris/arch-v10/page.h
+++ b/include/asm-cris/arch-v10/page.h
@@ -1,7 +1,6 @@
 #ifndef _CRIS_ARCH_PAGE_H
 #define _CRIS_ARCH_PAGE_H
 
-#include <linux/config.h>
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-cris/arch-v10/system.h b/include/asm-cris/arch-v10/system.h
index 1ac7b639b1b0..4a9cd36c9e16 100644
--- a/include/asm-cris/arch-v10/system.h
+++ b/include/asm-cris/arch-v10/system.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_CRIS_ARCH_SYSTEM_H
 #define __ASM_CRIS_ARCH_SYSTEM_H
 
-#include <linux/config.h>
 
 /* read the CPU version register */
 
diff --git a/include/asm-cris/arch-v32/io.h b/include/asm-cris/arch-v32/io.h
index 043c9ce5294e..5efe4d949001 100644
--- a/include/asm-cris/arch-v32/io.h
+++ b/include/asm-cris/arch-v32/io.h
@@ -4,7 +4,6 @@
 #include <asm/arch/hwregs/reg_map.h>
 #include <asm/arch/hwregs/reg_rdwr.h>
 #include <asm/arch/hwregs/gio_defs.h>
-#include <linux/config.h>
 
 enum crisv32_io_dir
 {
diff --git a/include/asm-cris/arch-v32/irq.h b/include/asm-cris/arch-v32/irq.h
index d35aa8174c2f..eeb0a80262c8 100644
--- a/include/asm-cris/arch-v32/irq.h
+++ b/include/asm-cris/arch-v32/irq.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_ARCH_IRQ_H
 #define _ASM_ARCH_IRQ_H
 
-#include <linux/config.h>
 #include "hwregs/intr_vect.h"
 
 /* Number of non-cpu interrupts. */
diff --git a/include/asm-cris/arch-v32/page.h b/include/asm-cris/arch-v32/page.h
index 77827bc17cca..fa454fe12425 100644
--- a/include/asm-cris/arch-v32/page.h
+++ b/include/asm-cris/arch-v32/page.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_CRIS_ARCH_PAGE_H
 #define _ASM_CRIS_ARCH_PAGE_H
 
-#include <linux/config.h>
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-cris/arch-v32/processor.h b/include/asm-cris/arch-v32/processor.h
index 32bf2e538ced..5553b0cd02bf 100644
--- a/include/asm-cris/arch-v32/processor.h
+++ b/include/asm-cris/arch-v32/processor.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_CRIS_ARCH_PROCESSOR_H
 #define _ASM_CRIS_ARCH_PROCESSOR_H
 
-#include <linux/config.h>
 
 /* Return current instruction pointer. */
 #define current_text_addr() \
diff --git a/include/asm-cris/arch-v32/system.h b/include/asm-cris/arch-v32/system.h
index a3d75d581e2f..d20e2d6d64a3 100644
--- a/include/asm-cris/arch-v32/system.h
+++ b/include/asm-cris/arch-v32/system.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_CRIS_ARCH_SYSTEM_H
 #define _ASM_CRIS_ARCH_SYSTEM_H
 
-#include <linux/config.h>
 
 /* Read the CPU version register. */
 static inline unsigned long rdvr(void)
diff --git a/include/asm-cris/eshlibld.h b/include/asm-cris/eshlibld.h
index 2b577cde17eb..10ce36cf79a9 100644
--- a/include/asm-cris/eshlibld.h
+++ b/include/asm-cris/eshlibld.h
@@ -32,7 +32,6 @@
 /* We have dependencies all over the place for the host system
    for xsim being a linux system, so let's not pretend anything
    else with #ifdef:s here until fixed.  */
-#include <linux/config.h>
 #include <linux/limits.h>
 
 /* Maybe do sanity checking if file input. */
diff --git a/include/asm-cris/etraxgpio.h b/include/asm-cris/etraxgpio.h
index 80ee10f70d43..5d0028dba7c6 100644
--- a/include/asm-cris/etraxgpio.h
+++ b/include/asm-cris/etraxgpio.h
@@ -25,7 +25,6 @@
 #ifndef _ASM_ETRAXGPIO_H
 #define _ASM_ETRAXGPIO_H
 
-#include <linux/config.h>
 /* etraxgpio _IOC_TYPE, bits 8 to 15 in ioctl cmd */
 #ifdef CONFIG_ETRAX_ARCH_V10
 #define ETRAXGPIO_IOCTYPE 43
diff --git a/include/asm-cris/fasttimer.h b/include/asm-cris/fasttimer.h
index 69522028baa5..a3a77132ce32 100644
--- a/include/asm-cris/fasttimer.h
+++ b/include/asm-cris/fasttimer.h
@@ -5,7 +5,6 @@
  * This may be useful in other OS than Linux so use 2 space indentation...
  * Copyright (C) 2000, 2002 Axis Communications AB
  */
-#include <linux/config.h>
 #include <linux/time.h> /* struct timeval */
 #include <linux/timex.h>
 
diff --git a/include/asm-cris/page.h b/include/asm-cris/page.h
index 3787633e6209..81832e9e157f 100644
--- a/include/asm-cris/page.h
+++ b/include/asm-cris/page.h
@@ -1,7 +1,6 @@
 #ifndef _CRIS_PAGE_H
 #define _CRIS_PAGE_H
 
-#include <linux/config.h>
 #include <asm/arch/page.h>
 
 /* PAGE_SHIFT determines the page size */
diff --git a/include/asm-cris/pci.h b/include/asm-cris/pci.h
index 2064bc1de074..b2ac8a331da1 100644
--- a/include/asm-cris/pci.h
+++ b/include/asm-cris/pci.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_CRIS_PCI_H
 #define __ASM_CRIS_PCI_H
 
-#include <linux/config.h>
 
 #ifdef __KERNEL__
 #include <linux/mm.h>		/* for struct page */
diff --git a/include/asm-cris/pgtable.h b/include/asm-cris/pgtable.h
index 70a832514f62..5d76c1c0d6c9 100644
--- a/include/asm-cris/pgtable.h
+++ b/include/asm-cris/pgtable.h
@@ -9,7 +9,6 @@
 #include <asm-generic/pgtable-nopmd.h>
 
 #ifndef __ASSEMBLY__
-#include <linux/config.h>
 #include <linux/sched.h>
 #include <asm/mmu.h>
 #endif
diff --git a/include/asm-cris/processor.h b/include/asm-cris/processor.h
index 961e2bceadbc..568da1deceb9 100644
--- a/include/asm-cris/processor.h
+++ b/include/asm-cris/processor.h
@@ -10,7 +10,6 @@
 #ifndef __ASM_CRIS_PROCESSOR_H
 #define __ASM_CRIS_PROCESSOR_H
 
-#include <linux/config.h>
 #include <asm/system.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
diff --git a/include/asm-cris/rtc.h b/include/asm-cris/rtc.h
index 97c13039834a..cb4bf9217fee 100644
--- a/include/asm-cris/rtc.h
+++ b/include/asm-cris/rtc.h
@@ -4,7 +4,6 @@
 #define __RTC_H__
 
 
-#include <linux/config.h>
 
 #ifdef CONFIG_ETRAX_DS1302
    /* Dallas DS1302 clock/calendar register numbers. */
diff --git a/include/asm-cris/tlbflush.h b/include/asm-cris/tlbflush.h
index c52238005b55..0569612477e3 100644
--- a/include/asm-cris/tlbflush.h
+++ b/include/asm-cris/tlbflush.h
@@ -1,7 +1,6 @@
 #ifndef _CRIS_TLBFLUSH_H
 #define _CRIS_TLBFLUSH_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <asm/processor.h>
 #include <asm/pgtable.h>
diff --git a/include/asm-frv/atomic.h b/include/asm-frv/atomic.h
index 5d9f84bfdcad..9a4ff03c3969 100644
--- a/include/asm-frv/atomic.h
+++ b/include/asm-frv/atomic.h
@@ -14,7 +14,6 @@
 #ifndef _ASM_ATOMIC_H
 #define _ASM_ATOMIC_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <asm/spr-regs.h>
 
diff --git a/include/asm-frv/bitops.h b/include/asm-frv/bitops.h
index 6344d06390b9..980ae1b0cd28 100644
--- a/include/asm-frv/bitops.h
+++ b/include/asm-frv/bitops.h
@@ -14,7 +14,6 @@
 #ifndef _ASM_BITOPS_H
 #define _ASM_BITOPS_H
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <asm/byteorder.h>
 #include <asm/system.h>
diff --git a/include/asm-frv/bug.h b/include/asm-frv/bug.h
index 451712cc3060..6b1b44d71028 100644
--- a/include/asm-frv/bug.h
+++ b/include/asm-frv/bug.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_BUG_H
 #define _ASM_BUG_H
 
-#include <linux/config.h>
 #include <linux/linkage.h>
 
 #ifdef CONFIG_BUG
diff --git a/include/asm-frv/cache.h b/include/asm-frv/cache.h
index cf69b6373b34..2797163b8f4f 100644
--- a/include/asm-frv/cache.h
+++ b/include/asm-frv/cache.h
@@ -12,7 +12,6 @@
 #ifndef __ASM_CACHE_H
 #define __ASM_CACHE_H
 
-#include <linux/config.h>
 
 /* bytes per L1 cache line */
 #define L1_CACHE_SHIFT		(CONFIG_FRV_L1_CACHE_SHIFT)
diff --git a/include/asm-frv/dma.h b/include/asm-frv/dma.h
index d8f9a2f21521..18d6bb8f84fc 100644
--- a/include/asm-frv/dma.h
+++ b/include/asm-frv/dma.h
@@ -14,7 +14,6 @@
 
 //#define DMA_DEBUG 1
 
-#include <linux/config.h>
 #include <linux/interrupt.h>
 
 #undef MAX_DMA_CHANNELS		/* don't use kernel/dma.c */
diff --git a/include/asm-frv/elf.h b/include/asm-frv/elf.h
index 7d2098f0476b..38656da00e40 100644
--- a/include/asm-frv/elf.h
+++ b/include/asm-frv/elf.h
@@ -12,7 +12,6 @@
 #ifndef __ASM_ELF_H
 #define __ASM_ELF_H
 
-#include <linux/config.h>
 #include <asm/ptrace.h>
 #include <asm/user.h>
 
diff --git a/include/asm-frv/fpu.h b/include/asm-frv/fpu.h
index b1178f8ca5ce..d73c60b56641 100644
--- a/include/asm-frv/fpu.h
+++ b/include/asm-frv/fpu.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_FPU_H
 #define __ASM_FPU_H
 
-#include <linux/config.h>
 
 /*
  * MAX floating point unit state size (FSAVE/FRESTORE)
diff --git a/include/asm-frv/hardirq.h b/include/asm-frv/hardirq.h
index 685123981e8b..7581b5a7559a 100644
--- a/include/asm-frv/hardirq.h
+++ b/include/asm-frv/hardirq.h
@@ -12,7 +12,6 @@
 #ifndef __ASM_HARDIRQ_H
 #define __ASM_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/irq.h>
 
diff --git a/include/asm-frv/highmem.h b/include/asm-frv/highmem.h
index 295f74a57f22..cfbf7d3a1feb 100644
--- a/include/asm-frv/highmem.h
+++ b/include/asm-frv/highmem.h
@@ -17,7 +17,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/init.h>
 #include <asm/mem-layout.h>
 #include <asm/spr-regs.h>
diff --git a/include/asm-frv/ide.h b/include/asm-frv/ide.h
index ae031eaa3dd2..f0bd2cb250c1 100644
--- a/include/asm-frv/ide.h
+++ b/include/asm-frv/ide.h
@@ -14,7 +14,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <asm/setup.h>
 #include <asm/io.h>
 #include <asm/irq.h>
diff --git a/include/asm-frv/io.h b/include/asm-frv/io.h
index 01247cb2bc39..b56eba59e3cd 100644
--- a/include/asm-frv/io.h
+++ b/include/asm-frv/io.h
@@ -17,7 +17,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <asm/virtconvert.h>
 #include <asm/string.h>
diff --git a/include/asm-frv/irq.h b/include/asm-frv/irq.h
index 2c16d8dc02fd..58b619215a50 100644
--- a/include/asm-frv/irq.h
+++ b/include/asm-frv/irq.h
@@ -12,7 +12,6 @@
 #ifndef _ASM_IRQ_H_
 #define _ASM_IRQ_H_
 
-#include <linux/config.h>
 
 /*
  * the system has an on-CPU PIC and another PIC on the FPGA and other PICs on other peripherals,
diff --git a/include/asm-frv/mmu_context.h b/include/asm-frv/mmu_context.h
index 4fb9ea3c5bc9..72edcaaccd5d 100644
--- a/include/asm-frv/mmu_context.h
+++ b/include/asm-frv/mmu_context.h
@@ -12,7 +12,6 @@
 #ifndef _ASM_MMU_CONTEXT_H
 #define _ASM_MMU_CONTEXT_H
 
-#include <linux/config.h>
 #include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
diff --git a/include/asm-frv/page.h b/include/asm-frv/page.h
index dc0f7e08a4c2..134cc0cdf6c2 100644
--- a/include/asm-frv/page.h
+++ b/include/asm-frv/page.h
@@ -3,7 +3,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <asm/virtconvert.h>
 #include <asm/mem-layout.h>
 #include <asm/sections.h>
diff --git a/include/asm-frv/pci.h b/include/asm-frv/pci.h
index 598b0c6b695d..f35a4511e7b9 100644
--- a/include/asm-frv/pci.h
+++ b/include/asm-frv/pci.h
@@ -13,7 +13,6 @@
 #ifndef ASM_PCI_H
 #define	ASM_PCI_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <asm/scatterlist.h>
 #include <asm-generic/pci-dma-compat.h>
diff --git a/include/asm-frv/pgalloc.h b/include/asm-frv/pgalloc.h
index 1bd28f41bfa8..ce982a6c610f 100644
--- a/include/asm-frv/pgalloc.h
+++ b/include/asm-frv/pgalloc.h
@@ -15,7 +15,6 @@
 #ifndef _ASM_PGALLOC_H
 #define _ASM_PGALLOC_H
 
-#include <linux/config.h>
 #include <asm/setup.h>
 #include <asm/virtconvert.h>
 
diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h
index d1c3b182c691..7af7485e889e 100644
--- a/include/asm-frv/pgtable.h
+++ b/include/asm-frv/pgtable.h
@@ -16,7 +16,6 @@
 #ifndef _ASM_PGTABLE_H
 #define _ASM_PGTABLE_H
 
-#include <linux/config.h>
 #include <asm/mem-layout.h>
 #include <asm/setup.h>
 #include <asm/processor.h>
diff --git a/include/asm-frv/processor.h b/include/asm-frv/processor.h
index 5228c18b7f78..1c4dba1c5f57 100644
--- a/include/asm-frv/processor.h
+++ b/include/asm-frv/processor.h
@@ -12,7 +12,6 @@
 #ifndef _ASM_PROCESSOR_H
 #define _ASM_PROCESSOR_H
 
-#include <linux/config.h>
 #include <asm/mem-layout.h>
 
 #ifndef __ASSEMBLY__
diff --git a/include/asm-frv/segment.h b/include/asm-frv/segment.h
index 61222f00dfc1..e3616a6f941d 100644
--- a/include/asm-frv/segment.h
+++ b/include/asm-frv/segment.h
@@ -12,7 +12,6 @@
 #ifndef _ASM_SEGMENT_H
 #define _ASM_SEGMENT_H
 
-#include <linux/config.h>
 
 #ifndef __ASSEMBLY__
 
diff --git a/include/asm-frv/serial.h b/include/asm-frv/serial.h
index 6917d556a1e1..dbb825998689 100644
--- a/include/asm-frv/serial.h
+++ b/include/asm-frv/serial.h
@@ -6,7 +6,6 @@
  *
  * Based on linux/include/asm-i386/serial.h
  */
-#include <linux/config.h>
 #include <asm/serial-regs.h>
 
 /*
diff --git a/include/asm-frv/smp.h b/include/asm-frv/smp.h
index 5ca771631fd8..38349ec8b61b 100644
--- a/include/asm-frv/smp.h
+++ b/include/asm-frv/smp.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_SMP_H
 #define __ASM_SMP_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_SMP
 #error SMP not supported
diff --git a/include/asm-frv/system.h b/include/asm-frv/system.h
index 1734ed91bcdc..351863dfd06e 100644
--- a/include/asm-frv/system.h
+++ b/include/asm-frv/system.h
@@ -12,7 +12,6 @@
 #ifndef _ASM_SYSTEM_H
 #define _ASM_SYSTEM_H
 
-#include <linux/config.h> /* get configuration macros */
 #include <linux/linkage.h>
 #include <asm/atomic.h>
 
diff --git a/include/asm-frv/tlbflush.h b/include/asm-frv/tlbflush.h
index bc3462625084..da3a3179a85d 100644
--- a/include/asm-frv/tlbflush.h
+++ b/include/asm-frv/tlbflush.h
@@ -12,7 +12,6 @@
 #ifndef _ASM_TLBFLUSH_H
 #define _ASM_TLBFLUSH_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <asm/processor.h>
 
diff --git a/include/asm-frv/types.h b/include/asm-frv/types.h
index 2560f596a75d..1b6d1923b25b 100644
--- a/include/asm-frv/types.h
+++ b/include/asm-frv/types.h
@@ -46,7 +46,6 @@ typedef unsigned long long __u64;
 
 #ifndef __ASSEMBLY__
 
-#include <linux/config.h>
 
 typedef signed char s8;
 typedef unsigned char u8;
diff --git a/include/asm-frv/unaligned.h b/include/asm-frv/unaligned.h
index a0d199bf01d9..dc8e9c9bf6bd 100644
--- a/include/asm-frv/unaligned.h
+++ b/include/asm-frv/unaligned.h
@@ -12,7 +12,6 @@
 #ifndef _ASM_UNALIGNED_H
 #define _ASM_UNALIGNED_H
 
-#include <linux/config.h>
 
 /*
  * Unaligned accesses on uClinux can't be performed in a fault handler - the
diff --git a/include/asm-frv/virtconvert.h b/include/asm-frv/virtconvert.h
index a29a0aec291f..59788fa2a813 100644
--- a/include/asm-frv/virtconvert.h
+++ b/include/asm-frv/virtconvert.h
@@ -17,7 +17,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <asm/setup.h>
 
 #ifdef CONFIG_MMU
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 1a565a9d2fa7..0cd9711895fa 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -2,7 +2,6 @@
 #define _ASM_GENERIC_BUG_H
 
 #include <linux/compiler.h>
-#include <linux/config.h>
 
 #ifdef CONFIG_BUG
 #ifndef HAVE_ARCH_BUG
diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h
index 1b356207712c..b541e48cc545 100644
--- a/include/asm-generic/dma-mapping.h
+++ b/include/asm-generic/dma-mapping.h
@@ -7,7 +7,6 @@
 #ifndef _ASM_GENERIC_DMA_MAPPING_H
 #define _ASM_GENERIC_DMA_MAPPING_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_PCI
 
diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index b663520dcdc4..c154b9d6e7e5 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_GENERIC_FCNTL_H
 #define _ASM_GENERIC_FCNTL_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 
 /* open/fcntl - O_SYNC is only implemented on blocks devices and on files
diff --git a/include/asm-generic/local.h b/include/asm-generic/local.h
index 9291c24f5819..ab469297272c 100644
--- a/include/asm-generic/local.h
+++ b/include/asm-generic/local.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_GENERIC_LOCAL_H
 #define _ASM_GENERIC_LOCAL_H
 
-#include <linux/config.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <asm/atomic.h>
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index cdd4145243cd..867d9008fafa 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -13,7 +13,6 @@
 #ifndef _ASM_GENERIC__TLB_H
 #define _ASM_GENERIC__TLB_H
 
-#include <linux/config.h>
 #include <linux/swap.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
diff --git a/include/asm-h8300/bitops.h b/include/asm-h8300/bitops.h
index 574f57b6c4d1..d76299c98b81 100644
--- a/include/asm-h8300/bitops.h
+++ b/include/asm-h8300/bitops.h
@@ -6,7 +6,6 @@
  * Copyright 2002, Yoshinori Sato
  */
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <asm/system.h>
 
diff --git a/include/asm-h8300/dma.h b/include/asm-h8300/dma.h
index 3708681b7ddc..3edbaaaedf5b 100644
--- a/include/asm-h8300/dma.h
+++ b/include/asm-h8300/dma.h
@@ -1,7 +1,6 @@
 #ifndef _H8300_DMA_H
 #define _H8300_DMA_H 
  
-#include <linux/config.h>
 
 /*
  * Set number of channels of DMA on ColdFire for different implementations.
diff --git a/include/asm-h8300/elf.h b/include/asm-h8300/elf.h
index f4af1553a55f..7ba6a0af447c 100644
--- a/include/asm-h8300/elf.h
+++ b/include/asm-h8300/elf.h
@@ -5,7 +5,6 @@
  * ELF register definitions..
  */
 
-#include <linux/config.h>
 #include <asm/ptrace.h>
 #include <asm/user.h>
 
diff --git a/include/asm-h8300/hardirq.h b/include/asm-h8300/hardirq.h
index e961bfe201b8..18fa7931e09f 100644
--- a/include/asm-h8300/hardirq.h
+++ b/include/asm-h8300/hardirq.h
@@ -2,7 +2,6 @@
 #define __H8300_HARDIRQ_H
 
 #include <linux/kernel.h>
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
diff --git a/include/asm-h8300/io.h b/include/asm-h8300/io.h
index 1773e373e9c6..91b7487cb7ae 100644
--- a/include/asm-h8300/io.h
+++ b/include/asm-h8300/io.h
@@ -3,7 +3,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <asm/virtconvert.h>
 
 #if defined(CONFIG_H83007) || defined(CONFIG_H83068)
diff --git a/include/asm-h8300/keyboard.h b/include/asm-h8300/keyboard.h
index b05d11387ae5..fbad65e8a5c0 100644
--- a/include/asm-h8300/keyboard.h
+++ b/include/asm-h8300/keyboard.h
@@ -7,7 +7,6 @@
 #ifndef _H8300_KEYBOARD_H
 #define _H8300_KEYBOARD_H
 
-#include <linux/config.h>
 
 /* dummy i.e. no real keyboard */
 #define kbd_setkeycode(x...)	(-ENOSYS)
diff --git a/include/asm-h8300/mmu_context.h b/include/asm-h8300/mmu_context.h
index 23b555b7b4b9..855721a5dcc9 100644
--- a/include/asm-h8300/mmu_context.h
+++ b/include/asm-h8300/mmu_context.h
@@ -1,7 +1,6 @@
 #ifndef __H8300_MMU_CONTEXT_H
 #define __H8300_MMU_CONTEXT_H
 
-#include <linux/config.h>
 #include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
diff --git a/include/asm-h8300/page.h b/include/asm-h8300/page.h
index 6472c9f88227..f9f9d3eea8ed 100644
--- a/include/asm-h8300/page.h
+++ b/include/asm-h8300/page.h
@@ -1,7 +1,6 @@
 #ifndef _H8300_PAGE_H
 #define _H8300_PAGE_H
 
-#include <linux/config.h>
 
 /* PAGE_SHIFT determines the page size */
 
diff --git a/include/asm-h8300/page_offset.h b/include/asm-h8300/page_offset.h
index 8cc6e17218a8..f8706463008c 100644
--- a/include/asm-h8300/page_offset.h
+++ b/include/asm-h8300/page_offset.h
@@ -1,4 +1,3 @@
 
-#include <linux/config.h>
 #define PAGE_OFFSET_RAW		0x00000000
 
diff --git a/include/asm-h8300/param.h b/include/asm-h8300/param.h
index 126dddf72359..c25806ed1fb3 100644
--- a/include/asm-h8300/param.h
+++ b/include/asm-h8300/param.h
@@ -1,7 +1,6 @@
 #ifndef _H8300_PARAM_H
 #define _H8300_PARAM_H
 
-#include <linux/config.h>
 
 #ifndef HZ
 #define HZ 100
diff --git a/include/asm-h8300/pgtable.h b/include/asm-h8300/pgtable.h
index f6e296fc1297..8b7c6857998b 100644
--- a/include/asm-h8300/pgtable.h
+++ b/include/asm-h8300/pgtable.h
@@ -3,7 +3,6 @@
 
 #include <asm-generic/4level-fixup.h>
 
-#include <linux/config.h>
 #include <linux/slab.h>
 #include <asm/processor.h>
 #include <asm/page.h>
diff --git a/include/asm-h8300/processor.h b/include/asm-h8300/processor.h
index c6f0a7108ef3..c7e2f454b83a 100644
--- a/include/asm-h8300/processor.h
+++ b/include/asm-h8300/processor.h
@@ -17,7 +17,6 @@
  */
 #define current_text_addr() ({ __label__ _l; _l: &&_l;})
 
-#include <linux/config.h>
 #include <asm/segment.h>
 #include <asm/fpu.h>
 #include <asm/ptrace.h>
diff --git a/include/asm-h8300/semaphore-helper.h b/include/asm-h8300/semaphore-helper.h
index 29e0fbf1acb7..4fea36be5fd8 100644
--- a/include/asm-h8300/semaphore-helper.h
+++ b/include/asm-h8300/semaphore-helper.h
@@ -10,7 +10,6 @@
  * m68k version by Andreas Schwab
  */
 
-#include <linux/config.h>
 #include <linux/errno.h>
 
 /*
diff --git a/include/asm-h8300/shm.h b/include/asm-h8300/shm.h
index bec758524839..ed6623c0545d 100644
--- a/include/asm-h8300/shm.h
+++ b/include/asm-h8300/shm.h
@@ -1,7 +1,6 @@
 #ifndef _H8300_SHM_H
 #define _H8300_SHM_H
 
-#include <linux/config.h>
 
 /* format of page table entries that correspond to shared memory pages
    currently out in swap space (see also mm/swap.c):
diff --git a/include/asm-h8300/system.h b/include/asm-h8300/system.h
index 8e81cf665e75..134e0929fce5 100644
--- a/include/asm-h8300/system.h
+++ b/include/asm-h8300/system.h
@@ -1,7 +1,6 @@
 #ifndef _H8300_SYSTEM_H
 #define _H8300_SYSTEM_H
 
-#include <linux/config.h> /* get configuration macros */
 #include <linux/linkage.h>
 
 /*
diff --git a/include/asm-h8300/unaligned.h b/include/asm-h8300/unaligned.h
index 8a93961173c3..ffb67f472070 100644
--- a/include/asm-h8300/unaligned.h
+++ b/include/asm-h8300/unaligned.h
@@ -1,7 +1,6 @@
 #ifndef __H8300_UNALIGNED_H
 #define __H8300_UNALIGNED_H
 
-#include <linux/config.h>
 
 /* Use memmove here, so gcc does not insert a __builtin_memcpy. */
 
diff --git a/include/asm-h8300/virtconvert.h b/include/asm-h8300/virtconvert.h
index 3b344c1dfe0f..ee7d5ea10065 100644
--- a/include/asm-h8300/virtconvert.h
+++ b/include/asm-h8300/virtconvert.h
@@ -7,7 +7,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <asm/setup.h>
 #include <asm/page.h>
 
diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h
index 288233fd77d7..cc9b940fb7e8 100644
--- a/include/asm-i386/apic.h
+++ b/include/asm-i386/apic.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_APIC_H
 #define __ASM_APIC_H
 
-#include <linux/config.h>
 #include <linux/pm.h>
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
diff --git a/include/asm-i386/atomic.h b/include/asm-i386/atomic.h
index 4ddce5296a78..4f061fa73794 100644
--- a/include/asm-i386/atomic.h
+++ b/include/asm-i386/atomic.h
@@ -1,7 +1,6 @@
 #ifndef __ARCH_I386_ATOMIC__
 #define __ARCH_I386_ATOMIC__
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <asm/processor.h>
 
diff --git a/include/asm-i386/bitops.h b/include/asm-i386/bitops.h
index 08deaeee6be9..1c780fa1e762 100644
--- a/include/asm-i386/bitops.h
+++ b/include/asm-i386/bitops.h
@@ -5,7 +5,6 @@
  * Copyright 1992, Linus Torvalds.
  */
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <asm/alternative.h>
 
diff --git a/include/asm-i386/bug.h b/include/asm-i386/bug.h
index 8f79de19eb94..8062cdbf2587 100644
--- a/include/asm-i386/bug.h
+++ b/include/asm-i386/bug.h
@@ -1,7 +1,6 @@
 #ifndef _I386_BUG_H
 #define _I386_BUG_H
 
-#include <linux/config.h>
 
 /*
  * Tell the user there is some problem.
diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h
index 50233e0345fb..2a9e4ee5904d 100644
--- a/include/asm-i386/bugs.h
+++ b/include/asm-i386/bugs.h
@@ -17,7 +17,6 @@
  *	void check_bugs(void);
  */
 
-#include <linux/config.h>
 #include <linux/init.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
diff --git a/include/asm-i386/byteorder.h b/include/asm-i386/byteorder.h
index a0d73f48d5be..a45470a8b74a 100644
--- a/include/asm-i386/byteorder.h
+++ b/include/asm-i386/byteorder.h
@@ -8,7 +8,6 @@
 
 /* For avoiding bswap on i386 */
 #ifdef __KERNEL__
-#include <linux/config.h>
 #endif
 
 static __inline__ __attribute_const__ __u32 ___arch__swab32(__u32 x)
diff --git a/include/asm-i386/cache.h b/include/asm-i386/cache.h
index ca15c9c665cf..57c62f414158 100644
--- a/include/asm-i386/cache.h
+++ b/include/asm-i386/cache.h
@@ -4,7 +4,6 @@
 #ifndef __ARCH_I386_CACHE_H
 #define __ARCH_I386_CACHE_H
 
-#include <linux/config.h>
 
 /* L1 cache line size */
 #define L1_CACHE_SHIFT	(CONFIG_X86_L1_CACHE_SHIFT)
diff --git a/include/asm-i386/dma.h b/include/asm-i386/dma.h
index f24b2bba2831..d23aac8e1a50 100644
--- a/include/asm-i386/dma.h
+++ b/include/asm-i386/dma.h
@@ -8,7 +8,6 @@
 #ifndef _ASM_DMA_H
 #define _ASM_DMA_H
 
-#include <linux/config.h>
 #include <linux/spinlock.h>	/* And spinlocks */
 #include <asm/io.h>		/* need byte IO */
 #include <linux/delay.h>
diff --git a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h
index cfb1c61d3b9c..f7e068f4d2f9 100644
--- a/include/asm-i386/fixmap.h
+++ b/include/asm-i386/fixmap.h
@@ -13,7 +13,6 @@
 #ifndef _ASM_FIXMAP_H
 #define _ASM_FIXMAP_H
 
-#include <linux/config.h>
 
 /* used by vmalloc.c, vsyscall.lds.S.
  *
diff --git a/include/asm-i386/hardirq.h b/include/asm-i386/hardirq.h
index ee754d359734..0e358dc405f8 100644
--- a/include/asm-i386/hardirq.h
+++ b/include/asm-i386/hardirq.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_HARDIRQ_H
 #define __ASM_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/irq.h>
 
diff --git a/include/asm-i386/highmem.h b/include/asm-i386/highmem.h
index 0fd331306b60..e9a34ebc25d5 100644
--- a/include/asm-i386/highmem.h
+++ b/include/asm-i386/highmem.h
@@ -20,7 +20,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/interrupt.h>
 #include <linux/threads.h>
 #include <asm/kmap_types.h>
diff --git a/include/asm-i386/hpet.h b/include/asm-i386/hpet.h
index 7f1a8a6ee32f..af5d435519d1 100644
--- a/include/asm-i386/hpet.h
+++ b/include/asm-i386/hpet.h
@@ -27,7 +27,6 @@
 #include <asm/processor.h>
 
 #include <linux/timex.h>
-#include <linux/config.h>
 
 #include <asm/fixmap.h>
 
diff --git a/include/asm-i386/hw_irq.h b/include/asm-i386/hw_irq.h
index 622815bf3243..95d3fd090298 100644
--- a/include/asm-i386/hw_irq.h
+++ b/include/asm-i386/hw_irq.h
@@ -12,7 +12,6 @@
  *	<tomsoft@informatik.tu-chemnitz.de>
  */
 
-#include <linux/config.h>
 #include <linux/profile.h>
 #include <asm/atomic.h>
 #include <asm/irq.h>
diff --git a/include/asm-i386/ide.h b/include/asm-i386/ide.h
index 454440193eac..73465d2892b9 100644
--- a/include/asm-i386/ide.h
+++ b/include/asm-i386/ide.h
@@ -13,7 +13,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #ifndef MAX_HWIFS
 # ifdef CONFIG_BLK_DEV_IDEPCI
diff --git a/include/asm-i386/io.h b/include/asm-i386/io.h
index 79670bb4b0c7..b3724fe93ff1 100644
--- a/include/asm-i386/io.h
+++ b/include/asm-i386/io.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_IO_H
 #define _ASM_IO_H
 
-#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/compiler.h>
 
diff --git a/include/asm-i386/io_apic.h b/include/asm-i386/io_apic.h
index 51c4e5fe6062..7d3e82d4b69d 100644
--- a/include/asm-i386/io_apic.h
+++ b/include/asm-i386/io_apic.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_IO_APIC_H
 #define __ASM_IO_APIC_H
 
-#include <linux/config.h>
 #include <asm/types.h>
 #include <asm/mpspec.h>
 
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 5169d7af456f..331726b41128 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -10,7 +10,6 @@
  *	<tomsoft@informatik.tu-chemnitz.de>
  */
 
-#include <linux/config.h>
 #include <linux/sched.h>
 /* include comes from machine specific directory */
 #include "irq_vectors.h"
diff --git a/include/asm-i386/kmap_types.h b/include/asm-i386/kmap_types.h
index 6886a0c3fedf..806aae3c5338 100644
--- a/include/asm-i386/kmap_types.h
+++ b/include/asm-i386/kmap_types.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_KMAP_TYPES_H
 #define _ASM_KMAP_TYPES_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_DEBUG_HIGHMEM
 # define D(n) __KM_FENCE_##n ,
diff --git a/include/asm-i386/mach-summit/mach_apic.h b/include/asm-i386/mach-summit/mach_apic.h
index 3d6d12937e1f..9fd073286289 100644
--- a/include/asm-i386/mach-summit/mach_apic.h
+++ b/include/asm-i386/mach-summit/mach_apic.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_MACH_APIC_H
 #define __ASM_MACH_APIC_H
 
-#include <linux/config.h>
 #include <asm/smp.h>
 
 #define esr_disable (1)
diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h
index bf08218357ea..62b7bf184094 100644
--- a/include/asm-i386/mmu_context.h
+++ b/include/asm-i386/mmu_context.h
@@ -1,7 +1,6 @@
 #ifndef __I386_SCHED_H
 #define __I386_SCHED_H
 
-#include <linux/config.h>
 #include <asm/desc.h>
 #include <asm/atomic.h>
 #include <asm/pgalloc.h>
diff --git a/include/asm-i386/mtrr.h b/include/asm-i386/mtrr.h
index 64cf937c7e33..5a46de08efea 100644
--- a/include/asm-i386/mtrr.h
+++ b/include/asm-i386/mtrr.h
@@ -23,7 +23,6 @@
 #ifndef _LINUX_MTRR_H
 #define _LINUX_MTRR_H
 
-#include <linux/config.h>
 #include <linux/ioctl.h>
 #include <linux/errno.h>
 
diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index 30f52a2263ba..85f35e67020f 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -12,7 +12,6 @@
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 
-#include <linux/config.h>
 
 #ifdef CONFIG_X86_USE_3DNOW
 
diff --git a/include/asm-i386/param.h b/include/asm-i386/param.h
index 095580f3a45c..745dc5bd0fbc 100644
--- a/include/asm-i386/param.h
+++ b/include/asm-i386/param.h
@@ -2,7 +2,6 @@
 #define _ASMi386_PARAM_H
 
 #ifdef __KERNEL__
-# include <linux/config.h>
 # define HZ		CONFIG_HZ	/* Internal kernel timer frequency */
 # define USER_HZ	100		/* .. some user interfaces are in "ticks" */
 # define CLOCKS_PER_SEC		(USER_HZ)	/* like times() */
diff --git a/include/asm-i386/pci.h b/include/asm-i386/pci.h
index 78c85985aee3..64b6d0baedbc 100644
--- a/include/asm-i386/pci.h
+++ b/include/asm-i386/pci.h
@@ -1,7 +1,6 @@
 #ifndef __i386_PCI_H
 #define __i386_PCI_H
 
-#include <linux/config.h>
 
 #ifdef __KERNEL__
 #include <linux/mm.h>		/* for struct page */
diff --git a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h
index 0380c3dc1f7e..4b1e61359f89 100644
--- a/include/asm-i386/pgalloc.h
+++ b/include/asm-i386/pgalloc.h
@@ -1,7 +1,6 @@
 #ifndef _I386_PGALLOC_H
 #define _I386_PGALLOC_H
 
-#include <linux/config.h>
 #include <asm/fixmap.h>
 #include <linux/threads.h>
 #include <linux/mm.h>		/* for struct page */
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index ee056c41a9fb..248bd80a69c5 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -1,7 +1,6 @@
 #ifndef _I386_PGTABLE_H
 #define _I386_PGTABLE_H
 
-#include <linux/config.h>
 
 /*
  * The Linux memory management assumes a three-level page table setup. On
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 805f0dcda468..4df3818e4122 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -17,7 +17,6 @@
 #include <asm/msr.h>
 #include <asm/system.h>
 #include <linux/cache.h>
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <asm/percpu.h>
 #include <linux/cpumask.h>
diff --git a/include/asm-i386/serial.h b/include/asm-i386/serial.h
index e1ecfccb743b..bd67480ca109 100644
--- a/include/asm-i386/serial.h
+++ b/include/asm-i386/serial.h
@@ -2,7 +2,6 @@
  * include/asm-i386/serial.h
  */
 
-#include <linux/config.h>
 
 /*
  * This assumes you have a 1.8432 MHz clock for your UART.
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index 61d3ab9db70c..142d10e34ade 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -5,7 +5,6 @@
  * We need the APIC definitions automatically as part of 'smp.h'
  */
 #ifndef __ASSEMBLY__
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/threads.h>
 #include <linux/cpumask.h>
diff --git a/include/asm-i386/spinlock.h b/include/asm-i386/spinlock.h
index d76b7693cf1d..04ba30234c48 100644
--- a/include/asm-i386/spinlock.h
+++ b/include/asm-i386/spinlock.h
@@ -4,7 +4,6 @@
 #include <asm/atomic.h>
 #include <asm/rwlock.h>
 #include <asm/page.h>
-#include <linux/config.h>
 #include <linux/compiler.h>
 
 /*
diff --git a/include/asm-i386/string.h b/include/asm-i386/string.h
index bb5f88a27f7a..b9277361954b 100644
--- a/include/asm-i386/string.h
+++ b/include/asm-i386/string.h
@@ -2,7 +2,6 @@
 #define _I386_STRING_H_
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 /*
  * On a 486 or Pentium, we are better off not using the
  * byte string operations. But on a 386 or a PPro the
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 19cc79c9a35d..0249f912a29c 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_SYSTEM_H
 #define __ASM_SYSTEM_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <asm/segment.h>
 #include <asm/cpufeature.h>
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index 1f7d48c9ba3f..8420ed12491e 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -9,7 +9,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <asm/page.h>
 
diff --git a/include/asm-i386/timex.h b/include/asm-i386/timex.h
index 292b5a68f627..d434984303ca 100644
--- a/include/asm-i386/timex.h
+++ b/include/asm-i386/timex.h
@@ -6,7 +6,6 @@
 #ifndef _ASMi386_TIMEX_H
 #define _ASMi386_TIMEX_H
 
-#include <linux/config.h>
 #include <asm/processor.h>
 
 #ifdef CONFIG_X86_ELAN
diff --git a/include/asm-i386/tlbflush.h b/include/asm-i386/tlbflush.h
index ab216e1370ef..d57ca5c540b6 100644
--- a/include/asm-i386/tlbflush.h
+++ b/include/asm-i386/tlbflush.h
@@ -1,7 +1,6 @@
 #ifndef _I386_TLBFLUSH_H
 #define _I386_TLBFLUSH_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <asm/processor.h>
 
diff --git a/include/asm-i386/types.h b/include/asm-i386/types.h
index e50a08bd7ced..4b4b295ccdb9 100644
--- a/include/asm-i386/types.h
+++ b/include/asm-i386/types.h
@@ -35,7 +35,6 @@ typedef unsigned long long __u64;
 
 #ifndef __ASSEMBLY__
 
-#include <linux/config.h>
 
 typedef signed char s8;
 typedef unsigned char u8;
diff --git a/include/asm-i386/uaccess.h b/include/asm-i386/uaccess.h
index 371457b1ceb6..1ec65523ea5e 100644
--- a/include/asm-i386/uaccess.h
+++ b/include/asm-i386/uaccess.h
@@ -4,7 +4,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/thread_info.h>
 #include <linux/prefetch.h>
diff --git a/include/asm-ia64/asmmacro.h b/include/asm-ia64/asmmacro.h
index edf2cebb2969..c22b4658fc61 100644
--- a/include/asm-ia64/asmmacro.h
+++ b/include/asm-ia64/asmmacro.h
@@ -6,7 +6,6 @@
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
 
-#include <linux/config.h>
 
 #define ENTRY(name)				\
 	.align 32;				\
diff --git a/include/asm-ia64/cache.h b/include/asm-ia64/cache.h
index f0a104db8f20..e7482bd628ff 100644
--- a/include/asm-ia64/cache.h
+++ b/include/asm-ia64/cache.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_IA64_CACHE_H
 #define _ASM_IA64_CACHE_H
 
-#include <linux/config.h>
 
 /*
  * Copyright (C) 1998-2000 Hewlett-Packard Co
diff --git a/include/asm-ia64/delay.h b/include/asm-ia64/delay.h
index bba702076391..a30a62f235e1 100644
--- a/include/asm-ia64/delay.h
+++ b/include/asm-ia64/delay.h
@@ -12,7 +12,6 @@
  * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
  */
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/compiler.h>
diff --git a/include/asm-ia64/dma-mapping.h b/include/asm-ia64/dma-mapping.h
index df67d40801de..99a8f8e1218c 100644
--- a/include/asm-ia64/dma-mapping.h
+++ b/include/asm-ia64/dma-mapping.h
@@ -5,7 +5,6 @@
  * Copyright (C) 2003-2004 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
-#include <linux/config.h>
 #include <asm/machvec.h>
 
 #define dma_alloc_coherent	platform_dma_alloc_coherent
diff --git a/include/asm-ia64/dma.h b/include/asm-ia64/dma.h
index 3be1b4925e18..dad3a735df8b 100644
--- a/include/asm-ia64/dma.h
+++ b/include/asm-ia64/dma.h
@@ -6,7 +6,6 @@
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
 
-#include <linux/config.h>
 
 #include <asm/io.h>		/* need byte IO */
 
diff --git a/include/asm-ia64/elf.h b/include/asm-ia64/elf.h
index 446fce036fd9..25f9835d5459 100644
--- a/include/asm-ia64/elf.h
+++ b/include/asm-ia64/elf.h
@@ -8,7 +8,6 @@
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
 
-#include <linux/config.h>
 
 #include <asm/fpu.h>
 #include <asm/page.h>
diff --git a/include/asm-ia64/hardirq.h b/include/asm-ia64/hardirq.h
index 33ef8f096d95..140e495b8e0e 100644
--- a/include/asm-ia64/hardirq.h
+++ b/include/asm-ia64/hardirq.h
@@ -6,7 +6,6 @@
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
 
-#include <linux/config.h>
 
 #include <linux/threads.h>
 #include <linux/irq.h>
diff --git a/include/asm-ia64/ia32.h b/include/asm-ia64/ia32.h
index f8044a1169cd..5ff8d74c3e00 100644
--- a/include/asm-ia64/ia32.h
+++ b/include/asm-ia64/ia32.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_IA64_IA32_H
 #define _ASM_IA64_IA32_H
 
-#include <linux/config.h>
 
 #include <asm/ptrace.h>
 #include <asm/signal.h>
diff --git a/include/asm-ia64/ide.h b/include/asm-ia64/ide.h
index 93f45c5f189f..e928675de352 100644
--- a/include/asm-ia64/ide.h
+++ b/include/asm-ia64/ide.h
@@ -13,7 +13,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #include <linux/irq.h>
 
diff --git a/include/asm-ia64/intrinsics.h b/include/asm-ia64/intrinsics.h
index 8089f955e5d2..3a95aa432e99 100644
--- a/include/asm-ia64/intrinsics.h
+++ b/include/asm-ia64/intrinsics.h
@@ -9,7 +9,6 @@
  */
 
 #ifndef __ASSEMBLY__
-#include <linux/config.h>
 
 /* include compiler specific intrinsics */
 #include <asm/ia64regs.h>
diff --git a/include/asm-ia64/kmap_types.h b/include/asm-ia64/kmap_types.h
index bc777525fa12..5d1658aa2b3b 100644
--- a/include/asm-ia64/kmap_types.h
+++ b/include/asm-ia64/kmap_types.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_IA64_KMAP_TYPES_H
 #define _ASM_IA64_KMAP_TYPES_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_DEBUG_HIGHMEM
 # define D(n) __KM_FENCE_##n ,
diff --git a/include/asm-ia64/machvec.h b/include/asm-ia64/machvec.h
index c3e4ed8a3e17..96d46dbfde47 100644
--- a/include/asm-ia64/machvec.h
+++ b/include/asm-ia64/machvec.h
@@ -10,7 +10,6 @@
 #ifndef _ASM_IA64_MACHVEC_H
 #define _ASM_IA64_MACHVEC_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 
 /* forward declarations: */
diff --git a/include/asm-ia64/meminit.h b/include/asm-ia64/meminit.h
index 46501b01a5c5..894bc4d89dc0 100644
--- a/include/asm-ia64/meminit.h
+++ b/include/asm-ia64/meminit.h
@@ -7,7 +7,6 @@
  * for more details.
  */
 
-#include <linux/config.h>
 
 /*
  * Entries defined so far:
diff --git a/include/asm-ia64/nodedata.h b/include/asm-ia64/nodedata.h
index 9978c7ce7549..a140310bf84d 100644
--- a/include/asm-ia64/nodedata.h
+++ b/include/asm-ia64/nodedata.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_IA64_NODEDATA_H
 #define _ASM_IA64_NODEDATA_H
 
-#include <linux/config.h>
 #include <linux/numa.h>
 
 #include <asm/percpu.h>
diff --git a/include/asm-ia64/numa.h b/include/asm-ia64/numa.h
index dae6aeb7b119..e5a8260593a5 100644
--- a/include/asm-ia64/numa.h
+++ b/include/asm-ia64/numa.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_IA64_NUMA_H
 #define _ASM_IA64_NUMA_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_NUMA
 
diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h
index 2087825eefa4..f5a949ec6e1e 100644
--- a/include/asm-ia64/page.h
+++ b/include/asm-ia64/page.h
@@ -7,7 +7,6 @@
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
 
-#include <linux/config.h>
 
 #include <asm/intrinsics.h>
 #include <asm/types.h>
diff --git a/include/asm-ia64/param.h b/include/asm-ia64/param.h
index 5e1e0d2d7baf..49c62dd5eccf 100644
--- a/include/asm-ia64/param.h
+++ b/include/asm-ia64/param.h
@@ -19,7 +19,6 @@
 #define MAXHOSTNAMELEN	64	/* max length of hostname */
 
 #ifdef __KERNEL__
-# include <linux/config.h>	/* mustn't include <linux/config.h> outside of #ifdef __KERNEL__ */
 # ifdef CONFIG_IA64_HP_SIM
   /*
    * Yeah, simulating stuff is slow, so let us catch some breath between
diff --git a/include/asm-ia64/percpu.h b/include/asm-ia64/percpu.h
index 2b14dee29ce7..ae357d504fba 100644
--- a/include/asm-ia64/percpu.h
+++ b/include/asm-ia64/percpu.h
@@ -12,7 +12,6 @@
 # define THIS_CPU(var)	(per_cpu__##var)  /* use this to mark accesses to per-CPU variables... */
 #else /* !__ASSEMBLY__ */
 
-#include <linux/config.h>
 
 #include <linux/threads.h>
 
diff --git a/include/asm-ia64/pgalloc.h b/include/asm-ia64/pgalloc.h
index f2f233846476..9cb68e9b377e 100644
--- a/include/asm-ia64/pgalloc.h
+++ b/include/asm-ia64/pgalloc.h
@@ -13,7 +13,6 @@
  * Copyright (C) 2000, Goutham Rao <goutham.rao@intel.com>
  */
 
-#include <linux/config.h>
 
 #include <linux/compiler.h>
 #include <linux/mm.h>
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index c0f8144f2349..eaac08d5e0bd 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -12,7 +12,6 @@
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
 
-#include <linux/config.h>
 
 #include <asm/mman.h>
 #include <asm/page.h>
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index b3bd58e80690..265f4824db0e 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -13,7 +13,6 @@
  * 06/16/00	A. Mallick	added csd/ssd/tssd for ia32 support
  */
 
-#include <linux/config.h>
 
 #include <asm/intrinsics.h>
 #include <asm/kregs.h>
diff --git a/include/asm-ia64/ptrace.h b/include/asm-ia64/ptrace.h
index 9471cdc3f4c0..415abb23b210 100644
--- a/include/asm-ia64/ptrace.h
+++ b/include/asm-ia64/ptrace.h
@@ -54,7 +54,6 @@
  * This is because ar.ec is saved as part of ar.pfs.
  */
 
-#include <linux/config.h>
 
 #include <asm/fpu.h>
 #ifndef ASM_OFFSETS_C
diff --git a/include/asm-ia64/smp.h b/include/asm-ia64/smp.h
index a3914352c995..719ff309ce09 100644
--- a/include/asm-ia64/smp.h
+++ b/include/asm-ia64/smp.h
@@ -10,7 +10,6 @@
 #ifndef _ASM_IA64_SMP_H
 #define _ASM_IA64_SMP_H
 
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/threads.h>
 #include <linux/kernel.h>
diff --git a/include/asm-ia64/sn/simulator.h b/include/asm-ia64/sn/simulator.h
index 16a48b5a039c..c3fd3eb25768 100644
--- a/include/asm-ia64/sn/simulator.h
+++ b/include/asm-ia64/sn/simulator.h
@@ -8,7 +8,6 @@
 #ifndef _ASM_IA64_SN_SIMULATOR_H
 #define _ASM_IA64_SN_SIMULATOR_H
 
-#include <linux/config.h>
 
 #define SNMAGIC 0xaeeeeeee8badbeefL
 #define IS_MEDUSA()			({long sn; asm("mov %0=cpuid[%1]" : "=r"(sn) : "r"(2)); sn == SNMAGIC;})
diff --git a/include/asm-ia64/sn/sn_cpuid.h b/include/asm-ia64/sn/sn_cpuid.h
index 749deb2ca6c1..a676dd9ace3e 100644
--- a/include/asm-ia64/sn/sn_cpuid.h
+++ b/include/asm-ia64/sn/sn_cpuid.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_IA64_SN_SN_CPUID_H
 #define _ASM_IA64_SN_SN_CPUID_H
 
-#include <linux/config.h>
 #include <linux/smp.h>
 #include <asm/sn/addrs.h>
 #include <asm/sn/pda.h>
diff --git a/include/asm-ia64/sn/sn_sal.h b/include/asm-ia64/sn/sn_sal.h
index bf4cc867a698..8664c88a1ff5 100644
--- a/include/asm-ia64/sn/sn_sal.h
+++ b/include/asm-ia64/sn/sn_sal.h
@@ -12,7 +12,6 @@
  */
 
 
-#include <linux/config.h>
 #include <asm/sal.h>
 #include <asm/sn/sn_cpuid.h>
 #include <asm/sn/arch.h>
diff --git a/include/asm-ia64/sn/xpc.h b/include/asm-ia64/sn/xpc.h
index aa3b8ace9030..8406f1ef4caf 100644
--- a/include/asm-ia64/sn/xpc.h
+++ b/include/asm-ia64/sn/xpc.h
@@ -15,7 +15,6 @@
 #define _ASM_IA64_SN_XPC_H
 
 
-#include <linux/config.h>
 #include <linux/interrupt.h>
 #include <linux/sysctl.h>
 #include <linux/device.h>
diff --git a/include/asm-ia64/string.h b/include/asm-ia64/string.h
index 43502d3b57e5..85fd65c52a8c 100644
--- a/include/asm-ia64/string.h
+++ b/include/asm-ia64/string.h
@@ -9,7 +9,6 @@
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
 
-#include <linux/config.h>	/* remove this once we remove the A-step workaround... */
 
 #define __HAVE_ARCH_STRLEN	1 /* see arch/ia64/lib/strlen.S */
 #define __HAVE_ARCH_MEMSET	1 /* see arch/ia64/lib/memset.S */
diff --git a/include/asm-ia64/system.h b/include/asm-ia64/system.h
index 2f3620593687..65db43ce4de6 100644
--- a/include/asm-ia64/system.h
+++ b/include/asm-ia64/system.h
@@ -12,7 +12,6 @@
  * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
  * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
  */
-#include <linux/config.h>
 
 #include <asm/kregs.h>
 #include <asm/page.h>
diff --git a/include/asm-ia64/tlb.h b/include/asm-ia64/tlb.h
index 834370b9dea1..26edcb750f9f 100644
--- a/include/asm-ia64/tlb.h
+++ b/include/asm-ia64/tlb.h
@@ -37,7 +37,6 @@
  *	}
  *	tlb_finish_mmu(tlb, start, end);	// finish unmap for address space MM
  */
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
diff --git a/include/asm-ia64/tlbflush.h b/include/asm-ia64/tlbflush.h
index a35b323bae4c..cf9acb9bb1fb 100644
--- a/include/asm-ia64/tlbflush.h
+++ b/include/asm-ia64/tlbflush.h
@@ -6,7 +6,6 @@
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
 
-#include <linux/config.h>
 
 #include <linux/mm.h>
 
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index a40ebec6aeeb..395e6b2998f2 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -293,7 +293,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #define NR_syscalls			278 /* length of syscall table */
 
diff --git a/include/asm-m32r/assembler.h b/include/asm-m32r/assembler.h
index 1a1aa17edd33..47041d19d4a8 100644
--- a/include/asm-m32r/assembler.h
+++ b/include/asm-m32r/assembler.h
@@ -9,7 +9,6 @@
  * This file contains M32R architecture specific macro definitions.
  */
 
-#include <linux/config.h>
 
 #ifndef __STR
 #ifdef __ASSEMBLY__
diff --git a/include/asm-m32r/atomic.h b/include/asm-m32r/atomic.h
index 3122fe106f05..f5a7d7301c72 100644
--- a/include/asm-m32r/atomic.h
+++ b/include/asm-m32r/atomic.h
@@ -9,7 +9,6 @@
  *    Copyright (C) 2004  Hirokazu Takata <takata at linux-m32r.org>
  */
 
-#include <linux/config.h>
 #include <asm/assembler.h>
 #include <asm/system.h>
 
diff --git a/include/asm-m32r/bitops.h b/include/asm-m32r/bitops.h
index 902a366101a5..66ab672162cd 100644
--- a/include/asm-m32r/bitops.h
+++ b/include/asm-m32r/bitops.h
@@ -11,7 +11,6 @@
  *    Copyright (C) 2004  Hirokazu Takata <takata at linux-m32r.org>
  */
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <asm/assembler.h>
 #include <asm/system.h>
diff --git a/include/asm-m32r/cacheflush.h b/include/asm-m32r/cacheflush.h
index e57427b6e249..8b261b49149e 100644
--- a/include/asm-m32r/cacheflush.h
+++ b/include/asm-m32r/cacheflush.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_M32R_CACHEFLUSH_H
 #define _ASM_M32R_CACHEFLUSH_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 
 extern void _flush_cache_all(void);
diff --git a/include/asm-m32r/hardirq.h b/include/asm-m32r/hardirq.h
index 5da830ec1587..cb8aa762f235 100644
--- a/include/asm-m32r/hardirq.h
+++ b/include/asm-m32r/hardirq.h
@@ -2,7 +2,6 @@
 #ifndef __ASM_HARDIRQ_H
 #define __ASM_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/irq.h>
 
diff --git a/include/asm-m32r/ide.h b/include/asm-m32r/ide.h
index f7aa96970d18..219a0f74eff3 100644
--- a/include/asm-m32r/ide.h
+++ b/include/asm-m32r/ide.h
@@ -15,7 +15,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #ifndef MAX_HWIFS
 # ifdef CONFIG_BLK_DEV_IDEPCI
diff --git a/include/asm-m32r/irq.h b/include/asm-m32r/irq.h
index ca943954572a..2f93f4743add 100644
--- a/include/asm-m32r/irq.h
+++ b/include/asm-m32r/irq.h
@@ -2,7 +2,6 @@
 #ifndef _ASM_M32R_IRQ_H
 #define _ASM_M32R_IRQ_H
 
-#include <linux/config.h>
 
 #if defined(CONFIG_PLAT_M32700UT_Alpha) || defined(CONFIG_PLAT_USRV)
 /*
diff --git a/include/asm-m32r/kmap_types.h b/include/asm-m32r/kmap_types.h
index 7429591010b6..0524d89edb0f 100644
--- a/include/asm-m32r/kmap_types.h
+++ b/include/asm-m32r/kmap_types.h
@@ -3,7 +3,6 @@
 
 /* Dummy header just to define km_type. */
 
-#include <linux/config.h>
 
 #ifdef CONFIG_DEBUG_HIGHMEM
 # define D(n) __KM_FENCE_##n ,
diff --git a/include/asm-m32r/m32104ut/m32104ut_pld.h b/include/asm-m32r/m32104ut/m32104ut_pld.h
index a4eac20553df..6ba4ddf7dcf7 100644
--- a/include/asm-m32r/m32104ut/m32104ut_pld.h
+++ b/include/asm-m32r/m32104ut/m32104ut_pld.h
@@ -15,7 +15,6 @@
 #ifndef _M32104UT_M32104UT_PLD_H
 #define _M32104UT_M32104UT_PLD_H
 
-#include <linux/config.h>
 
 #if defined(CONFIG_PLAT_M32104UT)
 #define PLD_PLAT_BASE		0x02c00000
diff --git a/include/asm-m32r/m32700ut/m32700ut_lan.h b/include/asm-m32r/m32700ut/m32700ut_lan.h
index 50545ec9c42c..c050b19e8101 100644
--- a/include/asm-m32r/m32700ut/m32700ut_lan.h
+++ b/include/asm-m32r/m32700ut/m32700ut_lan.h
@@ -15,7 +15,6 @@
 #ifndef _M32700UT_M32700UT_LAN_H
 #define _M32700UT_M32700UT_LAN_H
 
-#include <linux/config.h>
 
 #ifndef __ASSEMBLY__
 /*
diff --git a/include/asm-m32r/m32700ut/m32700ut_lcd.h b/include/asm-m32r/m32700ut/m32700ut_lcd.h
index ede6c77bd5e6..4da4e822e2f3 100644
--- a/include/asm-m32r/m32700ut/m32700ut_lcd.h
+++ b/include/asm-m32r/m32700ut/m32700ut_lcd.h
@@ -15,7 +15,6 @@
 #ifndef _M32700UT_M32700UT_LCD_H
 #define _M32700UT_M32700UT_LCD_H
 
-#include <linux/config.h>
 
 #ifndef __ASSEMBLY__
 /*
diff --git a/include/asm-m32r/m32700ut/m32700ut_pld.h b/include/asm-m32r/m32700ut/m32700ut_pld.h
index f5e479486696..f35f9159acff 100644
--- a/include/asm-m32r/m32700ut/m32700ut_pld.h
+++ b/include/asm-m32r/m32700ut/m32700ut_pld.h
@@ -15,7 +15,6 @@
 #ifndef _M32700UT_M32700UT_PLD_H
 #define _M32700UT_M32700UT_PLD_H
 
-#include <linux/config.h>
 
 #if defined(CONFIG_PLAT_M32700UT_Alpha)
 #define PLD_PLAT_BASE		0x08c00000
diff --git a/include/asm-m32r/m32r.h b/include/asm-m32r/m32r.h
index b133ca61acf1..decfc59907c7 100644
--- a/include/asm-m32r/m32r.h
+++ b/include/asm-m32r/m32r.h
@@ -7,7 +7,6 @@
  * Copyright (C) 2003, 2004  Renesas Technology Corp.
  */
 
-#include <linux/config.h>
 
 /* Chip type */
 #if defined(CONFIG_CHIP_XNUX_MP) || defined(CONFIG_CHIP_XNUX2_MP)
diff --git a/include/asm-m32r/mmu.h b/include/asm-m32r/mmu.h
index 9c00eb78ee50..cf3f6d78ac66 100644
--- a/include/asm-m32r/mmu.h
+++ b/include/asm-m32r/mmu.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_M32R_MMU_H
 #define _ASM_M32R_MMU_H
 
-#include <linux/config.h>
 
 #if !defined(CONFIG_MMU)
 typedef struct {
diff --git a/include/asm-m32r/mmu_context.h b/include/asm-m32r/mmu_context.h
index 3634c5361a9b..542302eb6bcb 100644
--- a/include/asm-m32r/mmu_context.h
+++ b/include/asm-m32r/mmu_context.h
@@ -3,7 +3,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #include <asm/m32r.h>
 
@@ -15,7 +14,6 @@
 
 #ifndef __ASSEMBLY__
 
-#include <linux/config.h>
 #include <asm/atomic.h>
 #include <asm/pgalloc.h>
 #include <asm/mmu.h>
diff --git a/include/asm-m32r/opsput/opsput_lan.h b/include/asm-m32r/opsput/opsput_lan.h
index 7a2a839eedab..61948296f445 100644
--- a/include/asm-m32r/opsput/opsput_lan.h
+++ b/include/asm-m32r/opsput/opsput_lan.h
@@ -15,7 +15,6 @@
 #ifndef _OPSPUT_OPSPUT_LAN_H
 #define _OPSPUT_OPSPUT_LAN_H
 
-#include <linux/config.h>
 
 #ifndef __ASSEMBLY__
 /*
diff --git a/include/asm-m32r/opsput/opsput_lcd.h b/include/asm-m32r/opsput/opsput_lcd.h
index 3a883e3d7187..44cfd7fe2d88 100644
--- a/include/asm-m32r/opsput/opsput_lcd.h
+++ b/include/asm-m32r/opsput/opsput_lcd.h
@@ -15,7 +15,6 @@
 #ifndef _OPSPUT_OPSPUT_LCD_H
 #define _OPSPUT_OPSPUT_LCD_H
 
-#include <linux/config.h>
 
 #ifndef __ASSEMBLY__
 /*
diff --git a/include/asm-m32r/opsput/opsput_pld.h b/include/asm-m32r/opsput/opsput_pld.h
index 2018e6925035..46296fe1ec1a 100644
--- a/include/asm-m32r/opsput/opsput_pld.h
+++ b/include/asm-m32r/opsput/opsput_pld.h
@@ -15,7 +15,6 @@
 #ifndef _OPSPUT_OPSPUT_PLD_H
 #define _OPSPUT_OPSPUT_PLD_H
 
-#include <linux/config.h>
 
 #define PLD_PLAT_BASE		0x1cc00000
 
diff --git a/include/asm-m32r/page.h b/include/asm-m32r/page.h
index 9ddbc087dbc5..9688be003620 100644
--- a/include/asm-m32r/page.h
+++ b/include/asm-m32r/page.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_M32R_PAGE_H
 #define _ASM_M32R_PAGE_H
 
-#include <linux/config.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT	12
diff --git a/include/asm-m32r/pgalloc.h b/include/asm-m32r/pgalloc.h
index 6da309b6fda7..e09a86c3cadf 100644
--- a/include/asm-m32r/pgalloc.h
+++ b/include/asm-m32r/pgalloc.h
@@ -3,7 +3,6 @@
 
 /* $Id$ */
 
-#include <linux/config.h>
 #include <linux/mm.h>
 
 #include <asm/io.h>
diff --git a/include/asm-m32r/pgtable-2level.h b/include/asm-m32r/pgtable-2level.h
index 861727c20e8f..be0f167e344a 100644
--- a/include/asm-m32r/pgtable-2level.h
+++ b/include/asm-m32r/pgtable-2level.h
@@ -3,7 +3,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 /*
  * traditional M32R two-level paging structure:
diff --git a/include/asm-m32r/pgtable.h b/include/asm-m32r/pgtable.h
index 75740debcd01..1983b7f4527a 100644
--- a/include/asm-m32r/pgtable.h
+++ b/include/asm-m32r/pgtable.h
@@ -20,7 +20,6 @@
 
 #ifndef __ASSEMBLY__
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <asm/processor.h>
 #include <asm/addrspace.h>
diff --git a/include/asm-m32r/processor.h b/include/asm-m32r/processor.h
index 09fd1813e780..32755bf136de 100644
--- a/include/asm-m32r/processor.h
+++ b/include/asm-m32r/processor.h
@@ -14,7 +14,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/config.h>
 #include <asm/cache.h>
 #include <asm/ptrace.h>  /* pt_regs */
 
diff --git a/include/asm-m32r/ptrace.h b/include/asm-m32r/ptrace.h
index 53c792452dfc..a07fa90314d2 100644
--- a/include/asm-m32r/ptrace.h
+++ b/include/asm-m32r/ptrace.h
@@ -12,7 +12,6 @@
  *   Copyright (C) 2001-2002, 2004  Hirokazu Takata <takata at linux-m32r.org>
  */
 
-#include <linux/config.h>
 #include <asm/m32r.h>		/* M32R_PSW_BSM, M32R_PSW_BPM */
 
 /* 0 - 13 are integer registers (general purpose registers).  */
diff --git a/include/asm-m32r/rtc.h b/include/asm-m32r/rtc.h
index ec3cdf666c68..6b2b837c5978 100644
--- a/include/asm-m32r/rtc.h
+++ b/include/asm-m32r/rtc.h
@@ -4,7 +4,6 @@
 #define __RTC_H__
 
 
-#include <linux/config.h>
 
    /* Dallas DS1302 clock/calendar register numbers. */
 #  define RTC_SECONDS      0
diff --git a/include/asm-m32r/semaphore.h b/include/asm-m32r/semaphore.h
index 81750edc8916..41e45d7b87ef 100644
--- a/include/asm-m32r/semaphore.h
+++ b/include/asm-m32r/semaphore.h
@@ -12,7 +12,6 @@
  * Copyright (C) 2004, 2006  Hirokazu Takata <takata at linux-m32r.org>
  */
 
-#include <linux/config.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 #include <asm/assembler.h>
diff --git a/include/asm-m32r/serial.h b/include/asm-m32r/serial.h
index 1bf480f58493..5ac244c72f15 100644
--- a/include/asm-m32r/serial.h
+++ b/include/asm-m32r/serial.h
@@ -3,7 +3,6 @@
 
 /* include/asm-m32r/serial.h */
 
-#include <linux/config.h>
 
 #define BASE_BAUD	115200
 
diff --git a/include/asm-m32r/sigcontext.h b/include/asm-m32r/sigcontext.h
index 942b8a30937d..73025c0c41a1 100644
--- a/include/asm-m32r/sigcontext.h
+++ b/include/asm-m32r/sigcontext.h
@@ -3,7 +3,6 @@
 
 /* $Id$ */
 
-#include <linux/config.h>
 
 struct sigcontext {
 	/* CPU registers */
diff --git a/include/asm-m32r/smp.h b/include/asm-m32r/smp.h
index 1184293e5712..650d2558c304 100644
--- a/include/asm-m32r/smp.h
+++ b/include/asm-m32r/smp.h
@@ -3,7 +3,6 @@
 
 /* $Id$ */
 
-#include <linux/config.h>
 
 #ifdef CONFIG_SMP
 #ifndef __ASSEMBLY__
diff --git a/include/asm-m32r/spinlock.h b/include/asm-m32r/spinlock.h
index 7de7def28da9..f94c1a673569 100644
--- a/include/asm-m32r/spinlock.h
+++ b/include/asm-m32r/spinlock.h
@@ -9,7 +9,6 @@
  *    Copyright (C) 2004  Hirokazu Takata <takata at linux-m32r.org>
  */
 
-#include <linux/config.h>	/* CONFIG_DEBUG_SPINLOCK, CONFIG_SMP */
 #include <linux/compiler.h>
 #include <asm/atomic.h>
 #include <asm/page.h>
diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h
index e55013f378e5..33567e8bfe6b 100644
--- a/include/asm-m32r/system.h
+++ b/include/asm-m32r/system.h
@@ -10,7 +10,6 @@
  * Copyright (C) 2004, 2006  Hirokazu Takata <takata at linux-m32r.org>
  */
 
-#include <linux/config.h>
 #include <asm/assembler.h>
 
 #ifdef __KERNEL__
diff --git a/include/asm-m32r/timex.h b/include/asm-m32r/timex.h
index abf12e7ffbf3..e89bfd17db51 100644
--- a/include/asm-m32r/timex.h
+++ b/include/asm-m32r/timex.h
@@ -9,7 +9,6 @@
  * m32r architecture timex specifications
  */
 
-#include <linux/config.h>
 
 #define CLOCK_TICK_RATE	(CONFIG_BUS_CLOCK / CONFIG_TIMER_DIVIDE)
 #define CLOCK_TICK_FACTOR	20	/* Factor of both 1000000 and CLOCK_TICK_RATE */
diff --git a/include/asm-m32r/tlbflush.h b/include/asm-m32r/tlbflush.h
index bc7c407dbd92..ae4494960593 100644
--- a/include/asm-m32r/tlbflush.h
+++ b/include/asm-m32r/tlbflush.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_M32R_TLBFLUSH_H
 #define _ASM_M32R_TLBFLUSH_H
 
-#include <linux/config.h>
 #include <asm/m32r.h>
 
 /*
diff --git a/include/asm-m32r/uaccess.h b/include/asm-m32r/uaccess.h
index 819cc28a94f7..26e978c7e3b4 100644
--- a/include/asm-m32r/uaccess.h
+++ b/include/asm-m32r/uaccess.h
@@ -11,7 +11,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/thread_info.h>
 #include <asm/page.h>
diff --git a/include/asm-m68k/atomic.h b/include/asm-m68k/atomic.h
index 732d696d31a6..d5eed64cb833 100644
--- a/include/asm-m68k/atomic.h
+++ b/include/asm-m68k/atomic.h
@@ -1,7 +1,6 @@
 #ifndef __ARCH_M68K_ATOMIC__
 #define __ARCH_M68K_ATOMIC__
 
-#include <linux/config.h>
 
 #include <asm/system.h>	/* local_irq_XXX() */
 
diff --git a/include/asm-m68k/bug.h b/include/asm-m68k/bug.h
index 072ce274d537..7b60776cc966 100644
--- a/include/asm-m68k/bug.h
+++ b/include/asm-m68k/bug.h
@@ -1,7 +1,6 @@
 #ifndef _M68K_BUG_H
 #define _M68K_BUG_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_BUG
 #ifdef CONFIG_DEBUG_BUGVERBOSE
diff --git a/include/asm-m68k/dma-mapping.h b/include/asm-m68k/dma-mapping.h
index b1920c703d82..dffd59cf1364 100644
--- a/include/asm-m68k/dma-mapping.h
+++ b/include/asm-m68k/dma-mapping.h
@@ -1,7 +1,6 @@
 #ifndef _M68K_DMA_MAPPING_H
 #define _M68K_DMA_MAPPING_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_PCI
 #include <asm-generic/dma-mapping.h>
diff --git a/include/asm-m68k/dma.h b/include/asm-m68k/dma.h
index d5266a886226..d0c9e61e57b4 100644
--- a/include/asm-m68k/dma.h
+++ b/include/asm-m68k/dma.h
@@ -1,7 +1,6 @@
 #ifndef _M68K_DMA_H
 #define _M68K_DMA_H 1
 
-#include <linux/config.h>
 
 /* it's useless on the m68k, but unfortunately needed by the new
    bootmem allocator (but this should do it for this) */
diff --git a/include/asm-m68k/dvma.h b/include/asm-m68k/dvma.h
index 5978f87b0a8a..e1112de5a5e3 100644
--- a/include/asm-m68k/dvma.h
+++ b/include/asm-m68k/dvma.h
@@ -9,7 +9,6 @@
 #ifndef __M68K_DVMA_H
 #define __M68K_DVMA_H
 
-#include <linux/config.h>
 
 #define DVMA_PAGE_SHIFT	13
 #define DVMA_PAGE_SIZE	(1UL << DVMA_PAGE_SHIFT)
diff --git a/include/asm-m68k/elf.h b/include/asm-m68k/elf.h
index 38bf8347f14d..eb63b85f9336 100644
--- a/include/asm-m68k/elf.h
+++ b/include/asm-m68k/elf.h
@@ -5,7 +5,6 @@
  * ELF register definitions..
  */
 
-#include <linux/config.h>
 #include <asm/ptrace.h>
 #include <asm/user.h>
 
diff --git a/include/asm-m68k/entry.h b/include/asm-m68k/entry.h
index 0396495cd97d..f8f6b185d793 100644
--- a/include/asm-m68k/entry.h
+++ b/include/asm-m68k/entry.h
@@ -1,7 +1,6 @@
 #ifndef __M68K_ENTRY_H
 #define __M68K_ENTRY_H
 
-#include <linux/config.h>
 #include <asm/setup.h>
 #include <asm/page.h>
 
diff --git a/include/asm-m68k/fpu.h b/include/asm-m68k/fpu.h
index 3bcf85065c19..59701d7b4e78 100644
--- a/include/asm-m68k/fpu.h
+++ b/include/asm-m68k/fpu.h
@@ -1,7 +1,6 @@
 #ifndef __M68K_FPU_H
 #define __M68K_FPU_H
 
-#include <linux/config.h>
 
 /*
  * MAX floating point unit state size (FSAVE/FRESTORE)
diff --git a/include/asm-m68k/hardirq.h b/include/asm-m68k/hardirq.h
index 5e1c5826c83d..394ee946015c 100644
--- a/include/asm-m68k/hardirq.h
+++ b/include/asm-m68k/hardirq.h
@@ -1,7 +1,6 @@
 #ifndef __M68K_HARDIRQ_H
 #define __M68K_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/cache.h>
 
diff --git a/include/asm-m68k/ide.h b/include/asm-m68k/ide.h
index 36118fd01867..365f76fb8013 100644
--- a/include/asm-m68k/ide.h
+++ b/include/asm-m68k/ide.h
@@ -31,7 +31,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #include <asm/setup.h>
 #include <asm/io.h>
diff --git a/include/asm-m68k/io.h b/include/asm-m68k/io.h
index dcfaa352d34c..5e0fcf41804d 100644
--- a/include/asm-m68k/io.h
+++ b/include/asm-m68k/io.h
@@ -23,7 +23,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <asm/raw_io.h>
 #include <asm/virtconvert.h>
diff --git a/include/asm-m68k/irq.h b/include/asm-m68k/irq.h
index 9ac047c400c4..b4f48b2a6a57 100644
--- a/include/asm-m68k/irq.h
+++ b/include/asm-m68k/irq.h
@@ -1,7 +1,6 @@
 #ifndef _M68K_IRQ_H_
 #define _M68K_IRQ_H_
 
-#include <linux/config.h>
 #include <linux/interrupt.h>
 
 /*
diff --git a/include/asm-m68k/mc146818rtc.h b/include/asm-m68k/mc146818rtc.h
index 11442095a8cf..11fe12ddb913 100644
--- a/include/asm-m68k/mc146818rtc.h
+++ b/include/asm-m68k/mc146818rtc.h
@@ -4,7 +4,6 @@
 #ifndef _ASM_MC146818RTC_H
 #define _ASM_MC146818RTC_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_ATARI
 /* RTC in Atari machines */
diff --git a/include/asm-m68k/mmu_context.h b/include/asm-m68k/mmu_context.h
index 661191d15c81..231d11bd8e32 100644
--- a/include/asm-m68k/mmu_context.h
+++ b/include/asm-m68k/mmu_context.h
@@ -1,7 +1,6 @@
 #ifndef __M68K_MMU_CONTEXT_H
 #define __M68K_MMU_CONTEXT_H
 
-#include <linux/config.h>
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
diff --git a/include/asm-m68k/motorola_pgtable.h b/include/asm-m68k/motorola_pgtable.h
index 1628723458f5..1ccc7338a54b 100644
--- a/include/asm-m68k/motorola_pgtable.h
+++ b/include/asm-m68k/motorola_pgtable.h
@@ -1,7 +1,6 @@
 #ifndef _MOTOROLA_PGTABLE_H
 #define _MOTOROLA_PGTABLE_H
 
-#include <linux/config.h>
 
 /*
  * Definitions for MMU descriptors
diff --git a/include/asm-m68k/openprom.h b/include/asm-m68k/openprom.h
index efbfb0bec6e2..869ab9176e9f 100644
--- a/include/asm-m68k/openprom.h
+++ b/include/asm-m68k/openprom.h
@@ -8,7 +8,6 @@
  * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
  */
 
-#include <linux/config.h>
 
 /* Empirical constants... */
 #ifdef CONFIG_SUN3
diff --git a/include/asm-m68k/page.h b/include/asm-m68k/page.h
index f206dfbc1d48..db017f838c29 100644
--- a/include/asm-m68k/page.h
+++ b/include/asm-m68k/page.h
@@ -1,7 +1,6 @@
 #ifndef _M68K_PAGE_H
 #define _M68K_PAGE_H
 
-#include <linux/config.h>
 
 /* PAGE_SHIFT determines the page size */
 #ifndef CONFIG_SUN3
diff --git a/include/asm-m68k/page_offset.h b/include/asm-m68k/page_offset.h
index 86d3c2845ad4..1cbdb7f30ac2 100644
--- a/include/asm-m68k/page_offset.h
+++ b/include/asm-m68k/page_offset.h
@@ -1,4 +1,3 @@
-#include <linux/config.h>
 
 /* This handles the memory map.. */
 #ifndef CONFIG_SUN3
diff --git a/include/asm-m68k/pgalloc.h b/include/asm-m68k/pgalloc.h
index b468b7958aaa..a9cfb4b99d88 100644
--- a/include/asm-m68k/pgalloc.h
+++ b/include/asm-m68k/pgalloc.h
@@ -2,7 +2,6 @@
 #ifndef M68K_PGALLOC_H
 #define M68K_PGALLOC_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <asm/setup.h>
diff --git a/include/asm-m68k/pgtable.h b/include/asm-m68k/pgtable.h
index add129e93fd7..f3aa05377987 100644
--- a/include/asm-m68k/pgtable.h
+++ b/include/asm-m68k/pgtable.h
@@ -3,7 +3,6 @@
 
 #include <asm-generic/4level-fixup.h>
 
-#include <linux/config.h>
 #include <asm/setup.h>
 
 #ifndef __ASSEMBLY__
diff --git a/include/asm-m68k/processor.h b/include/asm-m68k/processor.h
index 7982285e84ed..352799e71f08 100644
--- a/include/asm-m68k/processor.h
+++ b/include/asm-m68k/processor.h
@@ -13,7 +13,6 @@
  */
 #define current_text_addr() ({ __label__ _l; _l: &&_l;})
 
-#include <linux/config.h>
 #include <linux/thread_info.h>
 #include <asm/segment.h>
 #include <asm/fpu.h>
diff --git a/include/asm-m68k/semaphore-helper.h b/include/asm-m68k/semaphore-helper.h
index 1516a642f9a5..eef30ba0b499 100644
--- a/include/asm-m68k/semaphore-helper.h
+++ b/include/asm-m68k/semaphore-helper.h
@@ -9,7 +9,6 @@
  * m68k version by Andreas Schwab
  */
 
-#include <linux/config.h>
 #include <linux/errno.h>
 
 /*
diff --git a/include/asm-m68k/serial.h b/include/asm-m68k/serial.h
index 3fe29f8b0194..2b90d6e69070 100644
--- a/include/asm-m68k/serial.h
+++ b/include/asm-m68k/serial.h
@@ -6,7 +6,6 @@
  *
  */
 
-#include <linux/config.h>
 
 /*
  * This assumes you have a 1.8432 MHz clock for your UART.
diff --git a/include/asm-m68k/setup.h b/include/asm-m68k/setup.h
index a89aa84073e5..7facc9a46e74 100644
--- a/include/asm-m68k/setup.h
+++ b/include/asm-m68k/setup.h
@@ -23,7 +23,6 @@
 #ifndef _M68K_SETUP_H
 #define _M68K_SETUP_H
 
-#include <linux/config.h>
 
 
     /*
diff --git a/include/asm-m68k/shm.h b/include/asm-m68k/shm.h
index 3fa2f368fc1a..fa56ec84a126 100644
--- a/include/asm-m68k/shm.h
+++ b/include/asm-m68k/shm.h
@@ -1,7 +1,6 @@
 #ifndef _M68K_SHM_H
 #define _M68K_SHM_H
 
-#include <linux/config.h>
 
 /* format of page table entries that correspond to shared memory pages
    currently out in swap space (see also mm/swap.c):
diff --git a/include/asm-m68k/system.h b/include/asm-m68k/system.h
index 64d3481df74c..d6dd8052cd6f 100644
--- a/include/asm-m68k/system.h
+++ b/include/asm-m68k/system.h
@@ -1,7 +1,6 @@
 #ifndef _M68K_SYSTEM_H
 #define _M68K_SYSTEM_H
 
-#include <linux/config.h> /* get configuration macros */
 #include <linux/linkage.h>
 #include <linux/kernel.h>
 #include <asm/segment.h>
diff --git a/include/asm-m68k/tlbflush.h b/include/asm-m68k/tlbflush.h
index 8e61ccffe13a..31678831ee47 100644
--- a/include/asm-m68k/tlbflush.h
+++ b/include/asm-m68k/tlbflush.h
@@ -1,7 +1,6 @@
 #ifndef _M68K_TLBFLUSH_H
 #define _M68K_TLBFLUSH_H
 
-#include <linux/config.h>
 
 #ifndef CONFIG_SUN3
 
diff --git a/include/asm-m68k/virtconvert.h b/include/asm-m68k/virtconvert.h
index 8c4e8037b898..83a87c9b1a16 100644
--- a/include/asm-m68k/virtconvert.h
+++ b/include/asm-m68k/virtconvert.h
@@ -7,7 +7,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <asm/setup.h>
 #include <asm/page.h>
diff --git a/include/asm-m68knommu/bitops.h b/include/asm-m68knommu/bitops.h
index 0b68ccd327f7..d7fa7d9c0e0f 100644
--- a/include/asm-m68knommu/bitops.h
+++ b/include/asm-m68knommu/bitops.h
@@ -5,7 +5,6 @@
  * Copyright 1992, Linus Torvalds.
  */
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <asm/byteorder.h>	/* swab32 */
 #include <asm/system.h>		/* save_flags */
diff --git a/include/asm-m68knommu/coldfire.h b/include/asm-m68knommu/coldfire.h
index 6190f77b1e6c..2fabca91df83 100644
--- a/include/asm-m68knommu/coldfire.h
+++ b/include/asm-m68knommu/coldfire.h
@@ -12,7 +12,6 @@
 #define	coldfire_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /*
  *	Define the processor support peripherals base address.
diff --git a/include/asm-m68knommu/commproc.h b/include/asm-m68knommu/commproc.h
index e522ca8193a2..0161ebb5d883 100644
--- a/include/asm-m68knommu/commproc.h
+++ b/include/asm-m68knommu/commproc.h
@@ -17,7 +17,6 @@
 #ifndef __CPM_360__
 #define __CPM_360__
 
-#include <linux/config.h>
 
 /* CPM Command register masks: */
 #define CPM_CR_RST	((ushort)0x8000)
diff --git a/include/asm-m68knommu/dma-mapping.h b/include/asm-m68knommu/dma-mapping.h
index a6c42ba48da6..5622b855a577 100644
--- a/include/asm-m68knommu/dma-mapping.h
+++ b/include/asm-m68knommu/dma-mapping.h
@@ -1,7 +1,6 @@
 #ifndef _M68KNOMMU_DMA_MAPPING_H
 #define _M68KNOMMU_DMA_MAPPING_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_PCI
 #include <asm-generic/dma-mapping.h>
diff --git a/include/asm-m68knommu/dma.h b/include/asm-m68knommu/dma.h
index 43e98c96a5c2..3338001abb40 100644
--- a/include/asm-m68knommu/dma.h
+++ b/include/asm-m68knommu/dma.h
@@ -3,7 +3,6 @@
  
 //#define	DMA_DEBUG	1
 
-#include <linux/config.h>
 
 #ifdef CONFIG_COLDFIRE
 /*
diff --git a/include/asm-m68knommu/elf.h b/include/asm-m68knommu/elf.h
index 9919487703bc..40b1ed6827db 100644
--- a/include/asm-m68knommu/elf.h
+++ b/include/asm-m68knommu/elf.h
@@ -5,7 +5,6 @@
  * ELF register definitions..
  */
 
-#include <linux/config.h>
 #include <asm/ptrace.h>
 #include <asm/user.h>
 
diff --git a/include/asm-m68knommu/elia.h b/include/asm-m68knommu/elia.h
index f18b8e9d8c36..e037d4e2de33 100644
--- a/include/asm-m68knommu/elia.h
+++ b/include/asm-m68knommu/elia.h
@@ -12,7 +12,6 @@
 #define	elia_h
 /****************************************************************************/
 
-#include <linux/config.h>
 #include <asm/coldfire.h>
 
 #ifdef CONFIG_eLIA
diff --git a/include/asm-m68knommu/entry.h b/include/asm-m68knommu/entry.h
index 06f5aa70b0b5..c2553d26273d 100644
--- a/include/asm-m68knommu/entry.h
+++ b/include/asm-m68knommu/entry.h
@@ -1,7 +1,6 @@
 #ifndef __M68KNOMMU_ENTRY_H
 #define __M68KNOMMU_ENTRY_H
 
-#include <linux/config.h>
 #include <asm/setup.h>
 #include <asm/page.h>
 
diff --git a/include/asm-m68knommu/fpu.h b/include/asm-m68knommu/fpu.h
index 225082991a03..b16b2e4fca2a 100644
--- a/include/asm-m68knommu/fpu.h
+++ b/include/asm-m68knommu/fpu.h
@@ -1,7 +1,6 @@
 #ifndef __M68KNOMMU_FPU_H
 #define __M68KNOMMU_FPU_H
 
-#include <linux/config.h>
 
 /*
  * MAX floating point unit state size (FSAVE/FRESTORE)
diff --git a/include/asm-m68knommu/hardirq.h b/include/asm-m68knommu/hardirq.h
index 476180f4cba2..980075bab792 100644
--- a/include/asm-m68knommu/hardirq.h
+++ b/include/asm-m68knommu/hardirq.h
@@ -1,7 +1,6 @@
 #ifndef __M68K_HARDIRQ_H
 #define __M68K_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/cache.h>
 #include <linux/threads.h>
 #include <asm/irq.h>
diff --git a/include/asm-m68knommu/io.h b/include/asm-m68knommu/io.h
index e08f2ee4b4a2..8df4cee2a0cd 100644
--- a/include/asm-m68knommu/io.h
+++ b/include/asm-m68knommu/io.h
@@ -3,7 +3,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 /*
  * These are for ISA/PCI shared memory _only_ and should never be used
diff --git a/include/asm-m68knommu/irq.h b/include/asm-m68knommu/irq.h
index 20c48ec858a4..2b408842a30e 100644
--- a/include/asm-m68knommu/irq.h
+++ b/include/asm-m68knommu/irq.h
@@ -1,7 +1,6 @@
 #ifndef _M68K_IRQ_H_
 #define _M68K_IRQ_H_
 
-#include <linux/config.h>
 #include <asm/ptrace.h>
 
 #ifdef CONFIG_COLDFIRE
diff --git a/include/asm-m68knommu/m5206sim.h b/include/asm-m68knommu/m5206sim.h
index d1e7509021c5..7e3594dea88b 100644
--- a/include/asm-m68knommu/m5206sim.h
+++ b/include/asm-m68knommu/m5206sim.h
@@ -12,7 +12,6 @@
 #define	m5206sim_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /*
  *	Define the 5206 SIM register set addresses.
diff --git a/include/asm-m68knommu/m520xsim.h b/include/asm-m68knommu/m520xsim.h
index 6dc62869e62b..1dac22ea95ba 100644
--- a/include/asm-m68knommu/m520xsim.h
+++ b/include/asm-m68knommu/m520xsim.h
@@ -11,7 +11,6 @@
 #define m520xsim_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /*
  *  Define the 5282 SIM register set addresses.
diff --git a/include/asm-m68knommu/m523xsim.h b/include/asm-m68knommu/m523xsim.h
index 926cfb805df7..bf397313e93f 100644
--- a/include/asm-m68knommu/m523xsim.h
+++ b/include/asm-m68knommu/m523xsim.h
@@ -11,7 +11,6 @@
 #define	m523xsim_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /*
  *	Define the 523x SIM register set addresses.
diff --git a/include/asm-m68knommu/m5272sim.h b/include/asm-m68knommu/m5272sim.h
index b40875362f46..6217edc21139 100644
--- a/include/asm-m68knommu/m5272sim.h
+++ b/include/asm-m68knommu/m5272sim.h
@@ -12,7 +12,6 @@
 #define	m5272sim_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /*
  *	Define the 5272 SIM register set addresses.
diff --git a/include/asm-m68knommu/m527xsim.h b/include/asm-m68knommu/m527xsim.h
index e7878d0f7d7a..1f63ab3fb3e6 100644
--- a/include/asm-m68knommu/m527xsim.h
+++ b/include/asm-m68knommu/m527xsim.h
@@ -11,7 +11,6 @@
 #define	m527xsim_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /*
  *	Define the 5270/5271 SIM register set addresses.
diff --git a/include/asm-m68knommu/m528xsim.h b/include/asm-m68knommu/m528xsim.h
index 610774a17f70..1a3b1ae06b1e 100644
--- a/include/asm-m68knommu/m528xsim.h
+++ b/include/asm-m68knommu/m528xsim.h
@@ -11,7 +11,6 @@
 #define	m528xsim_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /*
  *	Define the 5280/5282 SIM register set addresses.
diff --git a/include/asm-m68knommu/mcfcache.h b/include/asm-m68knommu/mcfcache.h
index 9cb401421835..45d1ac57ea82 100644
--- a/include/asm-m68knommu/mcfcache.h
+++ b/include/asm-m68knommu/mcfcache.h
@@ -11,7 +11,6 @@
 #define	__M68KNOMMU_MCFCACHE_H
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /*
  *	The different ColdFire families have different cache arrangments.
diff --git a/include/asm-m68knommu/mcfdma.h b/include/asm-m68knommu/mcfdma.h
index b93f8ba8a248..ea729e81a6be 100644
--- a/include/asm-m68knommu/mcfdma.h
+++ b/include/asm-m68knommu/mcfdma.h
@@ -11,7 +11,6 @@
 #define	mcfdma_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /*
  *	Get address specific defines for this Coldfire member.
diff --git a/include/asm-m68knommu/mcfmbus.h b/include/asm-m68knommu/mcfmbus.h
index 4762589e858a..13df9d41bd1a 100644
--- a/include/asm-m68knommu/mcfmbus.h
+++ b/include/asm-m68knommu/mcfmbus.h
@@ -11,7 +11,6 @@
 
 #ifndef mcfmbus_h
 #define mcfmbus_h
-#include <linux/config.h>
 
 
 #define MCFMBUS_BASE		0x280
diff --git a/include/asm-m68knommu/mcfne.h b/include/asm-m68knommu/mcfne.h
index a71b1c8cb4f8..c920ccdb61fe 100644
--- a/include/asm-m68knommu/mcfne.h
+++ b/include/asm-m68knommu/mcfne.h
@@ -18,7 +18,6 @@
 #define	mcfne_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /*
  *	Support for NE2000 clones devices in ColdFire based boards.
diff --git a/include/asm-m68knommu/mcfpci.h b/include/asm-m68knommu/mcfpci.h
index d6229047d06e..f1507dd06ec6 100644
--- a/include/asm-m68knommu/mcfpci.h
+++ b/include/asm-m68knommu/mcfpci.h
@@ -12,7 +12,6 @@
 #define	mcfpci_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 #ifdef CONFIG_PCI
 
diff --git a/include/asm-m68knommu/mcfpit.h b/include/asm-m68knommu/mcfpit.h
index a685f1b45401..0d2672dd518a 100644
--- a/include/asm-m68knommu/mcfpit.h
+++ b/include/asm-m68knommu/mcfpit.h
@@ -11,7 +11,6 @@
 #define	mcfpit_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /*
  *	Get address specific defines for the 5270/5271, 5280/5282, and 5208.
diff --git a/include/asm-m68knommu/mcfsim.h b/include/asm-m68knommu/mcfsim.h
index 81d74a31dc43..97a0c2734a72 100644
--- a/include/asm-m68knommu/mcfsim.h
+++ b/include/asm-m68knommu/mcfsim.h
@@ -12,7 +12,6 @@
 #define	mcfsim_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /*
  *	Include 5204, 5206/e, 5235, 5249, 5270/5271, 5272, 5280/5282,
diff --git a/include/asm-m68knommu/mcfsmc.h b/include/asm-m68knommu/mcfsmc.h
index 2583900b9591..2d7a4dbd9683 100644
--- a/include/asm-m68knommu/mcfsmc.h
+++ b/include/asm-m68knommu/mcfsmc.h
@@ -17,7 +17,6 @@
  *	allow 8 bit accesses. So this code is 16bit access only.
  */
 
-#include <linux/config.h>
 
 #undef	outb
 #undef	inb
diff --git a/include/asm-m68knommu/mcftimer.h b/include/asm-m68knommu/mcftimer.h
index 0f47164c33a9..68bf33ac10d1 100644
--- a/include/asm-m68knommu/mcftimer.h
+++ b/include/asm-m68knommu/mcftimer.h
@@ -12,7 +12,6 @@
 #define	mcftimer_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /*
  *	Get address specific defines for this ColdFire member.
diff --git a/include/asm-m68knommu/mcfuart.h b/include/asm-m68knommu/mcfuart.h
index b016fad83119..8040e43786be 100644
--- a/include/asm-m68knommu/mcfuart.h
+++ b/include/asm-m68knommu/mcfuart.h
@@ -12,7 +12,6 @@
 #define	mcfuart_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /*
  *	Define the base address of the UARTS within the MBAR address
diff --git a/include/asm-m68knommu/mcfwdebug.h b/include/asm-m68knommu/mcfwdebug.h
index 6ceae103596b..27f70e45d700 100644
--- a/include/asm-m68knommu/mcfwdebug.h
+++ b/include/asm-m68knommu/mcfwdebug.h
@@ -10,7 +10,6 @@
 #ifndef mcfdebug_h
 #define mcfdebug_h
 /****************************************************************************/
-#include <linux/config.h>
 
 /* Define the debug module registers */
 #define MCFDEBUG_CSR	0x0			/* Configuration status		*/
diff --git a/include/asm-m68knommu/mmu_context.h b/include/asm-m68knommu/mmu_context.h
index 1e080eca9ca8..6c077d3a2572 100644
--- a/include/asm-m68knommu/mmu_context.h
+++ b/include/asm-m68knommu/mmu_context.h
@@ -1,7 +1,6 @@
 #ifndef __M68KNOMMU_MMU_CONTEXT_H
 #define __M68KNOMMU_MMU_CONTEXT_H
 
-#include <linux/config.h>
 #include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
diff --git a/include/asm-m68knommu/nettel.h b/include/asm-m68knommu/nettel.h
index 9bda307e6544..0299f6a2deeb 100644
--- a/include/asm-m68knommu/nettel.h
+++ b/include/asm-m68knommu/nettel.h
@@ -13,7 +13,6 @@
 #define	nettel_h
 /****************************************************************************/
 
-#include <linux/config.h>
 
 /****************************************************************************/
 #ifdef CONFIG_NETtel
diff --git a/include/asm-m68knommu/page.h b/include/asm-m68knommu/page.h
index 942dfbead27f..a22bf5a88160 100644
--- a/include/asm-m68knommu/page.h
+++ b/include/asm-m68knommu/page.h
@@ -1,7 +1,6 @@
 #ifndef _M68KNOMMU_PAGE_H
 #define _M68KNOMMU_PAGE_H
 
-#include <linux/config.h>
 
 /* PAGE_SHIFT determines the page size */
 
diff --git a/include/asm-m68knommu/page_offset.h b/include/asm-m68knommu/page_offset.h
index 2b45645e9b29..8ed6d7b7d9d1 100644
--- a/include/asm-m68knommu/page_offset.h
+++ b/include/asm-m68knommu/page_offset.h
@@ -1,5 +1,4 @@
 
-#include <linux/config.h>
 
 /* This handles the memory map.. */
 
diff --git a/include/asm-m68knommu/param.h b/include/asm-m68knommu/param.h
index 3f57d5db81f5..4c9904d6512e 100644
--- a/include/asm-m68knommu/param.h
+++ b/include/asm-m68knommu/param.h
@@ -1,7 +1,6 @@
 #ifndef _M68KNOMMU_PARAM_H
 #define _M68KNOMMU_PARAM_H
 
-#include <linux/config.h>
 
 #if defined(CONFIG_CLEOPATRA)
 #define	HZ 1000
diff --git a/include/asm-m68knommu/pgtable.h b/include/asm-m68knommu/pgtable.h
index 00893055e6c2..549ad231efad 100644
--- a/include/asm-m68knommu/pgtable.h
+++ b/include/asm-m68knommu/pgtable.h
@@ -7,7 +7,6 @@
  * (C) Copyright 2000-2002, Greg Ungerer <gerg@snapgear.com>
  */
 
-#include <linux/config.h>
 #include <linux/slab.h>
 #include <asm/processor.h>
 #include <asm/page.h>
diff --git a/include/asm-m68knommu/processor.h b/include/asm-m68knommu/processor.h
index ba393b1a023b..278b00bc60c5 100644
--- a/include/asm-m68knommu/processor.h
+++ b/include/asm-m68knommu/processor.h
@@ -13,7 +13,6 @@
  */
 #define current_text_addr() ({ __label__ _l; _l: &&_l;})
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <asm/types.h>
 #include <asm/segment.h>
diff --git a/include/asm-m68knommu/semaphore-helper.h b/include/asm-m68knommu/semaphore-helper.h
index a6586417c1c2..43da7bc483c7 100644
--- a/include/asm-m68knommu/semaphore-helper.h
+++ b/include/asm-m68knommu/semaphore-helper.h
@@ -9,7 +9,6 @@
  * m68k version by Andreas Schwab
  */
 
-#include <linux/config.h>
 
 /*
  * These two _must_ execute atomically wrt each other.
diff --git a/include/asm-m68knommu/system.h b/include/asm-m68knommu/system.h
index 6338afc850ba..2bbe2db00a22 100644
--- a/include/asm-m68knommu/system.h
+++ b/include/asm-m68knommu/system.h
@@ -1,7 +1,6 @@
 #ifndef _M68KNOMMU_SYSTEM_H
 #define _M68KNOMMU_SYSTEM_H
 
-#include <linux/config.h> /* get configuration macros */
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/entry.h>
diff --git a/include/asm-m68knommu/unaligned.h b/include/asm-m68knommu/unaligned.h
index 8876f034ea64..869e9dd24f54 100644
--- a/include/asm-m68knommu/unaligned.h
+++ b/include/asm-m68knommu/unaligned.h
@@ -1,7 +1,6 @@
 #ifndef __M68K_UNALIGNED_H
 #define __M68K_UNALIGNED_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_COLDFIRE
 
diff --git a/include/asm-mips/a.out.h b/include/asm-mips/a.out.h
index 2b3dc3bed4da..ef33c3f13484 100644
--- a/include/asm-mips/a.out.h
+++ b/include/asm-mips/a.out.h
@@ -10,7 +10,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #endif
 
diff --git a/include/asm-mips/addrspace.h b/include/asm-mips/addrspace.h
index 42520cc84b0f..a7d0d26e93c9 100644
--- a/include/asm-mips/addrspace.h
+++ b/include/asm-mips/addrspace.h
@@ -10,7 +10,6 @@
 #ifndef _ASM_ADDRSPACE_H
 #define _ASM_ADDRSPACE_H
 
-#include <linux/config.h>
 #include <spaces.h>
 
 /*
diff --git a/include/asm-mips/arc/types.h b/include/asm-mips/arc/types.h
index bbb725c366fb..b9adcd6f0860 100644
--- a/include/asm-mips/arc/types.h
+++ b/include/asm-mips/arc/types.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_ARC_TYPES_H
 #define _ASM_ARC_TYPES_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_ARC32
 
diff --git a/include/asm-mips/asm.h b/include/asm-mips/asm.h
index 4b090f3142e0..e3038a4599ee 100644
--- a/include/asm-mips/asm.h
+++ b/include/asm-mips/asm.h
@@ -17,7 +17,6 @@
 #ifndef __ASM_ASM_H
 #define __ASM_ASM_H
 
-#include <linux/config.h>
 #include <asm/sgidefs.h>
 
 #ifndef CAT
diff --git a/include/asm-mips/asmmacro.h b/include/asm-mips/asmmacro.h
index f54aa147ec19..2c42f6b00a49 100644
--- a/include/asm-mips/asmmacro.h
+++ b/include/asm-mips/asmmacro.h
@@ -8,7 +8,6 @@
 #ifndef _ASM_ASMMACRO_H
 #define _ASM_ASMMACRO_H
 
-#include <linux/config.h>
 #include <asm/hazards.h>
 
 #ifdef CONFIG_32BIT
diff --git a/include/asm-mips/atomic.h b/include/asm-mips/atomic.h
index 2c8b853376c9..13d44e14025a 100644
--- a/include/asm-mips/atomic.h
+++ b/include/asm-mips/atomic.h
@@ -17,7 +17,6 @@
  * <linux/spinlock.h> we have to include <linux/spinlock.h> outside the
  * main big wrapper ...
  */
-#include <linux/config.h>
 #include <linux/spinlock.h>
 
 #ifndef _ASM_ATOMIC_H
diff --git a/include/asm-mips/bcache.h b/include/asm-mips/bcache.h
index 446102b34f4e..3646a3f2ed38 100644
--- a/include/asm-mips/bcache.h
+++ b/include/asm-mips/bcache.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_BCACHE_H
 #define _ASM_BCACHE_H
 
-#include <linux/config.h>
 
 /* Some R4000 / R4400 / R4600 / R5000 machines may have a non-dma-coherent,
    chipset implemented caches.  On machines with other CPUs the CPU does the
diff --git a/include/asm-mips/bitops.h b/include/asm-mips/bitops.h
index a1728f8c0705..0e71df31f81c 100644
--- a/include/asm-mips/bitops.h
+++ b/include/asm-mips/bitops.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_BITOPS_H
 #define _ASM_BITOPS_H
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <asm/bug.h>
diff --git a/include/asm-mips/bug.h b/include/asm-mips/bug.h
index 87d49a5bdc63..7b4739dc8f3f 100644
--- a/include/asm-mips/bug.h
+++ b/include/asm-mips/bug.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_BUG_H
 #define __ASM_BUG_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_BUG
 
diff --git a/include/asm-mips/bugs.h b/include/asm-mips/bugs.h
index cb2ea7c15c7a..0d7f9c1f5546 100644
--- a/include/asm-mips/bugs.h
+++ b/include/asm-mips/bugs.h
@@ -7,7 +7,6 @@
 #ifndef _ASM_BUGS_H
 #define _ASM_BUGS_H
 
-#include <linux/config.h>
 #include <linux/delay.h>
 #include <asm/cpu.h>
 #include <asm/cpu-info.h>
diff --git a/include/asm-mips/byteorder.h b/include/asm-mips/byteorder.h
index aefc02f16fd8..eee83cbdf2b0 100644
--- a/include/asm-mips/byteorder.h
+++ b/include/asm-mips/byteorder.h
@@ -8,7 +8,6 @@
 #ifndef _ASM_BYTEORDER_H
 #define _ASM_BYTEORDER_H
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <asm/types.h>
 
diff --git a/include/asm-mips/cache.h b/include/asm-mips/cache.h
index 55e19f2ff0e0..37f175c42bb5 100644
--- a/include/asm-mips/cache.h
+++ b/include/asm-mips/cache.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_CACHE_H
 #define _ASM_CACHE_H
 
-#include <linux/config.h>
 #include <kmalloc.h>
 
 #define L1_CACHE_SHIFT		CONFIG_MIPS_L1_CACHE_SHIFT
diff --git a/include/asm-mips/checksum.h b/include/asm-mips/checksum.h
index b09f8971e95d..a5e6050ec0f3 100644
--- a/include/asm-mips/checksum.h
+++ b/include/asm-mips/checksum.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_CHECKSUM_H
 #define _ASM_CHECKSUM_H
 
-#include <linux/config.h>
 #include <linux/in6.h>
 
 #include <asm/uaccess.h>
diff --git a/include/asm-mips/cpu-features.h b/include/asm-mips/cpu-features.h
index 254e11ed247b..881ce1f9803d 100644
--- a/include/asm-mips/cpu-features.h
+++ b/include/asm-mips/cpu-features.h
@@ -9,7 +9,6 @@
 #ifndef __ASM_CPU_FEATURES_H
 #define __ASM_CPU_FEATURES_H
 
-#include <linux/config.h>
 
 #include <asm/cpu.h>
 #include <asm/cpu-info.h>
diff --git a/include/asm-mips/cpu-info.h b/include/asm-mips/cpu-info.h
index 6572ac703662..a2f0c8ea9160 100644
--- a/include/asm-mips/cpu-info.h
+++ b/include/asm-mips/cpu-info.h
@@ -12,7 +12,6 @@
 #ifndef __ASM_CPU_INFO_H
 #define __ASM_CPU_INFO_H
 
-#include <linux/config.h>
 #include <asm/cache.h>
 
 #ifdef CONFIG_SGI_IP27
diff --git a/include/asm-mips/ddb5xxx/ddb5477.h b/include/asm-mips/ddb5xxx/ddb5477.h
index a438548e6ef3..c5af4b73fdd7 100644
--- a/include/asm-mips/ddb5xxx/ddb5477.h
+++ b/include/asm-mips/ddb5xxx/ddb5477.h
@@ -17,7 +17,6 @@
 #ifndef __ASM_DDB5XXX_DDB5477_H
 #define __ASM_DDB5XXX_DDB5477_H
 
-#include <linux/config.h>
 
 /*
  * This contains macros that are specific to DDB5477 or renamed from
diff --git a/include/asm-mips/ddb5xxx/ddb5xxx.h b/include/asm-mips/ddb5xxx/ddb5xxx.h
index 873c03f2c5fe..42c274871625 100644
--- a/include/asm-mips/ddb5xxx/ddb5xxx.h
+++ b/include/asm-mips/ddb5xxx/ddb5xxx.h
@@ -18,7 +18,6 @@
 #ifndef __ASM_DDB5XXX_DDB5XXX_H
 #define __ASM_DDB5XXX_DDB5XXX_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 
 /*
diff --git a/include/asm-mips/debug.h b/include/asm-mips/debug.h
index 930f2b75e766..1fd5a2b39445 100644
--- a/include/asm-mips/debug.h
+++ b/include/asm-mips/debug.h
@@ -15,7 +15,6 @@
 #ifndef _ASM_DEBUG_H
 #define _ASM_DEBUG_H
 
-#include <linux/config.h>
 
 /*
  * run-time macros for catching spurious errors.  Eable CONFIG_RUNTIME_DEBUG in
diff --git a/include/asm-mips/dec/prom.h b/include/asm-mips/dec/prom.h
index 1384dd0964b9..b9c8203688d5 100644
--- a/include/asm-mips/dec/prom.h
+++ b/include/asm-mips/dec/prom.h
@@ -15,7 +15,6 @@
 #ifndef _ASM_DEC_PROM_H
 #define _ASM_DEC_PROM_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 
 #include <asm/addrspace.h>
diff --git a/include/asm-mips/delay.h b/include/asm-mips/delay.h
index 64dd45150f64..b2c9ed47508d 100644
--- a/include/asm-mips/delay.h
+++ b/include/asm-mips/delay.h
@@ -10,7 +10,6 @@
 #ifndef _ASM_DELAY_H
 #define _ASM_DELAY_H
 
-#include <linux/config.h>
 #include <linux/param.h>
 #include <linux/smp.h>
 #include <asm/compiler.h>
diff --git a/include/asm-mips/dma.h b/include/asm-mips/dma.h
index 6aaf9939a716..e85849ac165f 100644
--- a/include/asm-mips/dma.h
+++ b/include/asm-mips/dma.h
@@ -12,7 +12,6 @@
 #ifndef _ASM_DMA_H
 #define _ASM_DMA_H
 
-#include <linux/config.h>
 #include <asm/io.h>			/* need byte IO */
 #include <linux/spinlock.h>		/* And spinlocks */
 #include <linux/delay.h>
diff --git a/include/asm-mips/elf.h b/include/asm-mips/elf.h
index bdc9de2df1ef..ebd6bfb19d66 100644
--- a/include/asm-mips/elf.h
+++ b/include/asm-mips/elf.h
@@ -8,7 +8,6 @@
 #ifndef _ASM_ELF_H
 #define _ASM_ELF_H
 
-#include <linux/config.h>
 
 /* ELF header e_flags defines. */
 /* MIPS architecture level. */
diff --git a/include/asm-mips/fcntl.h b/include/asm-mips/fcntl.h
index 43d047a9a6af..787220e6c1fc 100644
--- a/include/asm-mips/fcntl.h
+++ b/include/asm-mips/fcntl.h
@@ -8,7 +8,6 @@
 #ifndef _ASM_FCNTL_H
 #define _ASM_FCNTL_H
 
-#include <linux/config.h>
 
 #define O_APPEND	0x0008
 #define O_SYNC		0x0010
diff --git a/include/asm-mips/fixmap.h b/include/asm-mips/fixmap.h
index 73a3028dd9f9..1cadefbbc037 100644
--- a/include/asm-mips/fixmap.h
+++ b/include/asm-mips/fixmap.h
@@ -13,7 +13,6 @@
 #ifndef _ASM_FIXMAP_H
 #define _ASM_FIXMAP_H
 
-#include <linux/config.h>
 #include <asm/page.h>
 #ifdef CONFIG_HIGHMEM
 #include <linux/threads.h>
diff --git a/include/asm-mips/fpu.h b/include/asm-mips/fpu.h
index b0f50015e252..199e768ff73a 100644
--- a/include/asm-mips/fpu.h
+++ b/include/asm-mips/fpu.h
@@ -10,7 +10,6 @@
 #ifndef _ASM_FPU_H
 #define _ASM_FPU_H
 
-#include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/thread_info.h>
 
diff --git a/include/asm-mips/futex.h b/include/asm-mips/futex.h
index a554089991f2..d71d878990df 100644
--- a/include/asm-mips/futex.h
+++ b/include/asm-mips/futex.h
@@ -3,7 +3,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/futex.h>
 #include <asm/errno.h>
 #include <asm/uaccess.h>
diff --git a/include/asm-mips/hazards.h b/include/asm-mips/hazards.h
index dadc05188db7..66943c451c1d 100644
--- a/include/asm-mips/hazards.h
+++ b/include/asm-mips/hazards.h
@@ -10,7 +10,6 @@
 #ifndef _ASM_HAZARDS_H
 #define _ASM_HAZARDS_H
 
-#include <linux/config.h>
 
 #ifdef __ASSEMBLY__
 
diff --git a/include/asm-mips/highmem.h b/include/asm-mips/highmem.h
index 8cf598402492..c976bfaaba83 100644
--- a/include/asm-mips/highmem.h
+++ b/include/asm-mips/highmem.h
@@ -19,7 +19,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <asm/kmap_types.h>
diff --git a/include/asm-mips/interrupt.h b/include/asm-mips/interrupt.h
index 4bb9c06f4410..a99d6867510f 100644
--- a/include/asm-mips/interrupt.h
+++ b/include/asm-mips/interrupt.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_INTERRUPT_H
 #define _ASM_INTERRUPT_H
 
-#include <linux/config.h>
 #include <asm/hazards.h>
 
 __asm__ (
diff --git a/include/asm-mips/io.h b/include/asm-mips/io.h
index 6b17eb9d79a5..df624e1ee6e2 100644
--- a/include/asm-mips/io.h
+++ b/include/asm-mips/io.h
@@ -12,7 +12,6 @@
 #ifndef _ASM_IO_H
 #define _ASM_IO_H
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
diff --git a/include/asm-mips/ip32/machine.h b/include/asm-mips/ip32/machine.h
index e440fdf4b232..1b631b8da6f8 100644
--- a/include/asm-mips/ip32/machine.h
+++ b/include/asm-mips/ip32/machine.h
@@ -10,7 +10,6 @@
 #ifndef _ASM_IP32_MACHINE_H
 #define _ASM_IP32_MACHINE_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_SGI_IP32
 
diff --git a/include/asm-mips/irq.h b/include/asm-mips/irq.h
index dde677f02bc0..d35c61776a02 100644
--- a/include/asm-mips/irq.h
+++ b/include/asm-mips/irq.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_IRQ_H
 #define _ASM_IRQ_H
 
-#include <linux/config.h>
 #include <linux/linkage.h>
 
 #include <asm/mipsmtregs.h>
diff --git a/include/asm-mips/isadep.h b/include/asm-mips/isadep.h
index 7bb003511d9e..24c6cda79377 100644
--- a/include/asm-mips/isadep.h
+++ b/include/asm-mips/isadep.h
@@ -5,7 +5,6 @@
  *
  * Copyright (c) 1998 Harald Koerfgen
  */
-#include <linux/config.h>
 
 #ifndef __ASM_ISADEP_H
 #define __ASM_ISADEP_H
diff --git a/include/asm-mips/jmr3927/irq.h b/include/asm-mips/jmr3927/irq.h
index b0c325a22343..fe551f33a74f 100644
--- a/include/asm-mips/jmr3927/irq.h
+++ b/include/asm-mips/jmr3927/irq.h
@@ -12,7 +12,6 @@
 
 #ifndef __ASSEMBLY__
 
-#include <linux/config.h>
 #include <asm/irq.h>
 
 struct tb_irq_space {
diff --git a/include/asm-mips/kmap_types.h b/include/asm-mips/kmap_types.h
index 6886a0c3fedf..806aae3c5338 100644
--- a/include/asm-mips/kmap_types.h
+++ b/include/asm-mips/kmap_types.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_KMAP_TYPES_H
 #define _ASM_KMAP_TYPES_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_DEBUG_HIGHMEM
 # define D(n) __KM_FENCE_##n ,
diff --git a/include/asm-mips/local.h b/include/asm-mips/local.h
index c38844f615fc..9e2d43bae388 100644
--- a/include/asm-mips/local.h
+++ b/include/asm-mips/local.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_LOCAL_H
 #define _ASM_LOCAL_H
 
-#include <linux/config.h>
 #include <linux/percpu.h>
 #include <asm/atomic.h>
 
diff --git a/include/asm-mips/mach-au1x00/au1000.h b/include/asm-mips/mach-au1x00/au1000.h
index 4686e17c206c..582acd8adb81 100644
--- a/include/asm-mips/mach-au1x00/au1000.h
+++ b/include/asm-mips/mach-au1x00/au1000.h
@@ -35,7 +35,6 @@
 #ifndef _AU1000_H_
 #define _AU1000_H_
 
-#include <linux/config.h>
 
 #ifndef _LANGUAGE_ASSEMBLY
 
diff --git a/include/asm-mips/mach-au1x00/au1xxx.h b/include/asm-mips/mach-au1x00/au1xxx.h
index b7b46dd9b929..947135941033 100644
--- a/include/asm-mips/mach-au1x00/au1xxx.h
+++ b/include/asm-mips/mach-au1x00/au1xxx.h
@@ -23,7 +23,6 @@
 #ifndef _AU1XXX_H_
 #define _AU1XXX_H_
 
-#include <linux/config.h>
 
 #include <asm/mach-au1x00/au1000.h>
 
diff --git a/include/asm-mips/mach-au1x00/au1xxx_dbdma.h b/include/asm-mips/mach-au1x00/au1xxx_dbdma.h
index b327bcd3fee1..d5b38a247e5a 100644
--- a/include/asm-mips/mach-au1x00/au1xxx_dbdma.h
+++ b/include/asm-mips/mach-au1x00/au1xxx_dbdma.h
@@ -34,7 +34,6 @@
 #ifndef _AU1000_DBDMA_H_
 #define _AU1000_DBDMA_H_
 
-#include <linux/config.h>
 
 #ifndef _LANGUAGE_ASSEMBLY
 
diff --git a/include/asm-mips/mach-au1x00/au1xxx_ide.h b/include/asm-mips/mach-au1x00/au1xxx_ide.h
index e867b4ef96d1..301e71300779 100644
--- a/include/asm-mips/mach-au1x00/au1xxx_ide.h
+++ b/include/asm-mips/mach-au1x00/au1xxx_ide.h
@@ -29,7 +29,6 @@
  * Note: for more information, please refer "AMD Alchemy Au1200/Au1550 IDE
  *       Interface and Linux Device Driver" Application Note.
  */
-#include <linux/config.h>
 
 #ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA
         #define DMA_WAIT_TIMEOUT        100
diff --git a/include/asm-mips/mach-au1x00/au1xxx_psc.h b/include/asm-mips/mach-au1x00/au1xxx_psc.h
index 8e5fb3c7da4d..5c3e2a38ce12 100644
--- a/include/asm-mips/mach-au1x00/au1xxx_psc.h
+++ b/include/asm-mips/mach-au1x00/au1xxx_psc.h
@@ -33,7 +33,6 @@
 #ifndef _AU1000_PSC_H_
 #define _AU1000_PSC_H_
 
-#include <linux/config.h>
 
 /* The PSC base addresses.  */
 #ifdef CONFIG_SOC_AU1550
diff --git a/include/asm-mips/mach-au1x00/ioremap.h b/include/asm-mips/mach-au1x00/ioremap.h
index d3ec6274575a..098fca4289bb 100644
--- a/include/asm-mips/mach-au1x00/ioremap.h
+++ b/include/asm-mips/mach-au1x00/ioremap.h
@@ -9,7 +9,6 @@
 #ifndef __ASM_MACH_AU1X00_IOREMAP_H
 #define __ASM_MACH_AU1X00_IOREMAP_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 
 #ifdef CONFIG_64BIT_PHYS_ADDR
diff --git a/include/asm-mips/mach-cobalt/cpu-feature-overrides.h b/include/asm-mips/mach-cobalt/cpu-feature-overrides.h
index ace8c5ef9701..e0e08fc5d7f7 100644
--- a/include/asm-mips/mach-cobalt/cpu-feature-overrides.h
+++ b/include/asm-mips/mach-cobalt/cpu-feature-overrides.h
@@ -8,7 +8,6 @@
 #ifndef __ASM_COBALT_CPU_FEATURE_OVERRIDES_H
 #define __ASM_COBALT_CPU_FEATURE_OVERRIDES_H
 
-#include <linux/config.h>
 
 #define cpu_has_tlb		1
 #define cpu_has_4kex		1
diff --git a/include/asm-mips/mach-db1x00/db1x00.h b/include/asm-mips/mach-db1x00/db1x00.h
index 7b28b23f91ce..8fbb4b42a8b5 100644
--- a/include/asm-mips/mach-db1x00/db1x00.h
+++ b/include/asm-mips/mach-db1x00/db1x00.h
@@ -28,7 +28,6 @@
 #ifndef __ASM_DB1X00_H
 #define __ASM_DB1X00_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_MIPS_DB1550
 #define BCSR_KSEG1_ADDR 0xAF000000
diff --git a/include/asm-mips/mach-generic/ide.h b/include/asm-mips/mach-generic/ide.h
index e3315359500a..6eba2e576aaa 100644
--- a/include/asm-mips/mach-generic/ide.h
+++ b/include/asm-mips/mach-generic/ide.h
@@ -15,7 +15,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/pci.h>
 #include <linux/stddef.h>
 #include <asm/processor.h>
diff --git a/include/asm-mips/mach-generic/kmalloc.h b/include/asm-mips/mach-generic/kmalloc.h
index 373d66dee9d7..410ab5f6c563 100644
--- a/include/asm-mips/mach-generic/kmalloc.h
+++ b/include/asm-mips/mach-generic/kmalloc.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_MACH_GENERIC_KMALLOC_H
 #define __ASM_MACH_GENERIC_KMALLOC_H
 
-#include <linux/config.h>
 
 #ifndef CONFIG_DMA_COHERENT
 /*
diff --git a/include/asm-mips/mach-generic/spaces.h b/include/asm-mips/mach-generic/spaces.h
index b849d8dd7e78..0ae9997bc9a8 100644
--- a/include/asm-mips/mach-generic/spaces.h
+++ b/include/asm-mips/mach-generic/spaces.h
@@ -10,7 +10,6 @@
 #ifndef _ASM_MACH_GENERIC_SPACES_H
 #define _ASM_MACH_GENERIC_SPACES_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_32BIT
 
diff --git a/include/asm-mips/mach-ip22/spaces.h b/include/asm-mips/mach-ip22/spaces.h
index 8385f716798d..ab20c026fd19 100644
--- a/include/asm-mips/mach-ip22/spaces.h
+++ b/include/asm-mips/mach-ip22/spaces.h
@@ -10,7 +10,6 @@
 #ifndef _ASM_MACH_IP22_SPACES_H
 #define _ASM_MACH_IP22_SPACES_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_32BIT
 
diff --git a/include/asm-mips/mach-ip32/cpu-feature-overrides.h b/include/asm-mips/mach-ip32/cpu-feature-overrides.h
index 36070b5654ab..5312a11098d9 100644
--- a/include/asm-mips/mach-ip32/cpu-feature-overrides.h
+++ b/include/asm-mips/mach-ip32/cpu-feature-overrides.h
@@ -9,7 +9,6 @@
 #ifndef __ASM_MACH_IP32_CPU_FEATURE_OVERRIDES_H
 #define __ASM_MACH_IP32_CPU_FEATURE_OVERRIDES_H
 
-#include <linux/config.h>
 
 /*
  * R5000 has an interesting "restriction":  ll(d)/sc(d)
diff --git a/include/asm-mips/mach-ip32/kmalloc.h b/include/asm-mips/mach-ip32/kmalloc.h
index 9d2d4d9ac036..f6198a21fba1 100644
--- a/include/asm-mips/mach-ip32/kmalloc.h
+++ b/include/asm-mips/mach-ip32/kmalloc.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_MACH_IP32_KMALLOC_H
 #define __ASM_MACH_IP32_KMALLOC_H
 
-#include <linux/config.h>
 
 #if defined(CONFIG_CPU_R5000) || defined (CONFIG_CPU_RM7000)
 #define ARCH_KMALLOC_MINALIGN	32
diff --git a/include/asm-mips/mach-mips/cpu-feature-overrides.h b/include/asm-mips/mach-mips/cpu-feature-overrides.h
index e06af6c86f86..7efbff50fcdd 100644
--- a/include/asm-mips/mach-mips/cpu-feature-overrides.h
+++ b/include/asm-mips/mach-mips/cpu-feature-overrides.h
@@ -9,7 +9,6 @@
 #ifndef __ASM_MACH_MIPS_CPU_FEATURE_OVERRIDES_H
 #define __ASM_MACH_MIPS_CPU_FEATURE_OVERRIDES_H
 
-#include <linux/config.h>
 
 /*
  * CPU feature overrides for MIPS boards
diff --git a/include/asm-mips/mach-mips/irq.h b/include/asm-mips/mach-mips/irq.h
index f8579696ca54..083d9c512a04 100644
--- a/include/asm-mips/mach-mips/irq.h
+++ b/include/asm-mips/mach-mips/irq.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_MACH_MIPS_IRQ_H
 #define __ASM_MACH_MIPS_IRQ_H
 
-#include <linux/config.h>
 
 #define NR_IRQS	256
 
diff --git a/include/asm-mips/mach-pb1x00/pb1550.h b/include/asm-mips/mach-pb1x00/pb1550.h
index 9578ead11e8a..9a4955ce3b4a 100644
--- a/include/asm-mips/mach-pb1x00/pb1550.h
+++ b/include/asm-mips/mach-pb1x00/pb1550.h
@@ -27,7 +27,6 @@
 #ifndef __ASM_PB1550_H
 #define __ASM_PB1550_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 
 #define DBDMA_AC97_TX_CHAN DSCR_CMD0_PSC1_TX
diff --git a/include/asm-mips/mach-sim/cpu-feature-overrides.h b/include/asm-mips/mach-sim/cpu-feature-overrides.h
index cadbe8eda79c..f86f2751bc0c 100644
--- a/include/asm-mips/mach-sim/cpu-feature-overrides.h
+++ b/include/asm-mips/mach-sim/cpu-feature-overrides.h
@@ -8,7 +8,6 @@
 #ifndef __ASM_MACH_SIM_CPU_FEATURE_OVERRIDES_H
 #define __ASM_MACH_SIM_CPU_FEATURE_OVERRIDES_H
 
-#include <linux/config.h>
 
 /*
  * CPU feature overrides for MIPS boards
diff --git a/include/asm-mips/mips-boards/generic.h b/include/asm-mips/mips-boards/generic.h
index 25b6ffc26623..cad47ce8a7e9 100644
--- a/include/asm-mips/mips-boards/generic.h
+++ b/include/asm-mips/mips-boards/generic.h
@@ -20,7 +20,6 @@
 #ifndef __ASM_MIPS_BOARDS_GENERIC_H
 #define __ASM_MIPS_BOARDS_GENERIC_H
 
-#include <linux/config.h>
 #include <asm/addrspace.h>
 #include <asm/byteorder.h>
 #include <asm/mips-boards/bonito64.h>
diff --git a/include/asm-mips/mipsregs.h b/include/asm-mips/mipsregs.h
index a2ef579f6b1a..87e95b5e27d4 100644
--- a/include/asm-mips/mipsregs.h
+++ b/include/asm-mips/mipsregs.h
@@ -13,7 +13,6 @@
 #ifndef _ASM_MIPSREGS_H
 #define _ASM_MIPSREGS_H
 
-#include <linux/config.h>
 #include <linux/linkage.h>
 #include <asm/hazards.h>
 
diff --git a/include/asm-mips/mmu_context.h b/include/asm-mips/mmu_context.h
index 6e09f4c87211..18b69de87daa 100644
--- a/include/asm-mips/mmu_context.h
+++ b/include/asm-mips/mmu_context.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_MMU_CONTEXT_H
 #define _ASM_MMU_CONTEXT_H
 
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
diff --git a/include/asm-mips/mmzone.h b/include/asm-mips/mmzone.h
index 7bde4432092b..e132975256b2 100644
--- a/include/asm-mips/mmzone.h
+++ b/include/asm-mips/mmzone.h
@@ -5,7 +5,6 @@
 #ifndef _ASM_MMZONE_H_
 #define _ASM_MMZONE_H_
 
-#include <linux/config.h>
 #include <asm/page.h>
 #include <mmzone.h>
 
diff --git a/include/asm-mips/module.h b/include/asm-mips/module.h
index 2af496c78c12..399d03f1c4fc 100644
--- a/include/asm-mips/module.h
+++ b/include/asm-mips/module.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_MODULE_H
 #define _ASM_MODULE_H
 
-#include <linux/config.h>
 #include <linux/list.h>
 #include <asm/uaccess.h>
 
diff --git a/include/asm-mips/msgbuf.h b/include/asm-mips/msgbuf.h
index a1533959742e..0d6c7f14de31 100644
--- a/include/asm-mips/msgbuf.h
+++ b/include/asm-mips/msgbuf.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_MSGBUF_H
 #define _ASM_MSGBUF_H
 
-#include <linux/config.h>
 
 /*
  * The msqid64_ds structure for the MIPS architecture.
diff --git a/include/asm-mips/paccess.h b/include/asm-mips/paccess.h
index 46f2d23d2697..147844ef103b 100644
--- a/include/asm-mips/paccess.h
+++ b/include/asm-mips/paccess.h
@@ -13,7 +13,6 @@
 #ifndef _ASM_PACCESS_H
 #define _ASM_PACCESS_H
 
-#include <linux/config.h>
 #include <linux/errno.h>
 
 #ifdef CONFIG_32BIT
diff --git a/include/asm-mips/page.h b/include/asm-mips/page.h
index a1eab136ff6c..f2b3314fcabc 100644
--- a/include/asm-mips/page.h
+++ b/include/asm-mips/page.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_PAGE_H
 #define _ASM_PAGE_H
 
-#include <linux/config.h>
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-mips/pci.h b/include/asm-mips/pci.h
index 6c9ad8171a77..c4d68bebdca6 100644
--- a/include/asm-mips/pci.h
+++ b/include/asm-mips/pci.h
@@ -6,7 +6,6 @@
 #ifndef _ASM_PCI_H
 #define _ASM_PCI_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 
 #ifdef __KERNEL__
diff --git a/include/asm-mips/pgalloc.h b/include/asm-mips/pgalloc.h
index fe1df572318b..582c1fe6cc4a 100644
--- a/include/asm-mips/pgalloc.h
+++ b/include/asm-mips/pgalloc.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_PGALLOC_H
 #define _ASM_PGALLOC_H
 
-#include <linux/config.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
 
diff --git a/include/asm-mips/pgtable-32.h b/include/asm-mips/pgtable-32.h
index 4d6bc45df594..e1c0e88f03f5 100644
--- a/include/asm-mips/pgtable-32.h
+++ b/include/asm-mips/pgtable-32.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_PGTABLE_32_H
 #define _ASM_PGTABLE_32_H
 
-#include <linux/config.h>
 #include <asm/addrspace.h>
 #include <asm/page.h>
 
diff --git a/include/asm-mips/pgtable-64.h b/include/asm-mips/pgtable-64.h
index 82166b254b27..0ae30d56d019 100644
--- a/include/asm-mips/pgtable-64.h
+++ b/include/asm-mips/pgtable-64.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_PGTABLE_64_H
 #define _ASM_PGTABLE_64_H
 
-#include <linux/config.h>
 #include <linux/linkage.h>
 
 #include <asm/addrspace.h>
diff --git a/include/asm-mips/pgtable-bits.h b/include/asm-mips/pgtable-bits.h
index 01e76e932e3f..7494ba91112a 100644
--- a/include/asm-mips/pgtable-bits.h
+++ b/include/asm-mips/pgtable-bits.h
@@ -10,7 +10,6 @@
 #ifndef _ASM_PGTABLE_BITS_H
 #define _ASM_PGTABLE_BITS_H
 
-#include <linux/config.h>
 
 /*
  * Note that we shift the lower 32bits of each EntryLo[01] entry
diff --git a/include/asm-mips/pgtable.h b/include/asm-mips/pgtable.h
index 702a28fa7a34..d02b47933d7f 100644
--- a/include/asm-mips/pgtable.h
+++ b/include/asm-mips/pgtable.h
@@ -8,7 +8,6 @@
 #ifndef _ASM_PGTABLE_H
 #define _ASM_PGTABLE_H
 
-#include <linux/config.h>
 #ifdef CONFIG_32BIT
 #include <asm/pgtable-32.h>
 #endif
diff --git a/include/asm-mips/prefetch.h b/include/asm-mips/prefetch.h
index 71293ec1657c..17850834ccb0 100644
--- a/include/asm-mips/prefetch.h
+++ b/include/asm-mips/prefetch.h
@@ -8,7 +8,6 @@
 #ifndef __ASM_PREFETCH_H
 #define __ASM_PREFETCH_H
 
-#include <linux/config.h>
 
 /*
  * R5000 and RM5200 implements pref and prefx instructions but they're nops, so
diff --git a/include/asm-mips/processor.h b/include/asm-mips/processor.h
index 0fb75f0762e0..532df530b4ec 100644
--- a/include/asm-mips/processor.h
+++ b/include/asm-mips/processor.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_PROCESSOR_H
 #define _ASM_PROCESSOR_H
 
-#include <linux/config.h>
 #include <linux/cpumask.h>
 #include <linux/threads.h>
 
diff --git a/include/asm-mips/ptrace.h b/include/asm-mips/ptrace.h
index fa9d8713c12a..4113316ee0da 100644
--- a/include/asm-mips/ptrace.h
+++ b/include/asm-mips/ptrace.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_PTRACE_H
 #define _ASM_PTRACE_H
 
-#include <linux/config.h>
 
 #include <asm/isadep.h>
 
diff --git a/include/asm-mips/reg.h b/include/asm-mips/reg.h
index 6173004cc88e..634b55d7e7f6 100644
--- a/include/asm-mips/reg.h
+++ b/include/asm-mips/reg.h
@@ -12,7 +12,6 @@
 #ifndef __ASM_MIPS_REG_H
 #define __ASM_MIPS_REG_H
 
-#include <linux/config.h>
 
 #if defined(CONFIG_32BIT) || defined(WANT_COMPAT_REG_H)
 
diff --git a/include/asm-mips/resource.h b/include/asm-mips/resource.h
index 1fba00c22077..87cb3085269c 100644
--- a/include/asm-mips/resource.h
+++ b/include/asm-mips/resource.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_RESOURCE_H
 #define _ASM_RESOURCE_H
 
-#include <linux/config.h>
 
 /*
  * These five resource limit IDs have a MIPS/Linux-specific ordering,
diff --git a/include/asm-mips/serial.h b/include/asm-mips/serial.h
index 7196ceb0e948..584bd9c0ab2e 100644
--- a/include/asm-mips/serial.h
+++ b/include/asm-mips/serial.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_SERIAL_H
 #define _ASM_SERIAL_H
 
-#include <linux/config.h>
 
 /*
  * This assumes you have a 1.8432 MHz clock for your UART.
diff --git a/include/asm-mips/sgiarcs.h b/include/asm-mips/sgiarcs.h
index 722b77a8c5e5..ddb859d05257 100644
--- a/include/asm-mips/sgiarcs.h
+++ b/include/asm-mips/sgiarcs.h
@@ -12,7 +12,6 @@
 #ifndef _ASM_SGIARCS_H
 #define _ASM_SGIARCS_H
 
-#include <linux/config.h>
 #include <asm/types.h>
 #include <asm/arc/types.h>
 
diff --git a/include/asm-mips/sibyte/board.h b/include/asm-mips/sibyte/board.h
index 900edcbeec37..3dfe29ed42a8 100644
--- a/include/asm-mips/sibyte/board.h
+++ b/include/asm-mips/sibyte/board.h
@@ -19,7 +19,6 @@
 #ifndef _SIBYTE_BOARD_H
 #define _SIBYTE_BOARD_H
 
-#include <linux/config.h>
 
 #if defined(CONFIG_SIBYTE_SWARM) || defined(CONFIG_SIBYTE_PTSWARM) || \
     defined(CONFIG_SIBYTE_CRHONE) || defined(CONFIG_SIBYTE_CRHINE) || \
diff --git a/include/asm-mips/sibyte/carmel.h b/include/asm-mips/sibyte/carmel.h
index b5e7dae19f0f..57c53e62a37a 100644
--- a/include/asm-mips/sibyte/carmel.h
+++ b/include/asm-mips/sibyte/carmel.h
@@ -18,7 +18,6 @@
 #ifndef __ASM_SIBYTE_CARMEL_H
 #define __ASM_SIBYTE_CARMEL_H
 
-#include <linux/config.h>
 
 #include <asm/sibyte/sb1250.h>
 #include <asm/sibyte/sb1250_int.h>
diff --git a/include/asm-mips/sibyte/sentosa.h b/include/asm-mips/sibyte/sentosa.h
index 824605847af4..64c47874f32d 100644
--- a/include/asm-mips/sibyte/sentosa.h
+++ b/include/asm-mips/sibyte/sentosa.h
@@ -18,7 +18,6 @@
 #ifndef __ASM_SIBYTE_SENTOSA_H
 #define __ASM_SIBYTE_SENTOSA_H
 
-#include <linux/config.h>
 #include <asm/sibyte/sb1250.h>
 #include <asm/sibyte/sb1250_int.h>
 
diff --git a/include/asm-mips/sibyte/swarm.h b/include/asm-mips/sibyte/swarm.h
index 06e1d528e03a..86db37e5ad85 100644
--- a/include/asm-mips/sibyte/swarm.h
+++ b/include/asm-mips/sibyte/swarm.h
@@ -18,7 +18,6 @@
 #ifndef __ASM_SIBYTE_SWARM_H
 #define __ASM_SIBYTE_SWARM_H
 
-#include <linux/config.h>
 #include <asm/sibyte/sb1250.h>
 #include <asm/sibyte/sb1250_int.h>
 
diff --git a/include/asm-mips/siginfo.h b/include/asm-mips/siginfo.h
index 2ba313d94a78..2e32949bd674 100644
--- a/include/asm-mips/siginfo.h
+++ b/include/asm-mips/siginfo.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_SIGINFO_H
 #define _ASM_SIGINFO_H
 
-#include <linux/config.h>
 
 #define __ARCH_SIGEV_PREAMBLE_SIZE (sizeof(long) + 2*sizeof(int))
 #undef __ARCH_SI_TRAPNO	/* exception code needs to fill this ...  */
diff --git a/include/asm-mips/signal.h b/include/asm-mips/signal.h
index d8349e4b55ee..a1f3a3fa9bd6 100644
--- a/include/asm-mips/signal.h
+++ b/include/asm-mips/signal.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_SIGNAL_H
 #define _ASM_SIGNAL_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 
 #define _NSIG		128
diff --git a/include/asm-mips/sim.h b/include/asm-mips/sim.h
index 9c2af1b00e19..67c4fe52bb42 100644
--- a/include/asm-mips/sim.h
+++ b/include/asm-mips/sim.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_SIM_H
 #define _ASM_SIM_H
 
-#include <linux/config.h>
 
 #include <asm/asm-offsets.h>
 
diff --git a/include/asm-mips/smp.h b/include/asm-mips/smp.h
index 75c6fe7c2126..ffcb7a336b17 100644
--- a/include/asm-mips/smp.h
+++ b/include/asm-mips/smp.h
@@ -11,7 +11,6 @@
 #ifndef __ASM_SMP_H
 #define __ASM_SMP_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_SMP
 
diff --git a/include/asm-mips/sn/addrs.h b/include/asm-mips/sn/addrs.h
index 2b5cef1ba37f..3f6891b0c0ea 100644
--- a/include/asm-mips/sn/addrs.h
+++ b/include/asm-mips/sn/addrs.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_SN_ADDRS_H
 #define _ASM_SN_ADDRS_H
 
-#include <linux/config.h>
 
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
diff --git a/include/asm-mips/sn/agent.h b/include/asm-mips/sn/agent.h
index d6df13aaed49..ac4ea85c3a5c 100644
--- a/include/asm-mips/sn/agent.h
+++ b/include/asm-mips/sn/agent.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_SGI_SN_AGENT_H
 #define _ASM_SGI_SN_AGENT_H
 
-#include <linux/config.h>
 #include <linux/topology.h>
 #include <asm/sn/addrs.h>
 #include <asm/sn/arch.h>
diff --git a/include/asm-mips/sn/arch.h b/include/asm-mips/sn/arch.h
index d247a819de7f..51174af6ac52 100644
--- a/include/asm-mips/sn/arch.h
+++ b/include/asm-mips/sn/arch.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_SN_ARCH_H
 #define _ASM_SN_ARCH_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <asm/sn/types.h>
 #ifdef CONFIG_SGI_IP27
diff --git a/include/asm-mips/sn/io.h b/include/asm-mips/sn/io.h
index 13326453efc9..ab2fa8cd2627 100644
--- a/include/asm-mips/sn/io.h
+++ b/include/asm-mips/sn/io.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_SN_IO_H
 #define _ASM_SN_IO_H
 
-#include <linux/config.h>
 #if defined (CONFIG_SGI_IP27)
 #include <asm/sn/sn0/hubio.h>
 #endif
diff --git a/include/asm-mips/sn/klconfig.h b/include/asm-mips/sn/klconfig.h
index 9709ff701d9b..19e0e926be55 100644
--- a/include/asm-mips/sn/klconfig.h
+++ b/include/asm-mips/sn/klconfig.h
@@ -27,7 +27,6 @@
  *      that offsets of existing fields do not change.
  */
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <asm/sn/types.h>
 
diff --git a/include/asm-mips/sn/kldir.h b/include/asm-mips/sn/kldir.h
index f0efab1672ec..e3e231f0b79d 100644
--- a/include/asm-mips/sn/kldir.h
+++ b/include/asm-mips/sn/kldir.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_SN_KLDIR_H
 #define _ASM_SN_KLDIR_H
 
-#include <linux/config.h>
 
 #if defined(CONFIG_SGI_IO)
 #include <asm/hack.h>
diff --git a/include/asm-mips/sn/launch.h b/include/asm-mips/sn/launch.h
index b67699c0c475..b7c2226312c6 100644
--- a/include/asm-mips/sn/launch.h
+++ b/include/asm-mips/sn/launch.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_SN_LAUNCH_H
 #define _ASM_SN_LAUNCH_H
 
-#include <linux/config.h>
 #include <asm/sn/types.h>
 #include <asm/sn/addrs.h>
 
diff --git a/include/asm-mips/sn/mapped_kernel.h b/include/asm-mips/sn/mapped_kernel.h
index 59edb20f8ec5..c3dd5d0d525f 100644
--- a/include/asm-mips/sn/mapped_kernel.h
+++ b/include/asm-mips/sn/mapped_kernel.h
@@ -20,7 +20,6 @@
  * code. So no jumps can be done before we have switched to using
  * cksseg addresses.
  */
-#include <linux/config.h>
 #include <asm/addrspace.h>
 
 #define REP_BASE	CAC_BASE
diff --git a/include/asm-mips/sn/sn0/addrs.h b/include/asm-mips/sn/sn0/addrs.h
index 398815639fb8..c0905c1ac938 100644
--- a/include/asm-mips/sn/sn0/addrs.h
+++ b/include/asm-mips/sn/sn0/addrs.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_SN_SN0_ADDRS_H
 #define _ASM_SN_SN0_ADDRS_H
 
-#include <linux/config.h>
 
 /*
  * SN0 (on a T5) Address map
diff --git a/include/asm-mips/sn/sn0/arch.h b/include/asm-mips/sn/sn0/arch.h
index fb78773a5efe..7a221666c58e 100644
--- a/include/asm-mips/sn/sn0/arch.h
+++ b/include/asm-mips/sn/sn0/arch.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_SN_SN0_ARCH_H
 #define _ASM_SN_SN0_ARCH_H
 
-#include <linux/config.h>
 
 #ifndef SABLE
 
diff --git a/include/asm-mips/sn/sn0/hubmd.h b/include/asm-mips/sn/sn0/hubmd.h
index a66def4e0ba0..f01000241884 100644
--- a/include/asm-mips/sn/sn0/hubmd.h
+++ b/include/asm-mips/sn/sn0/hubmd.h
@@ -11,7 +11,6 @@
 #ifndef	_ASM_SN_SN0_HUBMD_H
 #define	_ASM_SN_SN0_HUBMD_H
 
-#include <linux/config.h>
 
 /*
  * Hub Memory/Directory interface registers
diff --git a/include/asm-mips/stackframe.h b/include/asm-mips/stackframe.h
index c4856a874965..513aa5133830 100644
--- a/include/asm-mips/stackframe.h
+++ b/include/asm-mips/stackframe.h
@@ -10,7 +10,6 @@
 #ifndef _ASM_STACKFRAME_H
 #define _ASM_STACKFRAME_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 
 #include <asm/asm.h>
diff --git a/include/asm-mips/string.h b/include/asm-mips/string.h
index 907da600fddd..436e3ad352d9 100644
--- a/include/asm-mips/string.h
+++ b/include/asm-mips/string.h
@@ -10,7 +10,6 @@
 #ifndef _ASM_STRING_H
 #define _ASM_STRING_H
 
-#include <linux/config.h>
 
 /*
  * Most of the inline functions are rather naive implementations so I just
diff --git a/include/asm-mips/system.h b/include/asm-mips/system.h
index 261f71d16a07..130333d7c4ee 100644
--- a/include/asm-mips/system.h
+++ b/include/asm-mips/system.h
@@ -12,7 +12,6 @@
 #ifndef _ASM_SYSTEM_H
 #define _ASM_SYSTEM_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 
 #include <asm/addrspace.h>
diff --git a/include/asm-mips/thread_info.h b/include/asm-mips/thread_info.h
index f8d97dafd2f4..ae8ada5b42a9 100644
--- a/include/asm-mips/thread_info.h
+++ b/include/asm-mips/thread_info.h
@@ -9,7 +9,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #ifndef __ASSEMBLY__
 
diff --git a/include/asm-mips/tlbflush.h b/include/asm-mips/tlbflush.h
index bb4ae3cdcbf1..276be77c3e85 100644
--- a/include/asm-mips/tlbflush.h
+++ b/include/asm-mips/tlbflush.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_TLBFLUSH_H
 #define __ASM_TLBFLUSH_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 
 /*
diff --git a/include/asm-mips/tx4927/toshiba_rbtx4927.h b/include/asm-mips/tx4927/toshiba_rbtx4927.h
index 6ce1e9475f99..94bef03d9635 100644
--- a/include/asm-mips/tx4927/toshiba_rbtx4927.h
+++ b/include/asm-mips/tx4927/toshiba_rbtx4927.h
@@ -27,7 +27,6 @@
 #ifndef __ASM_TX4927_TOSHIBA_RBTX4927_H
 #define __ASM_TX4927_TOSHIBA_RBTX4927_H
 
-#include <linux/config.h>
 #include <asm/tx4927/tx4927.h>
 #include <asm/tx4927/tx4927_mips.h>
 #ifdef CONFIG_PCI
diff --git a/include/asm-mips/types.h b/include/asm-mips/types.h
index cd2813d8e136..2b52e180c6f2 100644
--- a/include/asm-mips/types.h
+++ b/include/asm-mips/types.h
@@ -52,7 +52,6 @@ typedef unsigned long long __u64;
 
 #ifndef __ASSEMBLY__
 
-#include <linux/config.h>
 
 typedef __signed char s8;
 typedef unsigned char u8;
diff --git a/include/asm-mips/uaccess.h b/include/asm-mips/uaccess.h
index b96f3e0f3933..1cdd4eeb2f73 100644
--- a/include/asm-mips/uaccess.h
+++ b/include/asm-mips/uaccess.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_UACCESS_H
 #define _ASM_UACCESS_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/thread_info.h>
diff --git a/include/asm-mips/unistd.h b/include/asm-mips/unistd.h
index 1068fe9a0a58..e71f161a4896 100644
--- a/include/asm-mips/unistd.h
+++ b/include/asm-mips/unistd.h
@@ -1170,7 +1170,6 @@ type name (atype a,btype b,ctype c,dtype d,etype e,ftype f) \
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-mips/vr41xx/vrc4173.h b/include/asm-mips/vr41xx/vrc4173.h
index 4d41a9c091d4..96fdcd54cec7 100644
--- a/include/asm-mips/vr41xx/vrc4173.h
+++ b/include/asm-mips/vr41xx/vrc4173.h
@@ -24,7 +24,6 @@
 #ifndef __NEC_VRC4173_H
 #define __NEC_VRC4173_H
 
-#include <linux/config.h>
 #include <asm/io.h>
 
 /*
diff --git a/include/asm-mips/war.h b/include/asm-mips/war.h
index ad374bd3f130..9844f0c2dfee 100644
--- a/include/asm-mips/war.h
+++ b/include/asm-mips/war.h
@@ -8,7 +8,6 @@
 #ifndef _ASM_WAR_H
 #define _ASM_WAR_H
 
-#include <linux/config.h>
 
 /*
  * Another R4600 erratum.  Due to the lack of errata information the exact
diff --git a/include/asm-mips/wbflush.h b/include/asm-mips/wbflush.h
index c3bef50f37a8..eadc0ac47e24 100644
--- a/include/asm-mips/wbflush.h
+++ b/include/asm-mips/wbflush.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_WBFLUSH_H
 #define _ASM_WBFLUSH_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_CPU_HAS_WB
 
diff --git a/include/asm-parisc/atomic.h b/include/asm-parisc/atomic.h
index 403ea97316cf..48bf9b8ab8ff 100644
--- a/include/asm-parisc/atomic.h
+++ b/include/asm-parisc/atomic.h
@@ -5,7 +5,6 @@
 #ifndef _ASM_PARISC_ATOMIC_H_
 #define _ASM_PARISC_ATOMIC_H_
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <asm/system.h>
 
diff --git a/include/asm-parisc/cache.h b/include/asm-parisc/cache.h
index c831665473cb..7d22fa206fc4 100644
--- a/include/asm-parisc/cache.h
+++ b/include/asm-parisc/cache.h
@@ -5,7 +5,6 @@
 #ifndef __ARCH_PARISC_CACHE_H
 #define __ARCH_PARISC_CACHE_H
 
-#include <linux/config.h>
 
 /*
  * PA 2.0 processors have 64-byte cachelines; PA 1.1 processors have
diff --git a/include/asm-parisc/cacheflush.h b/include/asm-parisc/cacheflush.h
index 76b6b7d6046a..0b459cdfbd6f 100644
--- a/include/asm-parisc/cacheflush.h
+++ b/include/asm-parisc/cacheflush.h
@@ -1,7 +1,6 @@
 #ifndef _PARISC_CACHEFLUSH_H
 #define _PARISC_CACHEFLUSH_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <asm/cache.h>	/* for flush_user_dcache_range_asm() proto */
 
diff --git a/include/asm-parisc/dma-mapping.h b/include/asm-parisc/dma-mapping.h
index 74d4ac6f2151..1e387e1dad30 100644
--- a/include/asm-parisc/dma-mapping.h
+++ b/include/asm-parisc/dma-mapping.h
@@ -1,7 +1,6 @@
 #ifndef _PARISC_DMA_MAPPING_H
 #define _PARISC_DMA_MAPPING_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <asm/cacheflush.h>
 #include <asm/scatterlist.h>
diff --git a/include/asm-parisc/dma.h b/include/asm-parisc/dma.h
index 31fd10df43a7..9979c3cb3745 100644
--- a/include/asm-parisc/dma.h
+++ b/include/asm-parisc/dma.h
@@ -9,7 +9,6 @@
 #ifndef _ASM_DMA_H
 #define _ASM_DMA_H
 
-#include <linux/config.h>
 #include <asm/io.h>		/* need byte IO */
 #include <asm/system.h>	
 
diff --git a/include/asm-parisc/io.h b/include/asm-parisc/io.h
index 244f6b8883f4..b9eb245b8874 100644
--- a/include/asm-parisc/io.h
+++ b/include/asm-parisc/io.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_IO_H
 #define _ASM_IO_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <asm/pgtable.h>
 
diff --git a/include/asm-parisc/irq.h b/include/asm-parisc/irq.h
index b0a30e2c9813..377ba90c7d02 100644
--- a/include/asm-parisc/irq.h
+++ b/include/asm-parisc/irq.h
@@ -7,7 +7,6 @@
 #ifndef _ASM_PARISC_IRQ_H
 #define _ASM_PARISC_IRQ_H
 
-#include <linux/config.h>
 #include <linux/cpumask.h>
 #include <asm/types.h>
 
diff --git a/include/asm-parisc/kmap_types.h b/include/asm-parisc/kmap_types.h
index 6886a0c3fedf..806aae3c5338 100644
--- a/include/asm-parisc/kmap_types.h
+++ b/include/asm-parisc/kmap_types.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_KMAP_TYPES_H
 #define _ASM_KMAP_TYPES_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_DEBUG_HIGHMEM
 # define D(n) __KM_FENCE_##n ,
diff --git a/include/asm-parisc/page.h b/include/asm-parisc/page.h
index c0dd461fb8f1..0695bc958d56 100644
--- a/include/asm-parisc/page.h
+++ b/include/asm-parisc/page.h
@@ -10,7 +10,6 @@
 
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 
 #if defined(CONFIG_PARISC_PAGE_SIZE_4KB)
 # define PAGE_SHIFT	12	/* 4k */
diff --git a/include/asm-parisc/param.h b/include/asm-parisc/param.h
index f4694d452dd6..07cb9b93cfe2 100644
--- a/include/asm-parisc/param.h
+++ b/include/asm-parisc/param.h
@@ -2,7 +2,6 @@
 #define _ASMPARISC_PARAM_H
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 # ifdef CONFIG_PA20
 #  define HZ		1000		/* Faster machines */
 # else
diff --git a/include/asm-parisc/pci.h b/include/asm-parisc/pci.h
index 77bbafb7f73e..8b631f47eb25 100644
--- a/include/asm-parisc/pci.h
+++ b/include/asm-parisc/pci.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_PARISC_PCI_H
 #define __ASM_PARISC_PCI_H
 
-#include <linux/config.h>
 #include <asm/scatterlist.h>
 
 
diff --git a/include/asm-parisc/pdc.h b/include/asm-parisc/pdc.h
index 0a3face6c480..08364f957e7a 100644
--- a/include/asm-parisc/pdc.h
+++ b/include/asm-parisc/pdc.h
@@ -1,7 +1,6 @@
 #ifndef _PARISC_PDC_H
 #define _PARISC_PDC_H
 
-#include <linux/config.h>
 
 /*
  *	PDC return values ...
diff --git a/include/asm-parisc/pgtable.h b/include/asm-parisc/pgtable.h
index aec089eb8b85..b6bcc672ba80 100644
--- a/include/asm-parisc/pgtable.h
+++ b/include/asm-parisc/pgtable.h
@@ -3,7 +3,6 @@
 
 #include <asm-generic/4level-fixup.h>
 
-#include <linux/config.h>
 #include <asm/fixmap.h>
 
 #ifndef __ASSEMBLY__
diff --git a/include/asm-parisc/processor.h b/include/asm-parisc/processor.h
index 89f2f1c16c12..ca49dc91f4fc 100644
--- a/include/asm-parisc/processor.h
+++ b/include/asm-parisc/processor.h
@@ -9,7 +9,6 @@
 #define __ASM_PARISC_PROCESSOR_H
 
 #ifndef __ASSEMBLY__
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/spinlock_types.h>
 
diff --git a/include/asm-parisc/psw.h b/include/asm-parisc/psw.h
index 4334d6ca2add..5a3e23c9ce63 100644
--- a/include/asm-parisc/psw.h
+++ b/include/asm-parisc/psw.h
@@ -1,6 +1,5 @@
 #ifndef _PARISC_PSW_H
 
-#include <linux/config.h>
 
 #define	PSW_I	0x00000001
 #define	PSW_D	0x00000002
diff --git a/include/asm-parisc/smp.h b/include/asm-parisc/smp.h
index dbdbd2e9fdf9..d4c0e26afcd1 100644
--- a/include/asm-parisc/smp.h
+++ b/include/asm-parisc/smp.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_SMP_H
 #define __ASM_SMP_H
 
-#include <linux/config.h>
 
 #if defined(CONFIG_SMP)
 
diff --git a/include/asm-parisc/system.h b/include/asm-parisc/system.h
index a5a973c0c07f..863876134b2c 100644
--- a/include/asm-parisc/system.h
+++ b/include/asm-parisc/system.h
@@ -1,7 +1,6 @@
 #ifndef __PARISC_SYSTEM_H
 #define __PARISC_SYSTEM_H
 
-#include <linux/config.h>
 #include <asm/psw.h>
 
 /* The program status word as bitfields.  */
diff --git a/include/asm-parisc/tlbflush.h b/include/asm-parisc/tlbflush.h
index 825994a90e2d..f662e837dea1 100644
--- a/include/asm-parisc/tlbflush.h
+++ b/include/asm-parisc/tlbflush.h
@@ -3,7 +3,6 @@
 
 /* TLB flushing routines.... */
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <asm/mmu_context.h>
 
diff --git a/include/asm-powerpc/abs_addr.h b/include/asm-powerpc/abs_addr.h
index c5c3259e0f86..4aa220718b19 100644
--- a/include/asm-powerpc/abs_addr.h
+++ b/include/asm-powerpc/abs_addr.h
@@ -2,7 +2,6 @@
 #define _ASM_POWERPC_ABS_ADDR_H
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 /*
  * c 2001 PPC 64 Team, IBM Corp
diff --git a/include/asm-powerpc/cache.h b/include/asm-powerpc/cache.h
index 6379c2df5c40..642be62cf393 100644
--- a/include/asm-powerpc/cache.h
+++ b/include/asm-powerpc/cache.h
@@ -3,7 +3,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 /* bytes per L1 cache line */
 #if defined(CONFIG_8xx) || defined(CONFIG_403GCX)
diff --git a/include/asm-powerpc/dma-mapping.h b/include/asm-powerpc/dma-mapping.h
index 2ac63f569592..2ab9baf78bb4 100644
--- a/include/asm-powerpc/dma-mapping.h
+++ b/include/asm-powerpc/dma-mapping.h
@@ -8,7 +8,6 @@
 #define _ASM_DMA_MAPPING_H
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/cache.h>
 /* need struct page definitions */
diff --git a/include/asm-powerpc/dma.h b/include/asm-powerpc/dma.h
index 4bb57fe37097..7a4374bdbef4 100644
--- a/include/asm-powerpc/dma.h
+++ b/include/asm-powerpc/dma.h
@@ -22,7 +22,6 @@
  * with a grain of salt.
  */
 
-#include <linux/config.h>
 #include <asm/io.h>
 #include <linux/spinlock.h>
 #include <asm/system.h>
diff --git a/include/asm-powerpc/eeh.h b/include/asm-powerpc/eeh.h
index 868c7139dbff..e9c86b1eedab 100644
--- a/include/asm-powerpc/eeh.h
+++ b/include/asm-powerpc/eeh.h
@@ -21,7 +21,6 @@
 #define _PPC64_EEH_H
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/string.h>
diff --git a/include/asm-powerpc/floppy.h b/include/asm-powerpc/floppy.h
index 608164c39efb..7e2d169ee856 100644
--- a/include/asm-powerpc/floppy.h
+++ b/include/asm-powerpc/floppy.h
@@ -11,7 +11,6 @@
 #define __ASM_POWERPC_FLOPPY_H
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <asm/machdep.h>
 
 #define fd_inb(port)		inb_p(port)
diff --git a/include/asm-powerpc/hw_irq.h b/include/asm-powerpc/hw_irq.h
index 26b89d859c56..ce0f7db63c16 100644
--- a/include/asm-powerpc/hw_irq.h
+++ b/include/asm-powerpc/hw_irq.h
@@ -6,7 +6,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <asm/ptrace.h>
 #include <asm/processor.h>
diff --git a/include/asm-powerpc/ide.h b/include/asm-powerpc/ide.h
index da5f640480cf..b09b42af6a1e 100644
--- a/include/asm-powerpc/ide.h
+++ b/include/asm-powerpc/ide.h
@@ -22,7 +22,6 @@
 #endif
 
 #ifndef  __powerpc64__
-#include <linux/config.h>
 #include <linux/hdreg.h>
 #include <linux/ioport.h>
 #include <asm/io.h>
diff --git a/include/asm-powerpc/iommu.h b/include/asm-powerpc/iommu.h
index 18ca29e9105a..2acf7b29ef06 100644
--- a/include/asm-powerpc/iommu.h
+++ b/include/asm-powerpc/iommu.h
@@ -22,7 +22,6 @@
 #define _ASM_IOMMU_H
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <asm/types.h>
 #include <linux/spinlock.h>
 #include <linux/device.h>
diff --git a/include/asm-powerpc/irq.h b/include/asm-powerpc/irq.h
index 7bc6d73b2823..1e9f25330307 100644
--- a/include/asm-powerpc/irq.h
+++ b/include/asm-powerpc/irq.h
@@ -9,7 +9,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/config.h>
 #include <linux/threads.h>
 
 #include <asm/types.h>
diff --git a/include/asm-powerpc/iseries/iseries_io.h b/include/asm-powerpc/iseries/iseries_io.h
index 496aa852b617..f29009bd63c9 100644
--- a/include/asm-powerpc/iseries/iseries_io.h
+++ b/include/asm-powerpc/iseries/iseries_io.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_POWERPC_ISERIES_ISERIES_IO_H
 #define _ASM_POWERPC_ISERIES_ISERIES_IO_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_PPC_ISERIES
 #include <linux/types.h>
diff --git a/include/asm-powerpc/machdep.h b/include/asm-powerpc/machdep.h
index 0f9254c18914..3e7d37aa4a6d 100644
--- a/include/asm-powerpc/machdep.h
+++ b/include/asm-powerpc/machdep.h
@@ -9,7 +9,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/config.h>
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/dma-mapping.h>
diff --git a/include/asm-powerpc/mmzone.h b/include/asm-powerpc/mmzone.h
index 88d70bae7769..d484ca94cb7c 100644
--- a/include/asm-powerpc/mmzone.h
+++ b/include/asm-powerpc/mmzone.h
@@ -8,7 +8,6 @@
 #define _ASM_MMZONE_H_
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 /*
  * generic non-linear memory support:
diff --git a/include/asm-powerpc/paca.h b/include/asm-powerpc/paca.h
index 706325f99a84..3c6f644d49b4 100644
--- a/include/asm-powerpc/paca.h
+++ b/include/asm-powerpc/paca.h
@@ -16,7 +16,6 @@
 #define _ASM_POWERPC_PACA_H
 #ifdef __KERNEL__
 
-#include	<linux/config.h>
 #include	<asm/types.h>
 #include	<asm/lppaca.h>
 #include	<asm/mmu.h>
diff --git a/include/asm-powerpc/page.h b/include/asm-powerpc/page.h
index 2fbecebe1c92..f0469b961359 100644
--- a/include/asm-powerpc/page.h
+++ b/include/asm-powerpc/page.h
@@ -11,7 +11,6 @@
  */
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 #include <asm/asm-compat.h>
 
 /*
diff --git a/include/asm-powerpc/pgtable.h b/include/asm-powerpc/pgtable.h
index e9f1f4627e6b..964e312a1ffc 100644
--- a/include/asm-powerpc/pgtable.h
+++ b/include/asm-powerpc/pgtable.h
@@ -12,7 +12,6 @@
  */
 
 #ifndef __ASSEMBLY__
-#include <linux/config.h>
 #include <linux/stddef.h>
 #include <asm/processor.h>		/* For TASK_SIZE */
 #include <asm/mmu.h>
diff --git a/include/asm-powerpc/ppc_asm.h b/include/asm-powerpc/ppc_asm.h
index dd1c0a913d5f..a940cfe040da 100644
--- a/include/asm-powerpc/ppc_asm.h
+++ b/include/asm-powerpc/ppc_asm.h
@@ -5,7 +5,6 @@
 #define _ASM_POWERPC_PPC_ASM_H
 
 #include <linux/stringify.h>
-#include <linux/config.h>
 #include <asm/asm-compat.h>
 
 #ifndef __ASSEMBLY__
diff --git a/include/asm-powerpc/prom.h b/include/asm-powerpc/prom.h
index 97ef1cd71a4d..f4e2ca6fd53f 100644
--- a/include/asm-powerpc/prom.h
+++ b/include/asm-powerpc/prom.h
@@ -15,7 +15,6 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/proc_fs.h>
 #include <asm/atomic.h>
diff --git a/include/asm-powerpc/smp.h b/include/asm-powerpc/smp.h
index 4a716f707cf6..068f119aa298 100644
--- a/include/asm-powerpc/smp.h
+++ b/include/asm-powerpc/smp.h
@@ -17,7 +17,6 @@
 #define _ASM_POWERPC_SMP_H
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/kernel.h>
diff --git a/include/asm-powerpc/smu.h b/include/asm-powerpc/smu.h
index 2dc93632f210..51e65fc46a03 100644
--- a/include/asm-powerpc/smu.h
+++ b/include/asm-powerpc/smu.h
@@ -5,7 +5,6 @@
  * Definitions for talking to the SMU chip in newer G5 PowerMacs
  */
 #ifdef __KERNEL__
-#include <linux/config.h>
 #include <linux/list.h>
 #endif
 #include <linux/types.h>
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index f431d8b0b651..fb519a1e49bd 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -24,7 +24,6 @@
 #define _SPU_H
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/kref.h>
 #include <linux/workqueue.h>
 
diff --git a/include/asm-powerpc/thread_info.h b/include/asm-powerpc/thread_info.h
index 88b553c6b26c..d339e2e88b11 100644
--- a/include/asm-powerpc/thread_info.h
+++ b/include/asm-powerpc/thread_info.h
@@ -21,7 +21,6 @@
 #define THREAD_SIZE		(1 << THREAD_SHIFT)
 
 #ifndef __ASSEMBLY__
-#include <linux/config.h>
 #include <linux/cache.h>
 #include <asm/processor.h>
 #include <asm/page.h>
diff --git a/include/asm-powerpc/time.h b/include/asm-powerpc/time.h
index 912118db13ae..4463148c659f 100644
--- a/include/asm-powerpc/time.h
+++ b/include/asm-powerpc/time.h
@@ -14,7 +14,6 @@
 #define __POWERPC_TIME_H
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/percpu.h>
 
diff --git a/include/asm-powerpc/timex.h b/include/asm-powerpc/timex.h
index c02d15aced91..3b9a8e786806 100644
--- a/include/asm-powerpc/timex.h
+++ b/include/asm-powerpc/timex.h
@@ -7,7 +7,6 @@
  * PowerPC architecture timex specifications
  */
 
-#include <linux/config.h>
 #include <asm/cputable.h>
 
 #define CLOCK_TICK_RATE	1024000 /* Underlying HZ */
diff --git a/include/asm-powerpc/tlb.h b/include/asm-powerpc/tlb.h
index 601a53cf96d5..4e2a834683fb 100644
--- a/include/asm-powerpc/tlb.h
+++ b/include/asm-powerpc/tlb.h
@@ -13,7 +13,6 @@
 #define _ASM_POWERPC_TLB_H
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #ifndef __powerpc64__
 #include <asm/pgtable.h>
 #endif
diff --git a/include/asm-powerpc/tlbflush.h b/include/asm-powerpc/tlbflush.h
index a2998eee37bb..93c7d0c7230f 100644
--- a/include/asm-powerpc/tlbflush.h
+++ b/include/asm-powerpc/tlbflush.h
@@ -17,7 +17,6 @@
  */
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 struct mm_struct;
 
diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h
index 1e19cd00af25..4cf340ccb4cd 100644
--- a/include/asm-powerpc/topology.h
+++ b/include/asm-powerpc/topology.h
@@ -2,7 +2,6 @@
 #define _ASM_POWERPC_TOPOLOGY_H
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #ifdef CONFIG_NUMA
 
diff --git a/include/asm-powerpc/types.h b/include/asm-powerpc/types.h
index baabba96e313..d6fb56b80453 100644
--- a/include/asm-powerpc/types.h
+++ b/include/asm-powerpc/types.h
@@ -64,7 +64,6 @@ typedef struct {
 
 #ifndef __ASSEMBLY__
 
-#include <linux/config.h>
 
 typedef signed char s8;
 typedef unsigned char u8;
diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
index c612f1a62772..d471549e1b81 100644
--- a/include/asm-powerpc/unistd.h
+++ b/include/asm-powerpc/unistd.h
@@ -423,7 +423,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/compiler.h>
 #include <linux/linkage.h>
diff --git a/include/asm-powerpc/vga.h b/include/asm-powerpc/vga.h
index f8d350aabf1a..eadaf2f3d032 100644
--- a/include/asm-powerpc/vga.h
+++ b/include/asm-powerpc/vga.h
@@ -12,7 +12,6 @@
 
 #include <asm/io.h>
 
-#include <linux/config.h>
 
 #if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_MDA_CONSOLE)
 
diff --git a/include/asm-powerpc/vio.h b/include/asm-powerpc/vio.h
index 0544ece51761..be14c59846f9 100644
--- a/include/asm-powerpc/vio.h
+++ b/include/asm-powerpc/vio.h
@@ -15,7 +15,6 @@
 #define _ASM_POWERPC_VIO_H
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/device.h>
diff --git a/include/asm-ppc/amigahw.h b/include/asm-ppc/amigahw.h
index 8c98945e7dc1..90fd1274d727 100644
--- a/include/asm-ppc/amigahw.h
+++ b/include/asm-ppc/amigahw.h
@@ -2,7 +2,6 @@
 #ifndef __ASMPPC_AMIGAHW_H
 #define __ASMPPC_AMIGAHW_H
 
-#include <linux/config.h>
 #include <asm-m68k/amigahw.h>
 
 #undef CHIP_PHYSADDR
diff --git a/include/asm-ppc/bootinfo.h b/include/asm-ppc/bootinfo.h
index 93d955c70d65..2ace4a74f263 100644
--- a/include/asm-ppc/bootinfo.h
+++ b/include/asm-ppc/bootinfo.h
@@ -9,7 +9,6 @@
 #ifndef _PPC_BOOTINFO_H
 #define _PPC_BOOTINFO_H
 
-#include <linux/config.h>
 #include <asm/page.h>
 
 #if defined(CONFIG_APUS) && !defined(__BOOTER__)
diff --git a/include/asm-ppc/commproc.h b/include/asm-ppc/commproc.h
index 973e60908234..64a2623def02 100644
--- a/include/asm-ppc/commproc.h
+++ b/include/asm-ppc/commproc.h
@@ -17,7 +17,6 @@
 #ifndef __CPM_8XX__
 #define __CPM_8XX__
 
-#include <linux/config.h>
 #include <asm/8xx_immap.h>
 #include <asm/ptrace.h>
 
diff --git a/include/asm-ppc/ibm403.h b/include/asm-ppc/ibm403.h
index bf6efa0417ab..c9c5d539cfdb 100644
--- a/include/asm-ppc/ibm403.h
+++ b/include/asm-ppc/ibm403.h
@@ -12,7 +12,6 @@
 #ifndef __ASM_IBM403_H__
 #define __ASM_IBM403_H__
 
-#include <linux/config.h>
 
 #if defined(CONFIG_403GCX)
 
diff --git a/include/asm-ppc/ibm44x.h b/include/asm-ppc/ibm44x.h
index 3acc382cc83f..7818b54b6e37 100644
--- a/include/asm-ppc/ibm44x.h
+++ b/include/asm-ppc/ibm44x.h
@@ -17,7 +17,6 @@
 #ifndef __ASM_IBM44x_H__
 #define __ASM_IBM44x_H__
 
-#include <linux/config.h>
 
 #ifndef NR_BOARD_IRQS
 #define NR_BOARD_IRQS 0
diff --git a/include/asm-ppc/ibm4xx.h b/include/asm-ppc/ibm4xx.h
index 38f99710752b..cf62b69cb69a 100644
--- a/include/asm-ppc/ibm4xx.h
+++ b/include/asm-ppc/ibm4xx.h
@@ -14,7 +14,6 @@
 #ifndef __ASM_IBM4XX_H__
 #define __ASM_IBM4XX_H__
 
-#include <linux/config.h>
 #include <asm/types.h>
 
 #ifdef CONFIG_40x
diff --git a/include/asm-ppc/io.h b/include/asm-ppc/io.h
index b919d8fb7d98..89c6f1bc3aab 100644
--- a/include/asm-ppc/io.h
+++ b/include/asm-ppc/io.h
@@ -2,7 +2,6 @@
 #ifndef _PPC_IO_H
 #define _PPC_IO_H
 
-#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/types.h>
 
diff --git a/include/asm-ppc/machdep.h b/include/asm-ppc/machdep.h
index e1a0a7b213d7..da7746738aee 100644
--- a/include/asm-ppc/machdep.h
+++ b/include/asm-ppc/machdep.h
@@ -2,7 +2,6 @@
 #ifndef _PPC_MACHDEP_H
 #define _PPC_MACHDEP_H
 
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/kexec.h>
 
diff --git a/include/asm-ppc/mmu.h b/include/asm-ppc/mmu.h
index 9205db404c7a..0a70b05b3afb 100644
--- a/include/asm-ppc/mmu.h
+++ b/include/asm-ppc/mmu.h
@@ -6,7 +6,6 @@
 #ifndef _PPC_MMU_H_
 #define _PPC_MMU_H_
 
-#include <linux/config.h>
 
 #ifndef __ASSEMBLY__
 
diff --git a/include/asm-ppc/mmu_context.h b/include/asm-ppc/mmu_context.h
index 4f152cca13c1..94f2bf71310d 100644
--- a/include/asm-ppc/mmu_context.h
+++ b/include/asm-ppc/mmu_context.h
@@ -2,7 +2,6 @@
 #ifndef __PPC_MMU_CONTEXT_H
 #define __PPC_MMU_CONTEXT_H
 
-#include <linux/config.h>
 #include <asm/atomic.h>
 #include <asm/bitops.h>
 #include <asm/mmu.h>
diff --git a/include/asm-ppc/mpc8260.h b/include/asm-ppc/mpc8260.h
index 6ba69a86b9dd..4b93481e7679 100644
--- a/include/asm-ppc/mpc8260.h
+++ b/include/asm-ppc/mpc8260.h
@@ -8,7 +8,6 @@
 #ifndef __ASM_PPC_MPC8260_H__
 #define __ASM_PPC_MPC8260_H__
 
-#include <linux/config.h>
 
 #ifdef CONFIG_8260
 
diff --git a/include/asm-ppc/mpc83xx.h b/include/asm-ppc/mpc83xx.h
index 3c23fc43bfbc..02ed2c325714 100644
--- a/include/asm-ppc/mpc83xx.h
+++ b/include/asm-ppc/mpc83xx.h
@@ -17,7 +17,6 @@
 #ifndef __ASM_MPC83xx_H__
 #define __ASM_MPC83xx_H__
 
-#include <linux/config.h>
 #include <asm/mmu.h>
 
 #ifdef CONFIG_83xx
diff --git a/include/asm-ppc/mpc85xx.h b/include/asm-ppc/mpc85xx.h
index f47002a60edf..c25bdd9debf8 100644
--- a/include/asm-ppc/mpc85xx.h
+++ b/include/asm-ppc/mpc85xx.h
@@ -17,7 +17,6 @@
 #ifndef __ASM_MPC85xx_H__
 #define __ASM_MPC85xx_H__
 
-#include <linux/config.h>
 #include <asm/mmu.h>
 
 #ifdef CONFIG_85xx
diff --git a/include/asm-ppc/mpc8xx.h b/include/asm-ppc/mpc8xx.h
index 3515a7fa6c89..adcce33f20ae 100644
--- a/include/asm-ppc/mpc8xx.h
+++ b/include/asm-ppc/mpc8xx.h
@@ -8,7 +8,6 @@
 #ifndef __CONFIG_8xx_DEFS
 #define __CONFIG_8xx_DEFS
 
-#include <linux/config.h>
 
 #ifdef CONFIG_8xx
 
diff --git a/include/asm-ppc/mv64x60.h b/include/asm-ppc/mv64x60.h
index 4f2405b83612..663edbee3e91 100644
--- a/include/asm-ppc/mv64x60.h
+++ b/include/asm-ppc/mv64x60.h
@@ -17,7 +17,6 @@
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
-#include <linux/config.h>
 
 #include <asm/byteorder.h>
 #include <asm/io.h>
diff --git a/include/asm-ppc/ocp.h b/include/asm-ppc/ocp.h
index 983116f59d90..3be5d760ffcd 100644
--- a/include/asm-ppc/ocp.h
+++ b/include/asm-ppc/ocp.h
@@ -26,7 +26,6 @@
 
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/config.h>
 #include <linux/devfs_fs_kernel.h>
 #include <linux/device.h>
 
diff --git a/include/asm-ppc/open_pic.h b/include/asm-ppc/open_pic.h
index ec2f46629ca2..a4fe962d9f73 100644
--- a/include/asm-ppc/open_pic.h
+++ b/include/asm-ppc/open_pic.h
@@ -12,7 +12,6 @@
 #ifndef _PPC_KERNEL_OPEN_PIC_H
 #define _PPC_KERNEL_OPEN_PIC_H
 
-#include <linux/config.h>
 #include <linux/irq.h>
 
 #define OPENPIC_SIZE	0x40000
diff --git a/include/asm-ppc/page.h b/include/asm-ppc/page.h
index a70ba2ee552d..352faa4b0d4b 100644
--- a/include/asm-ppc/page.h
+++ b/include/asm-ppc/page.h
@@ -1,7 +1,6 @@
 #ifndef _PPC_PAGE_H
 #define _PPC_PAGE_H
 
-#include <linux/config.h>
 #include <asm/asm-compat.h>
 
 /* PAGE_SHIFT determines the page size */
@@ -15,7 +14,6 @@
 #define PAGE_MASK	(~((1 << PAGE_SHIFT) - 1))
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 
 /* This must match what is in arch/ppc/Makefile */
 #define PAGE_OFFSET	CONFIG_KERNEL_START
diff --git a/include/asm-ppc/pc_serial.h b/include/asm-ppc/pc_serial.h
index 8f994f9f8857..81a2d0fdaf00 100644
--- a/include/asm-ppc/pc_serial.h
+++ b/include/asm-ppc/pc_serial.h
@@ -9,7 +9,6 @@
  * anyone using any of those on a PPC platform.  -- paulus
  */
 
-#include <linux/config.h>
 
 /*
  * This assumes you have a 1.8432 MHz clock for your UART.
diff --git a/include/asm-ppc/pgalloc.h b/include/asm-ppc/pgalloc.h
index bdefd1c4a558..44d88a98e87c 100644
--- a/include/asm-ppc/pgalloc.h
+++ b/include/asm-ppc/pgalloc.h
@@ -2,7 +2,6 @@
 #ifndef _PPC_PGALLOC_H
 #define _PPC_PGALLOC_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 
 extern void __bad_pte(pmd_t *pmd);
diff --git a/include/asm-ppc/pgtable.h b/include/asm-ppc/pgtable.h
index 570b355162fa..9cb83679836c 100644
--- a/include/asm-ppc/pgtable.h
+++ b/include/asm-ppc/pgtable.h
@@ -4,7 +4,6 @@
 
 #include <asm-generic/4level-fixup.h>
 
-#include <linux/config.h>
 
 #ifndef __ASSEMBLY__
 #include <linux/sched.h>
diff --git a/include/asm-ppc/ppc4xx_dma.h b/include/asm-ppc/ppc4xx_dma.h
index 46a086fff816..935d1e05366b 100644
--- a/include/asm-ppc/ppc4xx_dma.h
+++ b/include/asm-ppc/ppc4xx_dma.h
@@ -24,7 +24,6 @@
 #ifndef __ASMPPC_PPC4xx_DMA_H
 #define __ASMPPC_PPC4xx_DMA_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <asm/mmu.h>
 #include <asm/ibm4xx.h>
diff --git a/include/asm-ppc/ppc4xx_pic.h b/include/asm-ppc/ppc4xx_pic.h
index c16c7f81cfd8..e44261206f8b 100644
--- a/include/asm-ppc/ppc4xx_pic.h
+++ b/include/asm-ppc/ppc4xx_pic.h
@@ -17,7 +17,6 @@
 #ifndef	__PPC4XX_PIC_H__
 #define	__PPC4XX_PIC_H__
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/irq.h>
 
diff --git a/include/asm-ppc/serial.h b/include/asm-ppc/serial.h
index b74af5461564..8a59f8871f32 100644
--- a/include/asm-ppc/serial.h
+++ b/include/asm-ppc/serial.h
@@ -6,7 +6,6 @@
 #ifndef __ASM_SERIAL_H__
 #define __ASM_SERIAL_H__
 
-#include <linux/config.h>
 
 #if defined(CONFIG_EV64260)
 #include <platforms/ev64260.h>
diff --git a/include/asm-ppc/smp.h b/include/asm-ppc/smp.h
index 30e9268a888c..0b7fa89589df 100644
--- a/include/asm-ppc/smp.h
+++ b/include/asm-ppc/smp.h
@@ -10,7 +10,6 @@
 #ifndef _PPC_SMP_H
 #define _PPC_SMP_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/bitops.h>
 #include <linux/errno.h>
diff --git a/include/asm-ppc/time.h b/include/asm-ppc/time.h
index c86112323c9f..f7eadf6ac806 100644
--- a/include/asm-ppc/time.h
+++ b/include/asm-ppc/time.h
@@ -9,7 +9,6 @@
 #ifndef __ASM_TIME_H__
 #define __ASM_TIME_H__
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/rtc.h>
 #include <linux/threads.h>
diff --git a/include/asm-s390/bitops.h b/include/asm-s390/bitops.h
index ca092ffb7a95..4d2b126ba159 100644
--- a/include/asm-s390/bitops.h
+++ b/include/asm-s390/bitops.h
@@ -12,7 +12,6 @@
  *    Copyright (C) 1992, Linus Torvalds
  *
  */
-#include <linux/config.h>
 #include <linux/compiler.h>
 
 /*
diff --git a/include/asm-s390/debug.h b/include/asm-s390/debug.h
index 23450ed4b571..7f1ef99fd1e1 100644
--- a/include/asm-s390/debug.h
+++ b/include/asm-s390/debug.h
@@ -9,7 +9,6 @@
 #ifndef DEBUG_H
 #define DEBUG_H
 
-#include <linux/config.h>
 #include <linux/fs.h>
 #include <linux/string.h>
 
diff --git a/include/asm-s390/hardirq.h b/include/asm-s390/hardirq.h
index 6792c559a124..e84b7ef54aac 100644
--- a/include/asm-s390/hardirq.h
+++ b/include/asm-s390/hardirq.h
@@ -12,7 +12,6 @@
 #ifndef __ASM_HARDIRQ_H
 #define __ASM_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/sched.h>
 #include <linux/cache.h>
diff --git a/include/asm-s390/idals.h b/include/asm-s390/idals.h
index 8038858b86bb..e82c10efe65a 100644
--- a/include/asm-s390/idals.h
+++ b/include/asm-s390/idals.h
@@ -13,7 +13,6 @@
 #ifndef _S390_IDALS_H
 #define _S390_IDALS_H
 
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/types.h>
diff --git a/include/asm-s390/local.h b/include/asm-s390/local.h
index cf8189009c30..86745a1b29bb 100644
--- a/include/asm-s390/local.h
+++ b/include/asm-s390/local.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_LOCAL_H
 #define _ASM_LOCAL_H
 
-#include <linux/config.h>
 #include <linux/percpu.h>
 #include <asm/atomic.h>
 
diff --git a/include/asm-s390/lowcore.h b/include/asm-s390/lowcore.h
index db0606c1abd4..e17d181b98a9 100644
--- a/include/asm-s390/lowcore.h
+++ b/include/asm-s390/lowcore.h
@@ -124,7 +124,6 @@
 
 #ifndef __ASSEMBLY__
 
-#include <linux/config.h>
 #include <asm/processor.h>
 #include <linux/types.h>
 #include <asm/sigp.h>
diff --git a/include/asm-s390/pgalloc.h b/include/asm-s390/pgalloc.h
index e28aaf28e4a8..3002fda89d33 100644
--- a/include/asm-s390/pgalloc.h
+++ b/include/asm-s390/pgalloc.h
@@ -13,7 +13,6 @@
 #ifndef _S390_PGALLOC_H
 #define _S390_PGALLOC_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/gfp.h>
 #include <linux/mm.h>
diff --git a/include/asm-s390/ptrace.h b/include/asm-s390/ptrace.h
index a949cc077cc7..a867e94ae484 100644
--- a/include/asm-s390/ptrace.h
+++ b/include/asm-s390/ptrace.h
@@ -181,7 +181,6 @@
 #define PTRACE_OLDSETOPTIONS         21
 
 #ifndef __ASSEMBLY__
-#include <linux/config.h>
 #include <linux/stddef.h>
 #include <linux/types.h>
 #include <asm/setup.h>
diff --git a/include/asm-s390/sfp-machine.h b/include/asm-s390/sfp-machine.h
index 3c79b5384f44..de69dfa46fbb 100644
--- a/include/asm-s390/sfp-machine.h
+++ b/include/asm-s390/sfp-machine.h
@@ -25,7 +25,6 @@
 #ifndef _SFP_MACHINE_H
 #define _SFP_MACHINE_H
    
-#include <linux/config.h>
 
 #define _FP_W_TYPE_SIZE		32
 #define _FP_W_TYPE		unsigned long
diff --git a/include/asm-s390/smp.h b/include/asm-s390/smp.h
index 444dae5912e6..657646054c5e 100644
--- a/include/asm-s390/smp.h
+++ b/include/asm-s390/smp.h
@@ -10,7 +10,6 @@
 #ifndef __ASM_SMP_H
 #define __ASM_SMP_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/bitops.h>
diff --git a/include/asm-s390/system.h b/include/asm-s390/system.h
index 6a89dbb03c1e..71a0732cd518 100644
--- a/include/asm-s390/system.h
+++ b/include/asm-s390/system.h
@@ -11,7 +11,6 @@
 #ifndef __ASM_SYSTEM_H
 #define __ASM_SYSTEM_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <asm/types.h>
 #include <asm/ptrace.h>
diff --git a/include/asm-s390/tlbflush.h b/include/asm-s390/tlbflush.h
index 1bb73b0e61fa..73cd85bebfb2 100644
--- a/include/asm-s390/tlbflush.h
+++ b/include/asm-s390/tlbflush.h
@@ -1,7 +1,6 @@
 #ifndef _S390_TLBFLUSH_H
 #define _S390_TLBFLUSH_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <asm/processor.h>
 
diff --git a/include/asm-s390/types.h b/include/asm-s390/types.h
index 5738ad63537c..ae2951cc83ac 100644
--- a/include/asm-s390/types.h
+++ b/include/asm-s390/types.h
@@ -58,7 +58,6 @@ typedef __signed__ long saddr_t;
 
 #ifndef __ASSEMBLY__
 
-#include <linux/config.h>
 
 typedef signed char s8;
 typedef unsigned char u8;
diff --git a/include/asm-s390/unistd.h b/include/asm-s390/unistd.h
index 657d582e8149..ac790bf44559 100644
--- a/include/asm-s390/unistd.h
+++ b/include/asm-s390/unistd.h
@@ -571,7 +571,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4,    \
 
 #ifdef __KERNEL_SYSCALLS__
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <asm/ptrace.h>
diff --git a/include/asm-sh/bug.h b/include/asm-sh/bug.h
index 70508a360cd6..1b4fc52a59e8 100644
--- a/include/asm-sh/bug.h
+++ b/include/asm-sh/bug.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_SH_BUG_H
 #define __ASM_SH_BUG_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_BUG
 /*
diff --git a/include/asm-sh/checksum.h b/include/asm-sh/checksum.h
index 5ebd0f24299e..fa03b30c4269 100644
--- a/include/asm-sh/checksum.h
+++ b/include/asm-sh/checksum.h
@@ -9,7 +9,6 @@
  * Copyright (C) 1999 by Kaz Kojima & Niibe Yutaka
  */
 
-#include <linux/config.h>
 #include <linux/in6.h>
 
 /*
diff --git a/include/asm-sh/dma-mapping.h b/include/asm-sh/dma-mapping.h
index 48f1f42c5d14..124968f9866e 100644
--- a/include/asm-sh/dma-mapping.h
+++ b/include/asm-sh/dma-mapping.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_SH_DMA_MAPPING_H
 #define __ASM_SH_DMA_MAPPING_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <asm/scatterlist.h>
 #include <asm/cacheflush.h>
diff --git a/include/asm-sh/dma.h b/include/asm-sh/dma.h
index a118a0d43053..e62a6d0ed932 100644
--- a/include/asm-sh/dma.h
+++ b/include/asm-sh/dma.h
@@ -11,7 +11,6 @@
 #define __ASM_SH_DMA_H
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
 #include <linux/sysdev.h>
diff --git a/include/asm-sh/fixmap.h b/include/asm-sh/fixmap.h
index 509224bdba28..412bccaa07e6 100644
--- a/include/asm-sh/fixmap.h
+++ b/include/asm-sh/fixmap.h
@@ -13,7 +13,6 @@
 #ifndef _ASM_FIXMAP_H
 #define _ASM_FIXMAP_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <asm/page.h>
 #ifdef CONFIG_HIGHMEM
diff --git a/include/asm-sh/hardirq.h b/include/asm-sh/hardirq.h
index f2fdf0f760e5..715ee237fc77 100644
--- a/include/asm-sh/hardirq.h
+++ b/include/asm-sh/hardirq.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_SH_HARDIRQ_H
 #define __ASM_SH_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/irq.h>
 
diff --git a/include/asm-sh/hd64461/hd64461.h b/include/asm-sh/hd64461/hd64461.h
index c457ca277a42..87f13d24c630 100644
--- a/include/asm-sh/hd64461/hd64461.h
+++ b/include/asm-sh/hd64461/hd64461.h
@@ -5,7 +5,6 @@
  *	Copyright (C) 2000 YAEGASHI Takeshi
  *	Hitachi HD64461 companion chip support
  */
-#include <linux/config.h>
 
 /* Constants for PCMCIA mappings */
 #define HD64461_PCC_WINDOW	0x01000000
diff --git a/include/asm-sh/hd64465/hd64465.h b/include/asm-sh/hd64465/hd64465.h
index c672032b72c9..cfd0e803d2a2 100644
--- a/include/asm-sh/hd64465/hd64465.h
+++ b/include/asm-sh/hd64465/hd64465.h
@@ -11,7 +11,6 @@
  * Derived from <asm/hd64461.h> which bore the message:
  * Copyright (C) 2000 YAEGASHI Takeshi
  */
-#include <linux/config.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 
diff --git a/include/asm-sh/ide.h b/include/asm-sh/ide.h
index 711dad4cb48b..9f8e9142dc33 100644
--- a/include/asm-sh/ide.h
+++ b/include/asm-sh/ide.h
@@ -14,7 +14,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #define ide_default_io_ctl(base)	(0)
 
diff --git a/include/asm-sh/io.h b/include/asm-sh/io.h
index 2c3afe71323d..894e64b2d5f0 100644
--- a/include/asm-sh/io.h
+++ b/include/asm-sh/io.h
@@ -23,7 +23,6 @@
  *  inb   by default expands to _inb, but the machine specific code may
  *        define it to __inb if it chooses.
  */
-#include <linux/config.h>
 #include <asm/cache.h>
 #include <asm/system.h>
 #include <asm/addrspace.h>
diff --git a/include/asm-sh/irq.h b/include/asm-sh/irq.h
index 42b8394c04ed..611e67cd0627 100644
--- a/include/asm-sh/irq.h
+++ b/include/asm-sh/irq.h
@@ -11,7 +11,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <asm/machvec.h>
 #include <asm/ptrace.h>		/* for pt_regs */
 
diff --git a/include/asm-sh/keyboard.h b/include/asm-sh/keyboard.h
index 1103df003243..31dcc4fa5f28 100644
--- a/include/asm-sh/keyboard.h
+++ b/include/asm-sh/keyboard.h
@@ -5,7 +5,6 @@
  */
 
 #include <linux/kd.h>
-#include <linux/config.h>
 #include <asm/machvec.h>
 
 #ifdef CONFIG_SH_MPC1211
diff --git a/include/asm-sh/kmap_types.h b/include/asm-sh/kmap_types.h
index 2492ba07148f..84d565c696be 100644
--- a/include/asm-sh/kmap_types.h
+++ b/include/asm-sh/kmap_types.h
@@ -3,7 +3,6 @@
 
 /* Dummy header just to define km_type. */
 
-#include <linux/config.h>
 
 #ifdef CONFIG_DEBUG_HIGHMEM
 # define D(n) __KM_FENCE_##n ,
diff --git a/include/asm-sh/machvec.h b/include/asm-sh/machvec.h
index 550c50a7359e..550501fa4fed 100644
--- a/include/asm-sh/machvec.h
+++ b/include/asm-sh/machvec.h
@@ -10,7 +10,6 @@
 #ifndef _ASM_SH_MACHVEC_H
 #define _ASM_SH_MACHVEC_H 1
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/time.h>
 
diff --git a/include/asm-sh/machvec_init.h b/include/asm-sh/machvec_init.h
index 9e7de808f7f8..e397798ebd94 100644
--- a/include/asm-sh/machvec_init.h
+++ b/include/asm-sh/machvec_init.h
@@ -12,7 +12,6 @@
 #ifndef __SH_MACHVEC_INIT_H
 #define __SH_MACHVEC_INIT_H
 
-#include <linux/config.h>
 
 /*
  * In a GENERIC kernel, we have lots of these vectors floating about,
diff --git a/include/asm-sh/mpc1211/dma.h b/include/asm-sh/mpc1211/dma.h
index 0a2fdab3e454..e506d1aaa0d0 100644
--- a/include/asm-sh/mpc1211/dma.h
+++ b/include/asm-sh/mpc1211/dma.h
@@ -8,7 +8,6 @@
 #ifndef _ASM_MPC1211_DMA_H
 #define _ASM_MPC1211_DMA_H
 
-#include <linux/config.h>
 #include <linux/spinlock.h>	/* And spinlocks */
 #include <asm/io.h>		/* need byte IO */
 #include <linux/delay.h>
diff --git a/include/asm-sh/overdrive/overdrive.h b/include/asm-sh/overdrive/overdrive.h
index aa62ae68c55c..fc746c244f83 100644
--- a/include/asm-sh/overdrive/overdrive.h
+++ b/include/asm-sh/overdrive/overdrive.h
@@ -6,7 +6,6 @@
  *
  */
 
-#include <linux/config.h>
 
 #ifndef __OVERDRIVE_H__
 #define __OVERDRIVE_H__
diff --git a/include/asm-sh/page.h b/include/asm-sh/page.h
index 9c89287c3e56..a5559e38744e 100644
--- a/include/asm-sh/page.h
+++ b/include/asm-sh/page.h
@@ -13,7 +13,6 @@
    [ P4 control   ]		0xE0000000
  */
 
-#include <linux/config.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT	12
diff --git a/include/asm-sh/pgtable.h b/include/asm-sh/pgtable.h
index bb0efb31a8cb..dcd23a03683d 100644
--- a/include/asm-sh/pgtable.h
+++ b/include/asm-sh/pgtable.h
@@ -8,7 +8,6 @@
  * Copyright (C) 2002, 2003, 2004 Paul Mundt
  */
 
-#include <linux/config.h>
 #include <asm/pgtable-2level.h>
 
 /*
diff --git a/include/asm-sh/serial.h b/include/asm-sh/serial.h
index f51e232d5cd9..8734590d27e8 100644
--- a/include/asm-sh/serial.h
+++ b/include/asm-sh/serial.h
@@ -7,7 +7,6 @@
 #ifndef _ASM_SERIAL_H
 #define _ASM_SERIAL_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 
 #ifdef CONFIG_SH_EC3104
diff --git a/include/asm-sh/smp.h b/include/asm-sh/smp.h
index f19a8b3b69a6..f57c4fe9692a 100644
--- a/include/asm-sh/smp.h
+++ b/include/asm-sh/smp.h
@@ -10,7 +10,6 @@
 #ifndef __ASM_SH_SMP_H
 #define __ASM_SH_SMP_H
 
-#include <linux/config.h>
 #include <linux/bitops.h>
 #include <linux/cpumask.h>
 
diff --git a/include/asm-sh/system.h b/include/asm-sh/system.h
index bb0330499bdf..b752e5cbb830 100644
--- a/include/asm-sh/system.h
+++ b/include/asm-sh/system.h
@@ -6,7 +6,6 @@
  * Copyright (C) 2002 Paul Mundt
  */
 
-#include <linux/config.h>
 
 /*
  *	switch_to() should switch tasks to task nr n, first
diff --git a/include/asm-sh/types.h b/include/asm-sh/types.h
index 488552f43b2a..3c09dd4ca31c 100644
--- a/include/asm-sh/types.h
+++ b/include/asm-sh/types.h
@@ -35,7 +35,6 @@ typedef unsigned long long __u64;
 
 #ifndef __ASSEMBLY__
 
-#include <linux/config.h>
 
 typedef __signed__ char s8;
 typedef unsigned char u8;
diff --git a/include/asm-sh/watchdog.h b/include/asm-sh/watchdog.h
index f0cf4be21655..09ca41972a11 100644
--- a/include/asm-sh/watchdog.h
+++ b/include/asm-sh/watchdog.h
@@ -13,7 +13,6 @@
 #ifdef __KERNEL__
 
 #include <linux/types.h>
-#include <linux/config.h>
 #include <asm/cpu/watchdog.h>
 #include <asm/io.h>
 
diff --git a/include/asm-sh64/bug.h b/include/asm-sh64/bug.h
index 5d659ec28e10..81f722efeb63 100644
--- a/include/asm-sh64/bug.h
+++ b/include/asm-sh64/bug.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_SH64_BUG_H
 #define __ASM_SH64_BUG_H
 
-#include <linux/config.h>
 
 /*
  * Tell the user there is some problem, then force a segfault (in process
diff --git a/include/asm-sh64/dma-mapping.h b/include/asm-sh64/dma-mapping.h
index cc9a2e86f5b4..a74a49e47922 100644
--- a/include/asm-sh64/dma-mapping.h
+++ b/include/asm-sh64/dma-mapping.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_SH_DMA_MAPPING_H
 #define __ASM_SH_DMA_MAPPING_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <asm/scatterlist.h>
 #include <asm/io.h>
diff --git a/include/asm-sh64/hardirq.h b/include/asm-sh64/hardirq.h
index ad2330e41fd5..555fd7a35108 100644
--- a/include/asm-sh64/hardirq.h
+++ b/include/asm-sh64/hardirq.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_SH64_HARDIRQ_H
 #define __ASM_SH64_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/irq.h>
 
diff --git a/include/asm-sh64/ide.h b/include/asm-sh64/ide.h
index 852f50afe39c..c9d84d5f772e 100644
--- a/include/asm-sh64/ide.h
+++ b/include/asm-sh64/ide.h
@@ -15,7 +15,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 /* Without this, the initialisation of PCI IDE cards end up calling
  * ide_init_hwif_ports, which won't work. */
diff --git a/include/asm-sh64/irq.h b/include/asm-sh64/irq.h
index f815b43df845..1ca49e29288a 100644
--- a/include/asm-sh64/irq.h
+++ b/include/asm-sh64/irq.h
@@ -12,7 +12,6 @@
  *
  */
 
-#include <linux/config.h>
 
 /*
  * Encoded IRQs are not considered worth to be supported.
diff --git a/include/asm-sh64/mmu_context.h b/include/asm-sh64/mmu_context.h
index 991cfda4cdf6..8c860dab2d0e 100644
--- a/include/asm-sh64/mmu_context.h
+++ b/include/asm-sh64/mmu_context.h
@@ -26,7 +26,6 @@
  */
 extern unsigned long mmu_context_cache;
 
-#include <linux/config.h>
 #include <asm/page.h>
 
 
diff --git a/include/asm-sh64/page.h b/include/asm-sh64/page.h
index e4937cdabebd..34fb34754ae6 100644
--- a/include/asm-sh64/page.h
+++ b/include/asm-sh64/page.h
@@ -17,7 +17,6 @@
  *
  */
 
-#include <linux/config.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT	12
diff --git a/include/asm-sh64/param.h b/include/asm-sh64/param.h
index d18cc87c1a80..f409adb41540 100644
--- a/include/asm-sh64/param.h
+++ b/include/asm-sh64/param.h
@@ -12,7 +12,6 @@
 #ifndef __ASM_SH64_PARAM_H
 #define __ASM_SH64_PARAM_H
 
-#include <linux/config.h>
 
 #ifdef __KERNEL__
 # ifdef CONFIG_SH_WDT
diff --git a/include/asm-sh64/pgtable.h b/include/asm-sh64/pgtable.h
index 57af6b3eb271..54c7821893f5 100644
--- a/include/asm-sh64/pgtable.h
+++ b/include/asm-sh64/pgtable.h
@@ -22,7 +22,6 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <linux/threads.h>
-#include <linux/config.h>
 
 struct vm_area_struct;
 
diff --git a/include/asm-sh64/system.h b/include/asm-sh64/system.h
index 3002e988180c..7606f6e1f01e 100644
--- a/include/asm-sh64/system.h
+++ b/include/asm-sh64/system.h
@@ -14,7 +14,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <asm/registers.h>
 #include <asm/processor.h>
 
diff --git a/include/asm-sparc/asmmacro.h b/include/asm-sparc/asmmacro.h
index 0d4b65bd252b..a619a4d97aae 100644
--- a/include/asm-sparc/asmmacro.h
+++ b/include/asm-sparc/asmmacro.h
@@ -6,7 +6,6 @@
 #ifndef _SPARC_ASMMACRO_H
 #define _SPARC_ASMMACRO_H
 
-#include <linux/config.h>
 #include <asm/btfixup.h>
 #include <asm/asi.h>
 
diff --git a/include/asm-sparc/atomic.h b/include/asm-sparc/atomic.h
index e1033170bd3a..731fa56e0c37 100644
--- a/include/asm-sparc/atomic.h
+++ b/include/asm-sparc/atomic.h
@@ -10,7 +10,6 @@
 #ifndef __ARCH_SPARC_ATOMIC__
 #define __ARCH_SPARC_ATOMIC__
 
-#include <linux/config.h>
 
 typedef struct { volatile int counter; } atomic_t;
 
diff --git a/include/asm-sparc/bugs.h b/include/asm-sparc/bugs.h
index e652f89e0eff..a0f939beeea1 100644
--- a/include/asm-sparc/bugs.h
+++ b/include/asm-sparc/bugs.h
@@ -5,7 +5,6 @@
  */
 
 #include <asm/cpudata.h>
-#include <linux/config.h>
 
 extern unsigned long loops_per_jiffy;
 
diff --git a/include/asm-sparc/cacheflush.h b/include/asm-sparc/cacheflush.h
index 4901217008c0..fc632f811cd8 100644
--- a/include/asm-sparc/cacheflush.h
+++ b/include/asm-sparc/cacheflush.h
@@ -1,7 +1,6 @@
 #ifndef _SPARC_CACHEFLUSH_H
 #define _SPARC_CACHEFLUSH_H
 
-#include <linux/config.h>
 #include <linux/mm.h>		/* Common for other includes */
 // #include <linux/kernel.h> from pgalloc.h
 // #include <linux/sched.h>  from pgalloc.h
diff --git a/include/asm-sparc/delay.h b/include/asm-sparc/delay.h
index 7ec8e9f7ad4f..48aa70eef997 100644
--- a/include/asm-sparc/delay.h
+++ b/include/asm-sparc/delay.h
@@ -7,7 +7,6 @@
 #ifndef __SPARC_DELAY_H
 #define __SPARC_DELAY_H
 
-#include <linux/config.h>
 #include <asm/cpudata.h>
 
 static inline void __delay(unsigned long loops)
diff --git a/include/asm-sparc/dma-mapping.h b/include/asm-sparc/dma-mapping.h
index d7c3b0f0a901..6db83dc93cb7 100644
--- a/include/asm-sparc/dma-mapping.h
+++ b/include/asm-sparc/dma-mapping.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_SPARC_DMA_MAPPING_H
 #define _ASM_SPARC_DMA_MAPPING_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_PCI
 #include <asm-generic/dma-mapping.h>
diff --git a/include/asm-sparc/dma.h b/include/asm-sparc/dma.h
index 8ec206aa5f2e..407b3614468a 100644
--- a/include/asm-sparc/dma.h
+++ b/include/asm-sparc/dma.h
@@ -7,7 +7,6 @@
 #ifndef _ASM_SPARC_DMA_H
 #define _ASM_SPARC_DMA_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 
diff --git a/include/asm-sparc/elf.h b/include/asm-sparc/elf.h
index 4a71d7c1eace..83a3dd15a6ed 100644
--- a/include/asm-sparc/elf.h
+++ b/include/asm-sparc/elf.h
@@ -6,7 +6,6 @@
  * ELF register definitions..
  */
 
-#include <linux/config.h>
 #include <asm/ptrace.h>
 
 #ifdef __KERNEL__
diff --git a/include/asm-sparc/fixmap.h b/include/asm-sparc/fixmap.h
index 9de52b4d2cfb..f18fc0755adf 100644
--- a/include/asm-sparc/fixmap.h
+++ b/include/asm-sparc/fixmap.h
@@ -13,7 +13,6 @@
 #ifndef _ASM_FIXMAP_H
 #define _ASM_FIXMAP_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <asm/page.h>
 #ifdef CONFIG_HIGHMEM
diff --git a/include/asm-sparc/hardirq.h b/include/asm-sparc/hardirq.h
index 2a668c479f68..4f63ed8df551 100644
--- a/include/asm-sparc/hardirq.h
+++ b/include/asm-sparc/hardirq.h
@@ -7,7 +7,6 @@
 #ifndef __SPARC_HARDIRQ_H
 #define __SPARC_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/spinlock.h>
 #include <linux/cache.h>
diff --git a/include/asm-sparc/ide.h b/include/asm-sparc/ide.h
index 64d810385ea4..a6d735a1310e 100644
--- a/include/asm-sparc/ide.h
+++ b/include/asm-sparc/ide.h
@@ -11,7 +11,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
 #include <asm/psr.h>
diff --git a/include/asm-sparc/irq.h b/include/asm-sparc/irq.h
index cee356b0dae3..dbc687403208 100644
--- a/include/asm-sparc/irq.h
+++ b/include/asm-sparc/irq.h
@@ -7,7 +7,6 @@
 #ifndef _SPARC_IRQ_H
 #define _SPARC_IRQ_H
 
-#include <linux/config.h>
 #include <linux/linkage.h>
 #include <linux/threads.h>     /* For NR_CPUS */
 #include <linux/interrupt.h>
diff --git a/include/asm-sparc/mostek.h b/include/asm-sparc/mostek.h
index 59b86bc793bf..bd92a78f4937 100644
--- a/include/asm-sparc/mostek.h
+++ b/include/asm-sparc/mostek.h
@@ -9,7 +9,6 @@
 #ifndef _SPARC_MOSTEK_H
 #define _SPARC_MOSTEK_H
 
-#include <linux/config.h>
 #include <asm/idprom.h>
 #include <asm/io.h>
 
diff --git a/include/asm-sparc/page.h b/include/asm-sparc/page.h
index ec3274b7ddf4..5bab8a7c25ce 100644
--- a/include/asm-sparc/page.h
+++ b/include/asm-sparc/page.h
@@ -8,7 +8,6 @@
 #ifndef _SPARC_PAGE_H
 #define _SPARC_PAGE_H
 
-#include <linux/config.h>
 #ifdef CONFIG_SUN4
 #define PAGE_SHIFT   13
 #else
diff --git a/include/asm-sparc/pgalloc.h b/include/asm-sparc/pgalloc.h
index 126800acd10d..a449cd4912d1 100644
--- a/include/asm-sparc/pgalloc.h
+++ b/include/asm-sparc/pgalloc.h
@@ -2,7 +2,6 @@
 #ifndef _SPARC_PGALLOC_H
 #define _SPARC_PGALLOC_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 
diff --git a/include/asm-sparc/pgtable.h b/include/asm-sparc/pgtable.h
index 9eea8f4d41f0..226c6475c9a2 100644
--- a/include/asm-sparc/pgtable.h
+++ b/include/asm-sparc/pgtable.h
@@ -11,7 +11,6 @@
 
 #include <asm-generic/4level-fixup.h>
 
-#include <linux/config.h>
 #include <linux/spinlock.h>
 #include <linux/swap.h>
 #include <asm/types.h>
diff --git a/include/asm-sparc/sfp-machine.h b/include/asm-sparc/sfp-machine.h
index b4ca2d94bf08..ecfc86a4a725 100644
--- a/include/asm-sparc/sfp-machine.h
+++ b/include/asm-sparc/sfp-machine.h
@@ -25,7 +25,6 @@
 #ifndef _SFP_MACHINE_H
 #define _SFP_MACHINE_H
 
-#include <linux/config.h>
    
 #define _FP_W_TYPE_SIZE		32
 #define _FP_W_TYPE		unsigned long
diff --git a/include/asm-sparc/smp.h b/include/asm-sparc/smp.h
index 98c46e3fbe8a..5a1b7e4e7cc9 100644
--- a/include/asm-sparc/smp.h
+++ b/include/asm-sparc/smp.h
@@ -6,7 +6,6 @@
 #ifndef _SPARC_SMP_H
 #define _SPARC_SMP_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <asm/head.h>
 #include <asm/btfixup.h>
diff --git a/include/asm-sparc/system.h b/include/asm-sparc/system.h
index 58dd162927bb..cb7dda1e5e91 100644
--- a/include/asm-sparc/system.h
+++ b/include/asm-sparc/system.h
@@ -1,10 +1,8 @@
 /* $Id: system.h,v 1.86 2001/10/30 04:57:10 davem Exp $ */
-#include <linux/config.h>
 
 #ifndef __SPARC_SYSTEM_H
 #define __SPARC_SYSTEM_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/threads.h>	/* NR_CPUS */
 #include <linux/thread_info.h>
diff --git a/include/asm-sparc/timer.h b/include/asm-sparc/timer.h
index b16eb739dddb..cb1fa1d1f184 100644
--- a/include/asm-sparc/timer.h
+++ b/include/asm-sparc/timer.h
@@ -4,7 +4,6 @@
  * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
  */
 
-#include <linux/config.h>
 
 #ifndef _SPARC_TIMER_H
 #define _SPARC_TIMER_H
diff --git a/include/asm-sparc/tlbflush.h b/include/asm-sparc/tlbflush.h
index 5643ca31ead9..4a3b66618e75 100644
--- a/include/asm-sparc/tlbflush.h
+++ b/include/asm-sparc/tlbflush.h
@@ -1,7 +1,6 @@
 #ifndef _SPARC_TLBFLUSH_H
 #define _SPARC_TLBFLUSH_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 // #include <asm/processor.h>
 
diff --git a/include/asm-sparc/vac-ops.h b/include/asm-sparc/vac-ops.h
index 9e0172323042..ab6f53b913ea 100644
--- a/include/asm-sparc/vac-ops.h
+++ b/include/asm-sparc/vac-ops.h
@@ -8,7 +8,6 @@
  * Copyright (C) 1994, David S. Miller (davem@caip.rutgers.edu)
  */
 
-#include <linux/config.h>
 #include <asm/sysen.h>
 #include <asm/contregs.h>
 #include <asm/asi.h>
diff --git a/include/asm-sparc/winmacro.h b/include/asm-sparc/winmacro.h
index 557257eef3f9..096f3d3d90c3 100644
--- a/include/asm-sparc/winmacro.h
+++ b/include/asm-sparc/winmacro.h
@@ -7,7 +7,6 @@
 #ifndef _SPARC_WINMACRO_H
 #define _SPARC_WINMACRO_H
 
-#include <linux/config.h>
 #include <asm/ptrace.h>
 
 /* Store the register window onto the 8-byte aligned area starting
diff --git a/include/asm-sparc64/atomic.h b/include/asm-sparc64/atomic.h
index 468eb48d8142..2f0bec26a695 100644
--- a/include/asm-sparc64/atomic.h
+++ b/include/asm-sparc64/atomic.h
@@ -8,7 +8,6 @@
 #ifndef __ARCH_SPARC64_ATOMIC__
 #define __ARCH_SPARC64_ATOMIC__
 
-#include <linux/config.h>
 #include <linux/types.h>
 
 typedef struct { volatile int counter; } atomic_t;
diff --git a/include/asm-sparc64/bitops.h b/include/asm-sparc64/bitops.h
index 71944b0f09de..3d5e1af84723 100644
--- a/include/asm-sparc64/bitops.h
+++ b/include/asm-sparc64/bitops.h
@@ -7,7 +7,6 @@
 #ifndef _SPARC64_BITOPS_H
 #define _SPARC64_BITOPS_H
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <asm/byteorder.h>
 
diff --git a/include/asm-sparc64/bugs.h b/include/asm-sparc64/bugs.h
index 360dd04ed8e4..120422fdb02f 100644
--- a/include/asm-sparc64/bugs.h
+++ b/include/asm-sparc64/bugs.h
@@ -4,7 +4,6 @@
  *  Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
  */
 
-#include <linux/config.h>
 
 extern unsigned long loops_per_jiffy;
 
diff --git a/include/asm-sparc64/cacheflush.h b/include/asm-sparc64/cacheflush.h
index b3f61659ba81..745d1ab60371 100644
--- a/include/asm-sparc64/cacheflush.h
+++ b/include/asm-sparc64/cacheflush.h
@@ -1,7 +1,6 @@
 #ifndef _SPARC64_CACHEFLUSH_H
 #define _SPARC64_CACHEFLUSH_H
 
-#include <linux/config.h>
 #include <asm/page.h>
 
 #ifndef __ASSEMBLY__
diff --git a/include/asm-sparc64/delay.h b/include/asm-sparc64/delay.h
index 2901ea0c342d..a4aae6f80627 100644
--- a/include/asm-sparc64/delay.h
+++ b/include/asm-sparc64/delay.h
@@ -11,7 +11,6 @@
 #ifndef __SPARC64_DELAY_H
 #define __SPARC64_DELAY_H
 
-#include <linux/config.h>
 #include <linux/param.h>
 #include <asm/cpudata.h>
 
diff --git a/include/asm-sparc64/dma-mapping.h b/include/asm-sparc64/dma-mapping.h
index c7d5804ba76d..c902a96d1d48 100644
--- a/include/asm-sparc64/dma-mapping.h
+++ b/include/asm-sparc64/dma-mapping.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_SPARC64_DMA_MAPPING_H
 #define _ASM_SPARC64_DMA_MAPPING_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_PCI
 #include <asm-generic/dma-mapping.h>
diff --git a/include/asm-sparc64/dma.h b/include/asm-sparc64/dma.h
index 1aab3c8dce2b..27f65972b3bb 100644
--- a/include/asm-sparc64/dma.h
+++ b/include/asm-sparc64/dma.h
@@ -7,7 +7,6 @@
 #ifndef _ASM_SPARC64_DMA_H
 #define _ASM_SPARC64_DMA_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/spinlock.h>
diff --git a/include/asm-sparc64/floppy.h b/include/asm-sparc64/floppy.h
index 6a95d5d0c576..07ccd6f04b52 100644
--- a/include/asm-sparc64/floppy.h
+++ b/include/asm-sparc64/floppy.h
@@ -10,7 +10,6 @@
 #ifndef __ASM_SPARC64_FLOPPY_H
 #define __ASM_SPARC64_FLOPPY_H
 
-#include <linux/config.h>
 #include <linux/init.h>
 
 #include <asm/page.h>
diff --git a/include/asm-sparc64/ide.h b/include/asm-sparc64/ide.h
index c393f815b0be..55149cf933c2 100644
--- a/include/asm-sparc64/ide.h
+++ b/include/asm-sparc64/ide.h
@@ -10,7 +10,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <asm/pgalloc.h>
 #include <asm/io.h>
 #include <asm/spitfire.h>
diff --git a/include/asm-sparc64/irq.h b/include/asm-sparc64/irq.h
index de33d6e1afb5..fa164d37ee3f 100644
--- a/include/asm-sparc64/irq.h
+++ b/include/asm-sparc64/irq.h
@@ -8,7 +8,6 @@
 #ifndef _SPARC64_IRQ_H
 #define _SPARC64_IRQ_H
 
-#include <linux/config.h>
 #include <linux/linkage.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
diff --git a/include/asm-sparc64/kprobes.h b/include/asm-sparc64/kprobes.h
index e4efe652b54b..e9bb26f770ed 100644
--- a/include/asm-sparc64/kprobes.h
+++ b/include/asm-sparc64/kprobes.h
@@ -1,7 +1,6 @@
 #ifndef _SPARC64_KPROBES_H
 #define _SPARC64_KPROBES_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/percpu.h>
 
diff --git a/include/asm-sparc64/mc146818rtc.h b/include/asm-sparc64/mc146818rtc.h
index 75bd572b35fe..e9c0fcc25c6f 100644
--- a/include/asm-sparc64/mc146818rtc.h
+++ b/include/asm-sparc64/mc146818rtc.h
@@ -4,7 +4,6 @@
 #ifndef __ASM_SPARC64_MC146818RTC_H
 #define __ASM_SPARC64_MC146818RTC_H
 
-#include <linux/config.h>
 #include <asm/io.h>
 
 #ifndef RTC_PORT
diff --git a/include/asm-sparc64/mmu.h b/include/asm-sparc64/mmu.h
index 2d4f2ea9568a..70af4b6ce136 100644
--- a/include/asm-sparc64/mmu.h
+++ b/include/asm-sparc64/mmu.h
@@ -1,7 +1,6 @@
 #ifndef __MMU_H
 #define __MMU_H
 
-#include <linux/config.h>
 #include <asm/page.h>
 #include <asm/const.h>
 #include <asm/hypervisor.h>
diff --git a/include/asm-sparc64/oplib.h b/include/asm-sparc64/oplib.h
index c754676e13ef..dea3e73f0955 100644
--- a/include/asm-sparc64/oplib.h
+++ b/include/asm-sparc64/oplib.h
@@ -9,7 +9,6 @@
 #ifndef __SPARC64_OPLIB_H
 #define __SPARC64_OPLIB_H
 
-#include <linux/config.h>
 #include <asm/openprom.h>
 
 /* OBP version string. */
diff --git a/include/asm-sparc64/page.h b/include/asm-sparc64/page.h
index aabb21906724..fdf0ceb76028 100644
--- a/include/asm-sparc64/page.h
+++ b/include/asm-sparc64/page.h
@@ -3,7 +3,6 @@
 #ifndef _SPARC64_PAGE_H
 #define _SPARC64_PAGE_H
 
-#include <linux/config.h>
 #include <asm/const.h>
 
 #if defined(CONFIG_SPARC64_PAGE_SIZE_8KB)
diff --git a/include/asm-sparc64/param.h b/include/asm-sparc64/param.h
index a1cd4974630b..f0125cf5a9df 100644
--- a/include/asm-sparc64/param.h
+++ b/include/asm-sparc64/param.h
@@ -1,7 +1,6 @@
 #ifndef _ASMSPARC64_PARAM_H
 #define _ASMSPARC64_PARAM_H
 
-#include <linux/config.h>
 
 #ifdef __KERNEL__
 # define HZ		CONFIG_HZ	/* Internal kernel timer frequency */
diff --git a/include/asm-sparc64/pgalloc.h b/include/asm-sparc64/pgalloc.h
index 12e4a273bd43..010f9cd0a672 100644
--- a/include/asm-sparc64/pgalloc.h
+++ b/include/asm-sparc64/pgalloc.h
@@ -2,7 +2,6 @@
 #ifndef _SPARC64_PGALLOC_H
 #define _SPARC64_PGALLOC_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
diff --git a/include/asm-sparc64/pgtable.h b/include/asm-sparc64/pgtable.h
index c44e7466534e..72f9a524dc67 100644
--- a/include/asm-sparc64/pgtable.h
+++ b/include/asm-sparc64/pgtable.h
@@ -14,7 +14,6 @@
 
 #include <asm-generic/pgtable-nopud.h>
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <asm/types.h>
 #include <asm/spitfire.h>
diff --git a/include/asm-sparc64/processor.h b/include/asm-sparc64/processor.h
index c6896b88283e..66dd2fa0e319 100644
--- a/include/asm-sparc64/processor.h
+++ b/include/asm-sparc64/processor.h
@@ -13,7 +13,6 @@
  */
 #define current_text_addr() ({ void *pc; __asm__("rd %%pc, %0" : "=r" (pc)); pc; })
 
-#include <linux/config.h>
 #include <asm/asi.h>
 #include <asm/a.out.h>
 #include <asm/pstate.h>
diff --git a/include/asm-sparc64/siginfo.h b/include/asm-sparc64/siginfo.h
index df17e47abc1c..c96e6c30f8b0 100644
--- a/include/asm-sparc64/siginfo.h
+++ b/include/asm-sparc64/siginfo.h
@@ -11,7 +11,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/compat.h>
 
 #ifdef CONFIG_COMPAT
diff --git a/include/asm-sparc64/signal.h b/include/asm-sparc64/signal.h
index e3059bb4a465..fdc42a14d4e6 100644
--- a/include/asm-sparc64/signal.h
+++ b/include/asm-sparc64/signal.h
@@ -6,7 +6,6 @@
 
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
-#include <linux/config.h>
 #include <linux/personality.h>
 #include <linux/types.h>
 #include <linux/compat.h>
diff --git a/include/asm-sparc64/smp.h b/include/asm-sparc64/smp.h
index 89d86ecaab24..388249b751c3 100644
--- a/include/asm-sparc64/smp.h
+++ b/include/asm-sparc64/smp.h
@@ -6,7 +6,6 @@
 #ifndef _SPARC64_SMP_H
 #define _SPARC64_SMP_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <asm/asi.h>
 #include <asm/starfire.h>
diff --git a/include/asm-sparc64/spinlock.h b/include/asm-sparc64/spinlock.h
index 508c416e9d6a..bd5ffc76bc7e 100644
--- a/include/asm-sparc64/spinlock.h
+++ b/include/asm-sparc64/spinlock.h
@@ -6,7 +6,6 @@
 #ifndef __SPARC64_SPINLOCK_H
 #define __SPARC64_SPINLOCK_H
 
-#include <linux/config.h>
 #include <linux/threads.h>	/* For NR_CPUS */
 
 #ifndef __ASSEMBLY__
diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h
index a18ec87a52c1..4ca68600c670 100644
--- a/include/asm-sparc64/system.h
+++ b/include/asm-sparc64/system.h
@@ -2,7 +2,6 @@
 #ifndef __SPARC64_SYSTEM_H
 #define __SPARC64_SYSTEM_H
 
-#include <linux/config.h>
 #include <asm/ptrace.h>
 #include <asm/processor.h>
 #include <asm/visasm.h>
diff --git a/include/asm-sparc64/timer.h b/include/asm-sparc64/timer.h
index edc8e08c3a39..d435594df786 100644
--- a/include/asm-sparc64/timer.h
+++ b/include/asm-sparc64/timer.h
@@ -9,7 +9,6 @@
 
 #include <linux/types.h>
 
-#include <linux/config.h>
 
 struct sparc64_tick_ops {
 	void (*init_tick)(unsigned long);
diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h
index 61c01882b562..7af1e1109c49 100644
--- a/include/asm-sparc64/tlb.h
+++ b/include/asm-sparc64/tlb.h
@@ -1,7 +1,6 @@
 #ifndef _SPARC64_TLB_H
 #define _SPARC64_TLB_H
 
-#include <linux/config.h>
 #include <linux/swap.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
diff --git a/include/asm-sparc64/tlbflush.h b/include/asm-sparc64/tlbflush.h
index 9ad5d9c51d42..0386014ecf27 100644
--- a/include/asm-sparc64/tlbflush.h
+++ b/include/asm-sparc64/tlbflush.h
@@ -1,7 +1,6 @@
 #ifndef _SPARC64_TLBFLUSH_H
 #define _SPARC64_TLBFLUSH_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <asm/mmu_context.h>
 
diff --git a/include/asm-sparc64/ttable.h b/include/asm-sparc64/ttable.h
index 2d5e3c464df5..f2352606a79f 100644
--- a/include/asm-sparc64/ttable.h
+++ b/include/asm-sparc64/ttable.h
@@ -2,7 +2,6 @@
 #ifndef _SPARC64_TTABLE_H
 #define _SPARC64_TTABLE_H
 
-#include <linux/config.h>
 #include <asm/utrap.h>
 
 #ifdef __ASSEMBLY__
diff --git a/include/asm-um/a.out.h b/include/asm-um/a.out.h
index 7c26265e1d7a..50cee7b296f4 100644
--- a/include/asm-um/a.out.h
+++ b/include/asm-um/a.out.h
@@ -1,7 +1,6 @@
 #ifndef __UM_A_OUT_H
 #define __UM_A_OUT_H
 
-#include "linux/config.h"
 #include "asm/arch/a.out.h"
 #include "choose-mode.h"
 
diff --git a/include/asm-um/cache.h b/include/asm-um/cache.h
index 3d0587075521..19e1bdd67416 100644
--- a/include/asm-um/cache.h
+++ b/include/asm-um/cache.h
@@ -1,7 +1,6 @@
 #ifndef __UM_CACHE_H
 #define __UM_CACHE_H
 
-#include <linux/config.h>
 
 #if defined(CONFIG_UML_X86) && !defined(CONFIG_64BIT)
 # define L1_CACHE_SHIFT		(CONFIG_X86_L1_CACHE_SHIFT)
diff --git a/include/asm-um/elf-ppc.h b/include/asm-um/elf-ppc.h
index 2998cf925042..99711134e477 100644
--- a/include/asm-um/elf-ppc.h
+++ b/include/asm-um/elf-ppc.h
@@ -1,7 +1,6 @@
 #ifndef __UM_ELF_PPC_H
 #define __UM_ELF_PPC_H
 
-#include "linux/config.h"
 
 extern long elf_aux_hwcap;
 #define ELF_HWCAP (elf_aux_hwcap)
diff --git a/include/asm-um/fixmap.h b/include/asm-um/fixmap.h
index ae0ca3932d50..d352a35cfafb 100644
--- a/include/asm-um/fixmap.h
+++ b/include/asm-um/fixmap.h
@@ -1,7 +1,6 @@
 #ifndef __UM_FIXMAP_H
 #define __UM_FIXMAP_H
 
-#include <linux/config.h>
 #include <asm/kmap_types.h>
 #include <asm/archparam.h>
 #include <asm/elf.h>
diff --git a/include/asm-um/hardirq.h b/include/asm-um/hardirq.h
index 1224b2690a23..313ebb8a2566 100644
--- a/include/asm-um/hardirq.h
+++ b/include/asm-um/hardirq.h
@@ -3,7 +3,6 @@
 #ifndef __ASM_UM_HARDIRQ_H
 #define __ASM_UM_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/irq.h>
 
diff --git a/include/asm-um/linkage.h b/include/asm-um/linkage.h
index e3d62dcbd356..78b862472b36 100644
--- a/include/asm-um/linkage.h
+++ b/include/asm-um/linkage.h
@@ -3,7 +3,6 @@
 
 #include "asm/arch/linkage.h"
 
-#include <linux/config.h>
 
 /* <linux/linkage.h> will pick sane defaults */
 #ifdef CONFIG_GPROF
diff --git a/include/asm-um/mmu_context.h b/include/asm-um/mmu_context.h
index 9a0e48eb542e..f709c784bf12 100644
--- a/include/asm-um/mmu_context.h
+++ b/include/asm-um/mmu_context.h
@@ -7,7 +7,6 @@
 #define __UM_MMU_CONTEXT_H
 
 #include "linux/sched.h"
-#include "linux/config.h"
 #include "choose-mode.h"
 #include "um_mmu.h"
 
diff --git a/include/asm-um/page.h b/include/asm-um/page.h
index 41364330aff1..4296d3135aa9 100644
--- a/include/asm-um/page.h
+++ b/include/asm-um/page.h
@@ -9,7 +9,6 @@
 
 struct page;
 
-#include <linux/config.h>
 #include <asm/vm-flags.h>
 
 /* PAGE_SHIFT determines the page size */
diff --git a/include/asm-um/pgalloc.h b/include/asm-um/pgalloc.h
index ea49411236dc..34ab268ef40e 100644
--- a/include/asm-um/pgalloc.h
+++ b/include/asm-um/pgalloc.h
@@ -8,7 +8,6 @@
 #ifndef __UM_PGALLOC_H
 #define __UM_PGALLOC_H
 
-#include "linux/config.h"
 #include "linux/mm.h"
 #include "asm/fixmap.h"
 
diff --git a/include/asm-um/processor-generic.h b/include/asm-um/processor-generic.h
index da07a69ce82a..824c28896382 100644
--- a/include/asm-um/processor-generic.h
+++ b/include/asm-um/processor-generic.h
@@ -10,7 +10,6 @@ struct pt_regs;
 
 struct task_struct;
 
-#include "linux/config.h"
 #include "asm/ptrace.h"
 #include "choose-mode.h"
 #include "registers.h"
diff --git a/include/asm-um/ptrace-generic.h b/include/asm-um/ptrace-generic.h
index 503484305e67..a36f5371b36b 100644
--- a/include/asm-um/ptrace-generic.h
+++ b/include/asm-um/ptrace-generic.h
@@ -8,7 +8,6 @@
 
 #ifndef __ASSEMBLY__
 
-#include "linux/config.h"
 
 #define pt_regs pt_regs_subarch
 #define show_regs show_regs_subarch
diff --git a/include/asm-um/smp.h b/include/asm-um/smp.h
index aeda6657f366..ca552261ed1f 100644
--- a/include/asm-um/smp.h
+++ b/include/asm-um/smp.h
@@ -3,7 +3,6 @@
 
 #ifdef CONFIG_SMP
 
-#include "linux/config.h"
 #include "linux/bitops.h"
 #include "asm/current.h"
 #include "linux/cpumask.h"
diff --git a/include/asm-um/thread_info.h b/include/asm-um/thread_info.h
index f166b9837c6a..261e2f4528f6 100644
--- a/include/asm-um/thread_info.h
+++ b/include/asm-um/thread_info.h
@@ -8,7 +8,6 @@
 
 #ifndef __ASSEMBLY__
 
-#include <linux/config.h>
 #include <asm/processor.h>
 #include <asm/types.h>
 
diff --git a/include/asm-v850/atomic.h b/include/asm-v850/atomic.h
index 166df00457ea..e4e57de08f73 100644
--- a/include/asm-v850/atomic.h
+++ b/include/asm-v850/atomic.h
@@ -14,7 +14,6 @@
 #ifndef __V850_ATOMIC_H__
 #define __V850_ATOMIC_H__
 
-#include <linux/config.h>
 
 #include <asm/system.h>
 
diff --git a/include/asm-v850/bitops.h b/include/asm-v850/bitops.h
index 1f6fd5ab4177..1fa99baf4e25 100644
--- a/include/asm-v850/bitops.h
+++ b/include/asm-v850/bitops.h
@@ -14,7 +14,6 @@
 #define __V850_BITOPS_H__
 
 
-#include <linux/config.h>
 #include <linux/compiler.h>	/* unlikely  */
 #include <asm/byteorder.h>	/* swab32 */
 #include <asm/system.h>		/* interrupt enable/disable */
diff --git a/include/asm-v850/dma-mapping.h b/include/asm-v850/dma-mapping.h
index c63fb50ec9ef..1cc42c603a1b 100644
--- a/include/asm-v850/dma-mapping.h
+++ b/include/asm-v850/dma-mapping.h
@@ -1,7 +1,6 @@
 #ifndef __V850_DMA_MAPPING_H__
 #define __V850_DMA_MAPPING_H__
 
-#include <linux/config.h>
 
 #ifdef CONFIG_PCI
 #include <asm-generic/dma-mapping.h>
diff --git a/include/asm-v850/hardirq.h b/include/asm-v850/hardirq.h
index d98488cd5af1..04e20127c5af 100644
--- a/include/asm-v850/hardirq.h
+++ b/include/asm-v850/hardirq.h
@@ -1,7 +1,6 @@
 #ifndef __V850_HARDIRQ_H__
 #define __V850_HARDIRQ_H__
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/cache.h>
 
diff --git a/include/asm-v850/machdep.h b/include/asm-v850/machdep.h
index 98d8bf63970e..f1e3b8b91508 100644
--- a/include/asm-v850/machdep.h
+++ b/include/asm-v850/machdep.h
@@ -14,7 +14,6 @@
 #ifndef __V850_MACHDEP_H__
 #define __V850_MACHDEP_H__
 
-#include <linux/config.h>
 
 /* chips */
 #ifdef CONFIG_V850E_MA1
diff --git a/include/asm-v850/pgtable.h b/include/asm-v850/pgtable.h
index 3cf8775ce85f..1ea2a900f0f8 100644
--- a/include/asm-v850/pgtable.h
+++ b/include/asm-v850/pgtable.h
@@ -3,7 +3,6 @@
 
 #include <asm-generic/4level-fixup.h>
 
-#include <linux/config.h>
 #include <asm/page.h>
 
 
diff --git a/include/asm-v850/processor.h b/include/asm-v850/processor.h
index 2d31308935a0..6965b66ccaed 100644
--- a/include/asm-v850/processor.h
+++ b/include/asm-v850/processor.h
@@ -14,7 +14,6 @@
 #ifndef __V850_PROCESSOR_H__
 #define __V850_PROCESSOR_H__
 
-#include <linux/config.h>
 #ifndef __ASSEMBLY__ /* <linux/thread_info.h> is not asm-safe.  */
 #include <linux/thread_info.h>
 #endif
diff --git a/include/asm-v850/serial.h b/include/asm-v850/serial.h
index 8c2a609ba2b0..36d8f4cbbf39 100644
--- a/include/asm-v850/serial.h
+++ b/include/asm-v850/serial.h
@@ -6,7 +6,6 @@
  * Copyright (C) 1999 by Ralf Baechle
  * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
  */ 
-#include <linux/config.h>
 
 #ifdef CONFIG_RTE_CB_ME2
 
diff --git a/include/asm-v850/v850e_uart.h b/include/asm-v850/v850e_uart.h
index 5930d5990b19..5182fb4cc989 100644
--- a/include/asm-v850/v850e_uart.h
+++ b/include/asm-v850/v850e_uart.h
@@ -19,7 +19,6 @@
 #ifndef __V850_V850E_UART_H__
 #define __V850_V850E_UART_H__
 
-#include <linux/config.h>
 #include <linux/termios.h>
 
 #include <asm/v850e_utils.h>
diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h
index bdbd8935612a..a731be2204d2 100644
--- a/include/asm-x86_64/apic.h
+++ b/include/asm-x86_64/apic.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_APIC_H
 #define __ASM_APIC_H
 
-#include <linux/config.h>
 #include <linux/pm.h>
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
diff --git a/include/asm-x86_64/atomic.h b/include/asm-x86_64/atomic.h
index cecbf7baa6aa..bd3fa67ed835 100644
--- a/include/asm-x86_64/atomic.h
+++ b/include/asm-x86_64/atomic.h
@@ -1,7 +1,6 @@
 #ifndef __ARCH_X86_64_ATOMIC__
 #define __ARCH_X86_64_ATOMIC__
 
-#include <linux/config.h>
 #include <asm/types.h>
 
 /* atomic_t should be 32 bit signed type */
diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h
index 79212128d0f7..e9bf933d25d0 100644
--- a/include/asm-x86_64/bitops.h
+++ b/include/asm-x86_64/bitops.h
@@ -5,7 +5,6 @@
  * Copyright 1992, Linus Torvalds.
  */
 
-#include <linux/config.h>
 
 #ifdef CONFIG_SMP
 #define LOCK_PREFIX "lock ; "
diff --git a/include/asm-x86_64/bugs.h b/include/asm-x86_64/bugs.h
index 59bc68925d0f..d86c5dd689fa 100644
--- a/include/asm-x86_64/bugs.h
+++ b/include/asm-x86_64/bugs.h
@@ -10,7 +10,6 @@
  *	void check_bugs(void);
  */
 
-#include <linux/config.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/msr.h>
diff --git a/include/asm-x86_64/cache.h b/include/asm-x86_64/cache.h
index f8dff1c67538..ed8a9d25272d 100644
--- a/include/asm-x86_64/cache.h
+++ b/include/asm-x86_64/cache.h
@@ -4,7 +4,6 @@
 #ifndef __ARCH_X8664_CACHE_H
 #define __ARCH_X8664_CACHE_H
 
-#include <linux/config.h>
 
 /* L1 cache line size */
 #define L1_CACHE_SHIFT	(CONFIG_X86_L1_CACHE_SHIFT)
diff --git a/include/asm-x86_64/calling.h b/include/asm-x86_64/calling.h
index fc2c5a6c262a..6f4f63af96e1 100644
--- a/include/asm-x86_64/calling.h
+++ b/include/asm-x86_64/calling.h
@@ -2,7 +2,6 @@
  * Some macros to handle stack frames in assembly.
  */ 
 
-#include <linux/config.h>
 
 #define R15 0
 #define R14 8
diff --git a/include/asm-x86_64/dma-mapping.h b/include/asm-x86_64/dma-mapping.h
index 49a81a66516e..498f66df36b9 100644
--- a/include/asm-x86_64/dma-mapping.h
+++ b/include/asm-x86_64/dma-mapping.h
@@ -6,7 +6,6 @@
  * documentation.
  */
 
-#include <linux/config.h>
 
 #include <asm/scatterlist.h>
 #include <asm/io.h>
diff --git a/include/asm-x86_64/dma.h b/include/asm-x86_64/dma.h
index 6f2a817b6a7c..c556208d3dd7 100644
--- a/include/asm-x86_64/dma.h
+++ b/include/asm-x86_64/dma.h
@@ -8,7 +8,6 @@
 #ifndef _ASM_DMA_H
 #define _ASM_DMA_H
 
-#include <linux/config.h>
 #include <linux/spinlock.h>	/* And spinlocks */
 #include <asm/io.h>		/* need byte IO */
 #include <linux/delay.h>
diff --git a/include/asm-x86_64/dwarf2.h b/include/asm-x86_64/dwarf2.h
index 07654bd155bf..0744db777676 100644
--- a/include/asm-x86_64/dwarf2.h
+++ b/include/asm-x86_64/dwarf2.h
@@ -1,7 +1,6 @@
 #ifndef _DWARF2_H
 #define _DWARF2_H 1
 
-#include <linux/config.h>
 
 #ifndef __ASSEMBLY__
 #warning "asm/dwarf2.h should be only included in pure assembly files"
diff --git a/include/asm-x86_64/fixmap.h b/include/asm-x86_64/fixmap.h
index 7b286bd21d1d..0b4ffbd1a125 100644
--- a/include/asm-x86_64/fixmap.h
+++ b/include/asm-x86_64/fixmap.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_FIXMAP_H
 #define _ASM_FIXMAP_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <asm/apicdef.h>
 #include <asm/page.h>
diff --git a/include/asm-x86_64/hardirq.h b/include/asm-x86_64/hardirq.h
index 8689951e3503..64a65ce2f41f 100644
--- a/include/asm-x86_64/hardirq.h
+++ b/include/asm-x86_64/hardirq.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_HARDIRQ_H
 #define __ASM_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/irq.h>
 #include <asm/pda.h>
diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h
index 0df1715dee71..3de96fd86a70 100644
--- a/include/asm-x86_64/hw_irq.h
+++ b/include/asm-x86_64/hw_irq.h
@@ -17,7 +17,6 @@
  */
 
 #ifndef __ASSEMBLY__
-#include <linux/config.h>
 #include <asm/atomic.h>
 #include <asm/irq.h>
 #include <linux/profile.h>
diff --git a/include/asm-x86_64/ia32.h b/include/asm-x86_64/ia32.h
index e6b7f2234e43..0190b7c4e319 100644
--- a/include/asm-x86_64/ia32.h
+++ b/include/asm-x86_64/ia32.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_X86_64_IA32_H
 #define _ASM_X86_64_IA32_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_IA32_EMULATION
 
diff --git a/include/asm-x86_64/io.h b/include/asm-x86_64/io.h
index a05da8a50bfd..70e91fe76344 100644
--- a/include/asm-x86_64/io.h
+++ b/include/asm-x86_64/io.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_IO_H
 #define _ASM_IO_H
 
-#include <linux/config.h>
 
 /*
  * This file contains the definitions for the x86 IO instructions
diff --git a/include/asm-x86_64/io_apic.h b/include/asm-x86_64/io_apic.h
index ee1bc69aec9c..d71badbd260a 100644
--- a/include/asm-x86_64/io_apic.h
+++ b/include/asm-x86_64/io_apic.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_IO_APIC_H
 #define __ASM_IO_APIC_H
 
-#include <linux/config.h>
 #include <asm/types.h>
 #include <asm/mpspec.h>
 
diff --git a/include/asm-x86_64/mmu_context.h b/include/asm-x86_64/mmu_context.h
index 19f0c83d0792..af03b9f852d6 100644
--- a/include/asm-x86_64/mmu_context.h
+++ b/include/asm-x86_64/mmu_context.h
@@ -1,7 +1,6 @@
 #ifndef __X86_64_MMU_CONTEXT_H
 #define __X86_64_MMU_CONTEXT_H
 
-#include <linux/config.h>
 #include <asm/desc.h>
 #include <asm/atomic.h>
 #include <asm/pgalloc.h>
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index 6944e7122df5..70bb9969766e 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -4,7 +4,6 @@
 #ifndef _ASM_X86_64_MMZONE_H
 #define _ASM_X86_64_MMZONE_H 1
 
-#include <linux/config.h>
 
 #ifdef CONFIG_NUMA
 
diff --git a/include/asm-x86_64/mtrr.h b/include/asm-x86_64/mtrr.h
index 66ac1c0f27e1..4a0610c185eb 100644
--- a/include/asm-x86_64/mtrr.h
+++ b/include/asm-x86_64/mtrr.h
@@ -23,7 +23,6 @@
 #ifndef _LINUX_MTRR_H
 #define _LINUX_MTRR_H
 
-#include <linux/config.h>
 #include <linux/ioctl.h>
 #include <linux/compat.h>
 
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index 408185bac351..b59f33e6b838 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -1,7 +1,6 @@
 #ifndef _X86_64_PAGE_H
 #define _X86_64_PAGE_H
 
-#include <linux/config.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT	12
diff --git a/include/asm-x86_64/param.h b/include/asm-x86_64/param.h
index 5956b23b57c2..a728786c3c7c 100644
--- a/include/asm-x86_64/param.h
+++ b/include/asm-x86_64/param.h
@@ -2,7 +2,6 @@
 #define _ASMx86_64_PARAM_H
 
 #ifdef __KERNEL__
-# include <linux/config.h>
 # define HZ            CONFIG_HZ	/* Internal kernel timer frequency */
 # define USER_HZ       100		/* .. some user interfaces are in "ticks */
 #define CLOCKS_PER_SEC        (USER_HZ)       /* like times() */
diff --git a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h
index 8a05af264d18..2db0620d5449 100644
--- a/include/asm-x86_64/pci.h
+++ b/include/asm-x86_64/pci.h
@@ -1,7 +1,6 @@
 #ifndef __x8664_PCI_H
 #define __x8664_PCI_H
 
-#include <linux/config.h>
 #include <asm/io.h>
 
 #ifdef __KERNEL__
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 37a3ec433ee5..3061a38a3b1d 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -12,7 +12,6 @@
 #include <asm/types.h>
 #include <asm/sigcontext.h>
 #include <asm/cpufeature.h>
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <asm/msr.h>
 #include <asm/current.h>
diff --git a/include/asm-x86_64/serial.h b/include/asm-x86_64/serial.h
index dc752eafa681..b0496e0d72a6 100644
--- a/include/asm-x86_64/serial.h
+++ b/include/asm-x86_64/serial.h
@@ -2,7 +2,6 @@
  * include/asm-x86_64/serial.h
  */
 
-#include <linux/config.h>
 
 /*
  * This assumes you have a 1.8432 MHz clock for your UART.
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index a4fdaeb5c397..7686b9b25aef 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -5,7 +5,6 @@
  * We need the APIC definitions automatically as part of 'smp.h'
  */
 #ifndef __ASSEMBLY__
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/bitops.h>
diff --git a/include/asm-x86_64/spinlock.h b/include/asm-x86_64/spinlock.h
index fe484a699cc3..5d8a5e3589ff 100644
--- a/include/asm-x86_64/spinlock.h
+++ b/include/asm-x86_64/spinlock.h
@@ -4,7 +4,6 @@
 #include <asm/atomic.h>
 #include <asm/rwlock.h>
 #include <asm/page.h>
-#include <linux/config.h>
 
 /*
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
diff --git a/include/asm-x86_64/swiotlb.h b/include/asm-x86_64/swiotlb.h
index 60757efd1353..5f9a01805821 100644
--- a/include/asm-x86_64/swiotlb.h
+++ b/include/asm-x86_64/swiotlb.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_SWIOTLB_H
 #define _ASM_SWTIOLB_H 1
 
-#include <linux/config.h>
 
 #include <asm/dma-mapping.h>
 
diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h
index 397598980228..f48e0dad8b3d 100644
--- a/include/asm-x86_64/system.h
+++ b/include/asm-x86_64/system.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_SYSTEM_H
 #define __ASM_SYSTEM_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <asm/segment.h>
 
diff --git a/include/asm-x86_64/tlbflush.h b/include/asm-x86_64/tlbflush.h
index 4a9c20ea9b10..d16d5b60f419 100644
--- a/include/asm-x86_64/tlbflush.h
+++ b/include/asm-x86_64/tlbflush.h
@@ -1,7 +1,6 @@
 #ifndef _X8664_TLBFLUSH_H
 #define _X8664_TLBFLUSH_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <asm/processor.h>
 
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 9db54e9d17bb..80c4e44d011c 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_X86_64_TOPOLOGY_H
 #define _ASM_X86_64_TOPOLOGY_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_NUMA
 
diff --git a/include/asm-x86_64/uaccess.h b/include/asm-x86_64/uaccess.h
index bddffcb591b8..1e1fa003daa3 100644
--- a/include/asm-x86_64/uaccess.h
+++ b/include/asm-x86_64/uaccess.h
@@ -4,7 +4,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
diff --git a/include/asm-xtensa/atomic.h b/include/asm-xtensa/atomic.h
index fe105a123924..5c2672021068 100644
--- a/include/asm-xtensa/atomic.h
+++ b/include/asm-xtensa/atomic.h
@@ -13,7 +13,6 @@
 #ifndef _XTENSA_ATOMIC_H
 #define _XTENSA_ATOMIC_H
 
-#include <linux/config.h>
 #include <linux/stringify.h>
 
 typedef struct { volatile int counter; } atomic_t;
diff --git a/include/asm-xtensa/checksum.h b/include/asm-xtensa/checksum.h
index 81a797ae3abe..bdc00ae9be48 100644
--- a/include/asm-xtensa/checksum.h
+++ b/include/asm-xtensa/checksum.h
@@ -11,7 +11,6 @@
 #ifndef _XTENSA_CHECKSUM_H
 #define _XTENSA_CHECKSUM_H
 
-#include <linux/config.h>
 #include <linux/in6.h>
 #include <xtensa/config/core.h>
 
diff --git a/include/asm-xtensa/delay.h b/include/asm-xtensa/delay.h
index 1bc601ec3621..e1d8c9e010c1 100644
--- a/include/asm-xtensa/delay.h
+++ b/include/asm-xtensa/delay.h
@@ -12,7 +12,6 @@
 #ifndef _XTENSA_DELAY_H
 #define _XTENSA_DELAY_H
 
-#include <linux/config.h>
 #include <asm/processor.h>
 #include <asm/param.h>
 
diff --git a/include/asm-xtensa/dma.h b/include/asm-xtensa/dma.h
index 1c22b0234586..db2633f67789 100644
--- a/include/asm-xtensa/dma.h
+++ b/include/asm-xtensa/dma.h
@@ -11,7 +11,6 @@
 #ifndef _XTENSA_DMA_H
 #define _XTENSA_DMA_H
 
-#include <linux/config.h>
 #include <asm/io.h>		/* need byte IO */
 #include <xtensa/config/core.h>
 
diff --git a/include/asm-xtensa/hardirq.h b/include/asm-xtensa/hardirq.h
index aa9c1adf68d7..87cb19d1b10c 100644
--- a/include/asm-xtensa/hardirq.h
+++ b/include/asm-xtensa/hardirq.h
@@ -11,7 +11,6 @@
 #ifndef _XTENSA_HARDIRQ_H
 #define _XTENSA_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/cache.h>
 #include <asm/irq.h>
 
diff --git a/include/asm-xtensa/ide.h b/include/asm-xtensa/ide.h
index b523cd4a486e..6b912742a42d 100644
--- a/include/asm-xtensa/ide.h
+++ b/include/asm-xtensa/ide.h
@@ -14,7 +14,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #ifndef MAX_HWIFS
 # define MAX_HWIFS	1
diff --git a/include/asm-xtensa/io.h b/include/asm-xtensa/io.h
index c5c13985bbe1..556e5eed34f5 100644
--- a/include/asm-xtensa/io.h
+++ b/include/asm-xtensa/io.h
@@ -12,7 +12,6 @@
 #define _XTENSA_IO_H
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 #include <asm/byteorder.h>
 
 #include <linux/types.h>
diff --git a/include/asm-xtensa/irq.h b/include/asm-xtensa/irq.h
index d984e955938f..049fde7e752d 100644
--- a/include/asm-xtensa/irq.h
+++ b/include/asm-xtensa/irq.h
@@ -11,7 +11,6 @@
 #ifndef _XTENSA_IRQ_H
 #define _XTENSA_IRQ_H
 
-#include <linux/config.h>
 #include <asm/platform/hardware.h>
 
 #include <xtensa/config/core.h>
diff --git a/include/asm-xtensa/mmu_context.h b/include/asm-xtensa/mmu_context.h
index 364a7b057bfa..af683a74a4ec 100644
--- a/include/asm-xtensa/mmu_context.h
+++ b/include/asm-xtensa/mmu_context.h
@@ -13,7 +13,6 @@
 #ifndef _XTENSA_MMU_CONTEXT_H
 #define _XTENSA_MMU_CONTEXT_H
 
-#include <linux/config.h>
 #include <linux/stringify.h>
 
 #include <asm/pgtable.h>
diff --git a/include/asm-xtensa/page.h b/include/asm-xtensa/page.h
index 992bac5c1258..40f4c6c3f580 100644
--- a/include/asm-xtensa/page.h
+++ b/include/asm-xtensa/page.h
@@ -14,7 +14,6 @@
 #ifdef __KERNEL__
 
 #include <asm/processor.h>
-#include <linux/config.h>
 
 /*
  * PAGE_SHIFT determines the page size
diff --git a/include/asm-xtensa/pgalloc.h b/include/asm-xtensa/pgalloc.h
index 734a8d060395..d56ddf2055e1 100644
--- a/include/asm-xtensa/pgalloc.h
+++ b/include/asm-xtensa/pgalloc.h
@@ -13,7 +13,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/highmem.h>
 #include <asm/processor.h>
diff --git a/include/asm-xtensa/platform.h b/include/asm-xtensa/platform.h
index 36163894bc20..48135a9718b0 100644
--- a/include/asm-xtensa/platform.h
+++ b/include/asm-xtensa/platform.h
@@ -13,7 +13,6 @@
 #ifndef _XTENSA_PLATFORM_H
 #define _XTENSA_PLATFORM_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/pci.h>
 
diff --git a/include/asm-xtensa/system.h b/include/asm-xtensa/system.h
index b29f7ae6a08a..f986170bd2a1 100644
--- a/include/asm-xtensa/system.h
+++ b/include/asm-xtensa/system.h
@@ -11,7 +11,6 @@
 #ifndef _XTENSA_SYSTEM_H
 #define _XTENSA_SYSTEM_H
 
-#include <linux/config.h>
 #include <linux/stringify.h>
 
 #include <asm/processor.h>
diff --git a/include/linux/acct.h b/include/linux/acct.h
index 255b11293a8d..3d54fbcf969e 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -115,7 +115,6 @@ struct acct_v3
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #ifdef CONFIG_BSD_PROCESS_ACCT
 struct vfsmount;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index d3bc25e6d27d..1cf0b91d05bd 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -25,7 +25,6 @@
 #ifndef _LINUX_ACPI_H
 #define _LINUX_ACPI_H
 
-#include <linux/config.h>
 
 #ifdef	CONFIG_ACPI
 
diff --git a/include/linux/amba/clcd.h b/include/linux/amba/clcd.h
index 9cf64b1b688b..29c0448265cf 100644
--- a/include/linux/amba/clcd.h
+++ b/include/linux/amba/clcd.h
@@ -9,7 +9,6 @@
  * License.  See the file COPYING in the main directory of this archive
  * for more details.
  */
-#include <linux/config.h>
 #include <linux/fb.h>
 
 /*
diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h
index b203ea82a0a8..1eb238affb12 100644
--- a/include/linux/atmdev.h
+++ b/include/linux/atmdev.h
@@ -209,7 +209,6 @@ struct atm_cirange {
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/wait.h> /* wait_queue_head_t */
 #include <linux/time.h> /* struct timeval */
 #include <linux/net.h>
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 59e1259b1c40..5d327313a9f7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_BLKDEV_H
 #define _LINUX_BLKDEV_H
 
-#include <linux/config.h>
 #include <linux/major.h>
 #include <linux/genhd.h>
 #include <linux/list.h>
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index b34d3e73d5ea..eb1a867ed245 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -1,7 +1,6 @@
 #ifndef BLKTRACE_H
 #define BLKTRACE_H
 
-#include <linux/config.h>
 #include <linux/blkdev.h>
 #include <linux/relay.h>
 
diff --git a/include/linux/blockgroup_lock.h b/include/linux/blockgroup_lock.h
index 0137ee5dd43c..8607312983bd 100644
--- a/include/linux/blockgroup_lock.h
+++ b/include/linux/blockgroup_lock.h
@@ -6,7 +6,6 @@
  * Simple hashed spinlocking.
  */
 
-#include <linux/config.h>
 #include <linux/spinlock.h>
 #include <linux/cache.h>
 
diff --git a/include/linux/cache.h b/include/linux/cache.h
index cc4b3aafad9a..4552504c0228 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -2,7 +2,6 @@
 #define __LINUX_CACHE_H
 
 #include <linux/kernel.h>
-#include <linux/config.h>
 #include <asm/cache.h>
 
 #ifndef L1_CACHE_ALIGN
diff --git a/include/linux/coda.h b/include/linux/coda.h
index bbc5afcd7db6..b5cf0780c51a 100644
--- a/include/linux/coda.h
+++ b/include/linux/coda.h
@@ -59,7 +59,6 @@ Mellon the rights to redistribute these changes without encumbrance.
 #ifndef _CODA_HEADER_
 #define _CODA_HEADER_
 
-#include <linux/config.h>
 
 /* Catch new _KERNEL defn for NetBSD and DJGPP/__CYGWIN32__ */
 #if defined(__NetBSD__) || \
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 6d3a654be1ae..dda1697ec753 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -4,7 +4,6 @@
  * These are the type definitions for the architecture specific
  * syscall compatibility layer.
  */
-#include <linux/config.h>
 
 #ifdef CONFIG_COMPAT
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 17866d7e2b71..5aa95011f7e6 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -15,7 +15,6 @@
 #define _LINUX_CPUFREQ_H
 
 #include <linux/mutex.h>
-#include <linux/config.h>
 #include <linux/notifier.h>
 #include <linux/threads.h>
 #include <linux/device.h>
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 0ab1bc1152ca..5a0470e36111 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -17,7 +17,6 @@
 #ifndef _LINUX_CRYPTO_H
 #define _LINUX_CRYPTO_H
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
diff --git a/include/linux/cyclomx.h b/include/linux/cyclomx.h
index 300d704bdb9a..b88f7f428e58 100644
--- a/include/linux/cyclomx.h
+++ b/include/linux/cyclomx.h
@@ -24,7 +24,6 @@
 * 1998/08/08	acme		Version 0.0.1
 */
 
-#include <linux/config.h>
 #include <linux/wanrouter.h>
 #include <linux/spinlock.h>
 
diff --git a/include/linux/dcookies.h b/include/linux/dcookies.h
index 1d68428c925d..0fe7cdf326f7 100644
--- a/include/linux/dcookies.h
+++ b/include/linux/dcookies.h
@@ -9,7 +9,6 @@
 #ifndef DCOOKIES_H
 #define DCOOKIES_H
  
-#include <linux/config.h>
 
 #ifdef CONFIG_PROFILING
  
diff --git a/include/linux/devfs_fs_kernel.h b/include/linux/devfs_fs_kernel.h
index 89810e73d256..0d74a6f22abc 100644
--- a/include/linux/devfs_fs_kernel.h
+++ b/include/linux/devfs_fs_kernel.h
@@ -2,7 +2,6 @@
 #define _LINUX_DEVFS_FS_KERNEL_H
 
 #include <linux/fs.h>
-#include <linux/config.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
 
diff --git a/include/linux/device.h b/include/linux/device.h
index f6e72a65a3f2..9943f51cd809 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -11,7 +11,6 @@
 #ifndef _DEVICE_H_
 #define _DEVICE_H_
 
-#include <linux/config.h>
 #include <linux/ioport.h>
 #include <linux/kobject.h>
 #include <linux/klist.h>
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index 64fd6c366604..b2cd2071d432 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -2,7 +2,6 @@
 #define __DMI_H__
 
 #include <linux/list.h>
-#include <linux/config.h>
 
 enum dmi_field {
 	DMI_NONE,
diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h
index f134a01975c7..102a902b4396 100644
--- a/include/linux/dnotify.h
+++ b/include/linux/dnotify.h
@@ -18,7 +18,6 @@ struct dnotify_struct {
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #ifdef CONFIG_DNOTIFY
 
diff --git a/include/linux/errqueue.h b/include/linux/errqueue.h
index 174582fedb8b..408118a07763 100644
--- a/include/linux/errqueue.h
+++ b/include/linux/errqueue.h
@@ -21,7 +21,6 @@ struct sock_extended_err
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <net/ip.h>
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 #include <linux/ipv6.h>
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3de2bfb2410f..75a236c268fc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -6,7 +6,6 @@
  * structures etc.
  */
 
-#include <linux/config.h>
 #include <linux/limits.h>
 #include <linux/ioctl.h>
 
diff --git a/include/linux/ftape.h b/include/linux/ftape.h
index 72faeec9f6e1..7e7038cba86a 100644
--- a/include/linux/ftape.h
+++ b/include/linux/ftape.h
@@ -35,7 +35,6 @@
 #include <linux/mm.h>
 #endif
 #include <linux/types.h>
-#include <linux/config.h>
 #include <linux/mtio.h>
 
 #define FT_SECTOR(x)		(x+1)	/* sector offset into real sector */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 3ac452945a7d..cc9e60844484 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -4,7 +4,6 @@
 #include <linux/mmzone.h>
 #include <linux/stddef.h>
 #include <linux/linkage.h>
-#include <linux/config.h>
 
 struct vm_area_struct;
 
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index eab537091f2a..114ae583cca9 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -1,7 +1,6 @@
 #ifndef LINUX_HARDIRQ_H
 #define LINUX_HARDIRQ_H
 
-#include <linux/config.h>
 #include <linux/preempt.h>
 #include <linux/smp_lock.h>
 #include <asm/hardirq.h>
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 892c4ea1b425..85ce7ef9a512 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_HIGHMEM_H
 #define _LINUX_HIGHMEM_H
 
-#include <linux/config.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 
diff --git a/include/linux/highuid.h b/include/linux/highuid.h
index 53ecac3905e8..434e56246f67 100644
--- a/include/linux/highuid.h
+++ b/include/linux/highuid.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_HIGHUID_H
 #define _LINUX_HIGHUID_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 
 /*
diff --git a/include/linux/ide.h b/include/linux/ide.h
index a8bef1d1371c..77e66d055f5b 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -6,7 +6,6 @@
  *  Copyright (C) 1994-2002  Linus Torvalds & authors
  */
 
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/hdreg.h>
diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h
index 395f0aad9cbf..f272a80caa3e 100644
--- a/include/linux/if_frad.h
+++ b/include/linux/if_frad.h
@@ -24,7 +24,6 @@
 #ifndef _FRAD_H_
 #define _FRAD_H_
 
-#include <linux/config.h>
 #include <linux/if.h>
 
 #if defined(CONFIG_DLCI) || defined(CONFIG_DLCI_MODULE)
diff --git a/include/linux/if_tr.h b/include/linux/if_tr.h
index 5502f597cf0e..2f94cf2c7abb 100644
--- a/include/linux/if_tr.h
+++ b/include/linux/if_tr.h
@@ -43,7 +43,6 @@ struct trh_hdr {
 };
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 #include <linux/skbuff.h>
 
 static inline struct trh_hdr *tr_hdr(const struct sk_buff *skb)
diff --git a/include/linux/init.h b/include/linux/init.h
index 93dcbe1abb4c..6667785dd1ff 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_INIT_H
 #define _LINUX_INIT_H
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 
 /* These macros are used to mark some functions or 
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 09e00433c78e..71aa1553ef38 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -67,7 +67,6 @@ struct inotify_event {
 
 #include <linux/dcache.h>
 #include <linux/fs.h>
-#include <linux/config.h>
 
 #ifdef CONFIG_INOTIFY
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 2c08fdc2bdf7..9e0fefd7884a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -2,7 +2,6 @@
 #ifndef _LINUX_INTERRUPT_H
 #define _LINUX_INTERRUPT_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/linkage.h>
 #include <linux/bitops.h>
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 1263d8cb3c18..297853c841b4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -1,7 +1,6 @@
 #ifndef _IPV6_H
 #define _IPV6_H
 
-#include <linux/config.h>
 #include <linux/in6.h>
 #include <asm/byteorder.h>
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index ee2a82a572f7..42c9cd562860 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -9,7 +9,6 @@
  * Thanks. --rmk
  */
 
-#include <linux/config.h>
 #include <linux/smp.h>
 
 #if !defined(CONFIG_S390)
diff --git a/include/linux/irq_cpustat.h b/include/linux/irq_cpustat.h
index af93505ec2ec..77e4bac29287 100644
--- a/include/linux/irq_cpustat.h
+++ b/include/linux/irq_cpustat.h
@@ -9,7 +9,6 @@
  * Keith Owens <kaos@ocs.com.au> July 2000.
  */
 
-#include <linux/config.h>
 
 /*
  * Simple wrappers reducing source bloat.  Define all irq_stat fields
diff --git a/include/linux/isapnp.h b/include/linux/isapnp.h
index 26c64c286f42..1e8728a9ee8a 100644
--- a/include/linux/isapnp.h
+++ b/include/linux/isapnp.h
@@ -22,7 +22,6 @@
 #ifndef LINUX_ISAPNP_H
 #define LINUX_ISAPNP_H
 
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/pnp.h>
 
diff --git a/include/linux/isdn.h b/include/linux/isdn.h
index 53eaee96065b..62991148d5a5 100644
--- a/include/linux/isdn.h
+++ b/include/linux/isdn.h
@@ -146,7 +146,6 @@ typedef struct {
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/major.h>
diff --git a/include/linux/isdn_ppp.h b/include/linux/isdn_ppp.h
index 26b00a76e135..8687a7dc0632 100644
--- a/include/linux/isdn_ppp.h
+++ b/include/linux/isdn_ppp.h
@@ -67,7 +67,6 @@ struct isdn_ppp_comp_data {
 #ifdef __KERNEL__
 
 
-#include <linux/config.h>
 
 #ifdef CONFIG_IPPP_FILTER
 #include <linux/filter.h>
diff --git a/include/linux/isdnif.h b/include/linux/isdnif.h
index 04e10f9f14f8..b9b5a684ed69 100644
--- a/include/linux/isdnif.h
+++ b/include/linux/isdnif.h
@@ -54,7 +54,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/skbuff.h>
 
 /***************************************************************************/
diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 9bbd04092365..54e2549f96ba 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -5,7 +5,6 @@
 #ifndef _LINUX_KALLSYMS_H
 #define _LINUX_KALLSYMS_H
 
-#include <linux/config.h>
 
 #define KSYM_NAME_LEN 127
 
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index b46249082cca..43e895f1cabe 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_KERNEL_STAT_H
 #define _LINUX_KERNEL_STAT_H
 
-#include <linux/config.h>
 #include <asm/irq.h>
 #include <linux/smp.h>
 #include <linux/threads.h>
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index e4a231549407..0db22a1ab474 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -20,7 +20,6 @@
  */
 
 #include <linux/stddef.h>
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/compiler.h>
 
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 778adc0fa640..8bf6702da2a0 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -29,7 +29,6 @@
  *		<jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
  *		<prasanna@in.ibm.com> added function-return probes.
  */
-#include <linux/config.h>
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index c08c9983e840..932021f872d5 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_LINKAGE_H
 #define _LINUX_LINKAGE_H
 
-#include <linux/config.h>
 #include <asm/linkage.h>
 
 #ifdef __cplusplus
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 995f89dc8c04..a8876bc6513b 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -11,7 +11,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/in.h>
 #include <linux/fs.h>
 #include <linux/kref.h>
diff --git a/include/linux/lockd/nlm.h b/include/linux/lockd/nlm.h
index 869b630cba24..d9d46e442538 100644
--- a/include/linux/lockd/nlm.h
+++ b/include/linux/lockd/nlm.h
@@ -9,7 +9,6 @@
 #ifndef LINUX_LOCKD_NLM_H
 #define LINUX_LOCKD_NLM_H
 
-#include <linux/config.h>
 
 /* Maximum file offset in file_lock.fl_end */
 # define NLM_OFFSET_MAX		((s32) 0x7fffffff)
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 6a7621b2b12b..8dfdd352bccd 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -28,7 +28,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/mmzone.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index ff0a64073ebc..6789c4940c9c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_MIGRATE_H
 #define _LINUX_MIGRATE_H
 
-#include <linux/config.h>
 #include <linux/mm.h>
 
 #ifdef CONFIG_MIGRATION
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1154684209a4..e2fa375e478e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -7,7 +7,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/gfp.h>
 #include <linux/list.h>
 #include <linux/mmzone.h>
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 4ad21c5863fd..87920a0852a3 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -11,7 +11,6 @@
 #define OVERCOMMIT_NEVER		2
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 #include <linux/mm.h>
 
 #include <asm/atomic.h>
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b5c21122c299..6be91fb2deb1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -4,7 +4,6 @@
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 
-#include <linux/config.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/wait.h>
diff --git a/include/linux/module.h b/include/linux/module.h
index eaec13ddd667..05e7dd17b7d0 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -6,7 +6,6 @@
  * Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996
  * Rewritten again by Rusty Russell, 2002
  */
-#include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index 23a568910341..09bfae6938b3 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -7,7 +7,6 @@
 #ifndef __MTD_CFI_H__
 #define __MTD_CFI_H__
 
-#include <linux/config.h>
 #include <linux/delay.h>
 #include <linux/types.h>
 #include <linux/interrupt.h>
diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index 7dfd6e1fcde7..28d461d862bd 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -5,7 +5,6 @@
 #ifndef __LINUX_MTD_MAP_H__
 #define __LINUX_MTD_MAP_H__
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/string.h>
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index b6f2fdae65c6..012a47df1960 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -13,7 +13,6 @@
 #error This is a kernel header. Perhaps include mtd-user.h instead?
 #endif
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/uio.h>
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index da5e67b3fc70..4b99d285803f 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -56,7 +56,6 @@
 #ifndef __LINUX_MTD_NAND_H
 #define __LINUX_MTD_NAND_H
 
-#include <linux/config.h>
 #include <linux/wait.h>
 #include <linux/spinlock.h>
 #include <linux/mtd/mtd.h>
diff --git a/include/linux/mtd/physmap.h b/include/linux/mtd/physmap.h
index c7b8bcdef013..bffaade1111e 100644
--- a/include/linux/mtd/physmap.h
+++ b/include/linux/mtd/physmap.h
@@ -16,7 +16,6 @@
 
 #ifndef __LINUX_MTD_PHYSMAP__
 
-#include <linux/config.h>
 
 #if defined(CONFIG_MTD_PHYSMAP)
 
diff --git a/include/linux/mtd/xip.h b/include/linux/mtd/xip.h
index 220d50bb71cd..e9d40bdde48c 100644
--- a/include/linux/mtd/xip.h
+++ b/include/linux/mtd/xip.h
@@ -18,7 +18,6 @@
 #ifndef __LINUX_MTD_XIP_H__
 #define __LINUX_MTD_XIP_H__
 
-#include <linux/config.h>
 
 #ifdef CONFIG_MTD_XIP
 
diff --git a/include/linux/net.h b/include/linux/net.h
index c88d7cf7f6b7..385e68f5bd93 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -18,7 +18,6 @@
 #ifndef _LINUX_NET_H
 #define _LINUX_NET_H
 
-#include <linux/config.h>
 #include <linux/wait.h>
 #include <asm/socket.h>
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 40ccf8cc4239..c81aa0f76642 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -34,7 +34,6 @@
 #include <asm/cache.h>
 #include <asm/byteorder.h>
 
-#include <linux/config.h>
 #include <linux/device.h>
 #include <linux/percpu.h>
 
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index b31a9bca9361..10168e26a846 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -40,7 +40,6 @@
 #endif
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 #ifdef CONFIG_NETFILTER
 
 extern void netfilter_init(void);
diff --git a/include/linux/netfilter_arp.h b/include/linux/netfilter_arp.h
index a3f8977f7f12..92bc6ddcbf73 100644
--- a/include/linux/netfilter_arp.h
+++ b/include/linux/netfilter_arp.h
@@ -5,7 +5,6 @@
  * (C)2002 Rusty Russell IBM -- This code is GPL.
  */
 
-#include <linux/config.h>
 #include <linux/netfilter.h>
 
 /* There is no PF_ARP. */
diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index a75b84bb9a88..87764022cc67 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -4,7 +4,6 @@
 /* bridge-specific defines for netfilter. 
  */
 
-#include <linux/config.h>
 #include <linux/netfilter.h>
 #if defined(__KERNEL__) && defined(CONFIG_BRIDGE_NETFILTER)
 #include <asm/atomic.h>
diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
index 85301c5e8d24..ce02c984f3ba 100644
--- a/include/linux/netfilter_ipv4.h
+++ b/include/linux/netfilter_ipv4.h
@@ -5,7 +5,6 @@
  * (C)1998 Rusty Russell -- This code is GPL.
  */
 
-#include <linux/config.h>
 #include <linux/netfilter.h>
 
 /* only for userspace compatibility */
diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index d54d7b278e96..4255bfec0920 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -4,7 +4,6 @@
 #include <linux/netfilter/nf_conntrack_common.h>
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 #include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
 #include <linux/bitops.h>
 #include <linux/compiler.h>
diff --git a/include/linux/netfilter_ipv4/listhelp.h b/include/linux/netfilter_ipv4/listhelp.h
index 360429f48737..5d92cf044d91 100644
--- a/include/linux/netfilter_ipv4/listhelp.h
+++ b/include/linux/netfilter_ipv4/listhelp.h
@@ -1,6 +1,5 @@
 #ifndef _LISTHELP_H
 #define _LISTHELP_H
-#include <linux/config.h>
 #include <linux/list.h>
 
 /* Header to do more comprehensive job than linux/list.h; assume list
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index ec7c2e872d72..2dcad295fece 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -10,7 +10,6 @@
 #ifndef LINUX_NFSD_NFSD_H
 #define LINUX_NFSD_NFSD_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/unistd.h>
 #include <linux/dirent.h>
diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h
index 0798b7781a6e..f9edcd2ff3c8 100644
--- a/include/linux/nfsd/nfsfh.h
+++ b/include/linux/nfsd/nfsfh.h
@@ -16,7 +16,6 @@
 
 #include <asm/types.h>
 #ifdef __KERNEL__
-# include <linux/config.h>
 # include <linux/types.h>
 # include <linux/string.h>
 # include <linux/fs.h>
diff --git a/include/linux/nfsd/syscall.h b/include/linux/nfsd/syscall.h
index 781efbf94ed3..dae0faea2807 100644
--- a/include/linux/nfsd/syscall.h
+++ b/include/linux/nfsd/syscall.h
@@ -11,7 +11,6 @@
 
 #include <asm/types.h>
 #ifdef __KERNEL__
-# include <linux/config.h>
 # include <linux/types.h>
 # include <linux/in.h>
 #endif 
diff --git a/include/linux/numa.h b/include/linux/numa.h
index e481feb1bfd8..a31a7301b159 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_NUMA_H
 #define _LINUX_NUMA_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_NODES_SHIFT
 #define NODES_SHIFT     CONFIG_NODES_SHIFT
diff --git a/include/linux/parport.h b/include/linux/parport.h
index 008d736a6c9a..d42737eeee06 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -96,7 +96,6 @@ typedef enum {
 /* The rest is for the kernel only */
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/jiffies.h>
 #include <linux/proc_fs.h>
 #include <linux/spinlock.h>
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 3a6a4e37a482..63609ae10736 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -47,7 +47,6 @@
 #ifdef __KERNEL__
 
 #include <linux/types.h>
-#include <linux/config.h>
 #include <linux/ioport.h>
 #include <linux/list.h>
 #include <linux/errno.h>
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 682525511c9e..66b5de404f22 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -6,7 +6,6 @@
  * WARNING: these things are HUGE.  4 kbytes per counter on 32-way P4.
  */
 
-#include <linux/config.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
 #include <linux/threads.h>
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 66be58902b17..658c1b93d5bb 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -23,7 +23,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/list.h>
 #include <asm/atomic.h>
 
diff --git a/include/linux/pm_legacy.h b/include/linux/pm_legacy.h
index 008932d73c35..78027c533b94 100644
--- a/include/linux/pm_legacy.h
+++ b/include/linux/pm_legacy.h
@@ -1,7 +1,6 @@
 #ifndef __LINUX_PM_LEGACY_H__
 #define __LINUX_PM_LEGACY_H__
 
-#include <linux/config.h>
 
 #ifdef CONFIG_PM_LEGACY
 
diff --git a/include/linux/pmu.h b/include/linux/pmu.h
index 217d3daf7336..ecce5912f4d6 100644
--- a/include/linux/pmu.h
+++ b/include/linux/pmu.h
@@ -6,7 +6,6 @@
  * Copyright (C) 1998 Paul Mackerras.
  */
 
-#include <linux/config.h>
 
 #define PMU_DRIVER_VERSION	2
 
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 5769d14d1e6a..d0926d63406c 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -6,7 +6,6 @@
  * preempt_count (used for kernel preemption, interrupt count, etc.)
  */
 
-#include <linux/config.h>
 #include <linux/thread_info.h>
 #include <linux/linkage.h>
 
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 4b47a0253425..5810d28fbed9 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_PROC_FS_H
 #define _LINUX_PROC_FS_H
 
-#include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/spinlock.h>
diff --git a/include/linux/profile.h b/include/linux/profile.h
index 1f2fea6640a4..e633004ae052 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -4,7 +4,6 @@
 #ifdef __KERNEL__
 
 #include <linux/kernel.h>
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/cpumask.h>
 #include <asm/errno.h>
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 21e5a9124856..5110201a4159 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -10,7 +10,6 @@
 #ifndef _LINUX_QUOTAOPS_
 #define _LINUX_QUOTAOPS_
 
-#include <linux/config.h>
 #include <linux/smp_lock.h>
 
 #include <linux/fs.h>
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index d42603dafc7c..5e961035c725 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -2,7 +2,6 @@
   File: linux/reiserfs_xattr.h
 */
 
-#include <linux/config.h>
 #include <linux/xattr.h>
 
 /* Magic value in header */
diff --git a/include/linux/relay.h b/include/linux/relay.h
index 4bcc1531d6a9..24accb483849 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -10,7 +10,6 @@
 #ifndef _LINUX_RELAY_H
 #define _LINUX_RELAY_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
diff --git a/include/linux/rio.h b/include/linux/rio.h
index c7e907faae9c..d93857056cb9 100644
--- a/include/linux/rio.h
+++ b/include/linux/rio.h
@@ -17,7 +17,6 @@
 #ifdef __KERNEL__
 
 #include <linux/types.h>
-#include <linux/config.h>
 #include <linux/ioport.h>
 #include <linux/list.h>
 #include <linux/errno.h>
diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h
index f54772d0e7f8..7adb2a1aac92 100644
--- a/include/linux/rio_drv.h
+++ b/include/linux/rio_drv.h
@@ -16,7 +16,6 @@
 #ifdef __KERNEL__
 
 #include <linux/types.h>
-#include <linux/config.h>
 #include <linux/ioport.h>
 #include <linux/list.h>
 #include <linux/errno.h>
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d6b9bcd1384c..2d4c81a220db 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -4,7 +4,6 @@
  * Declarations for Reverse Mapping functions in mm/rmap.c
  */
 
-#include <linux/config.h>
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index df0cdd41085c..facd9ee37b76 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -909,7 +909,6 @@ struct tcamsg
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/mutex.h>
 
 extern size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index bfb988885002..f99fe90732ab 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -13,7 +13,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <asm/system.h>
diff --git a/include/linux/scc.h b/include/linux/scc.h
index 885a4a02b23c..3495bd953cc6 100644
--- a/include/linux/scc.h
+++ b/include/linux/scc.h
@@ -3,7 +3,6 @@
 #ifndef	_SCC_H
 #define	_SCC_H
 
-#include <linux/config.h>
 
 /* selection of hardware types */
 
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index cd2773b29a64..3e8b1cf54303 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_SECCOMP_H
 #define _LINUX_SECCOMP_H
 
-#include <linux/config.h>
 
 #ifdef CONFIG_SECCOMP
 
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 5a095572881d..7bc5c7c12b54 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -26,7 +26,6 @@
  * by Keith Owens and Andrea Arcangeli
  */
 
-#include <linux/config.h>
 #include <linux/spinlock.h>
 #include <linux/preempt.h>
 
diff --git a/include/linux/serialP.h b/include/linux/serialP.h
index 2b9e6b9554d5..e811a615f696 100644
--- a/include/linux/serialP.h
+++ b/include/linux/serialP.h
@@ -19,7 +19,6 @@
  * For definitions of the flags field, see tty.h
  */
 
-#include <linux/config.h>
 #include <linux/termios.h>
 #include <linux/workqueue.h>
 #include <linux/interrupt.h>
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index c32e60e79dea..fcfb783bef41 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -132,7 +132,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <linux/interrupt.h>
 #include <linux/circ_buf.h>
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f8f234708b98..4dc65b55812e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -14,7 +14,6 @@
 #ifndef _LINUX_SKBUFF_H
 #define _LINUX_SKBUFF_H
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/compiler.h>
 #include <linux/time.h>
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 3af03b19c983..a7d7f131b5da 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -11,7 +11,6 @@
 
 typedef struct kmem_cache kmem_cache_t;
 
-#include	<linux/config.h>	/* kmalloc_sizes.h needs CONFIG_ options */
 #include	<linux/gfp.h>
 #include	<linux/init.h>
 #include	<linux/types.h>
diff --git a/include/linux/smp.h b/include/linux/smp.h
index e2fa3ab4afc5..c93c3fe4308c 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -6,7 +6,6 @@
  *		Alan Cox. <alan@redhat.com>
  */
 
-#include <linux/config.h>
 
 extern void cpu_idle(void);
 
diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h
index fa1ff3b165fe..cf715a40d833 100644
--- a/include/linux/smp_lock.h
+++ b/include/linux/smp_lock.h
@@ -1,7 +1,6 @@
 #ifndef __LINUX_SMPLOCK_H
 #define __LINUX_SMPLOCK_H
 
-#include <linux/config.h>
 #ifdef CONFIG_LOCK_KERNEL
 #include <linux/sched.h>
 #include <linux/spinlock.h>
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 799be6747944..ae23beef9cc9 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -46,7 +46,6 @@
  *  linux/spinlock.h:     builds the final spin_*() APIs.
  */
 
-#include <linux/config.h>
 #include <linux/preempt.h>
 #include <linux/linkage.h>
 #include <linux/compiler.h>
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 151a803ed0ed..5bfc553bdb21 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -4,7 +4,6 @@
    very heavy lock, which is equivalent to grabbing every spinlock
    (and more).  So the "read" side to such a lock is anything which
    diables preeempt. */
-#include <linux/config.h>
 #include <linux/cpu.h>
 #include <asm/system.h>
 
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index be4772ed43c0..a6de332e57d4 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -11,7 +11,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/sunrpc/msg_prot.h>
 #include <linux/sunrpc/xdr.h>
diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index e0cae8deb465..e4729aa67654 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -27,7 +27,6 @@
 #define RPCDBG_ALL		0x7fff
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 
 #include <linux/timer.h>
 #include <linux/workqueue.h>
diff --git a/include/linux/sunrpc/stats.h b/include/linux/sunrpc/stats.h
index d93c24b47f3f..5fa0f2084307 100644
--- a/include/linux/sunrpc/stats.h
+++ b/include/linux/sunrpc/stats.h
@@ -9,7 +9,6 @@
 #ifndef _LINUX_SUNRPC_STATS_H
 #define _LINUX_SUNRPC_STATS_H
 
-#include <linux/config.h>
 #include <linux/proc_fs.h>
 
 struct rpc_stat {
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 37c1c76fd547..96e31aa64cc7 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -6,7 +6,6 @@
 #endif
 #include <linux/swap.h>
 #include <linux/notifier.h>
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/pm.h>
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 5b1fdf1cff4f..e24fa9b69cbf 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_SWAP_H
 #define _LINUX_SWAP_H
 
-#include <linux/config.h>
 #include <linux/spinlock.h>
 #include <linux/linkage.h>
 #include <linux/mmzone.h>
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d3ebc0e68b2b..3bdc1970f8bd 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -53,7 +53,6 @@ struct mq_attr;
 struct compat_stat;
 struct compat_timeval;
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/aio_abi.h>
 #include <linux/capability.h>
diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h
index ea819b89c235..4812ff60561c 100644
--- a/include/linux/sysrq.h
+++ b/include/linux/sysrq.h
@@ -11,7 +11,6 @@
  *	based upon discusions in irc://irc.openprojects.net/#kernelnewbies
  */
 
-#include <linux/config.h>
 
 struct pt_regs;
 struct tty_struct;
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 542d39596bd8..a8b24eff5b5f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -159,7 +159,6 @@ struct tcp_info
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/inet_connection_sock.h>
diff --git a/include/linux/threads.h b/include/linux/threads.h
index e646bcdf2614..38d1a5d6568e 100644
--- a/include/linux/threads.h
+++ b/include/linux/threads.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_THREADS_H
 #define _LINUX_THREADS_H
 
-#include <linux/config.h>
 
 /*
  * The default limit for the nr of threads is now in
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 0a485beba9f5..c982304dbafd 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_TIMER_H
 #define _LINUX_TIMER_H
 
-#include <linux/config.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 03914b7e41b1..34d3ccff7bbb 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -53,7 +53,6 @@
 #ifndef _LINUX_TIMEX_H
 #define _LINUX_TIMEX_H
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <linux/time.h>
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index f13f49afe198..e898eeb94166 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -16,7 +16,6 @@
 		   consoles 16 and higher (since it returns a short) */
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 #include <linux/fs.h>
 #include <linux/major.h>
 #include <linux/termios.h>
diff --git a/include/linux/types.h b/include/linux/types.h
index 1046c7ad86d9..a5e46e783ffa 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -2,7 +2,6 @@
 #define _LINUX_TYPES_H
 
 #ifdef	__KERNEL__
-#include <linux/config.h>
 
 #define BITS_TO_LONGS(bits) \
 	(((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
diff --git a/include/linux/udp.h b/include/linux/udp.h
index 85a55658831c..bdd39be09406 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -35,7 +35,6 @@ struct udphdr {
 #define UDP_ENCAP_ESPINUDP	2 /* draft-ietf-ipsec-udp-encaps-06 */
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 #include <linux/types.h>
 
 #include <net/inet_sock.h>
diff --git a/include/linux/usb.h b/include/linux/usb.h
index e34e5e3dce52..1f492c0c7047 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -10,7 +10,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/errno.h>        /* for -ENODEV */
 #include <linux/delay.h>	/* for mdelay() */
 #include <linux/interrupt.h>	/* for in_interrupt() */
diff --git a/include/linux/usb_usual.h b/include/linux/usb_usual.h
index b2d08984a9f7..608487a62c98 100644
--- a/include/linux/usb_usual.h
+++ b/include/linux/usb_usual.h
@@ -9,7 +9,6 @@
 #ifndef __LINUX_USB_USUAL_H
 #define __LINUX_USB_USUAL_H
 
-#include <linux/config.h>
 
 /* We should do this for cleanliness... But other usb_foo.h do not do this. */
 /* #include <linux/usb.h> */
diff --git a/include/linux/vt_buffer.h b/include/linux/vt_buffer.h
index 1f7ba3629053..057db7d2f448 100644
--- a/include/linux/vt_buffer.h
+++ b/include/linux/vt_buffer.h
@@ -13,7 +13,6 @@
 #ifndef _LINUX_VT_BUFFER_H_
 #define _LINUX_VT_BUFFER_H_
 
-#include <linux/config.h>
 
 #if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_MDA_CONSOLE)
 #include <asm/vga.h>
diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index 530ae3f4248c..6ef527bb6235 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -6,7 +6,6 @@
  * with information needed by the vt package
  */
 
-#include <linux/config.h>
 #include <linux/vt.h>
 #include <linux/kd.h>
 #include <linux/tty.h>
diff --git a/include/linux/wait.h b/include/linux/wait.h
index d28518236b62..544e855c7c02 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -19,7 +19,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/list.h>
 #include <linux/stddef.h>
 #include <linux/spinlock.h>
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 750e2508dd90..3d71251b3eca 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -45,7 +45,6 @@ struct prefix_info {
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/netdevice.h>
 #include <net/if_inet6.h>
 #include <net/ipv6.h>
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 427dac94bc7e..795f81f9ec7f 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -1,7 +1,6 @@
 #ifndef __LINUX_NET_AFUNIX_H
 #define __LINUX_NET_AFUNIX_H
 
-#include <linux/config.h>
 #include <linux/socket.h>
 #include <linux/un.h>
 #include <linux/mutex.h>
diff --git a/include/net/ax25.h b/include/net/ax25.h
index d052b221dbcd..b74945288dfc 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -6,7 +6,6 @@
 #ifndef _AX25_H
 #define _AX25_H 
 
-#include <linux/config.h>
 #include <linux/ax25.h>
 #include <linux/spinlock.h>
 #include <linux/timer.h>
diff --git a/include/net/compat.h b/include/net/compat.h
index 8662b8f43df5..da680272cf6e 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -1,7 +1,6 @@
 #ifndef NET_COMPAT_H
 #define NET_COMPAT_H
 
-#include <linux/config.h>
 
 #if defined(CONFIG_COMPAT)
 
diff --git a/include/net/dst.h b/include/net/dst.h
index 5161e89017f9..36d54fc248b0 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -8,7 +8,6 @@
 #ifndef _NET_DST_H
 #define _NET_DST_H
 
-#include <linux/config.h>
 #include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/rcupdate.h>
diff --git a/include/net/icmp.h b/include/net/icmp.h
index e7c3f20fbafc..05f8ff7d9316 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -18,7 +18,6 @@
 #ifndef _ICMP_H
 #define	_ICMP_H
 
-#include <linux/config.h>
 #include <linux/icmp.h>
 
 #include <net/inet_sock.h>
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 59f0c83d55a2..bc6a71dce984 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -14,7 +14,6 @@
 #ifndef _INET6_HASHTABLES_H
 #define _INET6_HASHTABLES_H
 
-#include <linux/config.h>
 
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 #include <linux/in6.h>
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 135d80fd658e..98e0bb3014fe 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -14,7 +14,6 @@
 #ifndef _INET_HASHTABLES_H
 #define _INET_HASHTABLES_H
 
-#include <linux/config.h>
 
 #include <linux/interrupt.h>
 #include <linux/ipv6.h>
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 883eb529ef8e..1f4a9a60d4cc 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -16,7 +16,6 @@
 #ifndef _INET_SOCK_H
 #define _INET_SOCK_H
 
-#include <linux/config.h>
 
 #include <linux/string.h>
 #include <linux/types.h>
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 1da294c47522..519d3a077c62 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -15,7 +15,6 @@
 #ifndef _INET_TIMEWAIT_SOCK_
 #define _INET_TIMEWAIT_SOCK_
 
-#include <linux/config.h>
 
 #include <linux/list.h>
 #include <linux/module.h>
diff --git a/include/net/ip.h b/include/net/ip.h
index 3d2e5ca62a5a..3900fccf60c7 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -22,7 +22,6 @@
 #ifndef _IP_H
 #define _IP_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/ip.h>
 #include <linux/in.h>
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index e000fa2cd5f6..a095d1dec7a4 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -16,7 +16,6 @@
 #ifndef _NET_IP_FIB_H
 #define _NET_IP_FIB_H
 
-#include <linux/config.h>
 #include <net/flow.h>
 #include <linux/seq_file.h>
 
diff --git a/include/net/ip_mp_alg.h b/include/net/ip_mp_alg.h
index 77225735cbd4..ac747b64734c 100644
--- a/include/net/ip_mp_alg.h
+++ b/include/net/ip_mp_alg.h
@@ -7,7 +7,6 @@
 #ifndef _NET_IP_MP_ALG_H
 #define _NET_IP_MP_ALG_H
 
-#include <linux/config.h>
 #include <linux/ip_mp_alg.h>
 #include <net/flow.h>
 #include <net/route.h>
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 7d2674fde19a..3b57b159b653 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -248,7 +248,6 @@ struct ip_vs_daemon_user {
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/list.h>                 /* for struct list_head */
 #include <linux/spinlock.h>             /* for struct rwlock_t */
 #include <asm/atomic.h>                 /* for struct atomic_t */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 4abedb8eaece..a8fdf7970b37 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -104,7 +104,6 @@ struct frag_hdr {
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <net/sock.h>
 
 /* sysctls */
diff --git a/include/net/irda/irda.h b/include/net/irda/irda.h
index 1880e46ecc9b..1cb0607fcbb9 100644
--- a/include/net/irda/irda.h
+++ b/include/net/irda/irda.h
@@ -26,7 +26,6 @@
 #ifndef NET_IRDA_H
 #define NET_IRDA_H
 
-#include <linux/config.h>
 #include <linux/skbuff.h>		/* struct sk_buff */
 #include <linux/kernel.h>
 #include <linux/if.h>			/* sa_family_t in <linux/irda.h> */
diff --git a/include/net/irda/irda_device.h b/include/net/irda/irda_device.h
index 92c828029cd8..0575c59a5c96 100644
--- a/include/net/irda/irda_device.h
+++ b/include/net/irda/irda_device.h
@@ -39,7 +39,6 @@
 #ifndef IRDA_DEVICE_H
 #define IRDA_DEVICE_H
 
-#include <linux/config.h>
 #include <linux/tty.h>
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
diff --git a/include/net/irda/irlap.h b/include/net/irda/irlap.h
index 2127cae1e0a6..e77eb88d9226 100644
--- a/include/net/irda/irlap.h
+++ b/include/net/irda/irlap.h
@@ -27,7 +27,6 @@
 #ifndef IRLAP_H
 #define IRLAP_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
diff --git a/include/net/irda/irlmp.h b/include/net/irda/irlmp.h
index 86aefb1fda5e..0d8e9fa416f3 100644
--- a/include/net/irda/irlmp.h
+++ b/include/net/irda/irlmp.h
@@ -29,7 +29,6 @@
 
 #include <asm/param.h>  /* for HZ */
 
-#include <linux/config.h>
 #include <linux/types.h>
 
 #include <net/irda/irda.h>
diff --git a/include/net/irda/irlmp_frame.h b/include/net/irda/irlmp_frame.h
index eb3ad158c023..c463f8bca856 100644
--- a/include/net/irda/irlmp_frame.h
+++ b/include/net/irda/irlmp_frame.h
@@ -26,7 +26,6 @@
 #ifndef IRMLP_FRAME_H
 #define IRMLP_FRAME_H
 
-#include <linux/config.h>
 #include <linux/skbuff.h>
 
 #include <net/irda/discovery.h>
diff --git a/include/net/irda/qos.h b/include/net/irda/qos.h
index 9ae3d6bc2423..cc577dc0a0ef 100644
--- a/include/net/irda/qos.h
+++ b/include/net/irda/qos.h
@@ -31,7 +31,6 @@
 #ifndef IRDA_QOS_H
 #define IRDA_QOS_H
 
-#include <linux/config.h>
 #include <linux/skbuff.h>
 
 #include <net/irda/parameters.h>
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 91fa271a0064..d3915dabe6de 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -37,7 +37,6 @@ enum {
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 #include <linux/compiler.h>
 #include <linux/icmpv6.h>
 #include <linux/in6.h>
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 916013ca4a5c..fc00aa31e282 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -15,7 +15,6 @@
 #include <linux/netfilter/nf_conntrack_common.h>
 
 #ifdef __KERNEL__
-#include <linux/config.h>
 #include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <asm/atomic.h>
diff --git a/include/net/pkt_act.h b/include/net/pkt_act.h
index b225d8472b7e..cf5e4d2e4c21 100644
--- a/include/net/pkt_act.h
+++ b/include/net/pkt_act.h
@@ -4,7 +4,6 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/bitops.h>
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 6dc5970612d7..bcaee39bd2ff 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -24,7 +24,6 @@
 #ifndef _PROTOCOL_H
 #define _PROTOCOL_H
 
-#include <linux/config.h>
 #include <linux/in6.h>
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 #include <linux/ipv6.h>
diff --git a/include/net/raw.h b/include/net/raw.h
index e67b28a0248c..481b20190b12 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -17,7 +17,6 @@
 #ifndef _RAW_H
 #define _RAW_H
 
-#include <linux/config.h>
 
 #include <net/protocol.h>
 
diff --git a/include/net/red.h b/include/net/red.h
index 2ed4358e3295..5ccdbb3d4722 100644
--- a/include/net/red.h
+++ b/include/net/red.h
@@ -1,7 +1,6 @@
 #ifndef __NET_SCHED_RED_H
 #define __NET_SCHED_RED_H
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <net/pkt_sched.h>
 #include <net/inet_ecn.h>
diff --git a/include/net/route.h b/include/net/route.h
index 98c915abdec8..c4a068692dcc 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -24,7 +24,6 @@
 #ifndef _ROUTE_H
 #define _ROUTE_H
 
-#include <linux/config.h>
 #include <net/dst.h>
 #include <net/inetpeer.h>
 #include <net/flow.h>
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 7b6ec9986715..b0e9108a4e18 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -1,7 +1,6 @@
 #ifndef __NET_SCHED_GENERIC_H
 #define __NET_SCHED_GENERIC_H
 
-#include <linux/config.h>
 #include <linux/netdevice.h>
 #include <linux/types.h>
 #include <linux/rcupdate.h>
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index e673b2c984e9..9c30fa55051e 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -63,7 +63,6 @@
  */
 
 
-#include <linux/config.h>
 
 #ifdef TEST_FRAME
 #undef CONFIG_PROC_FS
diff --git a/include/net/sock.h b/include/net/sock.h
index ff8b0dad7b0f..d8a5d87ad145 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -40,7 +40,6 @@
 #ifndef _SOCK_H
 #define _SOCK_H
 
-#include <linux/config.h>
 #include <linux/list.h>
 #include <linux/timer.h>
 #include <linux/cache.h>
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3c989db8a7aa..9e88dcd5f134 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -21,7 +21,6 @@
 #define TCP_DEBUG 1
 #define FASTRETRANS_DEBUG 1
 
-#include <linux/config.h>
 #include <linux/list.h>
 #include <linux/tcp.h>
 #include <linux/slab.h>
diff --git a/include/pcmcia/ss.h b/include/pcmcia/ss.h
index 5e0a01ab2216..ede639812f8a 100644
--- a/include/pcmcia/ss.h
+++ b/include/pcmcia/ss.h
@@ -15,7 +15,6 @@
 #ifndef _LINUX_SS_H
 #define _LINUX_SS_H
 
-#include <linux/config.h>
 #include <linux/device.h>
 #include <linux/sched.h>	/* task_struct, completion */
 #include <linux/mutex.h>
diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h
index 5626225bd3ae..6d28b0317657 100644
--- a/include/scsi/scsi_transport_fc.h
+++ b/include/scsi/scsi_transport_fc.h
@@ -27,7 +27,6 @@
 #ifndef SCSI_TRANSPORT_FC_H
 #define SCSI_TRANSPORT_FC_H
 
-#include <linux/config.h>
 #include <linux/sched.h>
 #include <scsi/scsi.h>
 
diff --git a/include/scsi/scsi_transport_spi.h b/include/scsi/scsi_transport_spi.h
index 5e1d61913d4e..302680c0c0de 100644
--- a/include/scsi/scsi_transport_spi.h
+++ b/include/scsi/scsi_transport_spi.h
@@ -20,7 +20,6 @@
 #ifndef SCSI_TRANSPORT_SPI_H
 #define SCSI_TRANSPORT_SPI_H
 
-#include <linux/config.h>
 #include <linux/transport_class.h>
 #include <linux/mutex.h>
 
diff --git a/include/sound/driver.h b/include/sound/driver.h
index 89c6a73f3920..3c522e59a33c 100644
--- a/include/sound/driver.h
+++ b/include/sound/driver.h
@@ -26,7 +26,6 @@
 #include "config.h"
 #endif
 
-#include <linux/config.h>
 
 /* number of supported soundcards */
 #ifdef CONFIG_SND_DYNAMIC_MINORS
diff --git a/include/video/edid.h b/include/video/edid.h
index b913f196131d..f6a42d6c2e2d 100644
--- a/include/video/edid.h
+++ b/include/video/edid.h
@@ -3,7 +3,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/config.h>
 
 #ifdef CONFIG_X86
 struct edid_info {
diff --git a/include/video/vga.h b/include/video/vga.h
index 700d6c8eb736..b49a5120ca2d 100644
--- a/include/video/vga.h
+++ b/include/video/vga.h
@@ -17,7 +17,6 @@
 #ifndef __linux_video_vga_h__
 #define __linux_video_vga_h__
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <asm/io.h>
 #ifndef CONFIG_AMIGA
-- 
cgit v1.2.3


From f001e47f83db18a9f202f25c0255b4d11ebe468b Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 27 Apr 2006 00:11:01 +0100
Subject: Sanitise linux/audit.h for userspace consumption, split elf-em.h from
 elf.h

Don't include <linux/sched.h> outside __KERNEL__, and split the EM_xxx
definitions out of elf.h into elf-em.h so that audit.h can include just
that and not pollute the namespace any further than it needs to.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h  |  4 ++--
 include/linux/elf-em.h | 44 +++++++++++++++++++++++++++++++++++++
 include/linux/elf.h    | 59 +-------------------------------------------------
 3 files changed, 47 insertions(+), 60 deletions(-)
 create mode 100644 include/linux/elf-em.h

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 1c47c59058c1..319975532943 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -24,8 +24,7 @@
 #ifndef _LINUX_AUDIT_H_
 #define _LINUX_AUDIT_H_
 
-#include <linux/sched.h>
-#include <linux/elf.h>
+#include <linux/elf-em.h>
 
 /* The netlink messages for the audit system is divided into blocks:
  * 1000 - 1099 are for commanding the audit system
@@ -267,6 +266,7 @@ struct audit_rule {		/* for AUDIT_LIST, AUDIT_ADD, and AUDIT_DEL */
 };
 
 #ifdef __KERNEL__
+#include <linux/sched.h>
 
 struct audit_sig_info {
 	uid_t		uid;
diff --git a/include/linux/elf-em.h b/include/linux/elf-em.h
new file mode 100644
index 000000000000..114a96d25652
--- /dev/null
+++ b/include/linux/elf-em.h
@@ -0,0 +1,44 @@
+#ifndef _LINUX_ELF_EM_H
+#define _LINUX_ELF_EM_H
+
+/* These constants define the various ELF target machines */
+#define EM_NONE		0
+#define EM_M32		1
+#define EM_SPARC	2
+#define EM_386		3
+#define EM_68K		4
+#define EM_88K		5
+#define EM_486		6	/* Perhaps disused */
+#define EM_860		7
+#define EM_MIPS		8	/* MIPS R3000 (officially, big-endian only) */
+#define EM_MIPS_RS4_BE	10	/* MIPS R4000 big-endian */
+#define EM_PARISC	15	/* HPPA */
+#define EM_SPARC32PLUS	18	/* Sun's "v8plus" */
+#define EM_PPC		20	/* PowerPC */
+#define EM_PPC64	21       /* PowerPC64 */
+#define EM_SH		42	/* SuperH */
+#define EM_SPARCV9	43	/* SPARC v9 64-bit */
+#define EM_IA_64	50	/* HP/Intel IA-64 */
+#define EM_X86_64	62	/* AMD x86-64 */
+#define EM_S390		22	/* IBM S/390 */
+#define EM_CRIS		76	/* Axis Communications 32-bit embedded processor */
+#define EM_V850		87	/* NEC v850 */
+#define EM_M32R		88	/* Renesas M32R */
+#define EM_H8_300	46	/* Renesas H8/300,300H,H8S */
+#define EM_FRV		0x5441	/* Fujitsu FR-V */
+
+/*
+ * This is an interim value that we will use until the committee comes
+ * up with a final number.
+ */
+#define EM_ALPHA	0x9026
+
+/* Bogus old v850 magic number, used by old tools. */
+#define EM_CYGNUS_V850	0x9080
+/* Bogus old m32r magic number, used by old tools. */
+#define EM_CYGNUS_M32R	0x9041
+/* This is the old interim value for S/390 architecture */
+#define EM_S390_OLD	0xA390
+
+
+#endif /* _LINUX_ELF_EM_H */
diff --git a/include/linux/elf.h b/include/linux/elf.h
index d3bfacb24496..b70d1d2c8d28 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/auxvec.h>
+#include <linux/elf-em.h>
 #include <asm/elf.h>
 
 #ifndef elf_read_implies_exec
@@ -55,64 +56,6 @@ typedef __s64	Elf64_Sxword;
 #define ET_LOPROC 0xff00
 #define ET_HIPROC 0xffff
 
-/* These constants define the various ELF target machines */
-#define EM_NONE  0
-#define EM_M32   1
-#define EM_SPARC 2
-#define EM_386   3
-#define EM_68K   4
-#define EM_88K   5
-#define EM_486   6   /* Perhaps disused */
-#define EM_860   7
-
-#define EM_MIPS		8	/* MIPS R3000 (officially, big-endian only) */
-
-#define EM_MIPS_RS4_BE 10	/* MIPS R4000 big-endian */
-
-#define EM_PARISC      15	/* HPPA */
-
-#define EM_SPARC32PLUS 18	/* Sun's "v8plus" */
-
-#define EM_PPC	       20	/* PowerPC */
-#define EM_PPC64       21       /* PowerPC64 */
-
-#define EM_SH	       42	/* SuperH */
-
-#define EM_SPARCV9     43	/* SPARC v9 64-bit */
-
-#define EM_IA_64	50	/* HP/Intel IA-64 */
-
-#define EM_X86_64	62	/* AMD x86-64 */
-
-#define EM_S390		22	/* IBM S/390 */
-
-#define EM_CRIS         76      /* Axis Communications 32-bit embedded processor */
-
-#define EM_V850		87	/* NEC v850 */
-
-#define EM_M32R		88	/* Renesas M32R */
-
-#define EM_H8_300       46      /* Renesas H8/300,300H,H8S */
-
-/*
- * This is an interim value that we will use until the committee comes
- * up with a final number.
- */
-#define EM_ALPHA	0x9026
-
-/* Bogus old v850 magic number, used by old tools.  */
-#define EM_CYGNUS_V850	0x9080
-
-/* Bogus old m32r magic number, used by old tools.  */
-#define EM_CYGNUS_M32R	0x9041
-
-/*
- * This is the old interim value for S/390 architecture
- */
-#define EM_S390_OLD     0xA390
-
-#define EM_FRV		0x5441		/* Fujitsu FR-V */
-
 /* This is the info that is needed to parse the dynamic section of the file */
 #define DT_NULL		0
 #define DT_NEEDED	1
-- 
cgit v1.2.3


From b7b3c76a0a21c5a98124e90c47c488f7e4166f87 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 27 Apr 2006 00:12:56 +0100
Subject: Sanitise linux/sched.h for userspace consumption

There was a whole load of crap exposed which should have been inside the
existing #ifdef __KERNEL__ part. Also hide struct sched_param for now,
since glibc has its own and doesn't like being given ours (yet).

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/sched.h | 76 +++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2e05e402df4f..701b8cbceb05 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1,7 +1,44 @@
 #ifndef _LINUX_SCHED_H
 #define _LINUX_SCHED_H
 
+#include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
+
+/*
+ * cloning flags:
+ */
+#define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
+#define CLONE_VM	0x00000100	/* set if VM shared between processes */
+#define CLONE_FS	0x00000200	/* set if fs info shared between processes */
+#define CLONE_FILES	0x00000400	/* set if open files shared between processes */
+#define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
+#define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
+#define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
+#define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
+#define CLONE_THREAD	0x00010000	/* Same thread group? */
+#define CLONE_NEWNS	0x00020000	/* New namespace group? */
+#define CLONE_SYSVSEM	0x00040000	/* share system V SEM_UNDO semantics */
+#define CLONE_SETTLS	0x00080000	/* create a new TLS for the child */
+#define CLONE_PARENT_SETTID	0x00100000	/* set the TID in the parent */
+#define CLONE_CHILD_CLEARTID	0x00200000	/* clear the TID in the child */
+#define CLONE_DETACHED		0x00400000	/* Unused, ignored */
+#define CLONE_UNTRACED		0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
+#define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
+#define CLONE_STOPPED		0x02000000	/* Start in stopped state */
+
+/*
+ * Scheduling policies
+ */
+#define SCHED_NORMAL		0
+#define SCHED_FIFO		1
+#define SCHED_RR		2
+#define SCHED_BATCH		3
+
 #ifdef __KERNEL__
+
+struct sched_param {
+	int sched_priority;
+};
+
 #include <asm/param.h>	/* for HZ */
 
 #include <linux/capability.h>
@@ -44,34 +81,9 @@
 #include <linux/hrtimer.h>
 
 #include <asm/processor.h>
-#endif
-
-#include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 
 struct exec_domain;
 
-/*
- * cloning flags:
- */
-#define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
-#define CLONE_VM	0x00000100	/* set if VM shared between processes */
-#define CLONE_FS	0x00000200	/* set if fs info shared between processes */
-#define CLONE_FILES	0x00000400	/* set if open files shared between processes */
-#define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
-#define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
-#define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
-#define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
-#define CLONE_THREAD	0x00010000	/* Same thread group? */
-#define CLONE_NEWNS	0x00020000	/* New namespace group? */
-#define CLONE_SYSVSEM	0x00040000	/* share system V SEM_UNDO semantics */
-#define CLONE_SETTLS	0x00080000	/* create a new TLS for the child */
-#define CLONE_PARENT_SETTID	0x00100000	/* set the TID in the parent */
-#define CLONE_CHILD_CLEARTID	0x00200000	/* clear the TID in the child */
-#define CLONE_DETACHED		0x00400000	/* Unused, ignored */
-#define CLONE_UNTRACED		0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
-#define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
-#define CLONE_STOPPED		0x02000000	/* Start in stopped state */
-
 /*
  * List of flags we want to share for kernel threads,
  * if only because they are not used by them anyway.
@@ -158,20 +170,6 @@ extern unsigned long nr_iowait(void);
 /* Task command name length */
 #define TASK_COMM_LEN 16
 
-/*
- * Scheduling policies
- */
-#define SCHED_NORMAL		0
-#define SCHED_FIFO		1
-#define SCHED_RR		2
-#define SCHED_BATCH		3
-
-struct sched_param {
-	int sched_priority;
-};
-
-#ifdef __KERNEL__
-
 #include <linux/spinlock.h>
 
 /*
-- 
cgit v1.2.3


From acc429a517bd11fdcac9bea97d082d26231beb92 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 27 Apr 2006 16:46:56 +0100
Subject: linux/blkpg.h needs <linux/compiler.h> for __user

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/blkpg.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blkpg.h b/include/linux/blkpg.h
index be5d0f4ad24c..faf8a45af210 100644
--- a/include/linux/blkpg.h
+++ b/include/linux/blkpg.h
@@ -24,6 +24,7 @@
  *
  * For today, only the partition stuff - aeb, 990515
  */
+#include <linux/compiler.h>
 #include <linux/ioctl.h>
 
 #define BLKPG      _IO(0x12,105)
-- 
cgit v1.2.3


From 778382e08cce51b6268ca49449e5bd70c8413799 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Sat, 29 Apr 2006 01:46:02 +0100
Subject: Don't include <linux/mod_devicetable.h> in public part of linux/pci.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/pci.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 63609ae10736..fee8275df6d8 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -17,8 +17,6 @@
 #ifndef LINUX_PCI_H
 #define LINUX_PCI_H
 
-#include <linux/mod_devicetable.h>
-
 /* Include the pci register defines */
 #include <linux/pci_regs.h>
 
@@ -46,6 +44,8 @@
 
 #ifdef __KERNEL__
 
+#include <linux/mod_devicetable.h>
+
 #include <linux/types.h>
 #include <linux/ioport.h>
 #include <linux/list.h>
-- 
cgit v1.2.3


From c7afb48eb5147be9eb9789b4161462d246451ac2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Sat, 29 Apr 2006 01:48:16 +0100
Subject: Remove struct input_device_id from public view in linux/input.h

It uses kernel_ulong_t but can't be wrapped in __KERNEL__ because it's
used from scripts/mod/file2alias.c -- but we _can_ hide it inside
header manually too (and it doesn't generally exist for userspace).

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/input.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index b0e612dda0cf..f7ac97d834f6 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -805,6 +805,9 @@ struct ff_effect {
 
 #define FF_MAX		0x7f
 
+#ifdef LINUX_MOD_DEVICETABLE_H
+/* We only want this if mod_devicetable.h has been included -- that's
+   either in kernel space, or in scripts/mod/file2alias.c */
 struct input_device_id {
 
 	kernel_ulong_t flags;
@@ -823,6 +826,7 @@ struct input_device_id {
 
 	kernel_ulong_t driver_info;
 };
+#endif 
 
 /*
  * Structure for hotplug & device<->driver matching.
-- 
cgit v1.2.3


From 34c278d3913a15b64943e8c40a16b4f732cc7c59 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Sat, 29 Apr 2006 01:49:06 +0100
Subject: Remove 'extern int errno;' from public view in linux/unistd.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/unistd.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/unistd.h b/include/linux/unistd.h
index 10ed9834b822..c18c60f3254e 100644
--- a/include/linux/unistd.h
+++ b/include/linux/unistd.h
@@ -1,7 +1,9 @@
 #ifndef _LINUX_UNISTD_H_
 #define _LINUX_UNISTD_H_
 
+#ifdef __KERNEL__
 extern int errno;
+#endif
 
 /*
  * Include machine specific syscallX macros
-- 
cgit v1.2.3


From c3ce7e203af5d8eab7c3390fc991a1fcb152f741 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Sat, 29 Apr 2006 01:53:47 +0100
Subject: Sanitise ethtool.h and mii.h for userspace.

They shouldn't be using 'u32' et al in structures which are used for
communication with userspace. Switch to the proper types (__u32 etc).

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/ethtool.h | 169 ++++++++++++++++++++++++------------------------
 include/linux/mii.h     |  30 ++++-----
 2 files changed, 101 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 93535f093216..cf2abeca92a0 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -15,24 +15,24 @@
 
 /* This should work for both 32 and 64 bit userland. */
 struct ethtool_cmd {
-	u32	cmd;
-	u32	supported;	/* Features this interface supports */
-	u32	advertising;	/* Features this interface advertises */
-	u16	speed;		/* The forced speed, 10Mb, 100Mb, gigabit */
-	u8	duplex;		/* Duplex, half or full */
-	u8	port;		/* Which connector port */
-	u8	phy_address;
-	u8	transceiver;	/* Which transceiver to use */
-	u8	autoneg;	/* Enable or disable autonegotiation */
-	u32	maxtxpkt;	/* Tx pkts before generating tx int */
-	u32	maxrxpkt;	/* Rx pkts before generating rx int */
-	u32	reserved[4];
+	__u32	cmd;
+	__u32	supported;	/* Features this interface supports */
+	__u32	advertising;	/* Features this interface advertises */
+	__u16	speed;		/* The forced speed, 10Mb, 100Mb, gigabit */
+	__u8	duplex;		/* Duplex, half or full */
+	__u8	port;		/* Which connector port */
+	__u8	phy_address;
+	__u8	transceiver;	/* Which transceiver to use */
+	__u8	autoneg;	/* Enable or disable autonegotiation */
+	__u32	maxtxpkt;	/* Tx pkts before generating tx int */
+	__u32	maxrxpkt;	/* Rx pkts before generating rx int */
+	__u32	reserved[4];
 };
 
 #define ETHTOOL_BUSINFO_LEN	32
 /* these strings are set to whatever the driver author decides... */
 struct ethtool_drvinfo {
-	u32	cmd;
+	__u32	cmd;
 	char	driver[32];	/* driver short name, "tulip", "eepro100" */
 	char	version[32];	/* driver version string */
 	char	fw_version[32];	/* firmware version string, if applicable */
@@ -40,53 +40,53 @@ struct ethtool_drvinfo {
 				/* For PCI devices, use pci_name(pci_dev). */
 	char	reserved1[32];
 	char	reserved2[16];
-	u32	n_stats;	/* number of u64's from ETHTOOL_GSTATS */
-	u32	testinfo_len;
-	u32	eedump_len;	/* Size of data from ETHTOOL_GEEPROM (bytes) */
-	u32	regdump_len;	/* Size of data from ETHTOOL_GREGS (bytes) */
+	__u32	n_stats;	/* number of u64's from ETHTOOL_GSTATS */
+	__u32	testinfo_len;
+	__u32	eedump_len;	/* Size of data from ETHTOOL_GEEPROM (bytes) */
+	__u32	regdump_len;	/* Size of data from ETHTOOL_GREGS (bytes) */
 };
 
 #define SOPASS_MAX	6
 /* wake-on-lan settings */
 struct ethtool_wolinfo {
-	u32	cmd;
-	u32	supported;
-	u32	wolopts;
-	u8	sopass[SOPASS_MAX]; /* SecureOn(tm) password */
+	__u32	cmd;
+	__u32	supported;
+	__u32	wolopts;
+	__u8	sopass[SOPASS_MAX]; /* SecureOn(tm) password */
 };
 
 /* for passing single values */
 struct ethtool_value {
-	u32	cmd;
-	u32	data;
+	__u32	cmd;
+	__u32	data;
 };
 
 /* for passing big chunks of data */
 struct ethtool_regs {
-	u32	cmd;
-	u32	version; /* driver-specific, indicates different chips/revs */
-	u32	len; /* bytes */
-	u8	data[0];
+	__u32	cmd;
+	__u32	version; /* driver-specific, indicates different chips/revs */
+	__u32	len; /* bytes */
+	__u8	data[0];
 };
 
 /* for passing EEPROM chunks */
 struct ethtool_eeprom {
-	u32	cmd;
-	u32	magic;
-	u32	offset; /* in bytes */
-	u32	len; /* in bytes */
-	u8	data[0];
+	__u32	cmd;
+	__u32	magic;
+	__u32	offset; /* in bytes */
+	__u32	len; /* in bytes */
+	__u8	data[0];
 };
 
 /* for configuring coalescing parameters of chip */
 struct ethtool_coalesce {
-	u32	cmd;	/* ETHTOOL_{G,S}COALESCE */
+	__u32	cmd;	/* ETHTOOL_{G,S}COALESCE */
 
 	/* How many usecs to delay an RX interrupt after
 	 * a packet arrives.  If 0, only rx_max_coalesced_frames
 	 * is used.
 	 */
-	u32	rx_coalesce_usecs;
+	__u32	rx_coalesce_usecs;
 
 	/* How many packets to delay an RX interrupt after
 	 * a packet arrives.  If 0, only rx_coalesce_usecs is
@@ -94,21 +94,21 @@ struct ethtool_coalesce {
 	 * to zero as this would cause RX interrupts to never be
 	 * generated.
 	 */
-	u32	rx_max_coalesced_frames;
+	__u32	rx_max_coalesced_frames;
 
 	/* Same as above two parameters, except that these values
 	 * apply while an IRQ is being serviced by the host.  Not
 	 * all cards support this feature and the values are ignored
 	 * in that case.
 	 */
-	u32	rx_coalesce_usecs_irq;
-	u32	rx_max_coalesced_frames_irq;
+	__u32	rx_coalesce_usecs_irq;
+	__u32	rx_max_coalesced_frames_irq;
 
 	/* How many usecs to delay a TX interrupt after
 	 * a packet is sent.  If 0, only tx_max_coalesced_frames
 	 * is used.
 	 */
-	u32	tx_coalesce_usecs;
+	__u32	tx_coalesce_usecs;
 
 	/* How many packets to delay a TX interrupt after
 	 * a packet is sent.  If 0, only tx_coalesce_usecs is
@@ -116,22 +116,22 @@ struct ethtool_coalesce {
 	 * to zero as this would cause TX interrupts to never be
 	 * generated.
 	 */
-	u32	tx_max_coalesced_frames;
+	__u32	tx_max_coalesced_frames;
 
 	/* Same as above two parameters, except that these values
 	 * apply while an IRQ is being serviced by the host.  Not
 	 * all cards support this feature and the values are ignored
 	 * in that case.
 	 */
-	u32	tx_coalesce_usecs_irq;
-	u32	tx_max_coalesced_frames_irq;
+	__u32	tx_coalesce_usecs_irq;
+	__u32	tx_max_coalesced_frames_irq;
 
 	/* How many usecs to delay in-memory statistics
 	 * block updates.  Some drivers do not have an in-memory
 	 * statistic block, and in such cases this value is ignored.
 	 * This value must not be zero.
 	 */
-	u32	stats_block_coalesce_usecs;
+	__u32	stats_block_coalesce_usecs;
 
 	/* Adaptive RX/TX coalescing is an algorithm implemented by
 	 * some drivers to improve latency under low packet rates and
@@ -140,18 +140,18 @@ struct ethtool_coalesce {
 	 * not implemented by the driver causes these values to be
 	 * silently ignored.
 	 */
-	u32	use_adaptive_rx_coalesce;
-	u32	use_adaptive_tx_coalesce;
+	__u32	use_adaptive_rx_coalesce;
+	__u32	use_adaptive_tx_coalesce;
 
 	/* When the packet rate (measured in packets per second)
 	 * is below pkt_rate_low, the {rx,tx}_*_low parameters are
 	 * used.
 	 */
-	u32	pkt_rate_low;
-	u32	rx_coalesce_usecs_low;
-	u32	rx_max_coalesced_frames_low;
-	u32	tx_coalesce_usecs_low;
-	u32	tx_max_coalesced_frames_low;
+	__u32	pkt_rate_low;
+	__u32	rx_coalesce_usecs_low;
+	__u32	rx_max_coalesced_frames_low;
+	__u32	tx_coalesce_usecs_low;
+	__u32	tx_max_coalesced_frames_low;
 
 	/* When the packet rate is below pkt_rate_high but above
 	 * pkt_rate_low (both measured in packets per second) the
@@ -162,43 +162,43 @@ struct ethtool_coalesce {
 	 * is above pkt_rate_high, the {rx,tx}_*_high parameters are
 	 * used.
 	 */
-	u32	pkt_rate_high;
-	u32	rx_coalesce_usecs_high;
-	u32	rx_max_coalesced_frames_high;
-	u32	tx_coalesce_usecs_high;
-	u32	tx_max_coalesced_frames_high;
+	__u32	pkt_rate_high;
+	__u32	rx_coalesce_usecs_high;
+	__u32	rx_max_coalesced_frames_high;
+	__u32	tx_coalesce_usecs_high;
+	__u32	tx_max_coalesced_frames_high;
 
 	/* How often to do adaptive coalescing packet rate sampling,
 	 * measured in seconds.  Must not be zero.
 	 */
-	u32	rate_sample_interval;
+	__u32	rate_sample_interval;
 };
 
 /* for configuring RX/TX ring parameters */
 struct ethtool_ringparam {
-	u32	cmd;	/* ETHTOOL_{G,S}RINGPARAM */
+	__u32	cmd;	/* ETHTOOL_{G,S}RINGPARAM */
 
 	/* Read only attributes.  These indicate the maximum number
 	 * of pending RX/TX ring entries the driver will allow the
 	 * user to set.
 	 */
-	u32	rx_max_pending;
-	u32	rx_mini_max_pending;
-	u32	rx_jumbo_max_pending;
-	u32	tx_max_pending;
+	__u32	rx_max_pending;
+	__u32	rx_mini_max_pending;
+	__u32	rx_jumbo_max_pending;
+	__u32	tx_max_pending;
 
 	/* Values changeable by the user.  The valid values are
 	 * in the range 1 to the "*_max_pending" counterpart above.
 	 */
-	u32	rx_pending;
-	u32	rx_mini_pending;
-	u32	rx_jumbo_pending;
-	u32	tx_pending;
+	__u32	rx_pending;
+	__u32	rx_mini_pending;
+	__u32	rx_jumbo_pending;
+	__u32	tx_pending;
 };
 
 /* for configuring link flow control parameters */
 struct ethtool_pauseparam {
-	u32	cmd;	/* ETHTOOL_{G,S}PAUSEPARAM */
+	__u32	cmd;	/* ETHTOOL_{G,S}PAUSEPARAM */
 
 	/* If the link is being auto-negotiated (via ethtool_cmd.autoneg
 	 * being true) the user may set 'autonet' here non-zero to have the
@@ -210,9 +210,9 @@ struct ethtool_pauseparam {
 	 * then {rx,tx}_pause force the driver to use/not-use pause
 	 * flow control.
 	 */
-	u32	autoneg;
-	u32	rx_pause;
-	u32	tx_pause;
+	__u32	autoneg;
+	__u32	rx_pause;
+	__u32	tx_pause;
 };
 
 #define ETH_GSTRING_LEN		32
@@ -223,10 +223,10 @@ enum ethtool_stringset {
 
 /* for passing string sets for data tagging */
 struct ethtool_gstrings {
-	u32	cmd;		/* ETHTOOL_GSTRINGS */
-	u32	string_set;	/* string set id e.c. ETH_SS_TEST, etc*/
-	u32	len;		/* number of strings in the string set */
-	u8	data[0];
+	__u32	cmd;		/* ETHTOOL_GSTRINGS */
+	__u32	string_set;	/* string set id e.c. ETH_SS_TEST, etc*/
+	__u32	len;		/* number of strings in the string set */
+	__u8	data[0];
 };
 
 enum ethtool_test_flags {
@@ -236,26 +236,28 @@ enum ethtool_test_flags {
 
 /* for requesting NIC test and getting results*/
 struct ethtool_test {
-	u32	cmd;		/* ETHTOOL_TEST */
-	u32	flags;		/* ETH_TEST_FL_xxx */
-	u32	reserved;
-	u32	len;		/* result length, in number of u64 elements */
-	u64	data[0];
+	__u32	cmd;		/* ETHTOOL_TEST */
+	__u32	flags;		/* ETH_TEST_FL_xxx */
+	__u32	reserved;
+	__u32	len;		/* result length, in number of u64 elements */
+	__u64	data[0];
 };
 
 /* for dumping NIC-specific statistics */
 struct ethtool_stats {
-	u32	cmd;		/* ETHTOOL_GSTATS */
-	u32	n_stats;	/* number of u64's being returned */
-	u64	data[0];
+	__u32	cmd;		/* ETHTOOL_GSTATS */
+	__u32	n_stats;	/* number of u64's being returned */
+	__u64	data[0];
 };
 
 struct ethtool_perm_addr {
-	u32	cmd;		/* ETHTOOL_GPERMADDR */
-	u32	size;
-	u8	data[0];
+	__u32	cmd;		/* ETHTOOL_GPERMADDR */
+	__u32	size;
+	__u8	data[0];
 };
 
+#ifdef __KERNEL__
+
 struct net_device;
 
 /* Some generic methods drivers may use in their ethtool_ops */
@@ -371,6 +373,7 @@ struct ethtool_ops {
 	u32     (*get_ufo)(struct net_device *);
 	int     (*set_ufo)(struct net_device *, u32);
 };
+#endif /* __KERNEL__ */
 
 /* CMDs currently supported */
 #define ETHTOOL_GSET		0x00000001 /* Get settings. */
diff --git a/include/linux/mii.h b/include/linux/mii.h
index 68f5a0f392dd..beddc6d3b0f6 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -9,7 +9,6 @@
 #define __LINUX_MII_H__
 
 #include <linux/types.h>
-#include <linux/if.h>
 
 /* Generic MII registers. */
 
@@ -136,6 +135,20 @@
 #define LPA_1000FULL            0x0800  /* Link partner 1000BASE-T full duplex */
 #define LPA_1000HALF            0x0400  /* Link partner 1000BASE-T half duplex */
 
+/* This structure is used in all SIOCxMIIxxx ioctl calls */
+struct mii_ioctl_data {
+	__u16		phy_id;
+	__u16		reg_num;
+	__u16		val_in;
+	__u16		val_out;
+};
+
+#ifdef __KERNEL__ 
+
+#include <linux/if.h>
+
+struct ethtool_cmd;
+
 struct mii_if_info {
 	int phy_id;
 	int advertising;
@@ -151,9 +164,6 @@ struct mii_if_info {
 	void (*mdio_write) (struct net_device *dev, int phy_id, int location, int val);
 };
 
-struct ethtool_cmd;
-struct mii_ioctl_data;
-
 extern int mii_link_ok (struct mii_if_info *mii);
 extern int mii_nway_restart (struct mii_if_info *mii);
 extern int mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd);
@@ -168,16 +178,6 @@ extern int generic_mii_ioctl(struct mii_if_info *mii_if,
 			     unsigned int *duplex_changed);
 
 
-
-/* This structure is used in all SIOCxMIIxxx ioctl calls */
-struct mii_ioctl_data {
-	u16		phy_id;
-	u16		reg_num;
-	u16		val_in;
-	u16		val_out;
-};
-
-
 static inline struct mii_ioctl_data *if_mii(struct ifreq *rq)
 {
 	return (struct mii_ioctl_data *) &rq->ifr_ifru;
@@ -235,5 +235,5 @@ static inline unsigned int mii_duplex (unsigned int duplex_lock,
 	return 0;
 }
 
-
+#endif /* __KERNEL__ */
 #endif /* __LINUX_MII_H__ */
-- 
cgit v1.2.3


From 4f79c3ffc6e04623711e86cf9a0e09e4aad8cb36 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 2 May 2006 10:41:25 +0100
Subject: Guard some of linux/compiler.h with #ifdef __KERNEL__

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/compiler.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index f23d3c6fc2c0..1234be9024a2 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -78,6 +78,7 @@ extern void __chk_io_ptr(void __iomem *);
 
 #endif /* __ASSEMBLY__ */
 
+#ifdef __KERNEL__
 /*
  * Allow us to mark functions as 'deprecated' and have gcc emit a nice
  * warning for each use, in hopes of speeding the functions removal.
@@ -153,4 +154,5 @@ extern void __chk_io_ptr(void __iomem *);
 #define __always_inline inline
 #endif
 
+#endif /* __KERNEL__ */
 #endif /* __LINUX_COMPILER_H */
-- 
cgit v1.2.3


From cbb9a56177b16294ed347ba7fcb1c66c8adb5dc4 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 3 May 2006 13:07:27 +0100
Subject: Move jffs2_fs_i.h and jffs2_fs_sb.h from include/linux/ to fs/jffs2/

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/compr.h            |   4 +-
 fs/jffs2/dir.c              |   4 +-
 fs/jffs2/jffs2_fs_i.h       |  50 ++++++++++++++++++
 fs/jffs2/jffs2_fs_sb.h      | 122 ++++++++++++++++++++++++++++++++++++++++++++
 fs/jffs2/nodelist.h         |   4 +-
 include/linux/jffs2_fs_i.h  |  50 ------------------
 include/linux/jffs2_fs_sb.h | 122 --------------------------------------------
 7 files changed, 178 insertions(+), 178 deletions(-)
 create mode 100644 fs/jffs2/jffs2_fs_i.h
 create mode 100644 fs/jffs2/jffs2_fs_sb.h
 delete mode 100644 include/linux/jffs2_fs_i.h
 delete mode 100644 include/linux/jffs2_fs_sb.h

(limited to 'include/linux')

diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index a77e830d85c5..509b8b1c0811 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -23,8 +23,8 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/jffs2.h>
-#include <linux/jffs2_fs_i.h>
-#include <linux/jffs2_fs_sb.h>
+#include "jffs2_fs_i.h"
+#include "jffs2_fs_sb.h"
 #include "nodelist.h"
 
 #define JFFS2_RUBINMIPS_PRIORITY 10
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index f92840a3a52f..1c8e8c0f6cea 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -17,8 +17,8 @@
 #include <linux/fs.h>
 #include <linux/crc32.h>
 #include <linux/jffs2.h>
-#include <linux/jffs2_fs_i.h>
-#include <linux/jffs2_fs_sb.h>
+#include "jffs2_fs_i.h"
+#include "jffs2_fs_sb.h"
 #include <linux/time.h>
 #include "nodelist.h"
 
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
new file mode 100644
index 000000000000..ad565bf9dcc1
--- /dev/null
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -0,0 +1,50 @@
+/* $Id: jffs2_fs_i.h,v 1.19 2005/11/07 11:14:52 gleixner Exp $ */
+
+#ifndef _JFFS2_FS_I
+#define _JFFS2_FS_I
+
+#include <linux/version.h>
+#include <linux/rbtree.h>
+#include <asm/semaphore.h>
+
+struct jffs2_inode_info {
+	/* We need an internal mutex similar to inode->i_mutex.
+	   Unfortunately, we can't used the existing one, because
+	   either the GC would deadlock, or we'd have to release it
+	   before letting GC proceed. Or we'd have to put ugliness
+	   into the GC code so it didn't attempt to obtain the i_mutex
+	   for the inode(s) which are already locked */
+	struct semaphore sem;
+
+	/* The highest (datanode) version number used for this ino */
+	uint32_t highest_version;
+
+	/* List of data fragments which make up the file */
+	struct rb_root fragtree;
+
+	/* There may be one datanode which isn't referenced by any of the
+	   above fragments, if it contains a metadata update but no actual
+	   data - or if this is a directory inode */
+	/* This also holds the _only_ dnode for symlinks/device nodes,
+	   etc. */
+	struct jffs2_full_dnode *metadata;
+
+	/* Directory entries */
+	struct jffs2_full_dirent *dents;
+
+	/* The target path if this is the inode of a symlink */
+	unsigned char *target;
+
+	/* Some stuff we just have to keep in-core at all times, for each inode. */
+	struct jffs2_inode_cache *inocache;
+
+	uint16_t flags;
+	uint8_t usercompr;
+#if !defined (__ECOS)
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,2)
+	struct inode vfs_inode;
+#endif
+#endif
+};
+
+#endif /* _JFFS2_FS_I */
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
new file mode 100644
index 000000000000..4bcfb5570221
--- /dev/null
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -0,0 +1,122 @@
+/* $Id: jffs2_fs_sb.h,v 1.54 2005/09/21 13:37:34 dedekind Exp $ */
+
+#ifndef _JFFS2_FS_SB
+#define _JFFS2_FS_SB
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <asm/semaphore.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
+
+#define JFFS2_SB_FLAG_RO 1
+#define JFFS2_SB_FLAG_SCANNING 2 /* Flash scanning is in progress */
+#define JFFS2_SB_FLAG_BUILDING 4 /* File system building is in progress */
+
+struct jffs2_inodirty;
+
+/* A struct for the overall file system control.  Pointers to
+   jffs2_sb_info structs are named `c' in the source code.
+   Nee jffs_control
+*/
+struct jffs2_sb_info {
+	struct mtd_info *mtd;
+
+	uint32_t highest_ino;
+	uint32_t checked_ino;
+
+	unsigned int flags;
+
+	struct task_struct *gc_task;	/* GC task struct */
+	struct completion gc_thread_start; /* GC thread start completion */
+	struct completion gc_thread_exit; /* GC thread exit completion port */
+
+	struct semaphore alloc_sem;	/* Used to protect all the following
+					   fields, and also to protect against
+					   out-of-order writing of nodes. And GC. */
+	uint32_t cleanmarker_size;	/* Size of an _inline_ CLEANMARKER
+					 (i.e. zero for OOB CLEANMARKER */
+
+	uint32_t flash_size;
+	uint32_t used_size;
+	uint32_t dirty_size;
+	uint32_t wasted_size;
+	uint32_t free_size;
+	uint32_t erasing_size;
+	uint32_t bad_size;
+	uint32_t sector_size;
+	uint32_t unchecked_size;
+
+	uint32_t nr_free_blocks;
+	uint32_t nr_erasing_blocks;
+
+	/* Number of free blocks there must be before we... */
+	uint8_t resv_blocks_write;	/* ... allow a normal filesystem write */
+	uint8_t resv_blocks_deletion;	/* ... allow a normal filesystem deletion */
+	uint8_t resv_blocks_gctrigger;	/* ... wake up the GC thread */
+	uint8_t resv_blocks_gcbad;	/* ... pick a block from the bad_list to GC */
+	uint8_t resv_blocks_gcmerge;	/* ... merge pages when garbage collecting */
+
+	uint32_t nospc_dirty_size;
+
+	uint32_t nr_blocks;
+	struct jffs2_eraseblock *blocks;	/* The whole array of blocks. Used for getting blocks
+						 * from the offset (blocks[ofs / sector_size]) */
+	struct jffs2_eraseblock *nextblock;	/* The block we're currently filling */
+
+	struct jffs2_eraseblock *gcblock;	/* The block we're currently garbage-collecting */
+
+	struct list_head clean_list;		/* Blocks 100% full of clean data */
+	struct list_head very_dirty_list;	/* Blocks with lots of dirty space */
+	struct list_head dirty_list;		/* Blocks with some dirty space */
+	struct list_head erasable_list;		/* Blocks which are completely dirty, and need erasing */
+	struct list_head erasable_pending_wbuf_list;	/* Blocks which need erasing but only after the current wbuf is flushed */
+	struct list_head erasing_list;		/* Blocks which are currently erasing */
+	struct list_head erase_pending_list;	/* Blocks which need erasing now */
+	struct list_head erase_complete_list;	/* Blocks which are erased and need the clean marker written to them */
+	struct list_head free_list;		/* Blocks which are free and ready to be used */
+	struct list_head bad_list;		/* Bad blocks. */
+	struct list_head bad_used_list;		/* Bad blocks with valid data in. */
+
+	spinlock_t erase_completion_lock;	/* Protect free_list and erasing_list
+						   against erase completion handler */
+	wait_queue_head_t erase_wait;		/* For waiting for erases to complete */
+
+	wait_queue_head_t inocache_wq;
+	struct jffs2_inode_cache **inocache_list;
+	spinlock_t inocache_lock;
+
+	/* Sem to allow jffs2_garbage_collect_deletion_dirent to
+	   drop the erase_completion_lock while it's holding a pointer
+	   to an obsoleted node. I don't like this. Alternatives welcomed. */
+	struct semaphore erase_free_sem;
+
+	uint32_t wbuf_pagesize; /* 0 for NOR and other flashes with no wbuf */
+
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	/* Write-behind buffer for NAND flash */
+	unsigned char *wbuf;
+	uint32_t wbuf_ofs;
+	uint32_t wbuf_len;
+	struct jffs2_inodirty *wbuf_inodes;
+
+	struct rw_semaphore wbuf_sem;	/* Protects the write buffer */
+
+	/* Information about out-of-band area usage... */
+	struct nand_oobinfo *oobinfo;
+	uint32_t badblock_pos;
+	uint32_t fsdata_pos;
+	uint32_t fsdata_len;
+#endif
+
+	struct jffs2_summary *summary;		/* Summary information */
+
+	/* OS-private pointer for getting back to master superblock info */
+	void *os_priv;
+};
+
+#endif /* _JFFS2_FB_SB */
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 23a67bb3052f..f6645afe88e4 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -18,8 +18,8 @@
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/jffs2.h>
-#include <linux/jffs2_fs_sb.h>
-#include <linux/jffs2_fs_i.h>
+#include "jffs2_fs_sb.h"
+#include "jffs2_fs_i.h"
 #include "summary.h"
 
 #ifdef __ECOS
diff --git a/include/linux/jffs2_fs_i.h b/include/linux/jffs2_fs_i.h
deleted file mode 100644
index ad565bf9dcc1..000000000000
--- a/include/linux/jffs2_fs_i.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* $Id: jffs2_fs_i.h,v 1.19 2005/11/07 11:14:52 gleixner Exp $ */
-
-#ifndef _JFFS2_FS_I
-#define _JFFS2_FS_I
-
-#include <linux/version.h>
-#include <linux/rbtree.h>
-#include <asm/semaphore.h>
-
-struct jffs2_inode_info {
-	/* We need an internal mutex similar to inode->i_mutex.
-	   Unfortunately, we can't used the existing one, because
-	   either the GC would deadlock, or we'd have to release it
-	   before letting GC proceed. Or we'd have to put ugliness
-	   into the GC code so it didn't attempt to obtain the i_mutex
-	   for the inode(s) which are already locked */
-	struct semaphore sem;
-
-	/* The highest (datanode) version number used for this ino */
-	uint32_t highest_version;
-
-	/* List of data fragments which make up the file */
-	struct rb_root fragtree;
-
-	/* There may be one datanode which isn't referenced by any of the
-	   above fragments, if it contains a metadata update but no actual
-	   data - or if this is a directory inode */
-	/* This also holds the _only_ dnode for symlinks/device nodes,
-	   etc. */
-	struct jffs2_full_dnode *metadata;
-
-	/* Directory entries */
-	struct jffs2_full_dirent *dents;
-
-	/* The target path if this is the inode of a symlink */
-	unsigned char *target;
-
-	/* Some stuff we just have to keep in-core at all times, for each inode. */
-	struct jffs2_inode_cache *inocache;
-
-	uint16_t flags;
-	uint8_t usercompr;
-#if !defined (__ECOS)
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,2)
-	struct inode vfs_inode;
-#endif
-#endif
-};
-
-#endif /* _JFFS2_FS_I */
diff --git a/include/linux/jffs2_fs_sb.h b/include/linux/jffs2_fs_sb.h
deleted file mode 100644
index 4bcfb5570221..000000000000
--- a/include/linux/jffs2_fs_sb.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/* $Id: jffs2_fs_sb.h,v 1.54 2005/09/21 13:37:34 dedekind Exp $ */
-
-#ifndef _JFFS2_FS_SB
-#define _JFFS2_FS_SB
-
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/workqueue.h>
-#include <linux/completion.h>
-#include <asm/semaphore.h>
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/list.h>
-#include <linux/rwsem.h>
-
-#define JFFS2_SB_FLAG_RO 1
-#define JFFS2_SB_FLAG_SCANNING 2 /* Flash scanning is in progress */
-#define JFFS2_SB_FLAG_BUILDING 4 /* File system building is in progress */
-
-struct jffs2_inodirty;
-
-/* A struct for the overall file system control.  Pointers to
-   jffs2_sb_info structs are named `c' in the source code.
-   Nee jffs_control
-*/
-struct jffs2_sb_info {
-	struct mtd_info *mtd;
-
-	uint32_t highest_ino;
-	uint32_t checked_ino;
-
-	unsigned int flags;
-
-	struct task_struct *gc_task;	/* GC task struct */
-	struct completion gc_thread_start; /* GC thread start completion */
-	struct completion gc_thread_exit; /* GC thread exit completion port */
-
-	struct semaphore alloc_sem;	/* Used to protect all the following
-					   fields, and also to protect against
-					   out-of-order writing of nodes. And GC. */
-	uint32_t cleanmarker_size;	/* Size of an _inline_ CLEANMARKER
-					 (i.e. zero for OOB CLEANMARKER */
-
-	uint32_t flash_size;
-	uint32_t used_size;
-	uint32_t dirty_size;
-	uint32_t wasted_size;
-	uint32_t free_size;
-	uint32_t erasing_size;
-	uint32_t bad_size;
-	uint32_t sector_size;
-	uint32_t unchecked_size;
-
-	uint32_t nr_free_blocks;
-	uint32_t nr_erasing_blocks;
-
-	/* Number of free blocks there must be before we... */
-	uint8_t resv_blocks_write;	/* ... allow a normal filesystem write */
-	uint8_t resv_blocks_deletion;	/* ... allow a normal filesystem deletion */
-	uint8_t resv_blocks_gctrigger;	/* ... wake up the GC thread */
-	uint8_t resv_blocks_gcbad;	/* ... pick a block from the bad_list to GC */
-	uint8_t resv_blocks_gcmerge;	/* ... merge pages when garbage collecting */
-
-	uint32_t nospc_dirty_size;
-
-	uint32_t nr_blocks;
-	struct jffs2_eraseblock *blocks;	/* The whole array of blocks. Used for getting blocks
-						 * from the offset (blocks[ofs / sector_size]) */
-	struct jffs2_eraseblock *nextblock;	/* The block we're currently filling */
-
-	struct jffs2_eraseblock *gcblock;	/* The block we're currently garbage-collecting */
-
-	struct list_head clean_list;		/* Blocks 100% full of clean data */
-	struct list_head very_dirty_list;	/* Blocks with lots of dirty space */
-	struct list_head dirty_list;		/* Blocks with some dirty space */
-	struct list_head erasable_list;		/* Blocks which are completely dirty, and need erasing */
-	struct list_head erasable_pending_wbuf_list;	/* Blocks which need erasing but only after the current wbuf is flushed */
-	struct list_head erasing_list;		/* Blocks which are currently erasing */
-	struct list_head erase_pending_list;	/* Blocks which need erasing now */
-	struct list_head erase_complete_list;	/* Blocks which are erased and need the clean marker written to them */
-	struct list_head free_list;		/* Blocks which are free and ready to be used */
-	struct list_head bad_list;		/* Bad blocks. */
-	struct list_head bad_used_list;		/* Bad blocks with valid data in. */
-
-	spinlock_t erase_completion_lock;	/* Protect free_list and erasing_list
-						   against erase completion handler */
-	wait_queue_head_t erase_wait;		/* For waiting for erases to complete */
-
-	wait_queue_head_t inocache_wq;
-	struct jffs2_inode_cache **inocache_list;
-	spinlock_t inocache_lock;
-
-	/* Sem to allow jffs2_garbage_collect_deletion_dirent to
-	   drop the erase_completion_lock while it's holding a pointer
-	   to an obsoleted node. I don't like this. Alternatives welcomed. */
-	struct semaphore erase_free_sem;
-
-	uint32_t wbuf_pagesize; /* 0 for NOR and other flashes with no wbuf */
-
-#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
-	/* Write-behind buffer for NAND flash */
-	unsigned char *wbuf;
-	uint32_t wbuf_ofs;
-	uint32_t wbuf_len;
-	struct jffs2_inodirty *wbuf_inodes;
-
-	struct rw_semaphore wbuf_sem;	/* Protects the write buffer */
-
-	/* Information about out-of-band area usage... */
-	struct nand_oobinfo *oobinfo;
-	uint32_t badblock_pos;
-	uint32_t fsdata_pos;
-	uint32_t fsdata_len;
-#endif
-
-	struct jffs2_summary *summary;		/* Summary information */
-
-	/* OS-private pointer for getting back to master superblock info */
-	void *os_priv;
-};
-
-#endif /* _JFFS2_FB_SB */
-- 
cgit v1.2.3


From 423bc7b22bdeb73efeabfcf91d8a459ac33088f1 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 4 May 2006 00:41:02 +0100
Subject: Restore __attribute_const__ to user-visibility in
 linux/compiler.h...for now

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/compiler.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 1234be9024a2..9b4f11094937 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -128,6 +128,16 @@ extern void __chk_io_ptr(void __iomem *);
 # define __attribute_pure__	/* unimplemented */
 #endif
 
+#ifndef noinline
+#define noinline
+#endif
+
+#ifndef __always_inline
+#define __always_inline inline
+#endif
+
+#endif /* __KERNEL__ */
+
 /*
  * From the GCC manual:
  *
@@ -146,13 +156,4 @@ extern void __chk_io_ptr(void __iomem *);
 # define __attribute_const__	/* unimplemented */
 #endif
 
-#ifndef noinline
-#define noinline
-#endif
-
-#ifndef __always_inline
-#define __always_inline inline
-#endif
-
-#endif /* __KERNEL__ */
 #endif /* __LINUX_COMPILER_H */
-- 
cgit v1.2.3


From ac12c0fc8c08a14bfa263c3a478ee82ad3e346d2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 4 May 2006 00:59:14 +0100
Subject: Remove unneeded inclusion of <linux/time.h> from <linux/ufs_fs.h>

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/ufs_fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h
index 843aeaaa79d4..86b5b4271b5a 100644
--- a/include/linux/ufs_fs.h
+++ b/include/linux/ufs_fs.h
@@ -32,7 +32,6 @@
 
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/time.h>
 #include <linux/stat.h>
 #include <linux/fs.h>
 
-- 
cgit v1.2.3


From 8e1515df578e4665b77d1e0eec3c8b041d159b23 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 4 May 2006 01:42:36 +0100
Subject: Don't use 'u32' in user-visible struct ip_conntrack_old_tuple.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/netfilter/xt_conntrack.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_conntrack.h b/include/linux/netfilter/xt_conntrack.h
index 34f63cf2e293..4c2d9945ca54 100644
--- a/include/linux/netfilter/xt_conntrack.h
+++ b/include/linux/netfilter/xt_conntrack.h
@@ -42,7 +42,7 @@ struct ip_conntrack_old_tuple
 		} u;
 
 		/* The protocol. */
-		u16 protonum;
+		__u16 protonum;
 	} dst;
 };
 
-- 
cgit v1.2.3


From 90abbae2d35b3dc55fd39f8ab04acaf3da5cdc0a Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 4 May 2006 02:55:50 +0100
Subject: Use __uXX types in user-visible structures in <linux/nbd.h>

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/nbd.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nbd.h b/include/linux/nbd.h
index a6ce409ec6fc..1d7cdd20b553 100644
--- a/include/linux/nbd.h
+++ b/include/linux/nbd.h
@@ -77,11 +77,11 @@ struct nbd_device {
  * server. All data are in network byte order.
  */
 struct nbd_request {
-	u32 magic;
-	u32 type;	/* == READ || == WRITE 	*/
+	__u32 magic;
+	__u32 type;	/* == READ || == WRITE 	*/
 	char handle[8];
-	u64 from;
-	u32 len;
+	__u64 from;
+	__u32 len;
 }
 #ifdef __GNUC__
 	__attribute__ ((packed))
@@ -93,8 +93,8 @@ struct nbd_request {
  * it has completed an I/O request (or an error occurs).
  */
 struct nbd_reply {
-	u32 magic;
-	u32 error;		/* 0 = ok, else error	*/
+	__u32 magic;
+	__u32 error;		/* 0 = ok, else error	*/
 	char handle[8];		/* handle you got from request	*/
 };
 #endif
-- 
cgit v1.2.3


From 2c88f4a8bc4276013f7eee7824056d9cecccadb1 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 4 May 2006 12:07:37 +0100
Subject: Remove PPP_FCS from user view in <linux/ppp_defs.h>, remove __P mess
 entirely

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/ppp_defs.h | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ppp_defs.h b/include/linux/ppp_defs.h
index 402056cd049d..c6b13ff85028 100644
--- a/include/linux/ppp_defs.h
+++ b/include/linux/ppp_defs.h
@@ -42,8 +42,6 @@
 #ifndef _PPP_DEFS_H_
 #define _PPP_DEFS_H_
 
-#include <linux/crc-ccitt.h>
-
 /*
  * The basic PPP frame.
  */
@@ -97,7 +95,11 @@
 
 #define PPP_INITFCS	0xffff	/* Initial FCS value */
 #define PPP_GOODFCS	0xf0b8	/* Good final FCS value */
+
+#ifdef __KERNEL__
+#include <linux/crc-ccitt.h>
 #define PPP_FCS(fcs, c) crc_ccitt_byte(fcs, c)
+#endif
 
 /*
  * Extended asyncmap - allows any character to be escaped.
@@ -179,12 +181,4 @@ struct ppp_idle {
     time_t recv_idle;		/* time since last NP packet received */
 };
 
-#ifndef __P
-#ifdef __STDC__
-#define __P(x)	x
-#else
-#define __P(x)	()
-#endif
-#endif
-
 #endif /* _PPP_DEFS_H_ */
-- 
cgit v1.2.3


From 5da0458900bb5f56eb5e7a7c5ed275b5eaf51762 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 4 May 2006 15:07:59 +0100
Subject: Use __uXX types in <linux/divert.h> for struct divert_blk et al.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/divert.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/divert.h b/include/linux/divert.h
index 6919b09133d4..8fb4e9de6843 100644
--- a/include/linux/divert.h
+++ b/include/linux/divert.h
@@ -27,10 +27,10 @@ struct divert_blk
 {
 	int		divert;  /* are we active */
 	unsigned int protos;	/* protocols */
-	u16		tcp_dst[MAX_DIVERT_PORTS]; /* specific tcp dst ports to divert */
-	u16		tcp_src[MAX_DIVERT_PORTS]; /* specific tcp src ports to divert */
-	u16		udp_dst[MAX_DIVERT_PORTS]; /* specific udp dst ports to divert */
-	u16		udp_src[MAX_DIVERT_PORTS]; /* specific udp src ports to divert */
+	__u16		tcp_dst[MAX_DIVERT_PORTS]; /* specific tcp dst ports to divert */
+	__u16		tcp_src[MAX_DIVERT_PORTS]; /* specific tcp src ports to divert */
+	__u16		udp_dst[MAX_DIVERT_PORTS]; /* specific udp dst ports to divert */
+	__u16		udp_src[MAX_DIVERT_PORTS]; /* specific udp src ports to divert */
 };
 
 /*
@@ -40,12 +40,12 @@ struct divert_blk
 
 typedef union _divert_cf_arg
 {
-	s16		int16;
-	u16		uint16;
-	s32		int32;
-	u32		uint32;
-	s64		int64;
-	u64		uint64;
+	__s16		int16;
+	__u16		uint16;
+	__s32		int32;
+	__u32		uint32;
+	__s64		int64;
+	__u64		uint64;
 	void	__user *ptr;
 } divert_cf_arg;
 
-- 
cgit v1.2.3


From 7ee7d0e3186e2ad2a872436b5a272a814ea5cb0f Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 4 May 2006 15:49:24 +0100
Subject: Include <linux/types.h> and use __uXX types in
 <linux/affs_hardblocks.h>

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/affs_hardblocks.h | 72 +++++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/affs_hardblocks.h b/include/linux/affs_hardblocks.h
index 3fb869939d82..f1b948c1f592 100644
--- a/include/linux/affs_hardblocks.h
+++ b/include/linux/affs_hardblocks.h
@@ -1,45 +1,47 @@
 #ifndef	AFFS_HARDBLOCKS_H
 #define	AFFS_HARDBLOCKS_H
 
+#include <linux/types.h>
+
 /* Just the needed definitions for the RDB of an Amiga HD. */
 
 struct RigidDiskBlock {
-	u32	rdb_ID;
+	__u32	rdb_ID;
 	__be32	rdb_SummedLongs;
-	s32	rdb_ChkSum;
-	u32	rdb_HostID;
+	__s32	rdb_ChkSum;
+	__u32	rdb_HostID;
 	__be32	rdb_BlockBytes;
-	u32	rdb_Flags;
-	u32	rdb_BadBlockList;
+	__u32	rdb_Flags;
+	__u32	rdb_BadBlockList;
 	__be32	rdb_PartitionList;
-	u32	rdb_FileSysHeaderList;
-	u32	rdb_DriveInit;
-	u32	rdb_Reserved1[6];
-	u32	rdb_Cylinders;
-	u32	rdb_Sectors;
-	u32	rdb_Heads;
-	u32	rdb_Interleave;
-	u32	rdb_Park;
-	u32	rdb_Reserved2[3];
-	u32	rdb_WritePreComp;
-	u32	rdb_ReducedWrite;
-	u32	rdb_StepRate;
-	u32	rdb_Reserved3[5];
-	u32	rdb_RDBBlocksLo;
-	u32	rdb_RDBBlocksHi;
-	u32	rdb_LoCylinder;
-	u32	rdb_HiCylinder;
-	u32	rdb_CylBlocks;
-	u32	rdb_AutoParkSeconds;
-	u32	rdb_HighRDSKBlock;
-	u32	rdb_Reserved4;
+	__u32	rdb_FileSysHeaderList;
+	__u32	rdb_DriveInit;
+	__u32	rdb_Reserved1[6];
+	__u32	rdb_Cylinders;
+	__u32	rdb_Sectors;
+	__u32	rdb_Heads;
+	__u32	rdb_Interleave;
+	__u32	rdb_Park;
+	__u32	rdb_Reserved2[3];
+	__u32	rdb_WritePreComp;
+	__u32	rdb_ReducedWrite;
+	__u32	rdb_StepRate;
+	__u32	rdb_Reserved3[5];
+	__u32	rdb_RDBBlocksLo;
+	__u32	rdb_RDBBlocksHi;
+	__u32	rdb_LoCylinder;
+	__u32	rdb_HiCylinder;
+	__u32	rdb_CylBlocks;
+	__u32	rdb_AutoParkSeconds;
+	__u32	rdb_HighRDSKBlock;
+	__u32	rdb_Reserved4;
 	char	rdb_DiskVendor[8];
 	char	rdb_DiskProduct[16];
 	char	rdb_DiskRevision[4];
 	char	rdb_ControllerVendor[8];
 	char	rdb_ControllerProduct[16];
 	char	rdb_ControllerRevision[4];
-	u32	rdb_Reserved5[10];
+	__u32	rdb_Reserved5[10];
 };
 
 #define	IDNAME_RIGIDDISK	0x5244534B	/* "RDSK" */
@@ -47,16 +49,16 @@ struct RigidDiskBlock {
 struct PartitionBlock {
 	__be32	pb_ID;
 	__be32	pb_SummedLongs;
-	s32	pb_ChkSum;
-	u32	pb_HostID;
+	__s32	pb_ChkSum;
+	__u32	pb_HostID;
 	__be32	pb_Next;
-	u32	pb_Flags;
-	u32	pb_Reserved1[2];
-	u32	pb_DevFlags;
-	u8	pb_DriveName[32];
-	u32	pb_Reserved2[15];
+	__u32	pb_Flags;
+	__u32	pb_Reserved1[2];
+	__u32	pb_DevFlags;
+	__u8	pb_DriveName[32];
+	__u32	pb_Reserved2[15];
 	__be32	pb_Environment[17];
-	u32	pb_EReserved[15];
+	__u32	pb_EReserved[15];
 };
 
 #define	IDNAME_PARTITION	0x50415254	/* "PART" */
-- 
cgit v1.2.3


From de654c97861c314fd0fc0b6a6dd1bc4202e00e42 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 4 May 2006 17:28:26 +0100
Subject: Remove private struct dx_hash_info from public view in
 <linux/ext3_fs.h>

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/ext3_fs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index f327a3b5dfbe..757d54d8f1a5 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -661,6 +661,8 @@ struct ext3_dir_entry_2 {
 #define DX_HASH_HALF_MD4	1
 #define DX_HASH_TEA		2
 
+#ifdef __KERNEL__
+
 /* hash info structure used by the directory hash */
 struct dx_hash_info
 {
@@ -672,7 +674,6 @@ struct dx_hash_info
 
 #define EXT3_HTREE_EOF	0x7fffffff
 
-#ifdef __KERNEL__
 /*
  * Control parameters used by ext3_htree_next_block
  */
-- 
cgit v1.2.3


From cb8c1fdc0cf703e3297499dcd1a4b20b27570a7a Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 4 May 2006 17:32:44 +0100
Subject: Use __uXX types in <linux/i2o_dev.h>, include <linux/ioctl.h> too

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/i2o-dev.h | 167 +++++++++++++++++++++++-------------------------
 1 file changed, 80 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2o-dev.h b/include/linux/i2o-dev.h
index 36fd18cdad28..c2519df1b6dc 100644
--- a/include/linux/i2o-dev.h
+++ b/include/linux/i2o-dev.h
@@ -13,7 +13,7 @@
  * This header file defines the I2O APIs that are available to both
  * the kernel and user level applications.  Kernel specific structures
  * are defined in i2o_osm. OSMs should include _only_ i2o_osm.h which
- * automatically includs this file.
+ * automatically includes this file.
  *
  */
 
@@ -23,14 +23,7 @@
 /* How many controllers are we allowing */
 #define MAX_I2O_CONTROLLERS	32
 
-//#include <linux/ioctl.h>
-#ifndef __KERNEL__
-
-typedef unsigned char u8;
-typedef unsigned short u16;
-typedef unsigned int u32;
-
-#endif				/* __KERNEL__ */
+#include <linux/ioctl.h>
 
 /*
  * I2O Control IOCTLs and structures
@@ -53,7 +46,7 @@ typedef unsigned int u32;
 
 struct i2o_cmd_passthru32 {
 	unsigned int iop;	/* IOP unit number */
-	u32 msg;		/* message */
+	__u32 msg;		/* message */
 };
 
 struct i2o_cmd_passthru {
@@ -138,53 +131,53 @@ typedef struct i2o_sg_io_hdr {
 #define I2O_BUS_UNKNOWN 0x80
 
 typedef struct _i2o_pci_bus {
-	u8 PciFunctionNumber;
-	u8 PciDeviceNumber;
-	u8 PciBusNumber;
-	u8 reserved;
-	u16 PciVendorID;
-	u16 PciDeviceID;
+	__u8 PciFunctionNumber;
+	__u8 PciDeviceNumber;
+	__u8 PciBusNumber;
+	__u8 reserved;
+	__u16 PciVendorID;
+	__u16 PciDeviceID;
 } i2o_pci_bus;
 
 typedef struct _i2o_local_bus {
-	u16 LbBaseIOPort;
-	u16 reserved;
-	u32 LbBaseMemoryAddress;
+	__u16 LbBaseIOPort;
+	__u16 reserved;
+	__u32 LbBaseMemoryAddress;
 } i2o_local_bus;
 
 typedef struct _i2o_isa_bus {
-	u16 IsaBaseIOPort;
-	u8 CSN;
-	u8 reserved;
-	u32 IsaBaseMemoryAddress;
+	__u16 IsaBaseIOPort;
+	__u8 CSN;
+	__u8 reserved;
+	__u32 IsaBaseMemoryAddress;
 } i2o_isa_bus;
 
 typedef struct _i2o_eisa_bus_info {
-	u16 EisaBaseIOPort;
-	u8 reserved;
-	u8 EisaSlotNumber;
-	u32 EisaBaseMemoryAddress;
+	__u16 EisaBaseIOPort;
+	__u8 reserved;
+	__u8 EisaSlotNumber;
+	__u32 EisaBaseMemoryAddress;
 } i2o_eisa_bus;
 
 typedef struct _i2o_mca_bus {
-	u16 McaBaseIOPort;
-	u8 reserved;
-	u8 McaSlotNumber;
-	u32 McaBaseMemoryAddress;
+	__u16 McaBaseIOPort;
+	__u8 reserved;
+	__u8 McaSlotNumber;
+	__u32 McaBaseMemoryAddress;
 } i2o_mca_bus;
 
 typedef struct _i2o_other_bus {
-	u16 BaseIOPort;
-	u16 reserved;
-	u32 BaseMemoryAddress;
+	__u16 BaseIOPort;
+	__u16 reserved;
+	__u32 BaseMemoryAddress;
 } i2o_other_bus;
 
 typedef struct _i2o_hrt_entry {
-	u32 adapter_id;
-	u32 parent_tid:12;
-	u32 state:4;
-	u32 bus_num:8;
-	u32 bus_type:8;
+	__u32 adapter_id;
+	__u32 parent_tid:12;
+	__u32 state:4;
+	__u32 bus_num:8;
+	__u32 bus_type:8;
 	union {
 		i2o_pci_bus pci_bus;
 		i2o_local_bus local_bus;
@@ -196,66 +189,66 @@ typedef struct _i2o_hrt_entry {
 } i2o_hrt_entry;
 
 typedef struct _i2o_hrt {
-	u16 num_entries;
-	u8 entry_len;
-	u8 hrt_version;
-	u32 change_ind;
+	__u16 num_entries;
+	__u8 entry_len;
+	__u8 hrt_version;
+	__u32 change_ind;
 	i2o_hrt_entry hrt_entry[1];
 } i2o_hrt;
 
 typedef struct _i2o_lct_entry {
-	u32 entry_size:16;
-	u32 tid:12;
-	u32 reserved:4;
-	u32 change_ind;
-	u32 device_flags;
-	u32 class_id:12;
-	u32 version:4;
-	u32 vendor_id:16;
-	u32 sub_class;
-	u32 user_tid:12;
-	u32 parent_tid:12;
-	u32 bios_info:8;
-	u8 identity_tag[8];
-	u32 event_capabilities;
+	__u32 entry_size:16;
+	__u32 tid:12;
+	__u32 reserved:4;
+	__u32 change_ind;
+	__u32 device_flags;
+	__u32 class_id:12;
+	__u32 version:4;
+	__u32 vendor_id:16;
+	__u32 sub_class;
+	__u32 user_tid:12;
+	__u32 parent_tid:12;
+	__u32 bios_info:8;
+	__u8 identity_tag[8];
+	__u32 event_capabilities;
 } i2o_lct_entry;
 
 typedef struct _i2o_lct {
-	u32 table_size:16;
-	u32 boot_tid:12;
-	u32 lct_ver:4;
-	u32 iop_flags;
-	u32 change_ind;
+	__u32 table_size:16;
+	__u32 boot_tid:12;
+	__u32 lct_ver:4;
+	__u32 iop_flags;
+	__u32 change_ind;
 	i2o_lct_entry lct_entry[1];
 } i2o_lct;
 
 typedef struct _i2o_status_block {
-	u16 org_id;
-	u16 reserved;
-	u16 iop_id:12;
-	u16 reserved1:4;
-	u16 host_unit_id;
-	u16 segment_number:12;
-	u16 i2o_version:4;
-	u8 iop_state;
-	u8 msg_type;
-	u16 inbound_frame_size;
-	u8 init_code;
-	u8 reserved2;
-	u32 max_inbound_frames;
-	u32 cur_inbound_frames;
-	u32 max_outbound_frames;
+	__u16 org_id;
+	__u16 reserved;
+	__u16 iop_id:12;
+	__u16 reserved1:4;
+	__u16 host_unit_id;
+	__u16 segment_number:12;
+	__u16 i2o_version:4;
+	__u8 iop_state;
+	__u8 msg_type;
+	__u16 inbound_frame_size;
+	__u8 init_code;
+	__u8 reserved2;
+	__u32 max_inbound_frames;
+	__u32 cur_inbound_frames;
+	__u32 max_outbound_frames;
 	char product_id[24];
-	u32 expected_lct_size;
-	u32 iop_capabilities;
-	u32 desired_mem_size;
-	u32 current_mem_size;
-	u32 current_mem_base;
-	u32 desired_io_size;
-	u32 current_io_size;
-	u32 current_io_base;
-	u32 reserved3:24;
-	u32 cmd_status:8;
+	__u32 expected_lct_size;
+	__u32 iop_capabilities;
+	__u32 desired_mem_size;
+	__u32 current_mem_size;
+	__u32 current_mem_base;
+	__u32 desired_io_size;
+	__u32 current_io_size;
+	__u32 current_io_base;
+	__u32 reserved3:24;
+	__u32 cmd_status:8;
 } i2o_status_block;
 
 /* Event indicator mask flags */
-- 
cgit v1.2.3


From 05f75fd3bb92d9f5eaa8620195ff77a89431f2d2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 4 May 2006 17:50:04 +0100
Subject: Include <linux/types.h> and use __uXX types in <linux/cramfs_fs.h>

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/cramfs_fs.h | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cramfs_fs.h b/include/linux/cramfs_fs.h
index a8948f34b776..a41f38428c37 100644
--- a/include/linux/cramfs_fs.h
+++ b/include/linux/cramfs_fs.h
@@ -1,13 +1,7 @@
 #ifndef __CRAMFS_H
 #define __CRAMFS_H
 
-#ifndef __KERNEL__
-
-typedef unsigned char u8;
-typedef unsigned short u16;
-typedef unsigned int u32;
-
-#endif
+#include <linux/types.h>
 
 #define CRAMFS_MAGIC		0x28cd3d45	/* some random number */
 #define CRAMFS_SIGNATURE	"Compressed ROMFS"
@@ -33,9 +27,9 @@ typedef unsigned int u32;
  * Reasonably terse representation of the inode data.
  */
 struct cramfs_inode {
-	u32 mode:CRAMFS_MODE_WIDTH, uid:CRAMFS_UID_WIDTH;
+	__u32 mode:CRAMFS_MODE_WIDTH, uid:CRAMFS_UID_WIDTH;
 	/* SIZE for device files is i_rdev */
-	u32 size:CRAMFS_SIZE_WIDTH, gid:CRAMFS_GID_WIDTH;
+	__u32 size:CRAMFS_SIZE_WIDTH, gid:CRAMFS_GID_WIDTH;
 	/* NAMELEN is the length of the file name, divided by 4 and
            rounded up.  (cramfs doesn't support hard links.) */
 	/* OFFSET: For symlinks and non-empty regular files, this
@@ -44,27 +38,27 @@ struct cramfs_inode {
 	   see README).  For non-empty directories it is the offset
 	   (divided by 4) of the inode of the first file in that
 	   directory.  For anything else, offset is zero. */
-	u32 namelen:CRAMFS_NAMELEN_WIDTH, offset:CRAMFS_OFFSET_WIDTH;
+	__u32 namelen:CRAMFS_NAMELEN_WIDTH, offset:CRAMFS_OFFSET_WIDTH;
 };
 
 struct cramfs_info {
-	u32 crc;
-	u32 edition;
-	u32 blocks;
-	u32 files;
+	__u32 crc;
+	__u32 edition;
+	__u32 blocks;
+	__u32 files;
 };
 
 /*
  * Superblock information at the beginning of the FS.
  */
 struct cramfs_super {
-	u32 magic;			/* 0x28cd3d45 - random number */
-	u32 size;			/* length in bytes */
-	u32 flags;			/* feature flags */
-	u32 future;			/* reserved for future use */
-	u8 signature[16];		/* "Compressed ROMFS" */
+	__u32 magic;			/* 0x28cd3d45 - random number */
+	__u32 size;			/* length in bytes */
+	__u32 flags;			/* feature flags */
+	__u32 future;			/* reserved for future use */
+	__u8 signature[16];		/* "Compressed ROMFS" */
 	struct cramfs_info fsid;	/* unique filesystem info */
-	u8 name[16];			/* user-defined name */
+	__u8 name[16];			/* user-defined name */
 	struct cramfs_inode root;	/* root inode data */
 };
 
-- 
cgit v1.2.3


From 73566edf9b91dd085ddb12033d0ea7288979dd10 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Sun, 7 May 2006 17:16:36 +0100
Subject: [MTD] Convert physmap to platform driver

After dwmw2 let me know it ought to be done, I rewrote the physmap map
driver to be a platform driver.  I know zilch about the driver model,
so I probably botched it in some way, but I've done some tests on an
ixp23xx board which uses physmap, and it all seems to work.

In order to not break existing physmap users, I've added some compat
code that will instantiate a platform device iff CONFIG_MTD_PHYSMAP_LEN
is defined and != 0.  Also, I've changed the default value for
CONFIG_MTD_PHYSMAP_LEN to zero, so that people who inadvertently
compile in physmap (or new, platform-style, users of physmap) don't get
burned.

This works pretty well -- the new physmap driver is a drop-in replacement
for the old one, and works on said ixp23xx board without any code changes
needed.  (This should hold as long as users don't touch 'physmap_map'
directly.)

Once all physmap users have been converted to instantiate their own
platform devices, the compat code can go.  (Or we decide that we can
change all the in-tree users at the same time, and never merge the
compat code.)

Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 drivers/mtd/maps/Kconfig    |   2 +-
 drivers/mtd/maps/physmap.c  | 256 ++++++++++++++++++++++++++++++++------------
 include/linux/mtd/physmap.h |  21 ++--
 3 files changed, 196 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig
index 80d6810e88ed..b2becd7d4337 100644
--- a/drivers/mtd/maps/Kconfig
+++ b/drivers/mtd/maps/Kconfig
@@ -37,7 +37,7 @@ config MTD_PHYSMAP_START
 config MTD_PHYSMAP_LEN
 	hex "Physical length of flash mapping"
 	depends on MTD_PHYSMAP
-	default "0x4000000"
+	default "0"
 	help
 	  This is the total length of the mapping of the flash chips on
 	  your particular board. If there is space, or aliases, in the
diff --git a/drivers/mtd/maps/physmap.c b/drivers/mtd/maps/physmap.c
index f49ebc3c4606..76ce9bd943aa 100644
--- a/drivers/mtd/maps/physmap.c
+++ b/drivers/mtd/maps/physmap.c
@@ -14,112 +14,230 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <asm/io.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/config.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
+#include <asm/io.h>
+#include <asm/mach/flash.h>
 
-static struct mtd_info *mymtd;
-
-struct map_info physmap_map = {
-	.name = "phys_mapped_flash",
-	.phys = CONFIG_MTD_PHYSMAP_START,
-	.size = CONFIG_MTD_PHYSMAP_LEN,
-	.bankwidth = CONFIG_MTD_PHYSMAP_BANKWIDTH,
+struct physmap_flash_info {
+	struct mtd_info		*mtd;
+	struct map_info		map;
+	struct resource		*res;
+#ifdef CONFIG_MTD_PARTITIONS
+	int			nr_parts;
+	struct mtd_partition	*parts;
+#endif
 };
 
+
+static int physmap_flash_remove(struct platform_device *dev)
+{
+	struct physmap_flash_info *info;
+	struct physmap_flash_data *physmap_data;
+
+	info = platform_get_drvdata(dev);
+	if (info == NULL)
+		return 0;
+	platform_set_drvdata(dev, NULL);
+
+	physmap_data = dev->dev.platform_data;
+
+	if (info->mtd != NULL) {
 #ifdef CONFIG_MTD_PARTITIONS
-static struct mtd_partition *mtd_parts;
-static int                   mtd_parts_nb;
+		if (info->nr_parts) {
+			del_mtd_partitions(info->mtd);
+			kfree(info->parts);
+		} else if (physmap_data->nr_parts) {
+			del_mtd_partitions(info->mtd);
+		} else {
+			del_mtd_device(info->mtd);
+		}
+#else
+		del_mtd_device(info->mtd);
+#endif
+		map_destroy(info->mtd);
+	}
 
-static int num_physmap_partitions;
-static struct mtd_partition *physmap_partitions;
+	if (info->map.virt != NULL)
+		iounmap((void *)info->map.virt);
 
-static const char *part_probes[] __initdata = {"cmdlinepart", "RedBoot", NULL};
+	if (info->res != NULL) {
+		release_resource(info->res);
+		kfree(info->res);
+	}
 
-void physmap_set_partitions(struct mtd_partition *parts, int num_parts)
-{
-	physmap_partitions=parts;
-	num_physmap_partitions=num_parts;
+	return 0;
 }
-#endif /* CONFIG_MTD_PARTITIONS */
 
-static int __init init_physmap(void)
+static const char *rom_probe_types[] = { "cfi_probe", "jedec_probe", "map_rom", NULL };
+#ifdef CONFIG_MTD_PARTITIONS
+static const char *part_probe_types[] = { "cmdlinepart", "RedBoot", NULL };
+#endif
+
+static int physmap_flash_probe(struct platform_device *dev)
 {
-	static const char *rom_probe_types[] = { "cfi_probe", "jedec_probe", "map_rom", NULL };
-	const char **type;
+	struct physmap_flash_data *physmap_data;
+	struct physmap_flash_info *info;
+	const char **probe_type;
+	int err;
+
+	physmap_data = dev->dev.platform_data;
+	if (physmap_data == NULL)
+		return -ENODEV;
+
+       	printk(KERN_NOTICE "physmap platform flash device: %.8lx at %.8lx\n",
+		dev->resource->end - dev->resource->start + 1,
+		dev->resource->start);
+
+	info = kmalloc(sizeof(struct physmap_flash_info), GFP_KERNEL);
+	if (info == NULL) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	memset(info, 0, sizeof(*info));
 
-       	printk(KERN_NOTICE "physmap flash device: %lx at %lx\n", physmap_map.size, physmap_map.phys);
-	physmap_map.virt = ioremap(physmap_map.phys, physmap_map.size);
+	platform_set_drvdata(dev, info);
 
-	if (!physmap_map.virt) {
-		printk("Failed to ioremap\n");
-		return -EIO;
+	info->res = request_mem_region(dev->resource->start,
+			dev->resource->end - dev->resource->start + 1,
+			dev->dev.bus_id);
+	if (info->res == NULL) {
+		dev_err(&dev->dev, "Could not reserve memory region\n");
+		err = -ENOMEM;
+		goto err_out;
 	}
 
-	simple_map_init(&physmap_map);
+	info->map.name = dev->dev.bus_id;
+	info->map.phys = dev->resource->start;
+	info->map.size = dev->resource->end - dev->resource->start + 1;
+	info->map.bankwidth = physmap_data->width;
+	info->map.set_vpp = physmap_data->set_vpp;
+
+	info->map.virt = ioremap(info->map.phys, info->map.size);
+	if (info->map.virt == NULL) {
+		dev_err(&dev->dev, "Failed to ioremap flash region\n");
+		err = EIO;
+		goto err_out;
+	}
 
-	mymtd = NULL;
-	type = rom_probe_types;
-	for(; !mymtd && *type; type++) {
-		mymtd = do_map_probe(*type, &physmap_map);
+	simple_map_init(&info->map);
+
+	probe_type = rom_probe_types;
+	for (; info->mtd == NULL && *probe_type != NULL; probe_type++)
+		info->mtd = do_map_probe(*probe_type, &info->map);
+	if (info->mtd == NULL) {
+		dev_err(&dev->dev, "map_probe failed\n");
+		err = -ENXIO;
+		goto err_out;
 	}
-	if (mymtd) {
-		mymtd->owner = THIS_MODULE;
+	info->mtd->owner = THIS_MODULE;
 
 #ifdef CONFIG_MTD_PARTITIONS
-		mtd_parts_nb = parse_mtd_partitions(mymtd, part_probes,
-						    &mtd_parts, 0);
+	err = parse_mtd_partitions(info->mtd, part_probe_types, &info->parts, 0);
+	if (err > 0) {
+		add_mtd_partitions(info->mtd, info->parts, err);
+		return 0;
+	}
 
-		if (mtd_parts_nb > 0)
-		{
-			add_mtd_partitions (mymtd, mtd_parts, mtd_parts_nb);
-			return 0;
-		}
+	if (physmap_data->nr_parts) {
+		printk(KERN_NOTICE "Using physmap partition information\n");
+		add_mtd_partitions(info->mtd, physmap_data->parts,
+						physmap_data->nr_parts);
+		return 0;
+	}
+#endif
+
+	add_mtd_device(info->mtd);
+	return 0;
+
+err_out:
+	physmap_flash_remove(dev);
+	return err;
+}
+
+static struct platform_driver physmap_flash_driver = {
+	.probe		= physmap_flash_probe,
+	.remove		= physmap_flash_remove,
+	.driver		= {
+		.name	= "physmap-flash",
+	},
+};
 
-		if (num_physmap_partitions != 0)
-		{
-			printk(KERN_NOTICE
-			       "Using physmap partition definition\n");
-			add_mtd_partitions (mymtd, physmap_partitions, num_physmap_partitions);
-			return 0;
-		}
 
+#ifdef CONFIG_MTD_PHYSMAP_LEN
+#if CONFIG_MTD_PHYSMAP_LEN != 0
+#warning using PHYSMAP compat code
+#define PHYSMAP_COMPAT
+#endif
 #endif
-		add_mtd_device(mymtd);
 
-		return 0;
-	}
+#ifdef PHYSMAP_COMPAT
+static struct physmap_flash_data physmap_flash_data = {
+	.width		= CONFIG_MTD_PHYSMAP_BANKWIDTH,
+};
 
-	iounmap(physmap_map.virt);
-	return -ENXIO;
-}
+static struct resource physmap_flash_resource = {
+	.start		= CONFIG_MTD_PHYSMAP_START,
+	.end		= CONFIG_MTD_PHYSMAP_START + CONFIG_MTD_PHYSMAP_LEN,
+	.flags		= IORESOURCE_MEM,
+};
 
-static void __exit cleanup_physmap(void)
+static struct platform_device physmap_flash = {
+	.name		= "physmap-flash",
+	.id		= 0,
+	.dev		= {
+		.platform_data	= &physmap_flash_data,
+	},
+	.num_resources	= 1,
+	.resource	= &physmap_flash_resource,
+};
+
+void physmap_configure(unsigned long addr, unsigned long size,
+		int bankwidth, void (*set_vpp)(struct map_info *, int))
 {
+	physmap_flash_resource.start = addr;
+	physmap_flash_resource.end = addr + size - 1;
+	physmap_flash_data.width = bankwidth;
+	physmap_flash_data.set_vpp = set_vpp;
+}
+
 #ifdef CONFIG_MTD_PARTITIONS
-	if (mtd_parts_nb) {
-		del_mtd_partitions(mymtd);
-		kfree(mtd_parts);
-	} else if (num_physmap_partitions) {
-		del_mtd_partitions(mymtd);
-	} else {
-		del_mtd_device(mymtd);
-	}
-#else
-	del_mtd_device(mymtd);
+void physmap_set_partitions(struct mtd_partition *parts, int num_parts)
+{
+	physmap_flash_data.nr_parts = num_parts;
+	physmap_flash_data.parts = parts;
+}
+#endif
 #endif
-	map_destroy(mymtd);
 
-	iounmap(physmap_map.virt);
-	physmap_map.virt = NULL;
+static int __init physmap_init(void)
+{
+	int err;
+
+	err = platform_driver_register(&physmap_flash_driver);
+#ifdef PHYSMAP_COMPAT
+	if (err == 0)
+		platform_device_register(&physmap_flash);
+#endif
+
+	return err;
 }
 
-module_init(init_physmap);
-module_exit(cleanup_physmap);
+static void __exit physmap_exit(void)
+{
+#ifdef PHYSMAP_COMPAT
+	platform_device_unregister(&physmap_flash);
+#endif
+	platform_driver_unregister(&physmap_flash_driver);
+}
 
+module_init(physmap_init);
+module_exit(physmap_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
diff --git a/include/linux/mtd/physmap.h b/include/linux/mtd/physmap.h
index c7b8bcdef013..50f954461aa8 100644
--- a/include/linux/mtd/physmap.h
+++ b/include/linux/mtd/physmap.h
@@ -24,22 +24,18 @@
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
 
-/*
- * The map_info for physmap.  Board can override size, buswidth, phys,
- * (*set_vpp)(), etc in their initial setup routine.
- */
-extern struct map_info physmap_map;
+struct physmap_flash_data {
+	unsigned int		width;
+	void			(*set_vpp)(struct map_info *, int);
+	unsigned int		nr_parts;
+	struct mtd_partition	*parts;
+};
 
 /*
  * Board needs to specify the exact mapping during their setup time.
  */
-static inline void physmap_configure(unsigned long addr, unsigned long size, int bankwidth, void (*set_vpp)(struct map_info *, int) )
-{
-	physmap_map.phys = addr;
-	physmap_map.size = size;
-	physmap_map.bankwidth = bankwidth;
-	physmap_map.set_vpp = set_vpp;
-}
+void physmap_configure(unsigned long addr, unsigned long size,
+		int bankwidth, void (*set_vpp)(struct map_info *, int) );
 
 #if defined(CONFIG_MTD_PARTITIONS)
 
@@ -58,4 +54,3 @@ void physmap_set_partitions(struct mtd_partition *parts, int num_parts);
 #endif /* defined(CONFIG_MTD) */
 
 #endif /* __LINUX_MTD_PHYSMAP__ */
-
-- 
cgit v1.2.3


From 6f18a022fb311f07f3b32f2c0e1b5c9477dc4439 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 8 May 2006 22:40:05 +0100
Subject: Finally remove the obnoxious inter_module_xxx()

This was already a bad plan when I argued against adding it in the first
place. Good riddance.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/module.h |   9 ---
 init/Kconfig           |   3 -
 kernel/Makefile        |   1 -
 kernel/intermodule.c   | 184 -------------------------------------------------
 4 files changed, 197 deletions(-)
 delete mode 100644 kernel/intermodule.c

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index eaec13ddd667..b0d44134f3c4 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -557,13 +557,4 @@ static inline void module_remove_driver(struct device_driver *driver)
 
 #define __MODULE_STRING(x) __stringify(x)
 
-/* Use symbol_get and symbol_put instead.  You'll thank me. */
-#define HAVE_INTER_MODULE
-extern void __deprecated inter_module_register(const char *,
-		struct module *, const void *);
-extern void __deprecated inter_module_unregister(const char *);
-extern const void * __deprecated inter_module_get_request(const char *,
-		const char *);
-extern void __deprecated inter_module_put(const char *);
-
 #endif /* _LINUX_MODULE_H */
diff --git a/init/Kconfig b/init/Kconfig
index 3b36a1d53656..a7697787946a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -389,9 +389,6 @@ config SLOB
 	default !SLAB
 	bool
 
-config OBSOLETE_INTERMODULE
-	tristate
-
 menu "Loadable module support"
 
 config MODULES
diff --git a/kernel/Makefile b/kernel/Makefile
index 58908f9d156a..f6ef00f4f90f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -20,7 +20,6 @@ obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
deleted file mode 100644
index 55b1e5b85db9..000000000000
--- a/kernel/intermodule.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Deprecated, do not use.  Moved from module.c to here. --RR */
-
-/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */
-#include <linux/module.h>
-#include <linux/kmod.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-
-/* inter_module functions are always available, even when the kernel is
- * compiled without modules.  Consumers of inter_module_xxx routines
- * will always work, even when both are built into the kernel, this
- * approach removes lots of #ifdefs in mainline code.
- */
-
-static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
-static DEFINE_SPINLOCK(ime_lock);
-static int kmalloc_failed;
-
-struct inter_module_entry {
-	struct list_head list;
-	const char *im_name;
-	struct module *owner;
-	const void *userdata;
-};
-
-/**
- * inter_module_register - register a new set of inter module data.
- * @im_name: an arbitrary string to identify the data, must be unique
- * @owner: module that is registering the data, always use THIS_MODULE
- * @userdata: pointer to arbitrary userdata to be registered
- *
- * Description: Check that the im_name has not already been registered,
- * complain if it has.  For new data, add it to the inter_module_entry
- * list.
- */
-void inter_module_register(const char *im_name, struct module *owner, const void *userdata)
-{
-	struct list_head *tmp;
-	struct inter_module_entry *ime, *ime_new;
-
-	if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
-		/* Overloaded kernel, not fatal */
-		printk(KERN_ERR
-			"Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
-			im_name);
-		kmalloc_failed = 1;
-		return;
-	}
-	ime_new->im_name = im_name;
-	ime_new->owner = owner;
-	ime_new->userdata = userdata;
-
-	spin_lock(&ime_lock);
-	list_for_each(tmp, &ime_list) {
-		ime = list_entry(tmp, struct inter_module_entry, list);
-		if (strcmp(ime->im_name, im_name) == 0) {
-			spin_unlock(&ime_lock);
-			kfree(ime_new);
-			/* Program logic error, fatal */
-			printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name);
-			BUG();
-		}
-	}
-	list_add(&(ime_new->list), &ime_list);
-	spin_unlock(&ime_lock);
-}
-
-/**
- * inter_module_unregister - unregister a set of inter module data.
- * @im_name: an arbitrary string to identify the data, must be unique
- *
- * Description: Check that the im_name has been registered, complain if
- * it has not.  For existing data, remove it from the
- * inter_module_entry list.
- */
-void inter_module_unregister(const char *im_name)
-{
-	struct list_head *tmp;
-	struct inter_module_entry *ime;
-
-	spin_lock(&ime_lock);
-	list_for_each(tmp, &ime_list) {
-		ime = list_entry(tmp, struct inter_module_entry, list);
-		if (strcmp(ime->im_name, im_name) == 0) {
-			list_del(&(ime->list));
-			spin_unlock(&ime_lock);
-			kfree(ime);
-			return;
-		}
-	}
-	spin_unlock(&ime_lock);
-	if (kmalloc_failed) {
-		printk(KERN_ERR
-			"inter_module_unregister: no entry for '%s', "
-			"probably caused by previous kmalloc failure\n",
-			im_name);
-		return;
-	}
-	else {
-		/* Program logic error, fatal */
-		printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name);
-		BUG();
-	}
-}
-
-/**
- * inter_module_get - return arbitrary userdata from another module.
- * @im_name: an arbitrary string to identify the data, must be unique
- *
- * Description: If the im_name has not been registered, return NULL.
- * Try to increment the use count on the owning module, if that fails
- * then return NULL.  Otherwise return the userdata.
- */
-static const void *inter_module_get(const char *im_name)
-{
-	struct list_head *tmp;
-	struct inter_module_entry *ime;
-	const void *result = NULL;
-
-	spin_lock(&ime_lock);
-	list_for_each(tmp, &ime_list) {
-		ime = list_entry(tmp, struct inter_module_entry, list);
-		if (strcmp(ime->im_name, im_name) == 0) {
-			if (try_module_get(ime->owner))
-				result = ime->userdata;
-			break;
-		}
-	}
-	spin_unlock(&ime_lock);
-	return(result);
-}
-
-/**
- * inter_module_get_request - im get with automatic request_module.
- * @im_name: an arbitrary string to identify the data, must be unique
- * @modname: module that is expected to register im_name
- *
- * Description: If inter_module_get fails, do request_module then retry.
- */
-const void *inter_module_get_request(const char *im_name, const char *modname)
-{
-	const void *result = inter_module_get(im_name);
-	if (!result) {
-		request_module("%s", modname);
-		result = inter_module_get(im_name);
-	}
-	return(result);
-}
-
-/**
- * inter_module_put - release use of data from another module.
- * @im_name: an arbitrary string to identify the data, must be unique
- *
- * Description: If the im_name has not been registered, complain,
- * otherwise decrement the use count on the owning module.
- */
-void inter_module_put(const char *im_name)
-{
-	struct list_head *tmp;
-	struct inter_module_entry *ime;
-
-	spin_lock(&ime_lock);
-	list_for_each(tmp, &ime_list) {
-		ime = list_entry(tmp, struct inter_module_entry, list);
-		if (strcmp(ime->im_name, im_name) == 0) {
-			if (ime->owner)
-				module_put(ime->owner);
-			spin_unlock(&ime_lock);
-			return;
-		}
-	}
-	spin_unlock(&ime_lock);
-	printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name);
-	BUG();
-}
-
-EXPORT_SYMBOL(inter_module_register);
-EXPORT_SYMBOL(inter_module_unregister);
-EXPORT_SYMBOL(inter_module_get_request);
-EXPORT_SYMBOL(inter_module_put);
-
-MODULE_LICENSE("GPL");
-
-- 
cgit v1.2.3


From 32e62c636a728cb39c0b3bd191286f2ca65d4028 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Fri, 5 May 2006 17:19:50 -0600
Subject: [IA64] rework memory attribute aliasing

This closes a couple holes in our attribute aliasing avoidance scheme:

  - The current kernel fails mmaps of some /dev/mem MMIO regions because
    they don't appear in the EFI memory map.  This keeps X from working
    on the Intel Tiger box.

  - The current kernel allows UC mmap of the 0-1MB region of
    /sys/.../legacy_mem even when the chipset doesn't support UC
    access.  This causes an MCA when starting X on HP rx7620 and rx8620
    boxes in the default configuration.

There's more detail in the Documentation/ia64/aliasing.txt file this
adds, but the general idea is that if a region might be covered by
a granule-sized kernel identity mapping, any access via /dev/mem or
mmap must use the same attribute as the identity mapping.

Otherwise, we fall back to using an attribute that is supported
according to the EFI memory map, or to using UC if the EFI memory
map doesn't mention the region.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 Documentation/ia64/aliasing.txt | 208 ++++++++++++++++++++++++++++++++++++++++
 arch/ia64/kernel/efi.c          | 156 +++++++++++++++++++-----------
 arch/ia64/mm/ioremap.c          |  27 +++++-
 arch/ia64/pci/pci.c             |  17 +++-
 include/asm-ia64/io.h           |   1 +
 include/asm-ia64/pgtable.h      |  22 ++---
 include/linux/efi.h             |   1 +
 7 files changed, 359 insertions(+), 73 deletions(-)
 create mode 100644 Documentation/ia64/aliasing.txt

(limited to 'include/linux')

diff --git a/Documentation/ia64/aliasing.txt b/Documentation/ia64/aliasing.txt
new file mode 100644
index 000000000000..38f9a52d1820
--- /dev/null
+++ b/Documentation/ia64/aliasing.txt
@@ -0,0 +1,208 @@
+	         MEMORY ATTRIBUTE ALIASING ON IA-64
+
+			   Bjorn Helgaas
+		       <bjorn.helgaas@hp.com>
+			    May 4, 2006
+
+
+MEMORY ATTRIBUTES
+
+    Itanium supports several attributes for virtual memory references.
+    The attribute is part of the virtual translation, i.e., it is
+    contained in the TLB entry.  The ones of most interest to the Linux
+    kernel are:
+
+	WB		Write-back (cacheable)
+	UC		Uncacheable
+	WC		Write-coalescing
+
+    System memory typically uses the WB attribute.  The UC attribute is
+    used for memory-mapped I/O devices.  The WC attribute is uncacheable
+    like UC is, but writes may be delayed and combined to increase
+    performance for things like frame buffers.
+
+    The Itanium architecture requires that we avoid accessing the same
+    page with both a cacheable mapping and an uncacheable mapping[1].
+
+    The design of the chipset determines which attributes are supported
+    on which regions of the address space.  For example, some chipsets
+    support either WB or UC access to main memory, while others support
+    only WB access.
+
+MEMORY MAP
+
+    Platform firmware describes the physical memory map and the
+    supported attributes for each region.  At boot-time, the kernel uses
+    the EFI GetMemoryMap() interface.  ACPI can also describe memory
+    devices and the attributes they support, but Linux/ia64 currently
+    doesn't use this information.
+
+    The kernel uses the efi_memmap table returned from GetMemoryMap() to
+    learn the attributes supported by each region of physical address
+    space.  Unfortunately, this table does not completely describe the
+    address space because some machines omit some or all of the MMIO
+    regions from the map.
+
+    The kernel maintains another table, kern_memmap, which describes the
+    memory Linux is actually using and the attribute for each region.
+    This contains only system memory; it does not contain MMIO space.
+
+    The kern_memmap table typically contains only a subset of the system
+    memory described by the efi_memmap.  Linux/ia64 can't use all memory
+    in the system because of constraints imposed by the identity mapping
+    scheme.
+
+    The efi_memmap table is preserved unmodified because the original
+    boot-time information is required for kexec.
+
+KERNEL IDENTITY MAPPINGS
+
+    Linux/ia64 identity mappings are done with large pages, currently
+    either 16MB or 64MB, referred to as "granules."  Cacheable mappings
+    are speculative[2], so the processor can read any location in the
+    page at any time, independent of the programmer's intentions.  This
+    means that to avoid attribute aliasing, Linux can create a cacheable
+    identity mapping only when the entire granule supports cacheable
+    access.
+
+    Therefore, kern_memmap contains only full granule-sized regions that
+    can referenced safely by an identity mapping.
+
+    Uncacheable mappings are not speculative, so the processor will
+    generate UC accesses only to locations explicitly referenced by
+    software.  This allows UC identity mappings to cover granules that
+    are only partially populated, or populated with a combination of UC
+    and WB regions.
+
+USER MAPPINGS
+
+    User mappings are typically done with 16K or 64K pages.  The smaller
+    page size allows more flexibility because only 16K or 64K has to be
+    homogeneous with respect to memory attributes.
+
+POTENTIAL ATTRIBUTE ALIASING CASES
+
+    There are several ways the kernel creates new mappings:
+
+    mmap of /dev/mem
+
+	This uses remap_pfn_range(), which creates user mappings.  These
+	mappings may be either WB or UC.  If the region being mapped
+	happens to be in kern_memmap, meaning that it may also be mapped
+	by a kernel identity mapping, the user mapping must use the same
+	attribute as the kernel mapping.
+
+	If the region is not in kern_memmap, the user mapping should use
+	an attribute reported as being supported in the EFI memory map.
+
+	Since the EFI memory map does not describe MMIO on some
+	machines, this should use an uncacheable mapping as a fallback.
+
+    mmap of /sys/class/pci_bus/.../legacy_mem
+
+	This is very similar to mmap of /dev/mem, except that legacy_mem
+	only allows mmap of the one megabyte "legacy MMIO" area for a
+	specific PCI bus.  Typically this is the first megabyte of
+	physical address space, but it may be different on machines with
+	several VGA devices.
+
+	"X" uses this to access VGA frame buffers.  Using legacy_mem
+	rather than /dev/mem allows multiple instances of X to talk to
+	different VGA cards.
+
+	The /dev/mem mmap constraints apply.
+
+	However, since this is for mapping legacy MMIO space, WB access
+	does not make sense.  This matters on machines without legacy
+	VGA support: these machines may have WB memory for the entire
+	first megabyte (or even the entire first granule).
+
+	On these machines, we could mmap legacy_mem as WB, which would
+	be safe in terms of attribute aliasing, but X has no way of
+	knowing that it is accessing regular memory, not a frame buffer,
+	so the kernel should fail the mmap rather than doing it with WB.
+
+    read/write of /dev/mem
+
+	This uses copy_from_user(), which implicitly uses a kernel
+	identity mapping.  This is obviously safe for things in
+	kern_memmap.
+
+	There may be corner cases of things that are not in kern_memmap,
+	but could be accessed this way.  For example, registers in MMIO
+	space are not in kern_memmap, but could be accessed with a UC
+	mapping.  This would not cause attribute aliasing.  But
+	registers typically can be accessed only with four-byte or
+	eight-byte accesses, and the copy_from_user() path doesn't allow
+	any control over the access size, so this would be dangerous.
+
+    ioremap()
+
+	This returns a kernel identity mapping for use inside the
+	kernel.
+
+	If the region is in kern_memmap, we should use the attribute
+	specified there.  Otherwise, if the EFI memory map reports that
+	the entire granule supports WB, we should use that (granules
+	that are partially reserved or occupied by firmware do not appear
+	in kern_memmap).  Otherwise, we should use a UC mapping.
+
+PAST PROBLEM CASES
+
+    mmap of various MMIO regions from /dev/mem by "X" on Intel platforms
+
+      The EFI memory map may not report these MMIO regions.
+
+      These must be allowed so that X will work.  This means that
+      when the EFI memory map is incomplete, every /dev/mem mmap must
+      succeed.  It may create either WB or UC user mappings, depending
+      on whether the region is in kern_memmap or the EFI memory map.
+
+    mmap of 0x0-0xA0000 /dev/mem by "hwinfo" on HP sx1000 with VGA enabled
+
+      See https://bugzilla.novell.com/show_bug.cgi?id=140858.
+
+      The EFI memory map reports the following attributes:
+        0x00000-0x9FFFF WB only
+        0xA0000-0xBFFFF UC only (VGA frame buffer)
+        0xC0000-0xFFFFF WB only
+
+      This mmap is done with user pages, not kernel identity mappings,
+      so it is safe to use WB mappings.
+
+      The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000,
+      which will use a granule-sized UC mapping covering 0-0xFFFFF.  This
+      granule covers some WB-only memory, but since UC is non-speculative,
+      the processor will never generate an uncacheable reference to the
+      WB-only areas unless the driver explicitly touches them.
+
+    mmap of 0x0-0xFFFFF legacy_mem by "X"
+
+      If the EFI memory map reports this entire range as WB, there
+      is no VGA MMIO hole, and the mmap should fail or be done with
+      a WB mapping.
+
+      There's no easy way for X to determine whether the 0xA0000-0xBFFFF
+      region is a frame buffer or just memory, so I think it's best to
+      just fail this mmap request rather than using a WB mapping.  As
+      far as I know, there's no need to map legacy_mem with WB
+      mappings.
+
+      Otherwise, a UC mapping of the entire region is probably safe.
+      The VGA hole means the region will not be in kern_memmap.  The
+      HP sx1000 chipset doesn't support UC access to the memory surrounding
+      the VGA hole, but X doesn't need that area anyway and should not
+      reference it.
+
+    mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled
+
+      The EFI memory map reports the following attributes:
+        0x00000-0xFFFFF WB only (no VGA MMIO hole)
+
+      This is a special case of the previous case, and the mmap should
+      fail for the same reason as above.
+
+NOTES
+
+    [1] SDM rev 2.2, vol 2, sec 4.4.1.
+    [2] SDM rev 2.2, vol 2, sec 4.4.6.
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 12cfedce73b1..c33d0ba7e300 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -8,6 +8,8 @@
  * Copyright (C) 1999-2003 Hewlett-Packard Co.
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  *	Stephane Eranian <eranian@hpl.hp.com>
+ * (c) Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *	Bjorn Helgaas <bjorn.helgaas@hp.com>
  *
  * All EFI Runtime Services are not implemented yet as EFI only
  * supports physical mode addressing on SoftSDV. This is to be fixed
@@ -622,28 +624,20 @@ efi_get_iobase (void)
 	return 0;
 }
 
-static efi_memory_desc_t *
-efi_memory_descriptor (unsigned long phys_addr)
+static struct kern_memdesc *
+kern_memory_descriptor (unsigned long phys_addr)
 {
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
+	struct kern_memdesc *md;
 
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-
-		if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT))
+	for (md = kern_memmap; md->start != ~0UL; md++) {
+		if (phys_addr - md->start < (md->num_pages << EFI_PAGE_SHIFT))
 			 return md;
 	}
 	return 0;
 }
 
-static int
-efi_memmap_has_mmio (void)
+static efi_memory_desc_t *
+efi_memory_descriptor (unsigned long phys_addr)
 {
 	void *efi_map_start, *efi_map_end, *p;
 	efi_memory_desc_t *md;
@@ -656,8 +650,8 @@ efi_memmap_has_mmio (void)
 	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
 		md = p;
 
-		if (md->type == EFI_MEMORY_MAPPED_IO)
-			return 1;
+		if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT))
+			 return md;
 	}
 	return 0;
 }
@@ -683,71 +677,125 @@ efi_mem_attributes (unsigned long phys_addr)
 }
 EXPORT_SYMBOL(efi_mem_attributes);
 
-/*
- * Determines whether the memory at phys_addr supports the desired
- * attribute (WB, UC, etc).  If this returns 1, the caller can safely
- * access size bytes at phys_addr with the specified attribute.
- */
-int
-efi_mem_attribute_range (unsigned long phys_addr, unsigned long size, u64 attr)
+u64
+efi_mem_attribute (unsigned long phys_addr, unsigned long size)
 {
 	unsigned long end = phys_addr + size;
 	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
+	u64 attr;
+
+	if (!md)
+		return 0;
+
+	/*
+	 * EFI_MEMORY_RUNTIME is not a memory attribute; it just tells
+	 * the kernel that firmware needs this region mapped.
+	 */
+	attr = md->attribute & ~EFI_MEMORY_RUNTIME;
+	do {
+		unsigned long md_end = efi_md_end(md);
+
+		if (end <= md_end)
+			return attr;
+
+		md = efi_memory_descriptor(md_end);
+		if (!md || (md->attribute & ~EFI_MEMORY_RUNTIME) != attr)
+			return 0;
+	} while (md);
+	return 0;
+}
+
+u64
+kern_mem_attribute (unsigned long phys_addr, unsigned long size)
+{
+	unsigned long end = phys_addr + size;
+	struct kern_memdesc *md;
+	u64 attr;
 
 	/*
-	 * Some firmware doesn't report MMIO regions in the EFI memory
-	 * map.  The Intel BigSur (a.k.a. HP i2000) has this problem.
-	 * On those platforms, we have to assume UC is valid everywhere.
+	 * This is a hack for ioremap calls before we set up kern_memmap.
+	 * Maybe we should do efi_memmap_init() earlier instead.
 	 */
-	if (!md || (md->attribute & attr) != attr) {
-		if (attr == EFI_MEMORY_UC && !efi_memmap_has_mmio())
-			return 1;
+	if (!kern_memmap) {
+		attr = efi_mem_attribute(phys_addr, size);
+		if (attr & EFI_MEMORY_WB)
+			return EFI_MEMORY_WB;
 		return 0;
 	}
 
+	md = kern_memory_descriptor(phys_addr);
+	if (!md)
+		return 0;
+
+	attr = md->attribute;
 	do {
-		unsigned long md_end = efi_md_end(md);
+		unsigned long md_end = kmd_end(md);
 
 		if (end <= md_end)
-			return 1;
+			return attr;
 
-		md = efi_memory_descriptor(md_end);
-		if (!md || (md->attribute & attr) != attr)
+		md = kern_memory_descriptor(md_end);
+		if (!md || md->attribute != attr)
 			return 0;
 	} while (md);
 	return 0;
 }
+EXPORT_SYMBOL(kern_mem_attribute);
 
-/*
- * For /dev/mem, we only allow read & write system calls to access
- * write-back memory, because read & write don't allow the user to
- * control access size.
- */
 int
 valid_phys_addr_range (unsigned long phys_addr, unsigned long size)
 {
-	return efi_mem_attribute_range(phys_addr, size, EFI_MEMORY_WB);
+	u64 attr;
+
+	/*
+	 * /dev/mem reads and writes use copy_to_user(), which implicitly
+	 * uses a granule-sized kernel identity mapping.  It's really
+	 * only safe to do this for regions in kern_memmap.  For more
+	 * details, see Documentation/ia64/aliasing.txt.
+	 */
+	attr = kern_mem_attribute(phys_addr, size);
+	if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC)
+		return 1;
+	return 0;
 }
 
-/*
- * We allow mmap of anything in the EFI memory map that supports
- * either write-back or uncacheable access.  For uncacheable regions,
- * the supported access sizes are system-dependent, and the user is
- * responsible for using the correct size.
- *
- * Note that this doesn't currently allow access to hot-added memory,
- * because that doesn't appear in the boot-time EFI memory map.
- */
 int
 valid_mmap_phys_addr_range (unsigned long phys_addr, unsigned long size)
 {
-	if (efi_mem_attribute_range(phys_addr, size, EFI_MEMORY_WB))
-		return 1;
+	/*
+	 * MMIO regions are often missing from the EFI memory map.
+	 * We must allow mmap of them for programs like X, so we
+	 * currently can't do any useful validation.
+	 */
+	return 1;
+}
 
-	if (efi_mem_attribute_range(phys_addr, size, EFI_MEMORY_UC))
-		return 1;
+pgprot_t
+phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size,
+		     pgprot_t vma_prot)
+{
+	unsigned long phys_addr = pfn << PAGE_SHIFT;
+	u64 attr;
 
-	return 0;
+	/*
+	 * For /dev/mem mmap, we use user mappings, but if the region is
+	 * in kern_memmap (and hence may be covered by a kernel mapping),
+	 * we must use the same attribute as the kernel mapping.
+	 */
+	attr = kern_mem_attribute(phys_addr, size);
+	if (attr & EFI_MEMORY_WB)
+		return pgprot_cacheable(vma_prot);
+	else if (attr & EFI_MEMORY_UC)
+		return pgprot_noncached(vma_prot);
+
+	/*
+	 * Some chipsets don't support UC access to memory.  If
+	 * WB is supported, we prefer that.
+	 */
+	if (efi_mem_attribute(phys_addr, size) & EFI_MEMORY_WB)
+		return pgprot_cacheable(vma_prot);
+
+	return pgprot_noncached(vma_prot);
 }
 
 int __init
diff --git a/arch/ia64/mm/ioremap.c b/arch/ia64/mm/ioremap.c
index 643ccc6960ce..07bd02b6c372 100644
--- a/arch/ia64/mm/ioremap.c
+++ b/arch/ia64/mm/ioremap.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/efi.h>
 #include <asm/io.h>
+#include <asm/meminit.h>
 
 static inline void __iomem *
 __ioremap (unsigned long offset, unsigned long size)
@@ -21,16 +22,29 @@ __ioremap (unsigned long offset, unsigned long size)
 void __iomem *
 ioremap (unsigned long offset, unsigned long size)
 {
-	if (efi_mem_attribute_range(offset, size, EFI_MEMORY_WB))
-		return phys_to_virt(offset);
+	u64 attr;
+	unsigned long gran_base, gran_size;
 
-	if (efi_mem_attribute_range(offset, size, EFI_MEMORY_UC))
+	/*
+	 * For things in kern_memmap, we must use the same attribute
+	 * as the rest of the kernel.  For more details, see
+	 * Documentation/ia64/aliasing.txt.
+	 */
+	attr = kern_mem_attribute(offset, size);
+	if (attr & EFI_MEMORY_WB)
+		return phys_to_virt(offset);
+	else if (attr & EFI_MEMORY_UC)
 		return __ioremap(offset, size);
 
 	/*
-	 * Someday this should check ACPI resources so we
-	 * can do the right thing for hot-plugged regions.
+	 * Some chipsets don't support UC access to memory.  If
+	 * WB is supported for the whole granule, we prefer that.
 	 */
+	gran_base = GRANULEROUNDDOWN(offset);
+	gran_size = GRANULEROUNDUP(offset + size) - gran_base;
+	if (efi_mem_attribute(gran_base, gran_size) & EFI_MEMORY_WB)
+		return phys_to_virt(offset);
+
 	return __ioremap(offset, size);
 }
 EXPORT_SYMBOL(ioremap);
@@ -38,6 +52,9 @@ EXPORT_SYMBOL(ioremap);
 void __iomem *
 ioremap_nocache (unsigned long offset, unsigned long size)
 {
+	if (kern_mem_attribute(offset, size) & EFI_MEMORY_WB)
+		return 0;
+
 	return __ioremap(offset, size);
 }
 EXPORT_SYMBOL(ioremap_nocache);
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index ab829a22f8a4..30d148f34042 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -645,18 +645,31 @@ char *ia64_pci_get_legacy_mem(struct pci_bus *bus)
 int
 pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma)
 {
+	unsigned long size = vma->vm_end - vma->vm_start;
+	pgprot_t prot;
 	char *addr;
 
+	/*
+	 * Avoid attribute aliasing.  See Documentation/ia64/aliasing.txt
+	 * for more details.
+	 */
+	if (!valid_mmap_phys_addr_range(vma->vm_pgoff << PAGE_SHIFT, size))
+		return -EINVAL;
+	prot = phys_mem_access_prot(NULL, vma->vm_pgoff, size,
+				    vma->vm_page_prot);
+	if (pgprot_val(prot) != pgprot_val(pgprot_noncached(vma->vm_page_prot)))
+		return -EINVAL;
+
 	addr = pci_get_legacy_mem(bus);
 	if (IS_ERR(addr))
 		return PTR_ERR(addr);
 
 	vma->vm_pgoff += (unsigned long)addr >> PAGE_SHIFT;
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_page_prot = prot;
 	vma->vm_flags |= (VM_SHM | VM_RESERVED | VM_IO);
 
 	if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
-			    vma->vm_end - vma->vm_start, vma->vm_page_prot))
+			    size, vma->vm_page_prot))
 		return -EAGAIN;
 
 	return 0;
diff --git a/include/asm-ia64/io.h b/include/asm-ia64/io.h
index c2e3742108bb..781ee2c7e8c3 100644
--- a/include/asm-ia64/io.h
+++ b/include/asm-ia64/io.h
@@ -88,6 +88,7 @@ phys_to_virt (unsigned long address)
 }
 
 #define ARCH_HAS_VALID_PHYS_ADDR_RANGE
+extern u64 kern_mem_attribute (unsigned long phys_addr, unsigned long size);
 extern int valid_phys_addr_range (unsigned long addr, size_t count); /* efi.c */
 extern int valid_mmap_phys_addr_range (unsigned long addr, size_t count);
 
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index c0f8144f2349..90f3a2329232 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -317,22 +317,20 @@ ia64_phys_addr_valid (unsigned long addr)
 #define pte_mkhuge(pte)		(__pte(pte_val(pte)))
 
 /*
- * Macro to a page protection value as "uncacheable".  Note that "protection" is really a
- * misnomer here as the protection value contains the memory attribute bits, dirty bits,
- * and various other bits as well.
+ * Make page protection values cacheable, uncacheable, or write-
+ * combining.  Note that "protection" is really a misnomer here as the
+ * protection value contains the memory attribute bits, dirty bits, and
+ * various other bits as well.
  */
+#define pgprot_cacheable(prot)		__pgprot((pgprot_val(prot) & ~_PAGE_MA_MASK) | _PAGE_MA_WB)
 #define pgprot_noncached(prot)		__pgprot((pgprot_val(prot) & ~_PAGE_MA_MASK) | _PAGE_MA_UC)
-
-/*
- * Macro to make mark a page protection value as "write-combining".
- * Note that "protection" is really a misnomer here as the protection
- * value contains the memory attribute bits, dirty bits, and various
- * other bits as well.  Accesses through a write-combining translation
- * works bypasses the caches, but does allow for consecutive writes to
- * be combined into single (but larger) write transactions.
- */
 #define pgprot_writecombine(prot)	__pgprot((pgprot_val(prot) & ~_PAGE_MA_MASK) | _PAGE_MA_WC)
 
+struct file;
+extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+				     unsigned long size, pgprot_t vma_prot);
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+
 static inline unsigned long
 pgd_index (unsigned long address)
 {
diff --git a/include/linux/efi.h b/include/linux/efi.h
index e203613d3aec..66d621dbcb6c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -294,6 +294,7 @@ extern void efi_enter_virtual_mode (void);	/* switch EFI to virtual mode, if pos
 extern u64 efi_get_iobase (void);
 extern u32 efi_mem_type (unsigned long phys_addr);
 extern u64 efi_mem_attributes (unsigned long phys_addr);
+extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size);
 extern int efi_mem_attribute_range (unsigned long phys_addr, unsigned long size,
 				    u64 attr);
 extern int __init efi_uart_console_only (void);
-- 
cgit v1.2.3


From 9c01f87db183403a4f603fe5180c57b82b54b4a1 Mon Sep 17 00:00:00 2001
From: Kyungmin Park <kyungmin.park@samsung.com>
Date: Fri, 12 May 2006 17:02:31 +0300
Subject: OneNAND: handle byte access on BufferRAM

Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
---
 drivers/mtd/onenand/onenand_base.c | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/onenand.h        |  3 +++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index d6c13f7ae5a1..1439c9fa1d23 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -373,6 +373,17 @@ static int onenand_read_bufferram(struct mtd_info *mtd, int area,
 
 	bufferram += onenand_bufferram_offset(mtd, area);
 
+	if (ONENAND_CHECK_BYTE_ACCESS(count)) {
+		unsigned short word;
+
+		/* Align with word(16-bit) size */
+		count--;
+
+		/* Read word and save byte */
+		word = this->read_word(bufferram + offset + count);
+		buffer[count] = (word & 0xff);
+	}
+
 	memcpy(buffer, bufferram + offset, count);
 
 	return 0;
@@ -400,6 +411,17 @@ static int onenand_sync_read_bufferram(struct mtd_info *mtd, int area,
 
 	this->mmcontrol(mtd, ONENAND_SYS_CFG1_SYNC_READ);
 
+	if (ONENAND_CHECK_BYTE_ACCESS(count)) {
+		unsigned short word;
+
+		/* Align with word(16-bit) size */
+		count--;
+
+		/* Read word and save byte */
+		word = this->read_word(bufferram + offset + count);
+		buffer[count] = (word & 0xff);
+	}
+
 	memcpy(buffer, bufferram + offset, count);
 
 	this->mmcontrol(mtd, 0);
@@ -427,6 +449,22 @@ static int onenand_write_bufferram(struct mtd_info *mtd, int area,
 
 	bufferram += onenand_bufferram_offset(mtd, area);
 
+	if (ONENAND_CHECK_BYTE_ACCESS(count)) {
+		unsigned short word;
+		int byte_offset;
+
+		/* Align with word(16-bit) size */
+		count--;
+
+		/* Calculate byte access offset */
+		byte_offset = offset + count;
+
+		/* Read word and save byte */
+		word = this->read_word(bufferram + byte_offset);
+		word = (word & ~0xff) | buffer[count];
+		this->write_word(word, bufferram + byte_offset);
+	}
+
 	memcpy(bufferram + offset, buffer, count);
 
 	return 0;
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 7419b5fab133..22322c8a7729 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -130,6 +130,9 @@ struct onenand_chip {
 #define ONENAND_SET_SYS_CFG1(v, this)					\
 	(this->write_word(v, this->base + ONENAND_REG_SYS_CFG1))
 
+/* Check byte access in OneNAND */
+#define ONENAND_CHECK_BYTE_ACCESS(addr)		(addr & 0x1)
+
 /*
  * Options bits
  */
-- 
cgit v1.2.3


From 493c646077ef0b8668ed71b8057f81cb7454af87 Mon Sep 17 00:00:00 2001
From: Kyungmin Park <kyungmin.park@samsung.com>
Date: Fri, 12 May 2006 17:03:07 +0300
Subject: OneNAND: One-Time Programmable (OTP) support

One Block of the NAND Flash Array memory is reserved as
a One-Time Programmable Block memory area.
Also, 1st Block of NAND Flash Array can be used as OTP.

The OTP block can be read, programmed and locked using the same
operations as any other NAND Flash Array memory block.
OTP block cannot be erased.

OTP block is fully-guaranteed to be a valid block.

Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
---
 drivers/mtd/mtdchar.c              |   2 +-
 drivers/mtd/onenand/Kconfig        |  14 ++
 drivers/mtd/onenand/onenand_base.c | 313 ++++++++++++++++++++++++++++++++++++-
 include/linux/mtd/onenand.h        |   2 +
 include/linux/mtd/onenand_regs.h   |   8 +
 5 files changed, 335 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 6b83aee8abb8..7a7df851c993 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -552,7 +552,7 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 		break;
 	}
 
-#ifdef CONFIG_MTD_OTP
+#if defined(CONFIG_MTD_OTP) || defined(CONFIG_MTD_ONENAND_OTP)
 	case OTPSELECT:
 	{
 		int mode;
diff --git a/drivers/mtd/onenand/Kconfig b/drivers/mtd/onenand/Kconfig
index 126ff6bf63d5..5930a03736d7 100644
--- a/drivers/mtd/onenand/Kconfig
+++ b/drivers/mtd/onenand/Kconfig
@@ -29,6 +29,20 @@ config MTD_ONENAND_GENERIC
 	help
 	  Support for OneNAND flash via platform device driver.
 
+config MTD_ONENAND_OTP
+	bool "OneNAND OTP Support"
+	depends on MTD_ONENAND
+	help
+	  One Block of the NAND Flash Array memory is reserved as
+	  a One-Time Programmable Block memory area.
+	  Also, 1st Block of NAND Flash Array can be used as OTP.
+
+	  The OTP block can be read, programmed and locked using the same
+	  operations as any other NAND Flash Array memory block.
+	  OTP block cannot be erased.
+
+	  OTP block is fully-guaranteed to be a valid block.
+
 config MTD_ONENAND_SYNC_READ
 	bool "OneNAND Sync. Burst Read Support"
 	depends on ARCH_OMAP
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 7c7dc0ae5a19..163c81135447 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -191,7 +191,7 @@ static int onenand_buffer_address(int dataram1, int sectors, int count)
 static int onenand_command(struct mtd_info *mtd, int cmd, loff_t addr, size_t len)
 {
 	struct onenand_chip *this = mtd->priv;
-	int value, readcmd = 0;
+	int value, readcmd = 0, block_cmd = 0;
 	int block, page;
 	/* Now we use page size operation */
 	int sectors = 4, count = 4;
@@ -207,6 +207,8 @@ static int onenand_command(struct mtd_info *mtd, int cmd, loff_t addr, size_t le
 
 	case ONENAND_CMD_ERASE:
 	case ONENAND_CMD_BUFFERRAM:
+	case ONENAND_CMD_OTP_ACCESS:
+		block_cmd = 1;
 		block = (int) (addr >> this->erase_shift);
 		page = -1;
 		break;
@@ -235,7 +237,7 @@ static int onenand_command(struct mtd_info *mtd, int cmd, loff_t addr, size_t le
 		value = onenand_block_address(this, block);
 		this->write_word(value, this->base + ONENAND_REG_START_ADDRESS1);
 
-		if (cmd == ONENAND_CMD_ERASE) {
+		if (cmd == block_cmd) {
 			/* Select DataRAM for DDP */
 			value = onenand_bufferram_address(this, block);
 			this->write_word(value, this->base + ONENAND_REG_START_ADDRESS2);
@@ -1412,6 +1414,304 @@ static int onenand_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
 	return 0;
 }
 
+#ifdef CONFIG_MTD_ONENAND_OTP
+
+/* Interal OTP operation */
+typedef int (*otp_op_t)(struct mtd_info *mtd, loff_t form, size_t len,
+		size_t *retlen, u_char *buf);
+
+/**
+ * do_otp_read - [DEFAULT] Read OTP block area
+ * @param mtd		MTD device structure
+ * @param from		The offset to read
+ * @param len		number of bytes to read
+ * @param retlen	pointer to variable to store the number of readbytes
+ * @param buf		the databuffer to put/get data
+ *
+ * Read OTP block area.
+ */
+static int do_otp_read(struct mtd_info *mtd, loff_t from, size_t len,
+		size_t *retlen, u_char *buf)
+{
+	struct onenand_chip *this = mtd->priv;
+	int ret;
+
+	/* Enter OTP access mode */
+	this->command(mtd, ONENAND_CMD_OTP_ACCESS, 0, 0);
+	this->wait(mtd, FL_OTPING);
+
+	ret = mtd->read(mtd, from, len, retlen, buf);
+
+	/* Exit OTP access mode */
+	this->command(mtd, ONENAND_CMD_RESET, 0, 0);
+	this->wait(mtd, FL_RESETING);
+
+	return ret;
+}
+
+/**
+ * do_otp_write - [DEFAULT] Write OTP block area
+ * @param mtd		MTD device structure
+ * @param from		The offset to write
+ * @param len		number of bytes to write
+ * @param retlen	pointer to variable to store the number of write bytes
+ * @param buf		the databuffer to put/get data
+ *
+ * Write OTP block area.
+ */
+static int do_otp_write(struct mtd_info *mtd, loff_t from, size_t len,
+		size_t *retlen, u_char *buf)
+{
+	struct onenand_chip *this = mtd->priv;
+	unsigned char *pbuf = buf;
+	int ret;
+
+	/* Force buffer page aligned */
+	if (len < mtd->oobblock) {
+		memcpy(this->page_buf, buf, len);
+		memset(this->page_buf + len, 0xff, mtd->oobblock - len);
+		pbuf = this->page_buf;
+		len = mtd->oobblock;
+	}
+
+	/* Enter OTP access mode */
+	this->command(mtd, ONENAND_CMD_OTP_ACCESS, 0, 0);
+	this->wait(mtd, FL_OTPING);
+
+	ret = mtd->write(mtd, from, len, retlen, pbuf);
+
+	/* Exit OTP access mode */
+	this->command(mtd, ONENAND_CMD_RESET, 0, 0);
+	this->wait(mtd, FL_RESETING);
+
+	return ret;
+}
+
+/**
+ * do_otp_lock - [DEFAULT] Lock OTP block area
+ * @param mtd		MTD device structure
+ * @param from		The offset to lock
+ * @param len		number of bytes to lock
+ * @param retlen	pointer to variable to store the number of lock bytes
+ * @param buf		the databuffer to put/get data
+ *
+ * Lock OTP block area.
+ */
+static int do_otp_lock(struct mtd_info *mtd, loff_t from, size_t len,
+		size_t *retlen, u_char *buf)
+{
+	struct onenand_chip *this = mtd->priv;
+	int ret;
+
+	/* Enter OTP access mode */
+	this->command(mtd, ONENAND_CMD_OTP_ACCESS, 0, 0);
+	this->wait(mtd, FL_OTPING);
+
+	ret = mtd->write_oob(mtd, from, len, retlen, buf);
+
+	/* Exit OTP access mode */
+	this->command(mtd, ONENAND_CMD_RESET, 0, 0);
+	this->wait(mtd, FL_RESETING);
+
+	return ret;
+}
+
+/**
+ * onenand_otp_walk - [DEFAULT] Handle OTP operation
+ * @param mtd		MTD device structure
+ * @param from		The offset to read/write
+ * @param len		number of bytes to read/write
+ * @param retlen	pointer to variable to store the number of read bytes
+ * @param buf		the databuffer to put/get data
+ * @param action	do given action
+ * @param mode		specify user and factory
+ *
+ * Handle OTP operation.
+ */
+static int onenand_otp_walk(struct mtd_info *mtd, loff_t from, size_t len,
+			size_t *retlen, u_char *buf,
+			otp_op_t action, int mode)
+{
+	struct onenand_chip *this = mtd->priv;
+	int otp_pages;
+	int density;
+	int ret = 0;
+
+	*retlen = 0;
+
+	density = this->device_id >> ONENAND_DEVICE_DENSITY_SHIFT;
+	if (density < ONENAND_DEVICE_DENSITY_512Mb)
+		otp_pages = 20;
+	else
+		otp_pages = 10;
+
+	if (mode == MTD_OTP_FACTORY) {
+		from += mtd->oobblock * otp_pages;
+		otp_pages = 64 - otp_pages;
+	}
+
+	/* Check User/Factory boundary */
+	if (((mtd->oobblock * otp_pages) - (from + len)) < 0)
+		return 0;
+
+	while (len > 0 && otp_pages > 0) {
+		if (!action) {	/* OTP Info functions */
+			struct otp_info *otpinfo;
+
+			len -= sizeof(struct otp_info);
+			if (len <= 0)
+				return -ENOSPC;
+
+			otpinfo = (struct otp_info *) buf;
+			otpinfo->start = from;
+			otpinfo->length = mtd->oobblock;
+			otpinfo->locked = 0;
+
+			from += mtd->oobblock;
+			buf += sizeof(struct otp_info);
+			*retlen += sizeof(struct otp_info);
+		} else {
+			size_t tmp_retlen;
+			int size = len;
+
+			ret = action(mtd, from, len, &tmp_retlen, buf);
+
+			buf += size;
+			len -= size;
+			*retlen += size;
+
+			if (ret < 0)
+				return ret;
+		}
+		otp_pages--;
+	}
+
+	return 0;
+}
+
+/**
+ * onenand_get_fact_prot_info - [MTD Interface] Read factory OTP info
+ * @param mtd		MTD device structure
+ * @param buf		the databuffer to put/get data
+ * @param len		number of bytes to read
+ *
+ * Read factory OTP info.
+ */
+static int onenand_get_fact_prot_info(struct mtd_info *mtd,
+			struct otp_info *buf, size_t len)
+{
+	size_t retlen;
+	int ret;
+
+	ret = onenand_otp_walk(mtd, 0, len, &retlen, (u_char *) buf, NULL, MTD_OTP_FACTORY);
+
+	return ret ? : retlen;
+}
+
+/**
+ * onenand_read_fact_prot_reg - [MTD Interface] Read factory OTP area
+ * @param mtd		MTD device structure
+ * @param from		The offset to read
+ * @param len		number of bytes to read
+ * @param retlen	pointer to variable to store the number of read bytes
+ * @param buf		the databuffer to put/get data
+ *
+ * Read factory OTP area.
+ */
+static int onenand_read_fact_prot_reg(struct mtd_info *mtd, loff_t from,
+			size_t len, size_t *retlen, u_char *buf)
+{
+	return onenand_otp_walk(mtd, from, len, retlen, buf, do_otp_read, MTD_OTP_FACTORY);
+}
+
+/**
+ * onenand_get_user_prot_info - [MTD Interface] Read user OTP info
+ * @param mtd		MTD device structure
+ * @param buf		the databuffer to put/get data
+ * @param len		number of bytes to read
+ *
+ * Read user OTP info.
+ */
+static int onenand_get_user_prot_info(struct mtd_info *mtd,
+			struct otp_info *buf, size_t len)
+{
+	size_t retlen;
+	int ret;
+
+	ret = onenand_otp_walk(mtd, 0, len, &retlen, (u_char *) buf, NULL, MTD_OTP_USER);
+
+	return ret ? : retlen;
+}
+
+/**
+ * onenand_read_user_prot_reg - [MTD Interface] Read user OTP area
+ * @param mtd		MTD device structure
+ * @param from		The offset to read
+ * @param len		number of bytes to read
+ * @param retlen	pointer to variable to store the number of read bytes
+ * @param buf		the databuffer to put/get data
+ *
+ * Read user OTP area.
+ */
+static int onenand_read_user_prot_reg(struct mtd_info *mtd, loff_t from,
+			size_t len, size_t *retlen, u_char *buf)
+{
+	return onenand_otp_walk(mtd, from, len, retlen, buf, do_otp_read, MTD_OTP_USER);
+}
+
+/**
+ * onenand_write_user_prot_reg - [MTD Interface] Write user OTP area
+ * @param mtd		MTD device structure
+ * @param from		The offset to write
+ * @param len		number of bytes to write
+ * @param retlen	pointer to variable to store the number of write bytes
+ * @param buf		the databuffer to put/get data
+ *
+ * Write user OTP area.
+ */
+static int onenand_write_user_prot_reg(struct mtd_info *mtd, loff_t from,
+			size_t len, size_t *retlen, u_char *buf)
+{
+	return onenand_otp_walk(mtd, from, len, retlen, buf, do_otp_write, MTD_OTP_USER);
+}
+
+/**
+ * onenand_lock_user_prot_reg - [MTD Interface] Lock user OTP area
+ * @param mtd		MTD device structure
+ * @param from		The offset to lock
+ * @param len		number of bytes to unlock
+ *
+ * Write lock mark on spare area in page 0 in OTP block
+ */
+static int onenand_lock_user_prot_reg(struct mtd_info *mtd, loff_t from,
+			size_t len)
+{
+	unsigned char oob_buf[64];
+	size_t retlen;
+	int ret;
+
+	memset(oob_buf, 0xff, mtd->oobsize);
+	/*
+	 * Note: OTP lock operation
+	 *       OTP block : 0xXXFC
+	 *       1st block : 0xXXF3 (If chip support)
+	 *       Both      : 0xXXF0 (If chip support)
+	 */
+	oob_buf[ONENAND_OTP_LOCK_OFFSET] = 0xFC;
+
+	/*
+	 * Write lock mark to 8th word of sector0 of page0 of the spare0.
+	 * We write 16 bytes spare area instead of 2 bytes.
+	 */
+	from = 0;
+	len = 16;
+
+	ret = onenand_otp_walk(mtd, from, len, &retlen, oob_buf, do_otp_lock, MTD_OTP_USER);
+
+	return ret ? : retlen;
+}
+#endif	/* CONFIG_MTD_ONENAND_OTP */
+
 /**
  * onenand_print_device_info - Print device ID
  * @param device        device ID
@@ -1563,7 +1863,6 @@ static void onenand_resume(struct mtd_info *mtd)
 				"in suspended state\n");
 }
 
-
 /**
  * onenand_scan - [OneNAND Interface] Scan for the OneNAND device
  * @param mtd		MTD device structure
@@ -1655,6 +1954,14 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 	mtd->write_ecc = onenand_write_ecc;
 	mtd->read_oob = onenand_read_oob;
 	mtd->write_oob = onenand_write_oob;
+#ifdef CONFIG_MTD_ONENAND_OTP
+	mtd->get_fact_prot_info = onenand_get_fact_prot_info;
+	mtd->read_fact_prot_reg = onenand_read_fact_prot_reg;
+	mtd->get_user_prot_info = onenand_get_user_prot_info;
+	mtd->read_user_prot_reg = onenand_read_user_prot_reg;
+	mtd->write_user_prot_reg = onenand_write_user_prot_reg;
+	mtd->lock_user_prot_reg = onenand_lock_user_prot_reg;
+#endif
 	mtd->readv = NULL;
 	mtd->readv_ecc = NULL;
 	mtd->writev = onenand_writev;
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 22322c8a7729..3f5919f2e9da 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -35,6 +35,8 @@ typedef enum {
 	FL_SYNCING,
 	FL_UNLOCKING,
 	FL_LOCKING,
+	FL_RESETING,
+	FL_OTPING,
 	FL_PM_SUSPENDED,
 } onenand_state_t;
 
diff --git a/include/linux/mtd/onenand_regs.h b/include/linux/mtd/onenand_regs.h
index d7832ef8ed63..4a72818d2545 100644
--- a/include/linux/mtd/onenand_regs.h
+++ b/include/linux/mtd/onenand_regs.h
@@ -112,6 +112,7 @@
 #define ONENAND_CMD_LOCK_TIGHT		(0x2C)
 #define ONENAND_CMD_ERASE		(0x94)
 #define ONENAND_CMD_RESET		(0xF0)
+#define ONENAND_CMD_OTP_ACCESS		(0x65)
 #define ONENAND_CMD_READID		(0x90)
 
 /* NOTE: Those are not *REAL* commands */
@@ -152,6 +153,8 @@
 #define ONENAND_CTRL_ERASE		(1 << 11)
 #define ONENAND_CTRL_ERROR		(1 << 10)
 #define ONENAND_CTRL_RSTB		(1 << 7)
+#define ONENAND_CTRL_OTP_L		(1 << 6)
+#define ONENAND_CTRL_OTP_BL		(1 << 5)
 
 /*
  * Interrupt Status Register F241h (R)
@@ -177,4 +180,9 @@
 #define ONENAND_ECC_2BIT		(1 << 1)
 #define ONENAND_ECC_2BIT_ALL		(0xAAAA)
 
+/*
+ * One-Time Programmable (OTP)
+ */
+#define ONENAND_OTP_LOCK_OFFSET		(14)
+
 #endif	/* __ONENAND_REG_H */
-- 
cgit v1.2.3


From aa98d7cf59b5b0764d3502662053489585faf2fe Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 13 May 2006 15:09:47 +0900
Subject: [JFFS2][XATTR] XATTR support on JFFS2 (version. 5)

This attached patches provide xattr support including POSIX-ACL and
SELinux support on JFFS2 (version.5).

There are some significant differences from previous version posted
at last December.
The biggest change is addition of EBS(Erase Block Summary) support.
Currently, both kernel and usermode utility (sumtool) can recognize
xattr nodes which have JFFS2_NODETYPE_XATTR/_XREF nodetype.

In addition, some bugs are fixed.
- A potential race condition was fixed.
- Unexpected fail when updating a xattr by same name/value pair was fixed.
- A bug when removing xattr name/value pair was fixed.

The fundamental structures (such as using two new nodetypes and exclusion
mechanism by rwsem) are unchanged. But most of implementation were reviewed
and updated if necessary.
Espacially, we had to change several internal implementations related to
load_xattr_datum() to avoid a potential race condition.

[1/2] xattr_on_jffs2.kernel.version-5.patch
[2/2] xattr_on_jffs2.utils.version-5.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/Kconfig               |   38 ++
 fs/jffs2/Makefile        |    3 +
 fs/jffs2/acl.c           |  483 ++++++++++++++++++
 fs/jffs2/acl.h           |   46 ++
 fs/jffs2/build.c         |    2 +
 fs/jffs2/debug.h         |    6 +
 fs/jffs2/dir.c           |   62 ++-
 fs/jffs2/file.c          |    7 +-
 fs/jffs2/fs.c            |   11 +-
 fs/jffs2/gc.c            |   16 +-
 fs/jffs2/jffs2_fs_i.h    |    5 +
 fs/jffs2/jffs2_fs_sb.h   |   10 +
 fs/jffs2/malloc.c        |   68 ++-
 fs/jffs2/nodelist.c      |    1 +
 fs/jffs2/nodelist.h      |   21 +-
 fs/jffs2/os-linux.h      |    4 +
 fs/jffs2/readinode.c     |    1 +
 fs/jffs2/scan.c          |  168 ++++++
 fs/jffs2/security.c      |   82 +++
 fs/jffs2/summary.c       |  191 +++++++
 fs/jffs2/summary.h       |   42 ++
 fs/jffs2/super.c         |    6 +-
 fs/jffs2/symlink.c       |    7 +-
 fs/jffs2/write.c         |    2 +-
 fs/jffs2/xattr.c         | 1271 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/jffs2/xattr.h         |  120 +++++
 fs/jffs2/xattr_trusted.c |   51 ++
 fs/jffs2/xattr_user.c    |   51 ++
 include/linux/jffs2.h    |   40 ++
 29 files changed, 2800 insertions(+), 15 deletions(-)
 create mode 100644 fs/jffs2/acl.c
 create mode 100644 fs/jffs2/acl.h
 create mode 100644 fs/jffs2/security.c
 create mode 100644 fs/jffs2/xattr.c
 create mode 100644 fs/jffs2/xattr.h
 create mode 100644 fs/jffs2/xattr_trusted.c
 create mode 100644 fs/jffs2/xattr_user.c

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index f9b5842c8d2d..2496ccbe2604 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1075,6 +1075,44 @@ config JFFS2_FS_DEBUG
 	  If reporting bugs, please try to have available a full dump of the
 	  messages at debug level 1 while the misbehaviour was occurring.
 
+config JFFS2_FS_XATTR
+	bool "JFFS2 XATTR support"
+	depends on JFFS2_FS
+	default n
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+	  
+	  If unsure, say N.
+
+config JFFS2_FS_POSIX_ACL
+	bool "JFFS2 POSIX Access Control Lists"
+	depends on JFFS2_FS_XATTR
+	default y
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+	  
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+	  
+	  If you don't know what Access Control Lists are, say N
+
+config JFFS2_FS_SECURITY
+	bool "JFFS2 Security Labels"
+	depends on JFFS2_FS_XATTR
+	default y
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the jffs2 filesystem.
+	  
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
 config JFFS2_FS_WRITEBUFFER
 	bool "JFFS2 write-buffering support"
 	depends on JFFS2_FS
diff --git a/fs/jffs2/Makefile b/fs/jffs2/Makefile
index 77dc5561a04e..7f28ee0bd132 100644
--- a/fs/jffs2/Makefile
+++ b/fs/jffs2/Makefile
@@ -12,6 +12,9 @@ jffs2-y	+= symlink.o build.o erase.o background.o fs.o writev.o
 jffs2-y	+= super.o debug.o
 
 jffs2-$(CONFIG_JFFS2_FS_WRITEBUFFER)	+= wbuf.o
+jffs2-$(CONFIG_JFFS2_FS_XATTR)		+= xattr.o xattr_trusted.o xattr_user.o
+jffs2-$(CONFIG_JFFS2_FS_SECURITY)	+= security.o
+jffs2-$(CONFIG_JFFS2_FS_POSIX_ACL)	+= acl.o
 jffs2-$(CONFIG_JFFS2_RUBIN)	+= compr_rubin.o
 jffs2-$(CONFIG_JFFS2_RTIME)	+= compr_rtime.o
 jffs2-$(CONFIG_JFFS2_ZLIB)	+= compr_zlib.o
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
new file mode 100644
index 000000000000..080bb51e4b65
--- /dev/null
+++ b/fs/jffs2/acl.c
@@ -0,0 +1,483 @@
+/*-------------------------------------------------------------------------*
+ *  File: fs/jffs2/acl.c
+ *  POSIX ACL support on JFFS2 FileSystem
+ *
+ *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *  Copyright (C) 2006 NEC Corporation
+ *
+ *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
+ *-------------------------------------------------------------------------*/
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static size_t jffs2_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(jffs2_acl_header)
+		       + count * sizeof(jffs2_acl_entry_short);
+	} else {
+		return sizeof(jffs2_acl_header)
+		       + 4 * sizeof(jffs2_acl_entry_short)
+		       + (count - 4) * sizeof(jffs2_acl_entry);
+	}
+}
+
+static int jffs2_acl_count(size_t size)
+{
+	size_t s;
+
+	size -= sizeof(jffs2_acl_header);
+	s = size - 4 * sizeof(jffs2_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(jffs2_acl_entry_short))
+			return -1;
+		return size / sizeof(jffs2_acl_entry_short);
+	} else {
+		if (s % sizeof(jffs2_acl_entry))
+			return -1;
+		return s / sizeof(jffs2_acl_entry) + 4;
+	}
+}
+
+static struct posix_acl *jffs2_acl_from_medium(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	struct posix_acl *acl;
+	uint32_t ver;
+	int i, count;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(jffs2_acl_header))
+		return ERR_PTR(-EINVAL);
+	ver = je32_to_cpu(((jffs2_acl_header *)value)->a_version);
+	if (ver != JFFS2_ACL_VERSION) {
+		JFFS2_WARNING("Invalid ACL version. (=%u)\n", ver);
+		return ERR_PTR(-EINVAL);
+	}
+
+	value = (char *)value + sizeof(jffs2_acl_header);
+	count = jffs2_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	for (i=0; i < count; i++) {
+		jffs2_acl_entry *entry = (jffs2_acl_entry *)value;
+		if ((char *)value + sizeof(jffs2_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[i].e_tag = je16_to_cpu(entry->e_tag);
+		acl->a_entries[i].e_perm = je16_to_cpu(entry->e_perm);
+		switch (acl->a_entries[i].e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				value = (char *)value + sizeof(jffs2_acl_entry_short);
+				acl->a_entries[i].e_id = ACL_UNDEFINED_ID;
+				break;
+
+			case ACL_USER:
+			case ACL_GROUP:
+				value = (char *)value + sizeof(jffs2_acl_entry);
+				if ((char *)value > end)
+					goto fail;
+				acl->a_entries[i].e_id = je32_to_cpu(entry->e_id);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+ fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
+{
+	jffs2_acl_header *jffs2_acl;
+	char *e;
+	size_t i;
+
+	*size = jffs2_acl_size(acl->a_count);
+	jffs2_acl = (jffs2_acl_header *)kmalloc(sizeof(jffs2_acl_header)
+						+ acl->a_count * sizeof(jffs2_acl_entry),
+						GFP_KERNEL);
+	if (!jffs2_acl)
+		return ERR_PTR(-ENOMEM);
+	jffs2_acl->a_version = cpu_to_je32(JFFS2_ACL_VERSION);
+	e = (char *)jffs2_acl + sizeof(jffs2_acl_header);
+	for (i=0; i < acl->a_count; i++) {
+		jffs2_acl_entry *entry = (jffs2_acl_entry *)e;
+		entry->e_tag = cpu_to_je16(acl->a_entries[i].e_tag);
+		entry->e_perm = cpu_to_je16(acl->a_entries[i].e_perm);
+		switch(acl->a_entries[i].e_tag) {
+			case ACL_USER:
+			case ACL_GROUP:
+				entry->e_id = cpu_to_je32(acl->a_entries[i].e_id);
+				e += sizeof(jffs2_acl_entry);
+				break;
+
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				e += sizeof(jffs2_acl_entry_short);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	return (char *)jffs2_acl;
+ fail:
+	kfree(jffs2_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+static struct posix_acl *jffs2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
+{
+	struct posix_acl *acl = JFFS2_ACL_NOT_CACHED;
+
+	spin_lock(&inode->i_lock);
+	if (*i_acl != JFFS2_ACL_NOT_CACHED)
+		acl = posix_acl_dup(*i_acl);
+	spin_unlock(&inode->i_lock);
+	return acl;
+}
+
+static void jffs2_iset_acl(struct inode *inode, struct posix_acl **i_acl, struct posix_acl *acl)
+{
+	spin_lock(&inode->i_lock);
+	if (*i_acl != JFFS2_ACL_NOT_CACHED)
+		posix_acl_release(*i_acl);
+	*i_acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+}
+
+static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct posix_acl *acl;
+	char *value = NULL;
+	int rc, xprefix;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		acl = jffs2_iget_acl(inode, &f->i_acl_access);
+		if (acl != JFFS2_ACL_NOT_CACHED)
+			return acl;
+		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		acl = jffs2_iget_acl(inode, &f->i_acl_default);
+		if (acl != JFFS2_ACL_NOT_CACHED)
+			return acl;
+		xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+	rc = do_jffs2_getxattr(inode, xprefix, "", NULL, 0);
+	if (rc > 0) {
+		value = kmalloc(rc, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		rc = do_jffs2_getxattr(inode, xprefix, "", value, rc);
+	}
+	if (rc > 0) {
+		acl = jffs2_acl_from_medium(value, rc);
+	} else if (rc == -ENODATA || rc == -ENOSYS) {
+		acl = NULL;
+	} else {
+		acl = ERR_PTR(rc);
+	}
+	if (value)
+		kfree(value);
+	if (!IS_ERR(acl)) {
+		switch (type) {
+		case ACL_TYPE_ACCESS:
+			jffs2_iset_acl(inode, &f->i_acl_access, acl);
+			break;
+		case ACL_TYPE_DEFAULT:
+			jffs2_iset_acl(inode, &f->i_acl_default, acl);
+			break;
+		}
+	}
+	return acl;
+}
+
+static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	size_t size = 0;
+	char *value = NULL;
+	int rc, xprefix;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			rc = posix_acl_equiv_mode(acl, &mode);
+			if (rc < 0)
+				return rc;
+			if (inode->i_mode != mode) {
+				inode->i_mode = mode;
+				jffs2_dirty_inode(inode);
+			}
+			if (rc == 0)
+				acl = NULL;
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (acl) {
+		value = jffs2_acl_to_medium(acl, &size);
+		if (IS_ERR(value))
+			return PTR_ERR(value);
+	}
+
+	rc = do_jffs2_setxattr(inode, xprefix, "", value, size, 0);
+	if (value)
+		kfree(value);
+	if (!rc) {
+		switch(type) {
+		case ACL_TYPE_ACCESS:
+			jffs2_iset_acl(inode, &f->i_acl_access, acl);
+			break;
+		case ACL_TYPE_DEFAULT:
+			jffs2_iset_acl(inode, &f->i_acl_default, acl);
+			break;
+		}
+	}
+	return rc;
+}
+
+static int jffs2_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl;
+	int rc;
+
+	acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		rc = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return rc;
+	}
+	return -EAGAIN;
+}
+
+int jffs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	return generic_permission(inode, mask, jffs2_check_acl);
+}
+
+int jffs2_init_acl(struct inode *inode, struct inode *dir)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct posix_acl *acl = NULL, *clone;
+	mode_t mode;
+	int rc = 0;
+
+	f->i_acl_access = JFFS2_ACL_NOT_CACHED;
+	f->i_acl_default = JFFS2_ACL_NOT_CACHED;
+	if (!S_ISLNK(inode->i_mode)) {
+		acl = jffs2_get_acl(dir, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (!acl)
+			inode->i_mode &= ~current->fs->umask;
+	}
+	if (acl) {
+		if (S_ISDIR(inode->i_mode)) {
+			rc = jffs2_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+			if (rc)
+				goto cleanup;
+		}
+		clone = posix_acl_clone(acl, GFP_KERNEL);
+		rc = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+		mode = inode->i_mode;
+		rc = posix_acl_create_masq(clone, &mode);
+		if (rc >= 0) {
+			inode->i_mode = mode;
+			if (rc > 0)
+				rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, clone);
+		}
+		posix_acl_release(clone);
+	}
+ cleanup:
+	posix_acl_release(acl);
+	return rc;
+}
+
+void jffs2_clear_acl(struct inode *inode)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+
+	if (f->i_acl_access && f->i_acl_access != JFFS2_ACL_NOT_CACHED) {
+		posix_acl_release(f->i_acl_access);
+		f->i_acl_access = JFFS2_ACL_NOT_CACHED;
+	}
+	if (f->i_acl_default && f->i_acl_default != JFFS2_ACL_NOT_CACHED) {
+		posix_acl_release(f->i_acl_default);
+		f->i_acl_default = JFFS2_ACL_NOT_CACHED;
+	}
+}
+
+int jffs2_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int rc;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+	rc = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!rc)
+		rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, clone);
+	posix_acl_release(clone);
+	return rc;
+}
+
+static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t list_size,
+					 const char *name, size_t name_len)
+{
+	const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+	if (list && retlen <= list_size)
+		strcpy(list, POSIX_ACL_XATTR_ACCESS);
+	return retlen;
+}
+
+static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_t list_size,
+					  const char *name, size_t name_len)
+{
+	const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+	if (list && retlen <= list_size)
+		strcpy(list, POSIX_ACL_XATTR_DEFAULT);
+	return retlen;
+}
+
+static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_t size)
+{
+	struct posix_acl *acl;
+	int rc;
+
+	acl = jffs2_get_acl(inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (!acl)
+		return -ENODATA;
+	rc = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return rc;
+}
+
+static int jffs2_acl_access_getxattr(struct inode *inode, const char *name, void *buffer, size_t size)
+{
+	if (name[0] != '\0')
+		return -EINVAL;
+	return jffs2_acl_getxattr(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int jffs2_acl_default_getxattr(struct inode *inode, const char *name, void *buffer, size_t size)
+{
+	if (name[0] != '\0')
+		return -EINVAL;
+	return jffs2_acl_getxattr(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value, size_t size)
+{
+	struct posix_acl *acl;
+	int rc;
+
+	if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (acl) {
+			rc = posix_acl_valid(acl);
+			if (rc)
+				goto out;
+		}
+	} else {
+		acl = NULL;
+	}
+	rc = jffs2_set_acl(inode, type, acl);
+ out:
+	posix_acl_release(acl);
+	return rc;
+}
+
+static int jffs2_acl_access_setxattr(struct inode *inode, const char *name,
+				     const void *buffer, size_t size, int flags)
+{
+	if (name[0] != '\0')
+		return -EINVAL;
+	return jffs2_acl_setxattr(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int jffs2_acl_default_setxattr(struct inode *inode, const char *name,
+				      const void *buffer, size_t size, int flags)
+{
+	if (name[0] != '\0')
+		return -EINVAL;
+	return jffs2_acl_setxattr(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+struct xattr_handler jffs2_acl_access_xattr_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.list	= jffs2_acl_access_listxattr,
+	.get	= jffs2_acl_access_getxattr,
+	.set	= jffs2_acl_access_setxattr,
+};
+
+struct xattr_handler jffs2_acl_default_xattr_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.list	= jffs2_acl_default_listxattr,
+	.get	= jffs2_acl_default_getxattr,
+	.set	= jffs2_acl_default_setxattr,
+};
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
new file mode 100644
index 000000000000..c98610b4e81c
--- /dev/null
+++ b/fs/jffs2/acl.h
@@ -0,0 +1,46 @@
+/*-------------------------------------------------------------------------*
+ *  File: fs/jffs2/acl.h
+ *  POSIX ACL support on JFFS2 FileSystem
+ *
+ *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *  Copyright (C) 2006 NEC Corporation
+ *
+ *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
+ *-------------------------------------------------------------------------*/
+typedef struct {
+	jint16_t	e_tag;
+	jint16_t	e_perm;
+	jint32_t	e_id;
+} jffs2_acl_entry;
+
+typedef struct {
+	jint16_t	e_tag;
+	jint16_t	e_perm;
+} jffs2_acl_entry_short;
+
+typedef struct {
+	jint32_t	a_version;
+} jffs2_acl_header;
+
+#ifdef __KERNEL__
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+
+#define JFFS2_ACL_NOT_CACHED ((void *)-1)
+
+extern int jffs2_permission(struct inode *, int, struct nameidata *);
+extern int jffs2_acl_chmod(struct inode *);
+extern int jffs2_init_acl(struct inode *, struct inode *);
+extern void jffs2_clear_acl(struct inode *);
+
+extern struct xattr_handler jffs2_acl_access_xattr_handler;
+extern struct xattr_handler jffs2_acl_default_xattr_handler;
+
+#else
+
+#define jffs2_permission NULL
+#define jffs2_acl_chmod(inode)		(0)
+#define jffs2_init_acl(inode,dir)	(0)
+#define jffs2_clear_acl(inode)
+
+#endif	/* CONFIG_JFFS2_FS_POSIX_ACL */
+#endif	/* __KERNEL__ */
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 70f7a896c04a..02826967ab58 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -160,6 +160,7 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
 		ic->scan_dents = NULL;
 		cond_resched();
 	}
+	jffs2_build_xattr_subsystem(c);
 	c->flags &= ~JFFS2_SB_FLAG_BUILDING;
 
 	dbg_fsbuild("FS build complete\n");
@@ -178,6 +179,7 @@ exit:
 				jffs2_free_full_dirent(fd);
 			}
 		}
+		jffs2_clear_xattr_subsystem(c);
 	}
 
 	return ret;
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index 162af6dfe292..5fa494a792b2 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -171,6 +171,12 @@
 #define dbg_memalloc(fmt, ...)
 #endif
 
+/* Watch the XATTR subsystem */
+#ifdef JFFS2_DBG_XATTR_MESSAGES
+#define dbg_xattr(fmt, ...)  JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_xattr(fmt, ...)
+#endif 
 
 /* "Sanity" checks */
 void
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 1c8e8c0f6cea..f1b18b99a3cd 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -57,7 +57,12 @@ struct inode_operations jffs2_dir_inode_operations =
 	.rmdir =	jffs2_rmdir,
 	.mknod =	jffs2_mknod,
 	.rename =	jffs2_rename,
+	.permission =	jffs2_permission,
 	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
 };
 
 /***********************************************************************/
@@ -209,12 +214,15 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
 	ret = jffs2_do_create(c, dir_f, f, ri,
 			      dentry->d_name.name, dentry->d_name.len);
 
-	if (ret) {
-		make_bad_inode(inode);
-		iput(inode);
-		jffs2_free_raw_inode(ri);
-		return ret;
-	}
+	if (ret)
+		goto fail;
+
+	ret = jffs2_init_security(inode, dir_i);
+	if (ret)
+		goto fail;
+	ret = jffs2_init_acl(inode, dir_i);
+	if (ret)
+		goto fail;
 
 	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
 
@@ -224,6 +232,12 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
 	D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
 		  inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->nlink, inode->i_mapping->nrpages));
 	return 0;
+
+ fail:
+	make_bad_inode(inode);
+	iput(inode);
+	jffs2_free_raw_inode(ri);
+	return ret;
 }
 
 /***********************************************************************/
@@ -374,6 +388,18 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	up(&f->sem);
 
 	jffs2_complete_reservation(c);
+
+	ret = jffs2_init_security(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+	ret = jffs2_init_acl(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+
 	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
 				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 	if (ret) {
@@ -504,6 +530,18 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	up(&f->sem);
 
 	jffs2_complete_reservation(c);
+
+	ret = jffs2_init_security(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+	ret = jffs2_init_acl(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+
 	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
 				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 	if (ret) {
@@ -660,6 +698,18 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	up(&f->sem);
 
 	jffs2_complete_reservation(c);
+
+	ret = jffs2_init_security(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+	ret = jffs2_init_acl(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+
 	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
 				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 	if (ret) {
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 9f4171213e58..e92187f34d5f 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -54,7 +54,12 @@ const struct file_operations jffs2_file_operations =
 
 struct inode_operations jffs2_file_inode_operations =
 {
-	.setattr =	jffs2_setattr
+	.permission =	jffs2_permission,
+	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
 };
 
 struct address_space_operations jffs2_file_address_operations =
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index ea1f37d4fc58..4607cdc4c46d 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -185,7 +185,12 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 
 int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
-	return jffs2_do_setattr(dentry->d_inode, iattr);
+	int rc;
+
+	rc = jffs2_do_setattr(dentry->d_inode, iattr);
+	if (!rc && (iattr->ia_valid & ATTR_MODE))
+		rc = jffs2_acl_chmod(dentry->d_inode);
+	return rc;
 }
 
 int jffs2_statfs(struct super_block *sb, struct kstatfs *buf)
@@ -224,6 +229,7 @@ void jffs2_clear_inode (struct inode *inode)
 
 	D1(printk(KERN_DEBUG "jffs2_clear_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
 
+	jffs2_xattr_delete_inode(c, f->inocache);
 	jffs2_do_clear_inode(c, f);
 }
 
@@ -497,6 +503,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	memset(c->inocache_list, 0, INOCACHE_HASHSIZE * sizeof(struct jffs2_inode_cache *));
 
+	jffs2_init_xattr_subsystem(c);
+
 	if ((ret = jffs2_do_mount_fs(c)))
 		goto out_inohash;
 
@@ -531,6 +539,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 	else
 		kfree(c->blocks);
  out_inohash:
+	jffs2_clear_xattr_subsystem(c);
 	kfree(c->inocache_list);
  out_wbuf:
 	jffs2_flash_cleanup(c);
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 967fb2cf8e21..4ea1b7f0ae78 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -125,6 +125,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 	struct jffs2_eraseblock *jeb;
 	struct jffs2_raw_node_ref *raw;
 	int ret = 0, inum, nlink;
+	int xattr = 0;
 
 	if (down_interruptible(&c->alloc_sem))
 		return -EINTR;
@@ -138,7 +139,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 		   the node CRCs etc. Do it now. */
 
 		/* checked_ino is protected by the alloc_sem */
-		if (c->checked_ino > c->highest_ino) {
+		if (c->checked_ino > c->highest_ino && xattr) {
 			printk(KERN_CRIT "Checked all inodes but still 0x%x bytes of unchecked space?\n",
 			       c->unchecked_size);
 			jffs2_dbg_dump_block_lists_nolock(c);
@@ -148,6 +149,9 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 
 		spin_unlock(&c->erase_completion_lock);
 
+		if (!xattr)
+			xattr = jffs2_verify_xattr(c);
+
 		spin_lock(&c->inocache_lock);
 
 		ic = jffs2_get_ino_cache(c, c->checked_ino++);
@@ -262,6 +266,16 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 
 	ic = jffs2_raw_ref_to_ic(raw);
 
+	/* When 'ic' refers xattr_datum/xattr_ref, this node is GCed as xattr.
+	   We can decide whether this node is inode or xattr by ic->class.
+	   ret = 0 : ic is xattr_datum/xattr_ref, and GC was SUCCESSED.
+	   ret < 0 : ic is xattr_datum/xattr_ref, but GC was FAILED.
+	   ret > 0 : ic is NOT xattr_datum/xattr_ref.
+	*/
+	ret = jffs2_garbage_collect_xattr(c, ic);
+	if (ret <= 0)
+		goto release_sem;
+
 	/* We need to hold the inocache. Either the erase_completion_lock or
 	   the inocache_lock are sufficient; we trade down since the inocache_lock
 	   causes less contention. */
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index ad565bf9dcc1..2e0cc8e00b85 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -5,6 +5,7 @@
 
 #include <linux/version.h>
 #include <linux/rbtree.h>
+#include <linux/posix_acl.h>
 #include <asm/semaphore.h>
 
 struct jffs2_inode_info {
@@ -45,6 +46,10 @@ struct jffs2_inode_info {
 	struct inode vfs_inode;
 #endif
 #endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	struct posix_acl *i_acl_access;
+	struct posix_acl *i_acl_default;
+#endif
 };
 
 #endif /* _JFFS2_FS_I */
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 4bcfb5570221..3b4e0edd6dbb 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -115,6 +115,16 @@ struct jffs2_sb_info {
 
 	struct jffs2_summary *summary;		/* Summary information */
 
+#ifdef CONFIG_JFFS2_FS_XATTR
+#define XATTRINDEX_HASHSIZE	(57)
+	uint32_t highest_xid;
+	struct list_head xattrindex[XATTRINDEX_HASHSIZE];
+	struct list_head xattr_temp;
+	struct list_head xattr_unchecked;
+	struct rw_semaphore xattr_sem;
+	uint32_t xdatum_mem_usage;
+	uint32_t xdatum_mem_threshold;
+#endif
 	/* OS-private pointer for getting back to master superblock info */
 	void *os_priv;
 };
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 036cbd11c004..3d5b7ecfbf8d 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -26,6 +26,10 @@ static kmem_cache_t *tmp_dnode_info_slab;
 static kmem_cache_t *raw_node_ref_slab;
 static kmem_cache_t *node_frag_slab;
 static kmem_cache_t *inode_cache_slab;
+#ifdef CONFIG_JFFS2_FS_XATTR
+static kmem_cache_t *xattr_datum_cache;
+static kmem_cache_t *xattr_ref_cache;
+#endif
 
 int __init jffs2_create_slab_caches(void)
 {
@@ -68,8 +72,24 @@ int __init jffs2_create_slab_caches(void)
 	inode_cache_slab = kmem_cache_create("jffs2_inode_cache",
 					     sizeof(struct jffs2_inode_cache),
 					     0, 0, NULL, NULL);
-	if (inode_cache_slab)
-		return 0;
+	if (!inode_cache_slab)
+		goto err;
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+	xattr_datum_cache = kmem_cache_create("jffs2_xattr_datum",
+					     sizeof(struct jffs2_xattr_datum),
+					     0, 0, NULL, NULL);
+	if (!xattr_datum_cache)
+		goto err;
+
+	xattr_ref_cache = kmem_cache_create("jffs2_xattr_ref",
+					   sizeof(struct jffs2_xattr_ref),
+					   0, 0, NULL, NULL);
+	if (!xattr_ref_cache)
+		goto err;
+#endif
+
+	return 0;
  err:
 	jffs2_destroy_slab_caches();
 	return -ENOMEM;
@@ -91,6 +111,12 @@ void jffs2_destroy_slab_caches(void)
 		kmem_cache_destroy(node_frag_slab);
 	if(inode_cache_slab)
 		kmem_cache_destroy(inode_cache_slab);
+#ifdef CONFIG_JFFS2_FS_XATTR
+	if (xattr_datum_cache)
+		kmem_cache_destroy(xattr_datum_cache);
+	if (xattr_ref_cache)
+		kmem_cache_destroy(xattr_ref_cache);
+#endif
 }
 
 struct jffs2_full_dirent *jffs2_alloc_full_dirent(int namesize)
@@ -205,3 +231,41 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
 	dbg_memalloc("%p\n", x);
 	kmem_cache_free(inode_cache_slab, x);
 }
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
+{
+	struct jffs2_xattr_datum *xd;
+	xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL);
+	dbg_memalloc("%p\n", xd);
+
+	memset(xd, 0, sizeof(struct jffs2_xattr_datum));
+	xd->class = RAWNODE_CLASS_XATTR_DATUM;
+	INIT_LIST_HEAD(&xd->xindex);
+	return xd;
+}
+
+void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
+{
+	dbg_memalloc("%p\n", xd);
+	kmem_cache_free(xattr_datum_cache, xd);
+}
+
+struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
+{
+	struct jffs2_xattr_ref *ref;
+	ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL);
+	dbg_memalloc("%p\n", ref);
+
+	memset(ref, 0, sizeof(struct jffs2_xattr_ref));
+	ref->class = RAWNODE_CLASS_XATTR_REF;
+	INIT_LIST_HEAD(&ref->ilist);
+	return ref;
+}
+
+void jffs2_free_xattr_ref(struct jffs2_xattr_ref *ref)
+{
+	dbg_memalloc("%p\n", ref);
+	kmem_cache_free(xattr_ref_cache, ref);
+}
+#endif
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index d4d0c41490cd..9c575733659b 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -938,6 +938,7 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c)
 		this = c->inocache_list[i];
 		while (this) {
 			next = this->next;
+			jffs2_xattr_free_inode(c, this);
 			jffs2_free_inode_cache(this);
 			this = next;
 		}
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index f6645afe88e4..6f6279cf4909 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -20,6 +20,8 @@
 #include <linux/jffs2.h>
 #include "jffs2_fs_sb.h"
 #include "jffs2_fs_i.h"
+#include "xattr.h"
+#include "acl.h"
 #include "summary.h"
 
 #ifdef __ECOS
@@ -107,11 +109,16 @@ struct jffs2_inode_cache {
 		temporary lists of dirents, and later must be set to
 		NULL to mark the end of the raw_node_ref->next_in_ino
 		chain. */
+	u8 class;	/* It's used for identification */
+	u8 flags;
+	uint16_t state;
 	struct jffs2_inode_cache *next;
 	struct jffs2_raw_node_ref *nodes;
 	uint32_t ino;
 	int nlink;
-	int state;
+#ifdef CONFIG_JFFS2_FS_XATTR
+	struct list_head ilist;
+#endif
 };
 
 /* Inode states for 'state' above. We need the 'GC' state to prevent
@@ -125,6 +132,12 @@ struct jffs2_inode_cache {
 #define INO_STATE_READING	5	/* In read_inode() */
 #define INO_STATE_CLEARING	6	/* In clear_inode() */
 
+#define INO_FLAGS_XATTR_CHECKED	0x01	/* has no duplicate xattr_ref */
+
+#define RAWNODE_CLASS_INODE_CACHE	0
+#define RAWNODE_CLASS_XATTR_DATUM	1
+#define RAWNODE_CLASS_XATTR_REF		2
+
 #define INOCACHE_HASHSIZE 128
 
 /*
@@ -374,6 +387,12 @@ struct jffs2_node_frag *jffs2_alloc_node_frag(void);
 void jffs2_free_node_frag(struct jffs2_node_frag *);
 struct jffs2_inode_cache *jffs2_alloc_inode_cache(void);
 void jffs2_free_inode_cache(struct jffs2_inode_cache *);
+#ifdef CONFIG_JFFS2_FS_XATTR
+struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void);
+void jffs2_free_xattr_datum(struct jffs2_xattr_datum *);
+struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void);
+void jffs2_free_xattr_ref(struct jffs2_xattr_ref *);
+#endif
 
 /* gc.c */
 int jffs2_garbage_collect_pass(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index d307cf548625..9936ae23f8dc 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -60,6 +60,10 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 	f->target = NULL;
 	f->flags = 0;
 	f->usercompr = 0;
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	f->i_acl_access = JFFS2_ACL_NOT_CACHED;
+	f->i_acl_default = JFFS2_ACL_NOT_CACHED;
+#endif
 }
 
 
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index e1acce8fb2bf..61ccdf4f1042 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -902,6 +902,7 @@ int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 		f->inocache->ino = f->inocache->nlink = 1;
 		f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache;
 		f->inocache->state = INO_STATE_READING;
+		init_xattr_inode_cache(f->inocache);
 		jffs2_add_ino_cache(c, f->inocache);
 	}
 	if (!f->inocache) {
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index cf55b221fc2b..f09689e320fe 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -306,6 +306,136 @@ int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *je
 		return BLK_STATE_ALLDIRTY;
 }
 
+#ifdef CONFIG_JFFS2_FS_XATTR
+static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				 struct jffs2_raw_xattr *rx, uint32_t ofs,
+				 struct jffs2_summary *s)
+{
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_raw_node_ref *raw;
+	uint32_t totlen, crc;
+
+	crc = crc32(0, rx, sizeof(struct jffs2_raw_xattr) - 4);
+	if (crc != je32_to_cpu(rx->node_crc)) {
+		if (je32_to_cpu(rx->node_crc) != 0xffffffff)
+			JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+				      ofs, je32_to_cpu(rx->node_crc), crc);
+		DIRTY_SPACE(je32_to_cpu(rx->totlen));
+		return 0;
+	}
+
+	totlen = PAD(sizeof(*rx) + rx->name_len + 1 + je16_to_cpu(rx->value_len));
+	if (totlen != je32_to_cpu(rx->totlen)) {
+		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n",
+			      ofs, je32_to_cpu(rx->totlen), totlen);
+		DIRTY_SPACE(je32_to_cpu(rx->totlen));
+		return 0;
+	}
+
+	raw =  jffs2_alloc_raw_node_ref();
+	if (!raw)
+		return -ENOMEM;
+
+	xd = jffs2_setup_xattr_datum(c, je32_to_cpu(rx->xid), je32_to_cpu(rx->version));
+	if (IS_ERR(xd)) {
+		jffs2_free_raw_node_ref(raw);
+		if (PTR_ERR(xd) == -EEXIST) {
+			DIRTY_SPACE(PAD(je32_to_cpu(rx->totlen)));
+			return 0;
+		}
+		return PTR_ERR(xd);
+	}
+	xd->xprefix = rx->xprefix;
+	xd->name_len = rx->name_len;
+	xd->value_len = je16_to_cpu(rx->value_len);
+	xd->data_crc = je32_to_cpu(rx->data_crc);
+	xd->node = raw;
+
+	raw->__totlen = totlen;
+	raw->flash_offset = ofs | REF_PRISTINE;
+	raw->next_phys = NULL;
+	raw->next_in_ino = (void *)xd;
+	if (!jeb->first_node)
+		jeb->first_node = raw;
+	if (jeb->last_node)
+		jeb->last_node->next_phys = raw;
+	jeb->last_node = raw;
+
+	USED_SPACE(PAD(je32_to_cpu(rx->totlen)));
+	if (jffs2_sum_active())
+		jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset);
+	dbg_xattr("scaning xdatum at %#08x (xid=%u, version=%u)\n",
+		  ofs, xd->xid, xd->version);
+	return 0;
+}
+
+static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				struct jffs2_raw_xref *rr, uint32_t ofs,
+				struct jffs2_summary *s)
+{
+	struct jffs2_xattr_ref *ref;
+	struct jffs2_raw_node_ref *raw;
+	uint32_t crc;
+
+	crc = crc32(0, rr, sizeof(*rr) - 4);
+	if (crc != je32_to_cpu(rr->node_crc)) {
+		if (je32_to_cpu(rr->node_crc) != 0xffffffff)
+			JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+				      ofs, je32_to_cpu(rr->node_crc), crc);
+		DIRTY_SPACE(PAD(je32_to_cpu(rr->totlen)));
+		return 0;
+	}
+
+	if (PAD(sizeof(struct jffs2_raw_xref)) != je32_to_cpu(rr->totlen)) {
+		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n",
+			      ofs, je32_to_cpu(rr->totlen),
+			      PAD(sizeof(struct jffs2_raw_xref)));
+		DIRTY_SPACE(je32_to_cpu(rr->totlen));
+		return 0;
+	}
+
+	ref = jffs2_alloc_xattr_ref();
+	if (!ref)
+		return -ENOMEM;
+
+	raw =  jffs2_alloc_raw_node_ref();
+	if (!raw) {
+		jffs2_free_xattr_ref(ref);
+		return -ENOMEM;
+	}
+
+	/* BEFORE jffs2_build_xattr_subsystem() called, 
+	 * ref->xid is used to store 32bit xid, xd is not used
+	 * ref->ino is used to store 32bit inode-number, ic is not used
+	 * Thoes variables are declared as union, thus using those
+	 * are exclusive. In a similar way, ref->ilist is temporarily
+	 * used to chain all xattr_ref object. It's re-chained to
+	 * jffs2_inode_cache in jffs2_build_xattr_subsystem() correctly.
+	 */
+	ref->node = raw;
+	ref->ino = je32_to_cpu(rr->ino);
+	ref->xid = je32_to_cpu(rr->xid);
+	list_add_tail(&ref->ilist, &c->xattr_temp);
+
+	raw->__totlen = PAD(je32_to_cpu(rr->totlen));
+	raw->flash_offset = ofs | REF_PRISTINE;
+	raw->next_phys = NULL;
+	raw->next_in_ino = (void *)ref;
+	if (!jeb->first_node)
+		jeb->first_node = raw;
+	if (jeb->last_node)
+		jeb->last_node->next_phys = raw;
+	jeb->last_node = raw;
+
+	USED_SPACE(PAD(je32_to_cpu(rr->totlen)));	
+	if (jffs2_sum_active())
+		jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset);
+	dbg_xattr("scan xref at %#08x (xid=%u, ino=%u)\n",
+		  ofs, ref->xid, ref->ino);
+	return 0;
+}
+#endif
+
 static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 				unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
 	struct jffs2_unknown_node *node;
@@ -614,6 +744,43 @@ scan_more:
 			ofs += PAD(je32_to_cpu(node->totlen));
 			break;
 
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR:
+			if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
+				buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+				D1(printk(KERN_DEBUG "Fewer than %d bytes (xattr node)"
+					  " left to end of buf. Reading 0x%x at 0x%08x\n",
+					  je32_to_cpu(node->totlen), buf_len, ofs));
+				err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+				if (err)
+					return err;
+				buf_ofs = ofs;
+				node = (void *)buf;
+			}
+			err = jffs2_scan_xattr_node(c, jeb, (void *)node, ofs, s);
+			if (err)
+				return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+		case JFFS2_NODETYPE_XREF:
+			if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
+				buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+				D1(printk(KERN_DEBUG "Fewer than %d bytes (xref node)"
+					  " left to end of buf. Reading 0x%x at 0x%08x\n",
+					  je32_to_cpu(node->totlen), buf_len, ofs));
+				err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+				if (err)
+					return err;
+				buf_ofs = ofs;
+				node = (void *)buf;
+			}
+			err = jffs2_scan_xref_node(c, jeb, (void *)node, ofs, s);
+			if (err)
+				return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+#endif	/* CONFIG_JFFS2_FS_XATTR */
+
 		case JFFS2_NODETYPE_CLEANMARKER:
 			D1(printk(KERN_DEBUG "CLEANMARKER node found at 0x%08x\n", ofs));
 			if (je32_to_cpu(node->totlen) != c->cleanmarker_size) {
@@ -721,6 +888,7 @@ struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uin
 
 	ic->ino = ino;
 	ic->nodes = (void *)ic;
+	init_xattr_inode_cache(ic);
 	jffs2_add_ino_cache(c, ic);
 	if (ino == 1)
 		ic->nlink = 1;
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
new file mode 100644
index 000000000000..4b6c3b22524f
--- /dev/null
+++ b/fs/jffs2/security.c
@@ -0,0 +1,82 @@
+/*-------------------------------------------------------------------------*
+ *  File: fs/jffs2/security.c
+ *  Security Labels support on JFFS2 FileSystem
+ *
+ *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *  Copyright (C) 2006 NEC Corporation
+ *
+ *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
+ *-------------------------------------------------------------------------*/
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include <linux/security.h>
+#include "nodelist.h"
+
+/* ---- Initial Security Label Attachment -------------- */
+int jffs2_init_security(struct inode *inode, struct inode *dir)
+{
+	int rc;
+	size_t len;
+	void *value;
+	char *name;
+
+	rc = security_inode_init_security(inode, dir, &name, &value, &len);
+	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			return 0;
+		return rc;
+	}
+	rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0);
+
+        kfree(name);
+        kfree(value);
+        return rc;
+}
+
+/* ---- XATTR Handler for "security.*" ----------------- */
+static int jffs2_security_getxattr(struct inode *inode, const char *name,
+				   void *buffer, size_t size)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	return do_jffs2_getxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size);
+}
+
+static int jffs2_security_setxattr(struct inode *inode, const char *name, const void *buffer,
+				   size_t size, int flags)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size, flags);
+}
+
+static size_t jffs2_security_listxattr(struct inode *inode, char *list, size_t list_size,
+				       const char *name, size_t name_len)
+{
+	size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen <= list_size) {
+		strcpy(list, XATTR_SECURITY_PREFIX);
+		strcpy(list + XATTR_SECURITY_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+struct xattr_handler jffs2_security_xattr_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.list = jffs2_security_listxattr,
+	.set = jffs2_security_setxattr,
+	.get = jffs2_security_getxattr
+};
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 7b0ed77a4c35..5d9ec8e36528 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -5,6 +5,7 @@
  *                     Zoltan Sogor <weth@inf.u-szeged.hu>,
  *                     Patrik Kluba <pajko@halom.u-szeged.hu>,
  *                     University of Szeged, Hungary
+ *               2005  KaiGai Kohei <kaigai@ak.jp.nec.com>
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
@@ -81,6 +82,19 @@ static int jffs2_sum_add_mem(struct jffs2_summary *s, union jffs2_sum_mem *item)
 			dbg_summary("dirent (%u) added to summary\n",
 						je32_to_cpu(item->d.ino));
 			break;
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR:
+			s->sum_size += JFFS2_SUMMARY_XATTR_SIZE;
+			s->sum_num++;
+			dbg_summary("xattr (xid=%u, version=%u) added to summary\n",
+				    je32_to_cpu(item->x.xid), je32_to_cpu(item->x.version));
+			break;
+		case JFFS2_NODETYPE_XREF:
+			s->sum_size += JFFS2_SUMMARY_XREF_SIZE;
+			s->sum_num++;
+			dbg_summary("xref added to summary\n");
+			break;
+#endif
 		default:
 			JFFS2_WARNING("UNKNOWN node type %u\n",
 					    je16_to_cpu(item->u.nodetype));
@@ -141,6 +155,40 @@ int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *r
 	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
 }
 
+#ifdef CONFIG_JFFS2_FS_XATTR
+int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs)
+{
+	struct jffs2_sum_xattr_mem *temp;
+
+	temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL);
+	if (!temp)
+		return -ENOMEM;
+
+	temp->nodetype = rx->nodetype;
+	temp->xid = rx->xid;
+	temp->version = rx->version;
+	temp->offset = cpu_to_je32(ofs);
+	temp->totlen = rx->totlen;
+	temp->next = NULL;
+
+	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+
+int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs)
+{
+	struct jffs2_sum_xref_mem *temp;
+
+	temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL);
+	if (!temp)
+		return -ENOMEM;
+
+	temp->nodetype = rr->nodetype;
+	temp->offset = cpu_to_je32(ofs);
+	temp->next = NULL;
+
+	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+#endif
 /* Cleanup every collected summary information */
 
 static void jffs2_sum_clean_collected(struct jffs2_summary *s)
@@ -259,7 +307,40 @@ int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs,
 
 			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
 		}
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR: {
+			struct jffs2_sum_xattr_mem *temp;
+			if (je32_to_cpu(node->x.version) == 0xffffffff)
+				return 0;
+			temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL);
+			if (!temp)
+				goto no_mem;
 
+			temp->nodetype = node->x.nodetype;
+			temp->xid = node->x.xid;
+			temp->version = node->x.version;
+			temp->totlen = node->x.totlen;
+			temp->offset = cpu_to_je32(ofs);
+			temp->next = NULL;
+
+			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+		}
+		case JFFS2_NODETYPE_XREF: {
+			struct jffs2_sum_xref_mem *temp;
+
+			if (je32_to_cpu(node->r.ino) == 0xffffffff
+			    && je32_to_cpu(node->r.xid) == 0xffffffff)
+				return 0;
+			temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL);
+			if (!temp)
+				goto no_mem;
+			temp->nodetype = node->r.nodetype;
+			temp->offset = cpu_to_je32(ofs);
+			temp->next = NULL;
+
+			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+		}
+#endif
 		case JFFS2_NODETYPE_PADDING:
 			dbg_summary("node PADDING\n");
 			c->summary->sum_padded += je32_to_cpu(node->u.totlen);
@@ -408,8 +489,94 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 
 				break;
 			}
+#ifdef CONFIG_JFFS2_FS_XATTR
+			case JFFS2_NODETYPE_XATTR: {
+				struct jffs2_xattr_datum *xd;
+				struct jffs2_sum_xattr_flash *spx;
+				uint32_t ofs;
+
+				spx = (struct jffs2_sum_xattr_flash *)sp;
+				ofs = jeb->offset + je32_to_cpu(spx->offset);
+				dbg_summary("xattr at %#08x (xid=%u, version=%u)\n", ofs,
+					    je32_to_cpu(spx->xid), je32_to_cpu(spx->version));
+				raw = jffs2_alloc_raw_node_ref();
+				if (!raw) {
+					JFFS2_NOTICE("allocation of node reference failed\n");
+					kfree(summary);
+					return -ENOMEM;
+				}
+				xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid),
+								je32_to_cpu(spx->version));
+				if (IS_ERR(xd)) {
+					JFFS2_NOTICE("allocation of xattr_datum failed\n");
+					jffs2_free_raw_node_ref(raw);
+					kfree(summary);
+					return PTR_ERR(xd);
+				}
+				xd->node = raw;
 
+				raw->flash_offset = ofs | REF_UNCHECKED;
+				raw->__totlen = PAD(je32_to_cpu(spx->totlen));
+				raw->next_phys = NULL;
+				raw->next_in_ino = (void *)xd;
+				if (!jeb->first_node)
+					jeb->first_node = raw;
+				if (jeb->last_node)
+					jeb->last_node->next_phys = raw;
+				jeb->last_node = raw;
+
+				*pseudo_random += je32_to_cpu(spx->xid);
+				UNCHECKED_SPACE(je32_to_cpu(spx->totlen));
+				sp += JFFS2_SUMMARY_XATTR_SIZE;
+
+				break;
+			}
+			case JFFS2_NODETYPE_XREF: {
+				struct jffs2_xattr_ref *ref;
+				struct jffs2_sum_xref_flash *spr;
+				uint32_t ofs;
+
+				spr = (struct jffs2_sum_xref_flash *)sp;
+				ofs = jeb->offset + je32_to_cpu(spr->offset);
+				dbg_summary("xref at %#08x (xid=%u, ino=%u)\n", ofs,
+					    je32_to_cpu(spr->xid), je32_to_cpu(spr->ino));
+				raw = jffs2_alloc_raw_node_ref();
+				if (!raw) {
+					JFFS2_NOTICE("allocation of node reference failed\n");
+					kfree(summary);
+					return -ENOMEM;
+				}
+				ref = jffs2_alloc_xattr_ref();
+				if (!ref) {
+					JFFS2_NOTICE("allocation of xattr_datum failed\n");
+					jffs2_free_raw_node_ref(raw);
+					kfree(summary);
+					return -ENOMEM;
+				}
+				ref->ino = 0xfffffffe;
+				ref->xid = 0xfffffffd;
+				ref->node = raw;
+				list_add_tail(&ref->ilist, &c->xattr_temp);
+
+				raw->__totlen = PAD(sizeof(struct jffs2_raw_xref));
+				raw->flash_offset = ofs | REF_UNCHECKED;
+				raw->next_phys = NULL;
+				raw->next_in_ino = (void *)ref;
+				if (!jeb->first_node)
+					jeb->first_node = raw;
+				if (jeb->last_node)
+					jeb->last_node->next_phys = raw;
+				jeb->last_node = raw;
+
+				UNCHECKED_SPACE(PAD(sizeof(struct jffs2_raw_xref)));
+				*pseudo_random += ofs;
+				sp += JFFS2_SUMMARY_XREF_SIZE;
+
+				break;
+			}
+#endif
 			default : {
+printk("nodetype = %#04x\n",je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype));
 				JFFS2_WARNING("Unsupported node type found in summary! Exiting...");
 				kfree(summary);
 				return -EIO;
@@ -617,7 +784,31 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 
 				break;
 			}
+#ifdef CONFIG_JFFS2_FS_XATTR
+			case JFFS2_NODETYPE_XATTR: {
+				struct jffs2_sum_xattr_flash *sxattr_ptr = wpage;
+
+				temp = c->summary->sum_list_head;
+				sxattr_ptr->nodetype = temp->x.nodetype;
+				sxattr_ptr->xid = temp->x.xid;
+				sxattr_ptr->version = temp->x.version;
+				sxattr_ptr->offset = temp->x.offset;
+				sxattr_ptr->totlen = temp->x.totlen;
+
+				wpage += JFFS2_SUMMARY_XATTR_SIZE;
+				break;
+			}
+			case JFFS2_NODETYPE_XREF: {
+				struct jffs2_sum_xref_flash *sxref_ptr = wpage;
+
+				temp = c->summary->sum_list_head;
+				sxref_ptr->nodetype = temp->r.nodetype;
+				sxref_ptr->offset = temp->r.offset;
 
+				wpage += JFFS2_SUMMARY_XREF_SIZE;
+				break;
+			}
+#endif
 			default : {
 				BUG();	/* unknown node in summary information */
 			}
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index b7a678be1709..a3b66c18aae9 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -45,6 +45,8 @@
 #define JFFS2_SUMMARY_NOSUM_SIZE 0xffffffff
 #define JFFS2_SUMMARY_INODE_SIZE (sizeof(struct jffs2_sum_inode_flash))
 #define JFFS2_SUMMARY_DIRENT_SIZE(x) (sizeof(struct jffs2_sum_dirent_flash) + (x))
+#define JFFS2_SUMMARY_XATTR_SIZE (sizeof(struct jffs2_sum_xattr_flash))
+#define JFFS2_SUMMARY_XREF_SIZE (sizeof(struct jffs2_sum_xref_flash))
 
 /* Summary structures used on flash */
 
@@ -75,11 +77,28 @@ struct jffs2_sum_dirent_flash
 	uint8_t name[0];	/* dirent name */
 } __attribute__((packed));
 
+struct jffs2_sum_xattr_flash
+{
+	jint16_t nodetype;	/* == JFFS2_NODETYPE_XATR */
+	jint32_t xid;		/* xattr identifier */
+	jint32_t version;	/* version number */
+	jint32_t offset;	/* offset on jeb */
+	jint32_t totlen;	/* node length */
+} __attribute__((packed));
+
+struct jffs2_sum_xref_flash
+{
+	jint16_t nodetype;	/* == JFFS2_NODETYPE_XREF */
+	jint32_t offset;	/* offset on jeb */
+} __attribute__((packed));
+
 union jffs2_sum_flash
 {
 	struct jffs2_sum_unknown_flash u;
 	struct jffs2_sum_inode_flash i;
 	struct jffs2_sum_dirent_flash d;
+	struct jffs2_sum_xattr_flash x;
+	struct jffs2_sum_xref_flash r;
 };
 
 /* Summary structures used in the memory */
@@ -114,11 +133,30 @@ struct jffs2_sum_dirent_mem
 	uint8_t name[0];	/* dirent name */
 } __attribute__((packed));
 
+struct jffs2_sum_xattr_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;
+	jint32_t xid;
+	jint32_t version;
+	jint32_t offset;
+	jint32_t totlen;
+} __attribute__((packed));
+
+struct jffs2_sum_xref_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;
+	jint32_t offset;
+} __attribute__((packed));
+
 union jffs2_sum_mem
 {
 	struct jffs2_sum_unknown_mem u;
 	struct jffs2_sum_inode_mem i;
 	struct jffs2_sum_dirent_mem d;
+	struct jffs2_sum_xattr_mem x;
+	struct jffs2_sum_xref_mem r;
 };
 
 /* Summary related information stored in superblock */
@@ -159,6 +197,8 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c);
 int jffs2_sum_add_padding_mem(struct jffs2_summary *s, uint32_t size);
 int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri, uint32_t ofs);
 int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *rd, uint32_t ofs);
+int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs);
+int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs);
 int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 			uint32_t ofs, uint32_t *pseudo_random);
 
@@ -176,6 +216,8 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 #define jffs2_sum_add_padding_mem(a,b)
 #define jffs2_sum_add_inode_mem(a,b,c)
 #define jffs2_sum_add_dirent_mem(a,b,c)
+#define jffs2_sum_add_xattr_mem(a,b,c)
+#define jffs2_sum_add_xref_mem(a,b,c)
 #define jffs2_sum_scan_sumnode(a,b,c,d) (0)
 
 #endif /* CONFIG_JFFS2_SUMMARY */
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index ffd8e84b22cc..c8b539ee7d80 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -151,7 +151,10 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 
 	sb->s_op = &jffs2_super_operations;
 	sb->s_flags = flags | MS_NOATIME;
-
+	sb->s_xattr = jffs2_xattr_handlers;
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	sb->s_flags |= MS_POSIXACL;
+#endif
 	ret = jffs2_do_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 
 	if (ret) {
@@ -293,6 +296,7 @@ static void jffs2_put_super (struct super_block *sb)
 		kfree(c->blocks);
 	jffs2_flash_cleanup(c);
 	kfree(c->inocache_list);
+	jffs2_clear_xattr_subsystem(c);
 	if (c->mtd->sync)
 		c->mtd->sync(c->mtd);
 
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index d55754fe8925..fc211b6e9b03 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -24,7 +24,12 @@ struct inode_operations jffs2_symlink_inode_operations =
 {
 	.readlink =	generic_readlink,
 	.follow_link =	jffs2_follow_link,
-	.setattr =	jffs2_setattr
+	.permission =	jffs2_permission,
+	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
 };
 
 static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index 1342f0158e9b..d5c78195f3b8 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -36,7 +36,7 @@ int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint
 	f->inocache->nlink = 1;
 	f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache;
 	f->inocache->state = INO_STATE_PRESENT;
-
+	init_xattr_inode_cache(f->inocache);
 
 	jffs2_add_ino_cache(c, f->inocache);
 	D1(printk(KERN_DEBUG "jffs2_do_new_inode(): Assigned ino# %d\n", f->inocache->ino));
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
new file mode 100644
index 000000000000..c9a185c54ce7
--- /dev/null
+++ b/fs/jffs2/xattr.c
@@ -0,0 +1,1271 @@
+/* -------------------------------------------------------------------------
+ *  File: fs/jffs2/xattr.c
+ *  XATTR support on JFFS2 FileSystem
+ *
+ *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *  Copyright (C) 2006 NEC Corporation
+ *
+ *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
+ * ------------------------------------------------------------------------- */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+/* -------- xdatum related functions ----------------
+ * xattr_datum_hashkey(xprefix, xname, xvalue, xsize)
+ *   is used to calcurate xdatum hashkey. The reminder of hashkey into XATTRINDEX_HASHSIZE is
+ *   the index of the xattr name/value pair cache (c->xattrindex).
+ * unload_xattr_datum(c, xd)
+ *   is used to release xattr name/value pair and detach from c->xattrindex.
+ * reclaim_xattr_datum(c)
+ *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
+ *   memory usage by cache is over c->xdatum_mem_threshold. Currentry, this threshold 
+ *   is hard coded as 32KiB.
+ * delete_xattr_datum_node(c, xd)
+ *   is used to delete a jffs2 node is dominated by xdatum. When EBS(Erase Block Summary) is
+ *   enabled, it overwrites the obsolete node by myself.
+ * delete_xattr_datum(c, xd)
+ *   is used to delete jffs2_xattr_datum object. It must be called with 0-value of reference
+ *   counter. (It means how many jffs2_xattr_ref object refers this xdatum.)
+ * do_verify_xattr_datum(c, xd)
+ *   is used to load the xdatum informations without name/value pair from the medium.
+ *   It's necessary once, because those informations are not collected during mounting
+ *   process when EBS is enabled.
+ *   0 will be returned, if success. An negative return value means recoverable error, and
+ *   positive return value means unrecoverable error. Thus, caller must remove this xdatum
+ *   and xref when it returned positive value.
+ * do_load_xattr_datum(c, xd)
+ *   is used to load name/value pair from the medium.
+ *   The meanings of return value is same as do_verify_xattr_datum().
+ * load_xattr_datum(c, xd)
+ *   is used to be as a wrapper of do_verify_xattr_datum() and do_load_xattr_datum().
+ *   If xd need to call do_verify_xattr_datum() at first, it's called before calling
+ *   do_load_xattr_datum(). The meanings of return value is same as do_verify_xattr_datum().
+ * save_xattr_datum(c, xd, phys_ofs)
+ *   is used to write xdatum to medium. xd->version will be incremented.
+ * create_xattr_datum(c, xprefix, xname, xvalue, xsize, phys_ofs)
+ *   is used to create new xdatum and write to medium.
+ * -------------------------------------------------- */
+
+static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *xvalue, int xsize)
+{
+	int name_len = strlen(xname);
+
+	return crc32(xprefix, xname, name_len) ^ crc32(xprefix, xvalue, xsize);
+}
+
+static void unload_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	D1(dbg_xattr("%s: xid=%u, version=%u\n", __FUNCTION__, xd->xid, xd->version));
+	if (xd->xname) {
+		c->xdatum_mem_usage -= (xd->name_len + 1 + xd->value_len);
+		kfree(xd->xname);
+	}
+
+	list_del_init(&xd->xindex);
+	xd->hashkey = 0;
+	xd->xname = NULL;
+	xd->xvalue = NULL;
+}
+
+static void reclaim_xattr_datum(struct jffs2_sb_info *c)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd, *_xd;
+	uint32_t target, before;
+	static int index = 0;
+	int count;
+
+	if (c->xdatum_mem_threshold > c->xdatum_mem_usage)
+		return;
+
+	before = c->xdatum_mem_usage;
+	target = c->xdatum_mem_usage * 4 / 5; /* 20% reduction */
+	for (count = 0; count < XATTRINDEX_HASHSIZE; count++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[index], xindex) {
+			if (xd->flags & JFFS2_XFLAGS_HOT) {
+				xd->flags &= ~JFFS2_XFLAGS_HOT;
+			} else if (!(xd->flags & JFFS2_XFLAGS_BIND)) {
+				unload_xattr_datum(c, xd);
+			}
+			if (c->xdatum_mem_usage <= target)
+				goto out;
+		}
+		index = (index+1) % XATTRINDEX_HASHSIZE;
+	}
+ out:
+	JFFS2_NOTICE("xdatum_mem_usage from %u byte to %u byte (%u byte reclaimed)\n",
+		     before, c->xdatum_mem_usage, before - c->xdatum_mem_usage);
+}
+
+static void delete_xattr_datum_node(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_raw_xattr rx;
+	uint32_t length;
+	int rc;
+
+	if (!xd->node) {
+		JFFS2_WARNING("xdatum (xid=%u) is removed twice.\n", xd->xid);
+		return;
+	}
+	if (jffs2_sum_active()) {
+		memset(&rx, 0xff, sizeof(struct jffs2_raw_xattr));
+		rc = jffs2_flash_read(c, ref_offset(xd->node),
+				      sizeof(struct jffs2_unknown_node),
+				      &length, (char *)&rx);
+		if (rc || length != sizeof(struct jffs2_unknown_node)) {
+			JFFS2_ERROR("jffs2_flash_read()=%d, req=%u, read=%u at %#08x\n",
+				    rc, sizeof(struct jffs2_unknown_node),
+				    length, ref_offset(xd->node));
+		}
+		rc = jffs2_flash_write(c, ref_offset(xd->node), sizeof(rx),
+				       &length, (char *)&rx);
+		if (rc || length != sizeof(struct jffs2_raw_xattr)) {
+			JFFS2_ERROR("jffs2_flash_write()=%d, req=%u, wrote=%u ar %#08x\n",
+				    rc, sizeof(rx), length, ref_offset(xd->node));
+		}
+	}
+	spin_lock(&c->erase_completion_lock);
+	xd->node->next_in_ino = NULL;
+	spin_unlock(&c->erase_completion_lock);
+	jffs2_mark_node_obsolete(c, xd->node);
+	xd->node = NULL;
+}
+
+static void delete_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	BUG_ON(xd->refcnt);
+
+	unload_xattr_datum(c, xd);
+	if (xd->node) {
+		delete_xattr_datum_node(c, xd);
+		xd->node = NULL;
+	}
+	jffs2_free_xattr_datum(xd);
+}
+
+static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_xattr rx;
+	size_t readlen;
+	uint32_t crc, totlen;
+	int rc;
+
+	BUG_ON(!xd->node);
+	BUG_ON(ref_flags(xd->node) != REF_UNCHECKED);
+
+	rc = jffs2_flash_read(c, ref_offset(xd->node), sizeof(rx), &readlen, (char *)&rx);
+	if (rc || readlen != sizeof(rx)) {
+		JFFS2_WARNING("jffs2_flash_read()=%d, req=%u, read=%u at %#08x\n",
+			      rc, sizeof(rx), readlen, ref_offset(xd->node));
+		return rc ? rc : -EIO;
+	}
+	crc = crc32(0, &rx, sizeof(rx) - 4);
+	if (crc != je32_to_cpu(rx.node_crc)) {
+		if (je32_to_cpu(rx.node_crc) != 0xffffffff)
+			JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+				    ref_offset(xd->node), je32_to_cpu(rx.hdr_crc), crc);
+		return EIO;
+	}
+	totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
+	if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
+	    || je16_to_cpu(rx.nodetype) != JFFS2_NODETYPE_XATTR
+	    || je32_to_cpu(rx.totlen) != totlen
+	    || je32_to_cpu(rx.xid) != xd->xid
+	    || je32_to_cpu(rx.version) != xd->version) {
+		JFFS2_ERROR("inconsistent xdatum at %#08x, magic=%#04x/%#04x, "
+			    "nodetype=%#04x/%#04x, totlen=%u/%u, xid=%u/%u, version=%u/%u\n",
+			    ref_offset(xd->node), je16_to_cpu(rx.magic), JFFS2_MAGIC_BITMASK,
+			    je16_to_cpu(rx.nodetype), JFFS2_NODETYPE_XATTR,
+			    je32_to_cpu(rx.totlen), totlen,
+			    je32_to_cpu(rx.xid), xd->xid,
+			    je32_to_cpu(rx.version), xd->version);
+		return EIO;
+	}
+	xd->xprefix = rx.xprefix;
+	xd->name_len = rx.name_len;
+	xd->value_len = je16_to_cpu(rx.value_len);
+	xd->data_crc = je32_to_cpu(rx.data_crc);
+
+	/* This JFFS2_NODETYPE_XATTR node is checked */
+	jeb = &c->blocks[ref_offset(xd->node) / c->sector_size];
+	totlen = PAD(je32_to_cpu(rx.totlen));
+
+	spin_lock(&c->erase_completion_lock);
+	c->unchecked_size -= totlen; c->used_size += totlen;
+	jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+	xd->node->flash_offset = ref_offset(xd->node) | REF_PRISTINE;
+	spin_unlock(&c->erase_completion_lock);
+
+	/* unchecked xdatum is chained with c->xattr_unchecked */
+	list_del_init(&xd->xindex);
+
+	dbg_xattr("success on verfying xdatum (xid=%u, version=%u)\n",
+		  xd->xid, xd->version);
+
+	return 0;
+}
+
+static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	char *data;
+	size_t readlen;
+	uint32_t crc, length;
+	int i, ret, retry = 0;
+
+	BUG_ON(!xd->node);
+	BUG_ON(ref_flags(xd->node) != REF_PRISTINE);
+	BUG_ON(!list_empty(&xd->xindex));
+ retry:
+	length = xd->name_len + 1 + xd->value_len;
+	data = kmalloc(length, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ret = jffs2_flash_read(c, ref_offset(xd->node)+sizeof(struct jffs2_raw_xattr),
+			       length, &readlen, data);
+
+	if (ret || length!=readlen) {
+		JFFS2_WARNING("jffs2_flash_read() returned %d, request=%d, readlen=%d, at %#08x\n",
+			      ret, length, readlen, ref_offset(xd->node));
+		kfree(data);
+		return ret ? ret : -EIO;
+	}
+
+	data[xd->name_len] = '\0';
+	crc = crc32(0, data, length);
+	if (crc != xd->data_crc) {
+		JFFS2_WARNING("node CRC failed (JFFS2_NODETYPE_XREF)"
+			      " at %#08x, read: 0x%08x calculated: 0x%08x\n",
+			      ref_offset(xd->node), xd->data_crc, crc);
+		kfree(data);
+		return EIO;
+	}
+
+	xd->flags |= JFFS2_XFLAGS_HOT;
+	xd->xname = data;
+	xd->xvalue = data + xd->name_len+1;
+
+	c->xdatum_mem_usage += length;
+
+	xd->hashkey = xattr_datum_hashkey(xd->xprefix, xd->xname, xd->xvalue, xd->value_len);
+	i = xd->hashkey % XATTRINDEX_HASHSIZE;
+	list_add(&xd->xindex, &c->xattrindex[i]);
+	if (!retry) {
+		retry = 1;
+		reclaim_xattr_datum(c);
+		if (!xd->xname)
+			goto retry;
+	}
+
+	dbg_xattr("success on loading xdatum (xid=%u, xprefix=%u, xname='%s')\n",
+		  xd->xid, xd->xprefix, xd->xname);
+
+	return 0;
+}
+
+static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem);
+	 * rc < 0 : recoverable error, try again
+	 * rc = 0 : success
+	 * rc > 0 : Unrecoverable error, this node should be deleted.
+	 */
+	int rc = 0;
+	BUG_ON(xd->xname);
+	if (!xd->node)
+		return EIO;
+	if (unlikely(ref_flags(xd->node) != REF_PRISTINE)) {
+		rc = do_verify_xattr_datum(c, xd);
+		if (rc > 0) {
+			list_del_init(&xd->xindex);
+			delete_xattr_datum_node(c, xd);
+		}
+	}
+	if (!rc)
+		rc = do_load_xattr_datum(c, xd);
+	return rc;
+}
+
+static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd, uint32_t phys_ofs)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_raw_xattr rx;
+	struct jffs2_raw_node_ref *raw;
+	struct kvec vecs[2];
+	uint32_t length;
+	int rc, totlen;
+
+	BUG_ON(!xd->xname);
+
+	vecs[0].iov_base = &rx;
+	vecs[0].iov_len = PAD(sizeof(rx));
+	vecs[1].iov_base = xd->xname;
+	vecs[1].iov_len = xd->name_len + 1 + xd->value_len;
+	totlen = vecs[0].iov_len + vecs[1].iov_len;
+
+	raw = jffs2_alloc_raw_node_ref();
+	if (!raw)
+		return -ENOMEM;
+	raw->flash_offset = phys_ofs;
+	raw->__totlen = PAD(totlen);
+	raw->next_phys = NULL;
+	raw->next_in_ino = (void *)xd;
+
+	/* Setup raw-xattr */
+	rx.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rx.nodetype = cpu_to_je16(JFFS2_NODETYPE_XATTR);
+	rx.totlen = cpu_to_je32(PAD(totlen));
+	rx.hdr_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_unknown_node) - 4));
+
+	rx.xid = cpu_to_je32(xd->xid);
+	rx.version = cpu_to_je32(++xd->version);
+	rx.xprefix = xd->xprefix;
+	rx.name_len = xd->name_len;
+	rx.value_len = cpu_to_je16(xd->value_len);
+	rx.data_crc = cpu_to_je32(crc32(0, vecs[1].iov_base, vecs[1].iov_len));
+	rx.node_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_raw_xattr) - 4));
+
+	rc = jffs2_flash_writev(c, vecs, 2, phys_ofs, &length, 0);
+	if (rc || totlen != length) {
+		JFFS2_WARNING("jffs2_flash_writev()=%d, req=%u, wrote=%u, at %#08x\n",
+			      rc, totlen, length, phys_ofs);
+		rc = rc ? rc : -EIO;
+		if (length) {
+			raw->flash_offset |= REF_OBSOLETE;
+			raw->next_in_ino = NULL;
+			jffs2_add_physical_node_ref(c, raw);
+			jffs2_mark_node_obsolete(c, raw);
+		} else {
+			jffs2_free_raw_node_ref(raw);
+		}
+		return rc;
+	}
+	BUG_ON(raw->__totlen < sizeof(struct jffs2_raw_xattr));
+	/* success */
+	raw->flash_offset |= REF_PRISTINE;
+	jffs2_add_physical_node_ref(c, raw);
+	if (xd->node)
+		delete_xattr_datum_node(c, xd);
+	xd->node = raw;
+
+	dbg_xattr("success on saving xdatum (xid=%u, version=%u, xprefix=%u, xname='%s')\n",
+		  xd->xid, xd->version, xd->xprefix, xd->xname);
+
+	return 0;
+}
+
+static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
+						    int xprefix, const char *xname,
+						    const char *xvalue, int xsize,
+						    uint32_t phys_ofs)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd;
+	uint32_t hashkey, name_len;
+	char *data;
+	int i, rc;
+
+	/* Search xattr_datum has same xname/xvalue by index */
+	hashkey = xattr_datum_hashkey(xprefix, xname, xvalue, xsize);
+	i = hashkey % XATTRINDEX_HASHSIZE;
+	list_for_each_entry(xd, &c->xattrindex[i], xindex) {
+		if (xd->hashkey==hashkey
+		    && xd->xprefix==xprefix
+		    && xd->value_len==xsize
+		    && !strcmp(xd->xname, xname)
+		    && !memcmp(xd->xvalue, xvalue, xsize)) {
+			xd->refcnt++;
+			return xd;
+		}
+	}
+
+	/* Not found, Create NEW XATTR-Cache */
+	name_len = strlen(xname);
+
+	xd = jffs2_alloc_xattr_datum();
+	if (!xd)
+		return ERR_PTR(-ENOMEM);
+
+	data = kmalloc(name_len + 1 + xsize, GFP_KERNEL);
+	if (!data) {
+		jffs2_free_xattr_datum(xd);
+		return ERR_PTR(-ENOMEM);
+	}
+	strcpy(data, xname);
+	memcpy(data + name_len + 1, xvalue, xsize);
+
+	xd->refcnt = 1;
+	xd->xid = ++c->highest_xid;
+	xd->flags |= JFFS2_XFLAGS_HOT;
+	xd->xprefix = xprefix;
+
+	xd->hashkey = hashkey;
+	xd->xname = data;
+	xd->xvalue = data + name_len + 1;
+	xd->name_len = name_len;
+	xd->value_len = xsize;
+	xd->data_crc = crc32(0, data, xd->name_len + 1 + xd->value_len);
+
+	rc = save_xattr_datum(c, xd, phys_ofs);
+	if (rc) {
+		kfree(xd->xname);
+		jffs2_free_xattr_datum(xd);
+		return ERR_PTR(rc);
+	}
+
+	/* Insert Hash Index */
+	i = hashkey % XATTRINDEX_HASHSIZE;
+	list_add(&xd->xindex, &c->xattrindex[i]);
+
+	c->xdatum_mem_usage += (xd->name_len + 1 + xd->value_len);
+	reclaim_xattr_datum(c);
+
+	return xd;
+}
+
+/* -------- xdatum related functions ----------------
+ * verify_xattr_ref(c, ref)
+ *   is used to load xref information from medium. Because summary data does not
+ *   contain xid/ino, it's necessary to verify once while mounting process.
+ * delete_xattr_ref_node(c, ref)
+ *   is used to delete a jffs2 node is dominated by xref. When EBS is enabled,
+ *   it overwrites the obsolete node by myself. 
+ * delete_xattr_ref(c, ref)
+ *   is used to delete jffs2_xattr_ref object. If the reference counter of xdatum
+ *   is refered by this xref become 0, delete_xattr_datum() is called later.
+ * save_xattr_ref(c, ref, phys_ofs)
+ *   is used to write xref to medium.
+ * create_xattr_ref(c, ic, xd, phys_ofs)
+ *   is used to create a new xref and write to medium.
+ * jffs2_xattr_delete_inode(c, ic)
+ *   is called to remove xrefs related to obsolete inode when inode is unlinked.
+ * jffs2_xattr_free_inode(c, ic)
+ *   is called to release xattr related objects when unmounting. 
+ * check_xattr_ref_ilist(c, ic)
+ *   is used to confirm inode does not have duplicate xattr name/value pair.
+ * -------------------------------------------------- */
+static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_xref rr;
+	size_t readlen;
+	uint32_t crc, totlen;
+	int rc;
+
+	BUG_ON(ref_flags(ref->node) != REF_UNCHECKED);
+
+	rc = jffs2_flash_read(c, ref_offset(ref->node), sizeof(rr), &readlen, (char *)&rr);
+	if (rc || sizeof(rr) != readlen) {
+		JFFS2_WARNING("jffs2_flash_read()=%d, req=%u, read=%u, at %#08x\n",
+			      rc, sizeof(rr), readlen, ref_offset(ref->node));
+		return rc ? rc : -EIO;
+	}
+	/* obsolete node */
+	crc = crc32(0, &rr, sizeof(rr) - 4);
+	if (crc != je32_to_cpu(rr.node_crc)) {
+		if (je32_to_cpu(rr.node_crc) != 0xffffffff)
+			JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+				    ref_offset(ref->node), je32_to_cpu(rr.node_crc), crc);
+		return EIO;
+	}
+	if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
+	    || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
+	    || je32_to_cpu(rr.totlen) != PAD(sizeof(rr))) {
+		JFFS2_ERROR("inconsistent xref at %#08x, magic=%#04x/%#04x, "
+			    "nodetype=%#04x/%#04x, totlen=%u/%u\n",
+			    ref_offset(ref->node), je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
+			    je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
+			    je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
+		return EIO;
+	}
+	ref->ino = je32_to_cpu(rr.ino);
+	ref->xid = je32_to_cpu(rr.xid);
+
+	/* fixup superblock/eraseblock info */
+	jeb = &c->blocks[ref_offset(ref->node) / c->sector_size];
+	totlen = PAD(sizeof(rr));
+
+	spin_lock(&c->erase_completion_lock);
+	c->unchecked_size -= totlen; c->used_size += totlen;
+	jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+	ref->node->flash_offset = ref_offset(ref->node) | REF_PRISTINE;
+	spin_unlock(&c->erase_completion_lock);
+
+	dbg_xattr("success on verifying xref (ino=%u, xid=%u) at %#08x\n",
+		  ref->ino, ref->xid, ref_offset(ref->node));
+	return 0;
+}
+
+static void delete_xattr_ref_node(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	struct jffs2_raw_xref rr;
+	uint32_t length;
+	int rc;
+
+	if (jffs2_sum_active()) {
+		memset(&rr, 0xff, sizeof(rr));
+		rc = jffs2_flash_read(c, ref_offset(ref->node),
+				      sizeof(struct jffs2_unknown_node),
+				      &length, (char *)&rr);
+		if (rc || length != sizeof(struct jffs2_unknown_node)) {
+			JFFS2_ERROR("jffs2_flash_read()=%d, req=%u, read=%u at %#08x\n",
+				    rc, sizeof(struct jffs2_unknown_node),
+				    length, ref_offset(ref->node));
+		}
+		rc = jffs2_flash_write(c, ref_offset(ref->node), sizeof(rr),
+				       &length, (char *)&rr);
+		if (rc || length != sizeof(struct jffs2_raw_xref)) {
+			JFFS2_ERROR("jffs2_flash_write()=%d, req=%u, wrote=%u at %#08x\n",
+				    rc, sizeof(rr), length, ref_offset(ref->node));
+		}
+	}
+	spin_lock(&c->erase_completion_lock);
+	ref->node->next_in_ino = NULL;
+	spin_unlock(&c->erase_completion_lock);
+	jffs2_mark_node_obsolete(c, ref->node);
+	ref->node = NULL;
+}
+
+static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd;
+
+	BUG_ON(!ref->node);
+	delete_xattr_ref_node(c, ref);
+
+	list_del(&ref->ilist);
+	xd = ref->xd;
+	xd->refcnt--;
+	if (!xd->refcnt)
+		delete_xattr_datum(c, xd);
+	jffs2_free_xattr_ref(ref);
+}
+
+static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref, uint32_t phys_ofs)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_raw_node_ref *raw;
+	struct jffs2_raw_xref rr;
+	uint32_t length;
+	int ret;
+
+	raw = jffs2_alloc_raw_node_ref();
+	if (!raw)
+		return -ENOMEM;
+	raw->flash_offset = phys_ofs;
+	raw->__totlen = PAD(sizeof(rr));
+	raw->next_phys = NULL;
+	raw->next_in_ino = (void *)ref;
+
+	rr.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rr.nodetype = cpu_to_je16(JFFS2_NODETYPE_XREF);
+	rr.totlen = cpu_to_je32(PAD(sizeof(rr)));
+	rr.hdr_crc = cpu_to_je32(crc32(0, &rr, sizeof(struct jffs2_unknown_node) - 4));
+
+	rr.ino = cpu_to_je32(ref->ic->ino);
+	rr.xid = cpu_to_je32(ref->xd->xid);
+	rr.node_crc = cpu_to_je32(crc32(0, &rr, sizeof(rr) - 4));
+
+	ret = jffs2_flash_write(c, phys_ofs, sizeof(rr), &length, (char *)&rr);
+	if (ret || sizeof(rr) != length) {
+		JFFS2_WARNING("jffs2_flash_write() returned %d, request=%u, retlen=%u, at %#08x\n",
+			      ret, sizeof(rr), length, phys_ofs);
+		ret = ret ? ret : -EIO;
+		if (length) {
+			raw->flash_offset |= REF_OBSOLETE;
+			raw->next_in_ino = NULL;
+			jffs2_add_physical_node_ref(c, raw);
+			jffs2_mark_node_obsolete(c, raw);
+		} else {
+			jffs2_free_raw_node_ref(raw);
+		}
+		return ret;
+	}
+	raw->flash_offset |= REF_PRISTINE;
+
+	jffs2_add_physical_node_ref(c, raw);
+	if (ref->node)
+		delete_xattr_ref_node(c, ref);
+	ref->node = raw;
+
+	dbg_xattr("success on saving xref (ino=%u, xid=%u)\n", ref->ic->ino, ref->xd->xid);
+
+	return 0;
+}
+
+static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic,
+						struct jffs2_xattr_datum *xd, uint32_t phys_ofs)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_ref *ref;
+	int ret;
+
+	ref = jffs2_alloc_xattr_ref();
+	if (!ref)
+		return ERR_PTR(-ENOMEM);
+	ref->ic = ic;
+	ref->xd = xd;
+
+	ret = save_xattr_ref(c, ref, phys_ofs);
+	if (ret) {
+		jffs2_free_xattr_ref(ref);
+		return ERR_PTR(ret);
+	}
+
+	/* Chain to inode */
+	list_add(&ref->ilist, &ic->ilist);
+
+	return ref; /* success */
+}
+
+void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* It's called from jffs2_clear_inode() on inode removing.
+	   When an inode with XATTR is removed, those XATTRs must be removed. */
+	struct jffs2_xattr_ref *ref, *_ref;
+
+	if (!ic || ic->nlink > 0)
+		return;
+
+	down_write(&c->xattr_sem);
+	list_for_each_entry_safe(ref, _ref, &ic->ilist, ilist)
+		delete_xattr_ref(c, ref);
+	up_write(&c->xattr_sem);
+}
+
+void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* It's called from jffs2_free_ino_caches() until unmounting FS. */
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref, *_ref;
+
+	down_write(&c->xattr_sem);
+	list_for_each_entry_safe(ref, _ref, &ic->ilist, ilist) {
+		list_del(&ref->ilist);
+		xd = ref->xd;
+		xd->refcnt--;
+		if (!xd->refcnt) {
+			unload_xattr_datum(c, xd);
+			jffs2_free_xattr_datum(xd);
+		}
+		jffs2_free_xattr_ref(ref);
+	}
+	up_write(&c->xattr_sem);
+}
+
+static int check_xattr_ref_ilist(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* success of check_xattr_ref_ilist() means taht inode (ic) dose not have
+	 * duplicate name/value pairs. If duplicate name/value pair would be found,
+	 * one will be removed.
+	 */
+	struct jffs2_xattr_ref *ref, *cmp;
+	int rc = 0;
+
+	if (likely(ic->flags & INO_FLAGS_XATTR_CHECKED))
+		return 0;
+	down_write(&c->xattr_sem);
+ retry:
+	rc = 0;
+	list_for_each_entry(ref, &ic->ilist, ilist) {
+		if (!ref->xd->xname) {
+			rc = load_xattr_datum(c, ref->xd);
+			if (unlikely(rc > 0)) {
+				delete_xattr_ref(c, ref);
+				goto retry;
+			} else if (unlikely(rc < 0))
+				goto out;
+		}
+		cmp = ref;
+		list_for_each_entry_continue(cmp, &ic->ilist, ilist) {
+			if (!cmp->xd->xname) {
+				ref->xd->flags |= JFFS2_XFLAGS_BIND;
+				rc = load_xattr_datum(c, cmp->xd);
+				ref->xd->flags &= ~JFFS2_XFLAGS_BIND;
+				if (unlikely(rc > 0)) {
+					delete_xattr_ref(c, cmp);
+					goto retry;
+				} else if (unlikely(rc < 0))
+					goto out;
+			}
+			if (ref->xd->xprefix == cmp->xd->xprefix
+			    && !strcmp(ref->xd->xname, cmp->xd->xname)) {
+				delete_xattr_ref(c, cmp);
+				goto retry;
+			}
+		}
+	}
+	ic->flags |= INO_FLAGS_XATTR_CHECKED;
+ out:
+	up_write(&c->xattr_sem);
+
+	return rc;
+}
+
+/* -------- xattr subsystem functions ---------------
+ * jffs2_init_xattr_subsystem(c)
+ *   is used to initialize semaphore and list_head, and some variables.
+ * jffs2_find_xattr_datum(c, xid)
+ *   is used to lookup xdatum while scanning process.
+ * jffs2_clear_xattr_subsystem(c)
+ *   is used to release any xattr related objects.
+ * jffs2_build_xattr_subsystem(c)
+ *   is used to associate xdatum and xref while super block building process.
+ * jffs2_setup_xattr_datum(c, xid, version)
+ *   is used to insert xdatum while scanning process.
+ * -------------------------------------------------- */
+void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	int i;
+
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++)
+		INIT_LIST_HEAD(&c->xattrindex[i]);
+	INIT_LIST_HEAD(&c->xattr_temp);
+	INIT_LIST_HEAD(&c->xattr_unchecked);
+
+	init_rwsem(&c->xattr_sem);
+	c->xdatum_mem_usage = 0;
+	c->xdatum_mem_threshold = 32 * 1024;	/* Default 32KB */
+}
+
+static struct jffs2_xattr_datum *jffs2_find_xattr_datum(struct jffs2_sb_info *c, uint32_t xid)
+{
+	struct jffs2_xattr_datum *xd;
+	int i = xid % XATTRINDEX_HASHSIZE;
+
+	/* It's only used in scanning/building process. */
+	BUG_ON(!(c->flags & (JFFS2_SB_FLAG_SCANNING|JFFS2_SB_FLAG_BUILDING)));
+
+	list_for_each_entry(xd, &c->xattrindex[i], xindex) {
+		if (xd->xid==xid)
+			return xd;
+	}
+	return NULL;
+}
+
+void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_xattr_ref *ref, *_ref;
+	int i;
+
+	list_for_each_entry_safe(ref, _ref, &c->xattr_temp, ilist)
+		jffs2_free_xattr_ref(ref);
+
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+			list_del(&xd->xindex);
+			if (xd->xname)
+				kfree(xd->xname);
+			jffs2_free_xattr_datum(xd);
+		}
+	}
+}
+
+void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_ref *ref, *_ref;
+	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_inode_cache *ic;
+	int i, xdatum_count =0, xdatum_unchecked_count = 0, xref_count = 0;
+
+	BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING));
+
+	/* Phase.1 */
+	list_for_each_entry_safe(ref, _ref, &c->xattr_temp, ilist) {
+		list_del_init(&ref->ilist);
+		/* checking REF_UNCHECKED nodes */
+		if (ref_flags(ref->node) != REF_PRISTINE) {
+			if (verify_xattr_ref(c, ref)) {
+				delete_xattr_ref_node(c, ref);
+				jffs2_free_xattr_ref(ref);
+				continue;
+			}
+		}
+		/* At this point, ref->xid and ref->ino contain XID and inode number.
+		   ref->xd and ref->ic are not valid yet. */
+		xd = jffs2_find_xattr_datum(c, ref->xid);
+		ic = jffs2_get_ino_cache(c, ref->ino);
+		if (!xd || !ic) {
+			if (ref_flags(ref->node) != REF_UNCHECKED)
+				JFFS2_WARNING("xref(ino=%u, xid=%u) is orphan. \n",
+					      ref->ino, ref->xid);
+			delete_xattr_ref_node(c, ref);
+			jffs2_free_xattr_ref(ref);
+			continue;
+		}
+		ref->xd = xd;
+		ref->ic = ic;
+		xd->refcnt++;
+		list_add_tail(&ref->ilist, &ic->ilist);
+		xref_count++;
+	}
+	/* After this, ref->xid/ino are NEVER used. */
+
+	/* Phase.2 */
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+			list_del_init(&xd->xindex);
+			if (!xd->refcnt) {
+				if (ref_flags(xd->node) != REF_UNCHECKED)
+					JFFS2_WARNING("orphan xdatum(xid=%u, version=%u) at %#08x\n",
+						      xd->xid, xd->version, ref_offset(xd->node));
+				delete_xattr_datum(c, xd);
+				continue;
+			}
+			if (ref_flags(xd->node) != REF_PRISTINE) {
+				dbg_xattr("unchecked xdatum(xid=%u) at %#08x\n",
+					  xd->xid, ref_offset(xd->node));
+				list_add(&xd->xindex, &c->xattr_unchecked);
+				xdatum_unchecked_count++;
+			}
+			xdatum_count++;
+		}
+	}
+	/* build complete */
+	JFFS2_NOTICE("complete building xattr subsystem, %u of xdatum (%u unchecked) and "
+		     "%u of xref found.\n", xdatum_count, xdatum_unchecked_count, xref_count);
+}
+
+struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
+						  uint32_t xid, uint32_t version)
+{
+	struct jffs2_xattr_datum *xd, *_xd;
+
+	_xd = jffs2_find_xattr_datum(c, xid);
+	if (_xd) {
+		dbg_xattr("duplicate xdatum (xid=%u, version=%u/%u) at %#08x\n",
+			  xid, version, _xd->version, ref_offset(_xd->node));
+		if (version < _xd->version)
+			return ERR_PTR(-EEXIST);
+	}
+	xd = jffs2_alloc_xattr_datum();
+	if (!xd)
+		return ERR_PTR(-ENOMEM);
+	xd->xid = xid;
+	xd->version = version;
+	if (xd->xid > c->highest_xid)
+		c->highest_xid = xd->xid;
+	list_add_tail(&xd->xindex, &c->xattrindex[xid % XATTRINDEX_HASHSIZE]);
+
+	if (_xd) {
+		list_del_init(&_xd->xindex);
+		delete_xattr_datum_node(c, _xd);
+		jffs2_free_xattr_datum(_xd);
+	}
+	return xd;
+}
+
+/* -------- xattr subsystem functions ---------------
+ * xprefix_to_handler(xprefix)
+ *   is used to translate xprefix into xattr_handler.
+ * jffs2_listxattr(dentry, buffer, size)
+ *   is an implementation of listxattr handler on jffs2.
+ * do_jffs2_getxattr(inode, xprefix, xname, buffer, size)
+ *   is an implementation of getxattr handler on jffs2.
+ * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
+ *   is an implementation of setxattr handler on jffs2.
+ * -------------------------------------------------- */
+struct xattr_handler *jffs2_xattr_handlers[] = {
+	&jffs2_user_xattr_handler,
+#ifdef CONFIG_JFFS2_FS_SECURITY
+	&jffs2_security_xattr_handler,
+#endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	&jffs2_acl_access_xattr_handler,
+	&jffs2_acl_default_xattr_handler,
+#endif
+	&jffs2_trusted_xattr_handler,
+	NULL
+};
+
+static struct xattr_handler *xprefix_to_handler(int xprefix) {
+	struct xattr_handler *ret;
+
+	switch (xprefix) {
+	case JFFS2_XPREFIX_USER:
+		ret = &jffs2_user_xattr_handler;
+		break;
+#ifdef CONFIG_JFFS2_FS_SECURITY
+	case JFFS2_XPREFIX_SECURITY:
+		ret = &jffs2_security_xattr_handler;
+		break;
+#endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	case JFFS2_XPREFIX_ACL_ACCESS:
+		ret = &jffs2_acl_access_xattr_handler;
+		break;
+	case JFFS2_XPREFIX_ACL_DEFAULT:
+		ret = &jffs2_acl_default_xattr_handler;
+		break;
+#endif
+	case JFFS2_XPREFIX_TRUSTED:
+		ret = &jffs2_trusted_xattr_handler;
+		break;
+	default:
+		ret = NULL;
+		break;
+	}
+	return ret;
+}
+
+ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_ref *ref;
+	struct jffs2_xattr_datum *xd;
+	struct xattr_handler *xhandle;
+	ssize_t len, rc;
+	int retry = 0;
+
+	rc = check_xattr_ref_ilist(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	down_read(&c->xattr_sem);
+ retry:
+	len = 0;
+	list_for_each_entry(ref, &ic->ilist, ilist) {
+		BUG_ON(ref->ic != ic);
+		xd = ref->xd;
+		if (!xd->xname) {
+			/* xdatum is unchached */
+			if (!retry) {
+				retry = 1;
+				up_read(&c->xattr_sem);
+				down_write(&c->xattr_sem);
+				goto retry;
+			} else {
+				rc = load_xattr_datum(c, xd);
+				if (unlikely(rc > 0)) {
+					delete_xattr_ref(c, ref);
+					goto retry;
+				} else if (unlikely(rc < 0))
+					goto out;
+			}
+		}
+		xhandle = xprefix_to_handler(xd->xprefix);
+		if (!xhandle)
+			continue;
+		if (buffer) {
+			rc = xhandle->list(inode, buffer+len, size-len, xd->xname, xd->name_len);
+		} else {
+			rc = xhandle->list(inode, NULL, 0, xd->xname, xd->name_len);
+		}
+		if (rc < 0)
+			goto out;
+		len += rc;
+	}
+	rc = len;
+ out:
+	if (!retry) {
+		up_read(&c->xattr_sem);
+	} else {
+		up_write(&c->xattr_sem);
+	}
+	return rc;
+}
+
+int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
+		      char *buffer, size_t size)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref;
+	int rc, retry = 0;
+
+	rc = check_xattr_ref_ilist(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	down_read(&c->xattr_sem);
+ retry:
+	list_for_each_entry(ref, &ic->ilist, ilist) {
+		BUG_ON(ref->ic!=ic);
+
+		xd = ref->xd;
+		if (xd->xprefix != xprefix)
+			continue;
+		if (!xd->xname) {
+			/* xdatum is unchached */
+			if (!retry) {
+				retry = 1;
+				up_read(&c->xattr_sem);
+				down_write(&c->xattr_sem);
+				goto retry;
+			} else {
+				rc = load_xattr_datum(c, xd);
+				if (unlikely(rc > 0)) {
+					delete_xattr_ref(c, ref);
+					goto retry;
+				} else if (unlikely(rc < 0)) {
+					goto out;
+				}
+			}
+		}
+		if (!strcmp(xname, xd->xname)) {
+			rc = xd->value_len;
+			if (buffer) {
+				if (size < rc) {
+					rc = -ERANGE;
+				} else {
+					memcpy(buffer, xd->xvalue, rc);
+				}
+			}
+			goto out;
+		}
+	}
+	rc = -ENODATA;
+ out:
+	if (!retry) {
+		up_read(&c->xattr_sem);
+	} else {
+		up_write(&c->xattr_sem);
+	}
+	return rc;
+}
+
+int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
+		      const char *buffer, size_t size, int flags)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref, *newref;
+	uint32_t phys_ofs, length, request;
+	int rc;
+
+	rc = check_xattr_ref_ilist(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	request = PAD(sizeof(struct jffs2_raw_xattr) + strlen(xname) + 1 + size);
+	rc = jffs2_reserve_space(c, request, &phys_ofs, &length,
+				 ALLOC_NORMAL, JFFS2_SUMMARY_XATTR_SIZE);
+	if (rc) {
+		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
+		return rc;
+	}
+
+	/* Find existing xattr */
+	down_write(&c->xattr_sem);
+ retry:
+	list_for_each_entry(ref, &ic->ilist, ilist) {
+		xd = ref->xd;
+		if (xd->xprefix != xprefix)
+			continue;
+		if (!xd->xname) {
+			rc = load_xattr_datum(c, xd);
+			if (unlikely(rc > 0)) {
+				delete_xattr_ref(c, ref);
+				goto retry;
+			} else if (unlikely(rc < 0))
+				goto out;
+		}
+		if (!strcmp(xd->xname, xname)) {
+			if (flags & XATTR_CREATE) {
+				rc = -EEXIST;
+				goto out;
+			}
+			if (!buffer) {
+				delete_xattr_ref(c, ref);
+				rc = 0;
+				goto out;
+			}
+			goto found;
+		}
+	}
+	/* not found */
+	ref = NULL;
+	if (flags & XATTR_REPLACE) {
+		rc = -ENODATA;
+		goto out;
+	}
+	if (!buffer) {
+		rc = -EINVAL;
+		goto out;
+	}
+ found:
+	xd = create_xattr_datum(c, xprefix, xname, buffer, size, phys_ofs);
+	if (IS_ERR(xd)) {
+		rc = PTR_ERR(xd);
+		goto out;
+	}
+	up_write(&c->xattr_sem);
+	jffs2_complete_reservation(c);
+
+	/* create xattr_ref */
+	request = PAD(sizeof(struct jffs2_raw_xref));
+	rc = jffs2_reserve_space(c, request, &phys_ofs, &length,
+				 ALLOC_NORMAL, JFFS2_SUMMARY_XREF_SIZE);
+	if (rc) {
+		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
+		down_write(&c->xattr_sem);
+		xd->refcnt--;
+		if (!xd->refcnt)
+			delete_xattr_datum(c, xd);
+		up_write(&c->xattr_sem);
+		return rc;
+	}
+	down_write(&c->xattr_sem);
+	newref = create_xattr_ref(c, ic, xd, phys_ofs);
+	if (IS_ERR(newref)) {
+		rc = PTR_ERR(newref);
+		xd->refcnt--;
+		if (!xd->refcnt)
+			delete_xattr_datum(c, xd);
+	} else if (ref) {
+		/* If replaced xattr_ref exists */
+		delete_xattr_ref(c, ref);
+	}
+ out:
+	up_write(&c->xattr_sem);
+	jffs2_complete_reservation(c);
+	return rc;
+}
+
+/* -------- garbage collector functions -------------
+ * jffs2_garbage_collect_xattr_datum(c, xd)
+ *   is used to move xdatum into new node.
+ * jffs2_garbage_collect_xattr_ref(c, ref)
+ *   is used to move xref into new node.
+ * jffs2_garbage_collect_xattr(c, ic)
+ *   is used to call appropriate garbage collector function, if argument
+ *   pointer (ic) is the reference of xdatum/xref.
+ * jffs2_verify_xattr(c)
+ *   is used to call do_verify_xattr_datum() before garbage collecting.
+ * -------------------------------------------------- */
+static int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c,
+					     struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem), and called from GC thread */
+	uint32_t phys_ofs, totlen, length, old_ofs;
+	int rc;
+
+	BUG_ON(!xd->node);
+
+	old_ofs = ref_offset(xd->node);
+	totlen = ref_totlen(c, c->gcblock, xd->node);
+	if (totlen < sizeof(struct jffs2_raw_xattr))
+		return -EINVAL;
+
+	if (!xd->xname) {
+		rc = load_xattr_datum(c, xd);
+		if (unlikely(rc > 0)) {
+			delete_xattr_datum_node(c, xd);
+			return 0;
+		} else if (unlikely(rc < 0))
+			return -EINVAL;
+	}
+	rc = jffs2_reserve_space_gc(c, totlen, &phys_ofs, &length, JFFS2_SUMMARY_XATTR_SIZE);
+	if (rc || length < totlen) {
+		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, totlen);
+		return rc ? rc : -EBADFD;
+	}
+	rc = save_xattr_datum(c, xd, phys_ofs);
+	if (!rc)
+		dbg_xattr("xdatum (xid=%u, version=%u) GC'ed from %#08x to %08x\n",
+			  xd->xid, xd->version, old_ofs, ref_offset(xd->node));
+	return rc;
+}
+
+
+static int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c,
+					   struct jffs2_xattr_ref *ref)
+{
+	/* must be called under down(alloc_sem) */
+	uint32_t phys_ofs, totlen, length, old_ofs;
+	int rc;
+
+	BUG_ON(!ref->node);
+
+	old_ofs = ref_offset(ref->node);
+	totlen = ref_totlen(c, c->gcblock, ref->node);
+	if (totlen != sizeof(struct jffs2_raw_xref))
+		return -EINVAL;
+	rc = jffs2_reserve_space_gc(c, totlen, &phys_ofs, &length, JFFS2_SUMMARY_XREF_SIZE);
+	if (rc || length < totlen) {
+		JFFS2_WARNING("%s: jffs2_reserve_space() = %d, request = %u\n",
+			      __FUNCTION__, rc, totlen);
+		return rc ? rc : -EBADFD;
+	}
+	rc = save_xattr_ref(c, ref, phys_ofs);
+	if (!rc)
+		dbg_xattr("xref (ino=%u, xid=%u) GC'ed from %#08x to %08x\n",
+			  ref->ic->ino, ref->xd->xid, old_ofs, ref_offset(ref->node));
+	return rc;
+}
+
+int jffs2_garbage_collect_xattr(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref;
+	int ret;
+
+	switch (ic->class) {
+	case RAWNODE_CLASS_XATTR_DATUM:
+		spin_unlock(&c->erase_completion_lock);
+
+		down_write(&c->xattr_sem);
+		xd = (struct jffs2_xattr_datum *)ic;
+		ret = xd ? jffs2_garbage_collect_xattr_datum(c, xd) : 0;
+		up_write(&c->xattr_sem);
+		break;
+	case RAWNODE_CLASS_XATTR_REF:
+		spin_unlock(&c->erase_completion_lock);
+
+		down_write(&c->xattr_sem);
+		ref = (struct jffs2_xattr_ref *)ic;
+		ret = ref ? jffs2_garbage_collect_xattr_ref(c, ref) : 0;
+		up_write(&c->xattr_sem);
+		break;
+	default:
+		/* This node is not xattr_datum/xattr_ref */
+		ret = 1;
+		break;
+	}
+	return ret;
+}
+
+int jffs2_verify_xattr(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_datum *xd, *_xd;
+	int rc;
+
+	down_write(&c->xattr_sem);
+	list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) {
+		rc = do_verify_xattr_datum(c, xd);
+		if (rc == 0) {
+			list_del_init(&xd->xindex);
+			break;
+		} else if (rc > 0) {
+			list_del_init(&xd->xindex);
+			delete_xattr_datum_node(c, xd);
+		}
+	}
+	up_write(&c->xattr_sem);
+
+	return list_empty(&c->xattr_unchecked) ? 1 : 0;
+}
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
new file mode 100644
index 000000000000..d157ad641ed4
--- /dev/null
+++ b/fs/jffs2/xattr.h
@@ -0,0 +1,120 @@
+/*-------------------------------------------------------------------------*
+ *  File: fs/jffs2/xattr.c
+ *  XATTR support on JFFS2 FileSystem
+ *
+ *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *  Copyright (C) 2006 NEC Corporation
+ *
+ *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
+ *-------------------------------------------------------------------------*/
+
+#ifndef _JFFS2_FS_XATTR_H_
+#define _JFFS2_FS_XATTR_H_
+
+#include <linux/xattr.h>
+
+#define JFFS2_XFLAGS_HOT	(0x01)	/* This datum is HOT */
+#define JFFS2_XFLAGS_BIND	(0x02)	/* This datum is not reclaimed */
+
+struct jffs2_xattr_datum
+{
+	void *always_null;
+	u8 class;
+	u8 flags;
+	u16 xprefix;			/* see JFFS2_XATTR_PREFIX_* */
+
+	struct jffs2_raw_node_ref *node;
+	struct list_head xindex;	/* chained from c->xattrindex[n] */
+	uint32_t refcnt;		/* # of xattr_ref refers this */
+	uint32_t xid;
+	uint32_t version;
+
+	uint32_t data_crc;
+	uint32_t hashkey;
+	char *xname;		/* XATTR name without prefix */
+	uint32_t name_len;	/* length of xname */
+	char *xvalue;		/* XATTR value */
+	uint32_t value_len;	/* length of xvalue */
+};
+
+struct jffs2_inode_cache;	/* forward refence */
+struct jffs2_xattr_ref
+{
+	void *always_null;
+	u8 class;
+	u8 flags;		/* Currently unused */
+	u16 unused;
+
+	struct jffs2_raw_node_ref *node;
+	union {
+		struct jffs2_inode_cache *ic;	/* reference to jffs2_inode_cache */
+		uint32_t ino;			/* only used in scanning/building  */
+	};
+	union {
+		struct jffs2_xattr_datum *xd;	/* reference to jffs2_xattr_datum */
+		uint32_t xid;			/* only used in sccanning/building */
+	};
+	struct list_head ilist;			/* chained from ic->ilist */
+};
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+
+extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c);
+extern void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c);
+extern void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c);
+
+extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
+                                                  uint32_t xid, uint32_t version);
+
+extern void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+extern void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+
+extern int jffs2_garbage_collect_xattr(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+extern int jffs2_verify_xattr(struct jffs2_sb_info *c);
+
+extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
+			     char *buffer, size_t size);
+extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
+			     const char *buffer, size_t size, int flags);
+
+extern struct xattr_handler *jffs2_xattr_handlers[];
+extern struct xattr_handler jffs2_user_xattr_handler;
+extern struct xattr_handler jffs2_trusted_xattr_handler;
+
+extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
+#define jffs2_getxattr		generic_getxattr
+#define jffs2_setxattr		generic_setxattr
+#define jffs2_removexattr	generic_removexattr
+
+/*---- Any inline initialize functions ----*/
+#define init_xattr_inode_cache(x) INIT_LIST_HEAD(&((x)->ilist))
+
+#else
+
+#define jffs2_init_xattr_subsystem(c)
+#define jffs2_build_xattr_subsystem(c)
+#define jffs2_clear_xattr_subsystem(c)
+
+#define jffs2_xattr_delete_inode(c, ic)
+#define jffs2_xattr_free_inode(c, ic)
+#define jffs2_garbage_collect_xattr(c, ic)	(1)
+#define jffs2_verify_xattr(c)			(1)
+
+#define jffs2_xattr_handlers	NULL
+#define jffs2_listxattr		NULL
+#define jffs2_getxattr		NULL
+#define jffs2_setxattr		NULL
+#define jffs2_removexattr	NULL
+
+#define init_xattr_inode_cache(x)
+
+#endif /* CONFIG_JFFS2_FS_XATTR */
+
+#ifdef CONFIG_JFFS2_FS_SECURITY
+extern int jffs2_init_security(struct inode *inode, struct inode *dir);
+extern struct xattr_handler jffs2_security_xattr_handler;
+#else
+#define jffs2_init_security(inode,dir)	(0)
+#endif /* CONFIG_JFFS2_FS_SECURITY */
+
+#endif /* _JFFS2_FS_XATTR_H_ */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
new file mode 100644
index 000000000000..a018c9c31a60
--- /dev/null
+++ b/fs/jffs2/xattr_trusted.c
@@ -0,0 +1,51 @@
+/*-------------------------------------------------------------------------*
+ *  File: fs/jffs2/xattr_trusted.c
+ *  XATTR support on JFFS2 FileSystem
+ *
+ *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *  Copyright (C) 2006 NEC Corporation
+ *
+ *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
+ *-------------------------------------------------------------------------*/
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static int jffs2_trusted_getxattr(struct inode *inode, const char *name,
+				  void *buffer, size_t size)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_getxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size);
+}
+
+static int jffs2_trusted_setxattr(struct inode *inode, const char *name, const void *buffer,
+				  size_t size, int flags)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags);
+}
+
+static size_t jffs2_trusted_listxattr(struct inode *inode, char *list, size_t list_size,
+				      const char *name, size_t name_len)
+{
+	size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen<=list_size) {
+		strcpy(list, XATTR_TRUSTED_PREFIX);
+		strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+struct xattr_handler jffs2_trusted_xattr_handler = {
+	.prefix = XATTR_TRUSTED_PREFIX,
+	.list = jffs2_trusted_listxattr,
+	.set = jffs2_trusted_setxattr,
+	.get = jffs2_trusted_getxattr
+};
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
new file mode 100644
index 000000000000..d8c13636ea4c
--- /dev/null
+++ b/fs/jffs2/xattr_user.c
@@ -0,0 +1,51 @@
+/*-------------------------------------------------------------------------*
+ *  File: fs/jffs2/xattr_user.c
+ *  XATTR support on JFFS2 FileSystem
+ *
+ *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *  Copyright (C) 2006 NEC Corporation
+ *
+ *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
+ *-------------------------------------------------------------------------*/
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static int jffs2_user_getxattr(struct inode *inode, const char *name,
+                               void *buffer, size_t size)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_getxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size);
+}
+
+static int jffs2_user_setxattr(struct inode *inode, const char *name, const void *buffer,
+                               size_t size, int flags)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size, flags);
+}
+
+static size_t jffs2_user_listxattr(struct inode *inode, char *list, size_t list_size,
+				   const char *name, size_t name_len)
+{
+	size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen <= list_size) {
+		strcpy(list, XATTR_USER_PREFIX);
+		strcpy(list + XATTR_USER_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+struct xattr_handler jffs2_user_xattr_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.list = jffs2_user_listxattr,
+	.set = jffs2_user_setxattr,
+	.get = jffs2_user_getxattr
+};
diff --git a/include/linux/jffs2.h b/include/linux/jffs2.h
index cf792bb3c726..2cac60e55322 100644
--- a/include/linux/jffs2.h
+++ b/include/linux/jffs2.h
@@ -65,6 +65,18 @@
 
 #define JFFS2_NODETYPE_SUMMARY (JFFS2_FEATURE_RWCOMPAT_DELETE | JFFS2_NODE_ACCURATE | 6)
 
+#define JFFS2_NODETYPE_XATTR (JFFS2_FEATURE_INCOMPAT | JFFS2_NODE_ACCURATE | 8)
+#define JFFS2_NODETYPE_XREF (JFFS2_FEATURE_INCOMPAT | JFFS2_NODE_ACCURATE | 9)
+
+/* XATTR Related */
+#define JFFS2_XPREFIX_USER		1	/* for "user." */
+#define JFFS2_XPREFIX_SECURITY		2	/* for "security." */
+#define JFFS2_XPREFIX_ACL_ACCESS	3	/* for "system.posix_acl_access" */
+#define JFFS2_XPREFIX_ACL_DEFAULT	4	/* for "system.posix_acl_default" */
+#define JFFS2_XPREFIX_TRUSTED		5	/* for "trusted.*" */
+
+#define JFFS2_ACL_VERSION		0x0001
+
 // Maybe later...
 //#define JFFS2_NODETYPE_CHECKPOINT (JFFS2_FEATURE_RWCOMPAT_DELETE | JFFS2_NODE_ACCURATE | 3)
 //#define JFFS2_NODETYPE_OPTIONS (JFFS2_FEATURE_RWCOMPAT_COPY | JFFS2_NODE_ACCURATE | 4)
@@ -151,6 +163,32 @@ struct jffs2_raw_inode
 	uint8_t data[0];
 } __attribute__((packed));
 
+struct jffs2_raw_xattr {
+	jint16_t magic;
+	jint16_t nodetype;	/* = JFFS2_NODETYPE_XATTR */
+	jint32_t totlen;
+	jint32_t hdr_crc;
+	jint32_t xid;		/* XATTR identifier number */
+	jint32_t version;
+	uint8_t xprefix;
+	uint8_t name_len;
+	jint16_t value_len;
+	jint32_t data_crc;
+	jint32_t node_crc;
+	uint8_t data[0];
+} __attribute__((packed));
+
+struct jffs2_raw_xref
+{
+	jint16_t magic;
+	jint16_t nodetype;	/* = JFFS2_NODETYPE_XREF */
+	jint32_t totlen;
+	jint32_t hdr_crc;
+	jint32_t ino;		/* inode number */
+	jint32_t xid;		/* XATTR identifier number */
+	jint32_t node_crc;
+} __attribute__((packed));
+
 struct jffs2_raw_summary
 {
 	jint16_t magic;
@@ -169,6 +207,8 @@ union jffs2_node_union
 {
 	struct jffs2_raw_inode i;
 	struct jffs2_raw_dirent d;
+	struct jffs2_raw_xattr x;
+	struct jffs2_raw_xref r;
 	struct jffs2_raw_summary s;
 	struct jffs2_unknown_node u;
 };
-- 
cgit v1.2.3


From 151e76590f66f5406eb2e1f4270c5323f385d2e8 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 14 May 2006 01:51:54 +0100
Subject: [MTD] Fix legacy character sets throughout drivers/mtd,
 include/linux/mtd

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 drivers/mtd/chips/cfi_cmdset_0001.c | 2 +-
 drivers/mtd/chips/cfi_probe.c       | 8 ++++----
 drivers/mtd/devices/block2mtd.c     | 2 +-
 drivers/mtd/devices/phram.c         | 6 +++---
 drivers/mtd/maps/cfi_flagadm.c      | 4 ++--
 drivers/mtd/maps/dbox2-flash.c      | 2 +-
 drivers/mtd/maps/mtx-1_flash.c      | 2 +-
 drivers/mtd/nand/edb7312.c          | 2 +-
 drivers/mtd/nand/h1910.c            | 2 +-
 drivers/mtd/nand/ts7250.c           | 2 +-
 include/linux/mtd/mtd.h             | 2 +-
 11 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index fe00af3f9195..898c321ab86d 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -1475,7 +1475,7 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip,
 	ENABLE_VPP(map);
 	xip_disable(map, chip, cmd_adr);
 
-	/* �4.8 of the 28FxxxJ3A datasheet says "Any time SR.4 and/or SR.5 is set
+	/* §4.8 of the 28FxxxJ3A datasheet says "Any time SR.4 and/or SR.5 is set
 	   [...], the device will not accept any more Write to Buffer commands".
 	   So we must check here and reset those bits if they're set. Otherwise
 	   we're just pissing in the wind */
diff --git a/drivers/mtd/chips/cfi_probe.c b/drivers/mtd/chips/cfi_probe.c
index e636aa86bc24..4bf9f8cac0dd 100644
--- a/drivers/mtd/chips/cfi_probe.c
+++ b/drivers/mtd/chips/cfi_probe.c
@@ -349,12 +349,12 @@ static void print_cfi_ident(struct cfi_ident *cfip)
 	else
 		printk("No Vpp line\n");
 
-	printk("Typical byte/word write timeout: %d �s\n", 1<<cfip->WordWriteTimeoutTyp);
-	printk("Maximum byte/word write timeout: %d �s\n", (1<<cfip->WordWriteTimeoutMax) * (1<<cfip->WordWriteTimeoutTyp));
+	printk("Typical byte/word write timeout: %d µs\n", 1<<cfip->WordWriteTimeoutTyp);
+	printk("Maximum byte/word write timeout: %d µs\n", (1<<cfip->WordWriteTimeoutMax) * (1<<cfip->WordWriteTimeoutTyp));
 
 	if (cfip->BufWriteTimeoutTyp || cfip->BufWriteTimeoutMax) {
-		printk("Typical full buffer write timeout: %d �s\n", 1<<cfip->BufWriteTimeoutTyp);
-		printk("Maximum full buffer write timeout: %d �s\n", (1<<cfip->BufWriteTimeoutMax) * (1<<cfip->BufWriteTimeoutTyp));
+		printk("Typical full buffer write timeout: %d µs\n", 1<<cfip->BufWriteTimeoutTyp);
+		printk("Maximum full buffer write timeout: %d µs\n", (1<<cfip->BufWriteTimeoutMax) * (1<<cfip->BufWriteTimeoutTyp));
 	}
 	else
 		printk("Full buffer write not supported\n");
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index 45606921364b..8ca04f4e03f0 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -4,7 +4,7 @@
  * block2mtd.c - create an mtd from a block device
  *
  * Copyright (C) 2001,2002	Simon Evans <spse@secret.org.uk>
- * Copyright (C) 2004-2006	J�rn Engel <joern@wh.fh-wedel.de>
+ * Copyright (C) 2004-2006	Jörn Engel <joern@wh.fh-wedel.de>
  *
  * Licence: GPL
  */
diff --git a/drivers/mtd/devices/phram.c b/drivers/mtd/devices/phram.c
index 68d39cc9df71..e09e416667d3 100644
--- a/drivers/mtd/devices/phram.c
+++ b/drivers/mtd/devices/phram.c
@@ -1,8 +1,8 @@
 /**
  * $Id: phram.c,v 1.16 2005/11/07 11:14:25 gleixner Exp $
  *
- * Copyright (c) ????		Jochen Sch�uble <psionic@psionic.de>
- * Copyright (c) 2003-2004	J�rn Engel <joern@wh.fh-wedel.de>
+ * Copyright (c) ????		Jochen Schäuble <psionic@psionic.de>
+ * Copyright (c) 2003-2004	Jörn Engel <joern@wh.fh-wedel.de>
  *
  * Usage:
  *
@@ -300,5 +300,5 @@ module_init(init_phram);
 module_exit(cleanup_phram);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("J�rn Engel <joern@wh.fh-wedel.de>");
+MODULE_AUTHOR("Jörn Engel <joern@wh.fh-wedel.de>");
 MODULE_DESCRIPTION("MTD driver for physical RAM");
diff --git a/drivers/mtd/maps/cfi_flagadm.c b/drivers/mtd/maps/cfi_flagadm.c
index fd0f0d3187de..92b5d883d7b0 100644
--- a/drivers/mtd/maps/cfi_flagadm.c
+++ b/drivers/mtd/maps/cfi_flagadm.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright � 2001 Flaga hf. Medical Devices, K�ri Dav��sson <kd@flaga.is>
+ *  Copyright © 2001 Flaga hf. Medical Devices, Kári Davíðsson <kd@flaga.is>
  *
  *  $Id: cfi_flagadm.c,v 1.15 2005/11/07 11:14:26 gleixner Exp $
  *
@@ -135,5 +135,5 @@ module_exit(cleanup_flagadm);
 
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("K�ri Dav��sson <kd@flaga.is>");
+MODULE_AUTHOR("Kári Davíðsson <kd@flaga.is>");
 MODULE_DESCRIPTION("MTD map driver for Flaga digital module");
diff --git a/drivers/mtd/maps/dbox2-flash.c b/drivers/mtd/maps/dbox2-flash.c
index 652813cd6c2d..85c2a9e22b1e 100644
--- a/drivers/mtd/maps/dbox2-flash.c
+++ b/drivers/mtd/maps/dbox2-flash.c
@@ -122,5 +122,5 @@ module_exit(cleanup_dbox2_flash);
 
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("K�ri Dav��sson <kd@flaga.is>, Bastian Blank <waldi@tuxbox.org>, Alexander Wild <wild@te-elektronik.com>");
+MODULE_AUTHOR("Kári Davíðsson <kd@flaga.is>, Bastian Blank <waldi@tuxbox.org>, Alexander Wild <wild@te-elektronik.com>");
 MODULE_DESCRIPTION("MTD map driver for D-Box 2 board");
diff --git a/drivers/mtd/maps/mtx-1_flash.c b/drivers/mtd/maps/mtx-1_flash.c
index d1e66e186746..5c25d4e552c6 100644
--- a/drivers/mtd/maps/mtx-1_flash.c
+++ b/drivers/mtd/maps/mtx-1_flash.c
@@ -4,7 +4,7 @@
  * $Id: mtx-1_flash.c,v 1.2 2005/11/07 11:14:27 gleixner Exp $
  *
  * (C) 2005 Bruno Randolf <bruno.randolf@4g-systems.biz>
- * (C) 2005 J�rn Engel <joern@wohnheim.fh-wedel.de>
+ * (C) 2005 Jörn Engel <joern@wohnheim.fh-wedel.de>
  *
  */
 
diff --git a/drivers/mtd/nand/edb7312.c b/drivers/mtd/nand/edb7312.c
index ad4488abfb0b..8e56570af91f 100644
--- a/drivers/mtd/nand/edb7312.c
+++ b/drivers/mtd/nand/edb7312.c
@@ -1,7 +1,7 @@
 /*
  *  drivers/mtd/nand/edb7312.c
  *
- *  Copyright (C) 2002 Marius Gr�ger (mag@sysgo.de)
+ *  Copyright (C) 2002 Marius Gröger (mag@sysgo.de)
  *
  *  Derived from drivers/mtd/nand/autcpu12.c
  *       Copyright (c) 2001 Thomas Gleixner (gleixner@autronix.de)
diff --git a/drivers/mtd/nand/h1910.c b/drivers/mtd/nand/h1910.c
index b47a15c23d1c..9848eb09b884 100644
--- a/drivers/mtd/nand/h1910.c
+++ b/drivers/mtd/nand/h1910.c
@@ -4,7 +4,7 @@
  *  Copyright (C) 2003 Joshua Wise (joshua@joshuawise.com)
  *
  *  Derived from drivers/mtd/nand/edb7312.c
- *       Copyright (C) 2002 Marius Gr�ger (mag@sysgo.de)
+ *       Copyright (C) 2002 Marius Gröger (mag@sysgo.de)
  *       Copyright (c) 2001 Thomas Gleixner (gleixner@autronix.de)
  *
  * $Id: h1910.c,v 1.6 2005/11/07 11:14:30 gleixner Exp $
diff --git a/drivers/mtd/nand/ts7250.c b/drivers/mtd/nand/ts7250.c
index 756ef64b0efd..622db3127f7c 100644
--- a/drivers/mtd/nand/ts7250.c
+++ b/drivers/mtd/nand/ts7250.c
@@ -4,7 +4,7 @@
  * Copyright (C) 2004 Technologic Systems (support@embeddedARM.com)
  *
  * Derived from drivers/mtd/nand/edb7312.c
- *   Copyright (C) 2004 Marius Gr�ger (mag@sysgo.de)
+ *   Copyright (C) 2004 Marius Gröger (mag@sysgo.de)
  *
  * Derived from drivers/mtd/nand/autcpu12.c
  *   Copyright (c) 2001 Thomas Gleixner (gleixner@autronix.de)
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index b6f2fdae65c6..73620ef83364 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -61,7 +61,7 @@ struct mtd_info {
 	u_int32_t flags;
 	u_int32_t size;	 // Total size of the MTD
 
-	/* "Major" erase size for the device. Na�ve users may take this
+	/* "Major" erase size for the device. Naïve users may take this
 	 * to be the only erase size available, or may use the more detailed
 	 * information below if they desire
 	 */
-- 
cgit v1.2.3


From 0d4e30d26a279f1b6a008a233a6835ad2af571e4 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 14 May 2006 12:25:19 +0100
Subject: [MTD] Clean up <linux/mtd/physmap.h> to fix modular build

... and also fix the multiple inclusion guard so it actually _works_

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/mtd/physmap.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/physmap.h b/include/linux/mtd/physmap.h
index 50f954461aa8..86831e3594f6 100644
--- a/include/linux/mtd/physmap.h
+++ b/include/linux/mtd/physmap.h
@@ -15,10 +15,7 @@
  */
 
 #ifndef __LINUX_MTD_PHYSMAP__
-
-#include <linux/config.h>
-
-#if defined(CONFIG_MTD_PHYSMAP)
+#define __LINUX_MTD_PHYSMAP__
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
@@ -37,7 +34,7 @@ struct physmap_flash_data {
 void physmap_configure(unsigned long addr, unsigned long size,
 		int bankwidth, void (*set_vpp)(struct map_info *, int) );
 
-#if defined(CONFIG_MTD_PARTITIONS)
+#ifdef CONFIG_MTD_PARTITIONS
 
 /*
  * Machines that wish to do flash partition may want to call this function in
@@ -51,6 +48,5 @@ void physmap_configure(unsigned long addr, unsigned long size,
 void physmap_set_partitions(struct mtd_partition *parts, int num_parts);
 
 #endif /* defined(CONFIG_MTD_PARTITIONS) */
-#endif /* defined(CONFIG_MTD) */
 
 #endif /* __LINUX_MTD_PHYSMAP__ */
-- 
cgit v1.2.3


From 3e68fbb59b3d4e6b47b65e9928b5929e02179759 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 15 May 2006 00:49:43 +0100
Subject: [JFFS2] Don't pack on-medium structures, because GCC emits crappy
 code

If we use __attribute__((packed)), GCC will _also_ assume that the
structures aren't sensibly aligned, and it'll emit code to cope with
that instead of straight word load/save. This can be _very_ suboptimal
on architectures like ARM.

Ideally, we want an attribute which just tells GCC not to do any
padding, without the alignment side-effects. In the absense of that,
we'll just drop the 'packed' attribute and hope that everything stays as
it was (which to be fair is fairly much what we expect). And add some
paranoia checks in the initialisation code, which should be optimised
away completely in the normal case.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/super.c      | 14 +++++++++++++-
 include/linux/jffs2.h | 14 +++++++-------
 2 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index ffd8e84b22cc..5f73de586928 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -320,6 +320,18 @@ static int __init init_jffs2_fs(void)
 {
 	int ret;
 
+	/* Paranoia checks for on-medium structures. If we ask GCC
+	   to pack them with __attribute__((packed)) then it _also_
+	   assumes that they're not aligned -- so it emits crappy
+	   code on some architectures. Ideally we want an attribute
+	   which means just 'no padding', without the alignment
+	   thing. But GCC doesn't have that -- we have to just
+	   hope the structs are the right sizes, instead. */
+	BUG_ON(sizeof(struct jffs2_unknown_node) != 12);
+	BUG_ON(sizeof(struct jffs2_raw_dirent) != 40);
+	BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
+	BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
+
 	printk(KERN_INFO "JFFS2 version 2.2."
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	       " (NAND)"
@@ -327,7 +339,7 @@ static int __init init_jffs2_fs(void)
 #ifdef CONFIG_JFFS2_SUMMARY
 	       " (SUMMARY) "
 #endif
-	       " (C) 2001-2003 Red Hat, Inc.\n");
+	       " (C) 2001-2006 Red Hat, Inc.\n");
 
 	jffs2_inode_cachep = kmem_cache_create("jffs2_i",
 					     sizeof(struct jffs2_inode_info),
diff --git a/include/linux/jffs2.h b/include/linux/jffs2.h
index cf792bb3c726..228ad72f7dd8 100644
--- a/include/linux/jffs2.h
+++ b/include/linux/jffs2.h
@@ -82,15 +82,15 @@
 
 typedef struct {
 	uint32_t v32;
-} __attribute__((packed))  jint32_t;
+} jint32_t;
 
 typedef struct {
 	uint32_t m;
-} __attribute__((packed))  jmode_t;
+} jmode_t;
 
 typedef struct {
 	uint16_t v16;
-} __attribute__((packed)) jint16_t;
+} jint16_t;
 
 struct jffs2_unknown_node
 {
@@ -99,7 +99,7 @@ struct jffs2_unknown_node
 	jint16_t nodetype;
 	jint32_t totlen; /* So we can skip over nodes we don't grok */
 	jint32_t hdr_crc;
-} __attribute__((packed));
+};
 
 struct jffs2_raw_dirent
 {
@@ -117,7 +117,7 @@ struct jffs2_raw_dirent
 	jint32_t node_crc;
 	jint32_t name_crc;
 	uint8_t name[0];
-} __attribute__((packed));
+};
 
 /* The JFFS2 raw inode structure: Used for storage on physical media.  */
 /* The uid, gid, atime, mtime and ctime members could be longer, but
@@ -149,7 +149,7 @@ struct jffs2_raw_inode
 	jint32_t data_crc;   /* CRC for the (compressed) data.  */
 	jint32_t node_crc;   /* CRC for the raw inode (excluding data)  */
 	uint8_t data[0];
-} __attribute__((packed));
+};
 
 struct jffs2_raw_summary
 {
@@ -163,7 +163,7 @@ struct jffs2_raw_summary
 	jint32_t sum_crc;	/* summary information crc */
 	jint32_t node_crc; 	/* node crc */
 	jint32_t sum[0]; 	/* inode summary info */
-} __attribute__((packed));
+};
 
 union jffs2_node_union
 {
-- 
cgit v1.2.3


From 3c567b7d1137633f3ff67cd1df94abc5fd497a85 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:57:23 +0900
Subject: [PATCH] libata: rename ata_down_sata_spd_limit() and friends

Rename ata_down_sata_spd_limit() and friends to sata_down_spd_limit()
and likewise for simplicity & consistency.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c | 32 ++++++++++++++++----------------
 drivers/scsi/libata.h      |  4 ++--
 drivers/scsi/sata_sil24.c  |  2 +-
 include/linux/libata.h     |  2 +-
 4 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 509178c3700c..196c09ff5e71 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1465,7 +1465,7 @@ static int ata_bus_probe(struct ata_port *ap)
 		tries[dev->devno] = 0;
 		break;
 	case -EIO:
-		ata_down_sata_spd_limit(ap);
+		sata_down_spd_limit(ap);
 		/* fall through */
 	default:
 		tries[dev->devno]--;
@@ -1640,12 +1640,12 @@ void ata_port_disable(struct ata_port *ap)
 }
 
 /**
- *	ata_down_sata_spd_limit - adjust SATA spd limit downward
+ *	sata_down_spd_limit - adjust SATA spd limit downward
  *	@ap: Port to adjust SATA spd limit for
  *
  *	Adjust SATA spd limit of @ap downward.  Note that this
  *	function only adjusts the limit.  The change must be applied
- *	using ata_set_sata_spd().
+ *	using sata_set_spd().
  *
  *	LOCKING:
  *	Inherited from caller.
@@ -1653,7 +1653,7 @@ void ata_port_disable(struct ata_port *ap)
  *	RETURNS:
  *	0 on success, negative errno on failure
  */
-int ata_down_sata_spd_limit(struct ata_port *ap)
+int sata_down_spd_limit(struct ata_port *ap)
 {
 	u32 spd, mask;
 	int highbit;
@@ -1683,7 +1683,7 @@ int ata_down_sata_spd_limit(struct ata_port *ap)
 	return 0;
 }
 
-static int __ata_set_sata_spd_needed(struct ata_port *ap, u32 *scontrol)
+static int __sata_set_spd_needed(struct ata_port *ap, u32 *scontrol)
 {
 	u32 spd, limit;
 
@@ -1699,7 +1699,7 @@ static int __ata_set_sata_spd_needed(struct ata_port *ap, u32 *scontrol)
 }
 
 /**
- *	ata_set_sata_spd_needed - is SATA spd configuration needed
+ *	sata_set_spd_needed - is SATA spd configuration needed
  *	@ap: Port in question
  *
  *	Test whether the spd limit in SControl matches
@@ -1713,7 +1713,7 @@ static int __ata_set_sata_spd_needed(struct ata_port *ap, u32 *scontrol)
  *	RETURNS:
  *	1 if SATA spd configuration is needed, 0 otherwise.
  */
-int ata_set_sata_spd_needed(struct ata_port *ap)
+int sata_set_spd_needed(struct ata_port *ap)
 {
 	u32 scontrol;
 
@@ -1722,11 +1722,11 @@ int ata_set_sata_spd_needed(struct ata_port *ap)
 
 	scontrol = scr_read(ap, SCR_CONTROL);
 
-	return __ata_set_sata_spd_needed(ap, &scontrol);
+	return __sata_set_spd_needed(ap, &scontrol);
 }
 
 /**
- *	ata_set_sata_spd - set SATA spd according to spd limit
+ *	sata_set_spd - set SATA spd according to spd limit
  *	@ap: Port to set SATA spd for
  *
  *	Set SATA spd of @ap according to sata_spd_limit.
@@ -1738,7 +1738,7 @@ int ata_set_sata_spd_needed(struct ata_port *ap)
  *	0 if spd doesn't need to be changed, 1 if spd has been
  *	changed.  -EOPNOTSUPP if SCR registers are inaccessible.
  */
-int ata_set_sata_spd(struct ata_port *ap)
+int sata_set_spd(struct ata_port *ap)
 {
 	u32 scontrol;
 
@@ -1746,7 +1746,7 @@ int ata_set_sata_spd(struct ata_port *ap)
 		return -EOPNOTSUPP;
 
 	scontrol = scr_read(ap, SCR_CONTROL);
-	if (!__ata_set_sata_spd_needed(ap, &scontrol))
+	if (!__sata_set_spd_needed(ap, &scontrol))
 		return 0;
 
 	scr_write(ap, SCR_CONTROL, scontrol);
@@ -2464,7 +2464,7 @@ int sata_std_hardreset(struct ata_port *ap, unsigned int *class)
 
 	DPRINTK("ENTER\n");
 
-	if (ata_set_sata_spd_needed(ap)) {
+	if (sata_set_spd_needed(ap)) {
 		/* SATA spec says nothing about how to reconfigure
 		 * spd.  To be on the safe side, turn off phy during
 		 * reconfiguration.  This works for at least ICH7 AHCI
@@ -2474,7 +2474,7 @@ int sata_std_hardreset(struct ata_port *ap, unsigned int *class)
 		scontrol = (scontrol & 0x0f0) | 0x302;
 		scr_write_flush(ap, SCR_CONTROL, scontrol);
 
-		ata_set_sata_spd(ap);
+		sata_set_spd(ap);
 	}
 
 	/* issue phy wake/reset */
@@ -2657,7 +2657,7 @@ int ata_drive_probe_reset(struct ata_port *ap, ata_probeinit_fn_t probeinit,
 	if (probeinit)
 		probeinit(ap);
 
-	if (softreset && !ata_set_sata_spd_needed(ap)) {
+	if (softreset && !sata_set_spd_needed(ap)) {
 		rc = ata_do_reset(ap, softreset, postreset, classes);
 		if (rc == 0 && classes[0] != ATA_DEV_UNKNOWN)
 			goto done;
@@ -2677,7 +2677,7 @@ int ata_drive_probe_reset(struct ata_port *ap, ata_probeinit_fn_t probeinit,
 			break;
 		}
 
-		if (ata_down_sata_spd_limit(ap))
+		if (sata_down_spd_limit(ap))
 			goto done;
 
 		printk(KERN_INFO "ata%u: hardreset failed, will retry "
@@ -5113,7 +5113,7 @@ EXPORT_SYMBOL_GPL(ata_bmdma_irq_clear);
 EXPORT_SYMBOL_GPL(ata_bmdma_status);
 EXPORT_SYMBOL_GPL(ata_bmdma_stop);
 EXPORT_SYMBOL_GPL(ata_port_probe);
-EXPORT_SYMBOL_GPL(ata_set_sata_spd);
+EXPORT_SYMBOL_GPL(sata_set_spd);
 EXPORT_SYMBOL_GPL(sata_phy_reset);
 EXPORT_SYMBOL_GPL(__sata_phy_reset);
 EXPORT_SYMBOL_GPL(ata_bus_reset);
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index 3f8b0a863781..26975df9a3fc 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -51,8 +51,8 @@ extern void ata_port_flush_task(struct ata_port *ap);
 extern unsigned ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
 				  struct ata_taskfile *tf, const u8 *cdb,
 				  int dma_dir, void *buf, unsigned int buflen);
-extern int ata_down_sata_spd_limit(struct ata_port *ap);
-extern int ata_set_sata_spd_needed(struct ata_port *ap);
+extern int sata_down_spd_limit(struct ata_port *ap);
+extern int sata_set_spd_needed(struct ata_port *ap);
 extern int ata_down_xfermask_limit(struct ata_port *ap, struct ata_device *dev,
 				   int force_pio0);
 extern int ata_set_mode(struct ata_port *ap, struct ata_device **r_failed_dev);
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c
index e9fd869140c5..8c167250f4c9 100644
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -528,7 +528,7 @@ static int sil24_hardreset(struct ata_port *ap, unsigned int *class)
 	u32 tmp;
 
 	/* sil24 does the right thing(tm) without any protection */
-	ata_set_sata_spd(ap);
+	sata_set_spd(ap);
 
 	tout_msec = 100;
 	if (sata_dev_present(ap))
diff --git a/include/linux/libata.h b/include/linux/libata.h
index d35b1e3bb7e0..0b67aafd3878 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -504,7 +504,7 @@ extern void ata_port_probe(struct ata_port *);
 extern void __sata_phy_reset(struct ata_port *ap);
 extern void sata_phy_reset(struct ata_port *ap);
 extern void ata_bus_reset(struct ata_port *ap);
-extern int ata_set_sata_spd(struct ata_port *ap);
+extern int sata_set_spd(struct ata_port *ap);
 extern int ata_drive_probe_reset(struct ata_port *ap,
 			ata_probeinit_fn_t probeinit,
 			ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
-- 
cgit v1.2.3


From 6cd727b14f1a6cdcb088d1067c1ba0ba124806a7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:57:28 +0900
Subject: [PATCH] libata: kill duplicate prototypes

Kill duplicate prototypes for ata_eh_qc_complete/retry() in libata.h.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 include/linux/libata.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0b67aafd3878..220b9d7bfc28 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -530,8 +530,6 @@ extern void ata_host_set_remove(struct ata_host_set *host_set);
 extern int ata_scsi_detect(struct scsi_host_template *sht);
 extern int ata_scsi_ioctl(struct scsi_device *dev, int cmd, void __user *arg);
 extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *));
-extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
-extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
 extern int ata_scsi_release(struct Scsi_Host *host);
 extern unsigned int ata_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc);
 extern int ata_scsi_device_resume(struct scsi_device *);
-- 
cgit v1.2.3


From fe635c7e91036282e4fd0cc5b4eebc712e43270d Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:57:35 +0900
Subject: [PATCH] libata: use preallocated buffers

It's not a very good idea to allocate memory during EH.  Use
statically allocated buffer for dev->id[] and add 512byte buffer
ap->sector_buf.  This buffer is owned by EH (or probing) and to be
used as temporary buffer for various purposes (IDENTIFY, NCQ log page
10h, PM GSCR block).

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c | 32 ++++++++------------------------
 include/linux/libata.h     |  4 +++-
 2 files changed, 11 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 13bce43f1915..af55861a96e2 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1099,7 +1099,7 @@ unsigned int ata_pio_need_iordy(const struct ata_device *adev)
  *	@dev: target device
  *	@p_class: pointer to class of the target device (may be changed)
  *	@post_reset: is this read ID post-reset?
- *	@p_id: read IDENTIFY page (newly allocated)
+ *	@id: buffer to read IDENTIFY data into
  *
  *	Read ID data from the specified device.  ATA_CMD_ID_ATA is
  *	performed on ATA devices and ATA_CMD_ID_ATAPI on ATAPI
@@ -1113,12 +1113,11 @@ unsigned int ata_pio_need_iordy(const struct ata_device *adev)
  *	0 on success, -errno otherwise.
  */
 static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
-			   unsigned int *p_class, int post_reset, u16 **p_id)
+			   unsigned int *p_class, int post_reset, u16 *id)
 {
 	unsigned int class = *p_class;
 	struct ata_taskfile tf;
 	unsigned int err_mask = 0;
-	u16 *id;
 	const char *reason;
 	int rc;
 
@@ -1126,13 +1125,6 @@ static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
 
 	ata_dev_select(ap, dev->devno, 1, 1); /* select device 0/1 */
 
-	id = kmalloc(sizeof(id[0]) * ATA_ID_WORDS, GFP_KERNEL);
-	if (id == NULL) {
-		rc = -ENOMEM;
-		reason = "out of memory";
-		goto err_out;
-	}
-
  retry:
 	ata_tf_init(ap, &tf, dev->devno);
 
@@ -1194,13 +1186,12 @@ static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
 	}
 
 	*p_class = class;
-	*p_id = id;
+
 	return 0;
 
  err_out:
 	printk(KERN_WARNING "ata%u: dev %u failed to IDENTIFY (%s)\n",
 	       ap->id, dev->devno, reason);
-	kfree(id);
 	return rc;
 }
 
@@ -1425,9 +1416,7 @@ static int ata_bus_probe(struct ata_port *ap)
 		if (!ata_dev_enabled(dev))
 			continue;
 
-		kfree(dev->id);
-		dev->id = NULL;
-		rc = ata_dev_read_id(ap, dev, &dev->class, 1, &dev->id);
+		rc = ata_dev_read_id(ap, dev, &dev->class, 1, dev->id);
 		if (rc)
 			goto fail;
 
@@ -2788,7 +2777,7 @@ int ata_dev_revalidate(struct ata_port *ap, struct ata_device *dev,
 		       int post_reset)
 {
 	unsigned int class = dev->class;
-	u16 *id = NULL;
+	u16 *id = (void *)ap->sector_buf;
 	int rc;
 
 	if (!ata_dev_enabled(dev)) {
@@ -2796,8 +2785,8 @@ int ata_dev_revalidate(struct ata_port *ap, struct ata_device *dev,
 		goto fail;
 	}
 
-	/* allocate & read ID data */
-	rc = ata_dev_read_id(ap, dev, &class, post_reset, &id);
+	/* read ID data */
+	rc = ata_dev_read_id(ap, dev, &class, post_reset, id);
 	if (rc)
 		goto fail;
 
@@ -2807,8 +2796,7 @@ int ata_dev_revalidate(struct ata_port *ap, struct ata_device *dev,
 		goto fail;
 	}
 
-	kfree(dev->id);
-	dev->id = id;
+	memcpy(dev->id, id, sizeof(id[0]) * ATA_ID_WORDS);
 
 	/* configure device according to the new ID */
 	rc = ata_dev_configure(ap, dev, 0);
@@ -2818,7 +2806,6 @@ int ata_dev_revalidate(struct ata_port *ap, struct ata_device *dev,
  fail:
 	printk(KERN_ERR "ata%u: dev %u revalidation failed (errno=%d)\n",
 	       ap->id, dev->devno, rc);
-	kfree(id);
 	return rc;
 }
 
@@ -4873,14 +4860,11 @@ void ata_host_set_remove(struct ata_host_set *host_set)
 int ata_scsi_release(struct Scsi_Host *host)
 {
 	struct ata_port *ap = ata_shost_to_port(host);
-	int i;
 
 	DPRINTK("ENTER\n");
 
 	ap->ops->port_disable(ap);
 	ata_host_remove(ap, 0);
-	for (i = 0; i < ATA_MAX_DEVICES; i++)
-		kfree(ap->device[i].id);
 
 	DPRINTK("EXIT\n");
 	return 1;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 220b9d7bfc28..0e1a3be39475 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -360,7 +360,7 @@ struct ata_device {
 	unsigned long		flags;		/* ATA_DFLAG_xxx */
 	unsigned int		class;		/* ATA_DEV_xxx */
 	unsigned int		devno;		/* 0 or 1 */
-	u16			*id;		/* IDENTIFY xxx DEVICE data */
+	u16			id[ATA_ID_WORDS]; /* IDENTIFY xxx DEVICE data */
 	u8			pio_mode;
 	u8			dma_mode;
 	u8			xfer_mode;
@@ -425,6 +425,8 @@ struct ata_port {
 	struct list_head	eh_done_q;
 
 	void			*private_data;
+
+	u8			sector_buf[ATA_SECT_SIZE]; /* owned by EH */
 };
 
 struct ata_port_operations {
-- 
cgit v1.2.3


From e61e067227bc76b4d9411a50d735c9d87f27b0e2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:57:40 +0900
Subject: [PATCH] libata: implement qc->result_tf

Add qc->result_tf and ATA_QCFLAG_RESULT_TF.  This moves the
responsibility of loading result TF from post-compltion path to qc
execution path.  qc->result_tf is loaded if explicitly requested or
the qc failsa.  This allows more efficient completion implementation
and correct handling of result TF for controllers which don't have
global TF representation such as sil3124/32.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c |  4 ++--
 drivers/scsi/libata-scsi.c | 18 +++++++-----------
 include/linux/libata.h     | 16 ++++++++++++++--
 3 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 19ae3fa1cfcb..51cb9ca5519c 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -955,7 +955,6 @@ void ata_qc_complete_internal(struct ata_queued_cmd *qc)
 {
 	struct completion *waiting = qc->private_data;
 
-	qc->ap->ops->tf_read(qc->ap, &qc->tf);
 	complete(waiting);
 }
 
@@ -997,6 +996,7 @@ unsigned ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
 	qc->tf = *tf;
 	if (cdb)
 		memcpy(qc->cdb, cdb, ATAPI_CDB_LEN);
+	qc->flags |= ATA_QCFLAG_RESULT_TF;
 	qc->dma_dir = dma_dir;
 	if (dma_dir != DMA_NONE) {
 		ata_sg_init_one(qc, buf, buflen);
@@ -1034,7 +1034,7 @@ unsigned ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
 	/* finish up */
 	spin_lock_irqsave(&ap->host_set->lock, flags);
 
-	*tf = qc->tf;
+	*tf = qc->result_tf;
 	err_mask = qc->err_mask;
 
 	ata_qc_free(qc);
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index b0c83c28d578..ce90b6352a81 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -537,7 +537,7 @@ void ata_to_sense_error(unsigned id, u8 drv_stat, u8 drv_err, u8 *sk, u8 *asc,
 void ata_gen_ata_desc_sense(struct ata_queued_cmd *qc)
 {
 	struct scsi_cmnd *cmd = qc->scsicmd;
-	struct ata_taskfile *tf = &qc->tf;
+	struct ata_taskfile *tf = &qc->result_tf;
 	unsigned char *sb = cmd->sense_buffer;
 	unsigned char *desc = sb + 8;
 
@@ -608,7 +608,7 @@ void ata_gen_ata_desc_sense(struct ata_queued_cmd *qc)
 void ata_gen_fixed_sense(struct ata_queued_cmd *qc)
 {
 	struct scsi_cmnd *cmd = qc->scsicmd;
-	struct ata_taskfile *tf = &qc->tf;
+	struct ata_taskfile *tf = &qc->result_tf;
 	unsigned char *sb = cmd->sense_buffer;
 
 	memset(sb, 0, SCSI_SENSE_BUFFERSIZE);
@@ -1199,14 +1199,11 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
 	 */
 	if (((cdb[0] == ATA_16) || (cdb[0] == ATA_12)) &&
  	    ((cdb[2] & 0x20) || need_sense)) {
-		qc->ap->ops->tf_read(qc->ap, &qc->tf);
  		ata_gen_ata_desc_sense(qc);
 	} else {
 		if (!need_sense) {
 			cmd->result = SAM_STAT_GOOD;
 		} else {
-			qc->ap->ops->tf_read(qc->ap, &qc->tf);
-
 			/* TODO: decide which descriptor format to use
 			 * for 48b LBA devices and call that here
 			 * instead of the fixed desc, which is only
@@ -1217,10 +1214,8 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
 		}
 	}
 
-	if (need_sense) {
-		/* The ata_gen_..._sense routines fill in tf */
-		ata_dump_status(qc->ap->id, &qc->tf);
-	}
+	if (need_sense)
+		ata_dump_status(qc->ap->id, &qc->result_tf);
 
 	qc->scsidone(cmd);
 
@@ -2004,7 +1999,6 @@ static void atapi_sense_complete(struct ata_queued_cmd *qc)
 		 * a sense descriptors, since that's only
 		 * correct for ATA, not ATAPI
 		 */
-		qc->ap->ops->tf_read(qc->ap, &qc->tf);
 		ata_gen_ata_desc_sense(qc);
 	}
 
@@ -2080,7 +2074,6 @@ static void atapi_qc_complete(struct ata_queued_cmd *qc)
 		 * a sense descriptors, since that's only
 		 * correct for ATA, not ATAPI
 		 */
-		qc->ap->ops->tf_read(qc->ap, &qc->tf);
 		ata_gen_ata_desc_sense(qc);
 	} else {
 		u8 *scsicmd = cmd->cmnd;
@@ -2361,6 +2354,9 @@ ata_scsi_pass_thru(struct ata_queued_cmd *qc, const u8 *scsicmd)
 	 */
 	qc->nsect = cmd->bufflen / ATA_SECT_SIZE;
 
+	/* request result TF */
+	qc->flags |= ATA_QCFLAG_RESULT_TF;
+
 	return 0;
 
  invalid_fld:
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0e1a3be39475..a4b8a419caad 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -162,7 +162,9 @@ enum {
 	ATA_QCFLAG_SINGLE	= (1 << 2), /* no s/g, just a single buffer */
 	ATA_QCFLAG_DMAMAP	= ATA_QCFLAG_SG | ATA_QCFLAG_SINGLE,
 	ATA_QCFLAG_IO		= (1 << 3), /* standard IO command */
-	ATA_QCFLAG_EH_SCHEDULED = (1 << 4), /* EH scheduled */
+	ATA_QCFLAG_RESULT_TF	= (1 << 4), /* result TF requested */
+
+	ATA_QCFLAG_EH_SCHEDULED = (1 << 16), /* EH scheduled */
 
 	/* host set flags */
 	ATA_HOST_SIMPLEX	= (1 << 0),	/* Host is simplex, one DMA channel per host_set only */
@@ -343,7 +345,7 @@ struct ata_queued_cmd {
 	struct scatterlist	*__sg;
 
 	unsigned int		err_mask;
-
+	struct ata_taskfile	result_tf;
 	ata_qc_cb_t		complete_fn;
 
 	void			*private_data;
@@ -824,6 +826,10 @@ static inline void ata_qc_reinit(struct ata_queued_cmd *qc)
 	qc->err_mask = 0;
 
 	ata_tf_init(qc->ap, &qc->tf, qc->dev->devno);
+
+	/* init result_tf such that it indicates normal completion */
+	qc->result_tf.command = ATA_DRDY;
+	qc->result_tf.feature = 0;
 }
 
 /**
@@ -839,9 +845,15 @@ static inline void ata_qc_reinit(struct ata_queued_cmd *qc)
  */
 static inline void ata_qc_complete(struct ata_queued_cmd *qc)
 {
+	struct ata_port *ap = qc->ap;
+
 	if (unlikely(qc->flags & ATA_QCFLAG_EH_SCHEDULED))
 		return;
 
+	/* read result TF if failed or requested */
+	if (qc->err_mask || qc->flags & ATA_QCFLAG_RESULT_TF)
+		ap->ops->tf_read(ap, &qc->result_tf);
+
 	__ata_qc_complete(qc);
 }
 
-- 
cgit v1.2.3


From 34bf21704c848fe00c516d1c8f163db08b70b137 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:57:46 +0900
Subject: [PATCH] libata: implement new SCR handling and port on/offline
 functions

Implement ata_scr_{valid|read|write|write_flush}() and
ata_port_{online|offline}().  These functions replace
scr_{read|write}() and sata_dev_present().

Major difference between between the new SCR functions and the old
ones is that the new ones have a way to signal error to the caller.
This makes handling SCR-available and SCR-unavailable cases in the
same path easier.  Also, it eases later PM implementation where SCR
access can fail due to various reasons.

ata_port_{online|offline}() functions return 1 only when they are
affirmitive of the condition.  e.g.  if SCR is unaccessible or
presence cannot be determined for other reasons, these functions
return 0.  So, ata_port_online() != !ata_port_offline().  This
distinction is useful in many exception handling cases.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c | 143 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/libata.h     |   6 ++
 2 files changed, 149 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index f29d43cb6991..b9c5cbf0b786 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -4365,6 +4365,143 @@ irqreturn_t ata_interrupt (int irq, void *dev_instance, struct pt_regs *regs)
 	return IRQ_RETVAL(handled);
 }
 
+/**
+ *	sata_scr_valid - test whether SCRs are accessible
+ *	@ap: ATA port to test SCR accessibility for
+ *
+ *	Test whether SCRs are accessible for @ap.
+ *
+ *	LOCKING:
+ *	None.
+ *
+ *	RETURNS:
+ *	1 if SCRs are accessible, 0 otherwise.
+ */
+int sata_scr_valid(struct ata_port *ap)
+{
+	return ap->cbl == ATA_CBL_SATA && ap->ops->scr_read;
+}
+
+/**
+ *	sata_scr_read - read SCR register of the specified port
+ *	@ap: ATA port to read SCR for
+ *	@reg: SCR to read
+ *	@val: Place to store read value
+ *
+ *	Read SCR register @reg of @ap into *@val.  This function is
+ *	guaranteed to succeed if the cable type of the port is SATA
+ *	and the port implements ->scr_read.
+ *
+ *	LOCKING:
+ *	None.
+ *
+ *	RETURNS:
+ *	0 on success, negative errno on failure.
+ */
+int sata_scr_read(struct ata_port *ap, int reg, u32 *val)
+{
+	if (sata_scr_valid(ap)) {
+		*val = ap->ops->scr_read(ap, reg);
+		return 0;
+	}
+	return -EOPNOTSUPP;
+}
+
+/**
+ *	sata_scr_write - write SCR register of the specified port
+ *	@ap: ATA port to write SCR for
+ *	@reg: SCR to write
+ *	@val: value to write
+ *
+ *	Write @val to SCR register @reg of @ap.  This function is
+ *	guaranteed to succeed if the cable type of the port is SATA
+ *	and the port implements ->scr_read.
+ *
+ *	LOCKING:
+ *	None.
+ *
+ *	RETURNS:
+ *	0 on success, negative errno on failure.
+ */
+int sata_scr_write(struct ata_port *ap, int reg, u32 val)
+{
+	if (sata_scr_valid(ap)) {
+		ap->ops->scr_write(ap, reg, val);
+		return 0;
+	}
+	return -EOPNOTSUPP;
+}
+
+/**
+ *	sata_scr_write_flush - write SCR register of the specified port and flush
+ *	@ap: ATA port to write SCR for
+ *	@reg: SCR to write
+ *	@val: value to write
+ *
+ *	This function is identical to sata_scr_write() except that this
+ *	function performs flush after writing to the register.
+ *
+ *	LOCKING:
+ *	None.
+ *
+ *	RETURNS:
+ *	0 on success, negative errno on failure.
+ */
+int sata_scr_write_flush(struct ata_port *ap, int reg, u32 val)
+{
+	if (sata_scr_valid(ap)) {
+		ap->ops->scr_write(ap, reg, val);
+		ap->ops->scr_read(ap, reg);
+		return 0;
+	}
+	return -EOPNOTSUPP;
+}
+
+/**
+ *	ata_port_online - test whether the given port is online
+ *	@ap: ATA port to test
+ *
+ *	Test whether @ap is online.  Note that this function returns 0
+ *	if online status of @ap cannot be obtained, so
+ *	ata_port_online(ap) != !ata_port_offline(ap).
+ *
+ *	LOCKING:
+ *	None.
+ *
+ *	RETURNS:
+ *	1 if the port online status is available and online.
+ */
+int ata_port_online(struct ata_port *ap)
+{
+	u32 sstatus;
+
+	if (!sata_scr_read(ap, SCR_STATUS, &sstatus) && (sstatus & 0xf) == 0x3)
+		return 1;
+	return 0;
+}
+
+/**
+ *	ata_port_offline - test whether the given port is offline
+ *	@ap: ATA port to test
+ *
+ *	Test whether @ap is offline.  Note that this function returns
+ *	0 if offline status of @ap cannot be obtained, so
+ *	ata_port_online(ap) != !ata_port_offline(ap).
+ *
+ *	LOCKING:
+ *	None.
+ *
+ *	RETURNS:
+ *	1 if the port offline status is available and offline.
+ */
+int ata_port_offline(struct ata_port *ap)
+{
+	u32 sstatus;
+
+	if (!sata_scr_read(ap, SCR_STATUS, &sstatus) && (sstatus & 0xf) != 0x3)
+		return 1;
+	return 0;
+}
 
 /*
  * Execute a 'simple' command, that only consists of the opcode 'cmd' itself,
@@ -5133,6 +5270,12 @@ EXPORT_SYMBOL_GPL(ata_scsi_queuecmd);
 EXPORT_SYMBOL_GPL(ata_scsi_slave_config);
 EXPORT_SYMBOL_GPL(ata_scsi_release);
 EXPORT_SYMBOL_GPL(ata_host_intr);
+EXPORT_SYMBOL_GPL(sata_scr_valid);
+EXPORT_SYMBOL_GPL(sata_scr_read);
+EXPORT_SYMBOL_GPL(sata_scr_write);
+EXPORT_SYMBOL_GPL(sata_scr_write_flush);
+EXPORT_SYMBOL_GPL(ata_port_online);
+EXPORT_SYMBOL_GPL(ata_port_offline);
 EXPORT_SYMBOL_GPL(ata_id_string);
 EXPORT_SYMBOL_GPL(ata_id_c_string);
 EXPORT_SYMBOL_GPL(ata_scsi_simulate);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index a4b8a419caad..47b97157995d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -536,6 +536,12 @@ extern int ata_scsi_ioctl(struct scsi_device *dev, int cmd, void __user *arg);
 extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *));
 extern int ata_scsi_release(struct Scsi_Host *host);
 extern unsigned int ata_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc);
+extern int sata_scr_valid(struct ata_port *ap);
+extern int sata_scr_read(struct ata_port *ap, int reg, u32 *val);
+extern int sata_scr_write(struct ata_port *ap, int reg, u32 val);
+extern int sata_scr_write_flush(struct ata_port *ap, int reg, u32 val);
+extern int ata_port_online(struct ata_port *ap);
+extern int ata_port_offline(struct ata_port *ap);
 extern int ata_scsi_device_resume(struct scsi_device *);
 extern int ata_scsi_device_suspend(struct scsi_device *, pm_message_t state);
 extern int ata_device_resume(struct ata_port *, struct ata_device *);
-- 
cgit v1.2.3


From a0ab51cefc95cb7756c4914603fea2b1a0f813c5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:57:49 +0900
Subject: [PATCH] libata: kill old SCR functions and sata_dev_present()

Kill now unused scr_{read|write|write_flush}() and sata_dev_present().

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 include/linux/libata.h | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 47b97157995d..cd467cd54473 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -939,28 +939,6 @@ static inline u8 ata_irq_ack(struct ata_port *ap, unsigned int chk_drq)
 	return status;
 }
 
-static inline u32 scr_read(struct ata_port *ap, unsigned int reg)
-{
-	return ap->ops->scr_read(ap, reg);
-}
-
-static inline void scr_write(struct ata_port *ap, unsigned int reg, u32 val)
-{
-	ap->ops->scr_write(ap, reg, val);
-}
-
-static inline void scr_write_flush(struct ata_port *ap, unsigned int reg,
-				   u32 val)
-{
-	ap->ops->scr_write(ap, reg, val);
-	(void) ap->ops->scr_read(ap, reg);
-}
-
-static inline unsigned int sata_dev_present(struct ata_port *ap)
-{
-	return ((scr_read(ap, SCR_STATUS) & 0xf) == 0x3) ? 1 : 0;
-}
-
 static inline int ata_try_flush_cache(const struct ata_device *dev)
 {
 	return ata_id_wcache_enabled(dev->id) ||
-- 
cgit v1.2.3


From 38d87234d6c47ca487fc6344100323d5adc6f32c Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:57:51 +0900
Subject: [PATCH] libata: add dev->ap

Add dev->ap which points back to the port the device belongs to.  This
makes it unnecessary to pass @ap for silly reasons (e.g. printks).
Also, this change is necessary to accomodate later PM support which
will introduce ATA link inbetween port and device.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c | 1 +
 include/linux/libata.h     | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 56f0af208345..31b65e0da0b1 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -4749,6 +4749,7 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 
 	for (i = 0; i < ATA_MAX_DEVICES; i++) {
 		struct ata_device *dev = &ap->device[i];
+		dev->ap = ap;
 		dev->devno = i;
 		dev->pio_mask = UINT_MAX;
 		dev->mwdma_mask = UINT_MAX;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index cd467cd54473..ac2d2cc78b10 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -358,6 +358,7 @@ struct ata_host_stats {
 };
 
 struct ata_device {
+	struct ata_port		*ap;
 	u64			n_sectors;	/* size of device, if ATA */
 	unsigned long		flags;		/* ATA_DFLAG_xxx */
 	unsigned int		class;		/* ATA_DEV_xxx */
-- 
cgit v1.2.3


From 3373efd89dead4ce7818d685729e0431448357c9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:57:53 +0900
Subject: [PATCH] libata: use dev->ap

Use dev->ap where possible and eliminate superflous @ap from functions
and structures.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/ahci.c        |   2 +-
 drivers/scsi/libata-core.c | 160 +++++++++++++++++++++------------------------
 drivers/scsi/libata-scsi.c |  40 +++++-------
 drivers/scsi/libata.h      |  11 ++--
 include/linux/libata.h     |  21 +++---
 5 files changed, 106 insertions(+), 128 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index c2298fb131d8..f6e4c8ea74e3 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -597,7 +597,7 @@ static int ahci_softreset(struct ata_port *ap, unsigned int *class)
 	/* restart engine */
 	ahci_start_engine(ap);
 
-	ata_tf_init(ap, &tf, 0);
+	ata_tf_init(ap->device, &tf);
 	fis = pp->cmd_tbl;
 
 	/* issue the first D2H Register FIS */
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 31b65e0da0b1..4ced962db812 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -61,13 +61,10 @@
 
 #include "libata.h"
 
-static unsigned int ata_dev_init_params(struct ata_port *ap,
-					struct ata_device *dev,
-					u16 heads,
-					u16 sectors);
-static unsigned int ata_dev_set_xfermode(struct ata_port *ap,
-					 struct ata_device *dev);
-static void ata_dev_xfermask(struct ata_port *ap, struct ata_device *dev);
+static unsigned int ata_dev_init_params(struct ata_device *dev,
+					u16 heads, u16 sectors);
+static unsigned int ata_dev_set_xfermode(struct ata_device *dev);
+static void ata_dev_xfermask(struct ata_device *dev);
 
 static unsigned int ata_unique_id = 1;
 static struct workqueue_struct *ata_wq;
@@ -412,11 +409,11 @@ static const char *sata_spd_string(unsigned int spd)
 	return spd_str[spd - 1];
 }
 
-void ata_dev_disable(struct ata_port *ap, struct ata_device *dev)
+void ata_dev_disable(struct ata_device *dev)
 {
 	if (ata_dev_enabled(dev)) {
 		printk(KERN_WARNING "ata%u: dev %u disabled\n",
-		       ap->id, dev->devno);
+		       dev->ap->id, dev->devno);
 		dev->class++;
 	}
 }
@@ -960,7 +957,6 @@ void ata_qc_complete_internal(struct ata_queued_cmd *qc)
 
 /**
  *	ata_exec_internal - execute libata internal command
- *	@ap: Port to which the command is sent
  *	@dev: Device to which the command is sent
  *	@tf: Taskfile registers for the command and the result
  *	@cdb: CDB for packet command
@@ -978,10 +974,11 @@ void ata_qc_complete_internal(struct ata_queued_cmd *qc)
  *	None.  Should be called with kernel context, might sleep.
  */
 
-unsigned ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
+unsigned ata_exec_internal(struct ata_device *dev,
 			   struct ata_taskfile *tf, const u8 *cdb,
 			   int dma_dir, void *buf, unsigned int buflen)
 {
+	struct ata_port *ap = dev->ap;
 	u8 command = tf->command;
 	struct ata_queued_cmd *qc;
 	DECLARE_COMPLETION(wait);
@@ -990,7 +987,7 @@ unsigned ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
 
 	spin_lock_irqsave(&ap->host_set->lock, flags);
 
-	qc = ata_qc_new_init(ap, dev);
+	qc = ata_qc_new_init(dev);
 	BUG_ON(qc == NULL);
 
 	qc->tf = *tf;
@@ -1095,7 +1092,6 @@ unsigned int ata_pio_need_iordy(const struct ata_device *adev)
 
 /**
  *	ata_dev_read_id - Read ID data from the specified device
- *	@ap: port on which target device resides
  *	@dev: target device
  *	@p_class: pointer to class of the target device (may be changed)
  *	@post_reset: is this read ID post-reset?
@@ -1112,9 +1108,10 @@ unsigned int ata_pio_need_iordy(const struct ata_device *adev)
  *	RETURNS:
  *	0 on success, -errno otherwise.
  */
-static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
-			   unsigned int *p_class, int post_reset, u16 *id)
+static int ata_dev_read_id(struct ata_device *dev, unsigned int *p_class,
+			   int post_reset, u16 *id)
 {
+	struct ata_port *ap = dev->ap;
 	unsigned int class = *p_class;
 	struct ata_taskfile tf;
 	unsigned int err_mask = 0;
@@ -1126,7 +1123,7 @@ static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
 	ata_dev_select(ap, dev->devno, 1, 1); /* select device 0/1 */
 
  retry:
-	ata_tf_init(ap, &tf, dev->devno);
+	ata_tf_init(dev, &tf);
 
 	switch (class) {
 	case ATA_DEV_ATA:
@@ -1143,7 +1140,7 @@ static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
 
 	tf.protocol = ATA_PROT_PIO;
 
-	err_mask = ata_exec_internal(ap, dev, &tf, NULL, DMA_FROM_DEVICE,
+	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_FROM_DEVICE,
 				     id, sizeof(id[0]) * ATA_ID_WORDS);
 	if (err_mask) {
 		rc = -EIO;
@@ -1170,7 +1167,7 @@ static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
 		 * Some drives were very specific about that exact sequence.
 		 */
 		if (ata_id_major_version(id) < 4 || !ata_id_has_lba(id)) {
-			err_mask = ata_dev_init_params(ap, dev, id[3], id[6]);
+			err_mask = ata_dev_init_params(dev, id[3], id[6]);
 			if (err_mask) {
 				rc = -EIO;
 				reason = "INIT_DEV_PARAMS failed";
@@ -1195,15 +1192,13 @@ static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
 	return rc;
 }
 
-static inline u8 ata_dev_knobble(const struct ata_port *ap,
-				 struct ata_device *dev)
+static inline u8 ata_dev_knobble(struct ata_device *dev)
 {
-	return ((ap->cbl == ATA_CBL_SATA) && (!ata_id_is_sata(dev->id)));
+	return ((dev->ap->cbl == ATA_CBL_SATA) && (!ata_id_is_sata(dev->id)));
 }
 
 /**
  *	ata_dev_configure - Configure the specified ATA/ATAPI device
- *	@ap: Port on which target device resides
  *	@dev: Target device to configure
  *	@print_info: Enable device info printout
  *
@@ -1216,9 +1211,9 @@ static inline u8 ata_dev_knobble(const struct ata_port *ap,
  *	RETURNS:
  *	0 on success, -errno otherwise
  */
-static int ata_dev_configure(struct ata_port *ap, struct ata_device *dev,
-			     int print_info)
+static int ata_dev_configure(struct ata_device *dev, int print_info)
 {
+	struct ata_port *ap = dev->ap;
 	const u16 *id = dev->id;
 	unsigned int xfer_mask;
 	int i, rc;
@@ -1331,7 +1326,7 @@ static int ata_dev_configure(struct ata_port *ap, struct ata_device *dev,
 					      ap->device[i].cdb_len);
 
 	/* limit bridge transfers to udma5, 200 sectors */
-	if (ata_dev_knobble(ap, dev)) {
+	if (ata_dev_knobble(dev)) {
 		if (print_info)
 			printk(KERN_INFO "ata%u(%u): applying bridge limits\n",
 			       ap->id, dev->devno);
@@ -1416,11 +1411,11 @@ static int ata_bus_probe(struct ata_port *ap)
 		if (!ata_dev_enabled(dev))
 			continue;
 
-		rc = ata_dev_read_id(ap, dev, &dev->class, 1, dev->id);
+		rc = ata_dev_read_id(dev, &dev->class, 1, dev->id);
 		if (rc)
 			goto fail;
 
-		rc = ata_dev_configure(ap, dev, 1);
+		rc = ata_dev_configure(dev, 1);
 		if (rc)
 			goto fail;
 	}
@@ -1453,13 +1448,13 @@ static int ata_bus_probe(struct ata_port *ap)
 	default:
 		tries[dev->devno]--;
 		if (down_xfermask &&
-		    ata_down_xfermask_limit(ap, dev, tries[dev->devno] == 1))
+		    ata_down_xfermask_limit(dev, tries[dev->devno] == 1))
 			tries[dev->devno] = 0;
 	}
 
 	if (!tries[dev->devno]) {
-		ata_down_xfermask_limit(ap, dev, 1);
-		ata_dev_disable(ap, dev);
+		ata_down_xfermask_limit(dev, 1);
+		ata_dev_disable(dev);
 	}
 
 	goto retry;
@@ -1586,15 +1581,15 @@ void sata_phy_reset(struct ata_port *ap)
 
 /**
  *	ata_dev_pair		-	return other device on cable
- *	@ap: port
  *	@adev: device
  *
  *	Obtain the other device on the same cable, or if none is
  *	present NULL is returned
  */
 
-struct ata_device *ata_dev_pair(struct ata_port *ap, struct ata_device *adev)
+struct ata_device *ata_dev_pair(struct ata_device *adev)
 {
+	struct ata_port *ap = adev->ap;
 	struct ata_device *pair = &ap->device[1 - adev->devno];
 	if (!ata_dev_enabled(pair))
 		return NULL;
@@ -1886,7 +1881,6 @@ int ata_timing_compute(struct ata_device *adev, unsigned short speed,
 
 /**
  *	ata_down_xfermask_limit - adjust dev xfer masks downward
- *	@ap: Port associated with device @dev
  *	@dev: Device to adjust xfer masks
  *	@force_pio0: Force PIO0
  *
@@ -1900,9 +1894,9 @@ int ata_timing_compute(struct ata_device *adev, unsigned short speed,
  *	RETURNS:
  *	0 on success, negative errno on failure
  */
-int ata_down_xfermask_limit(struct ata_port *ap, struct ata_device *dev,
-			    int force_pio0)
+int ata_down_xfermask_limit(struct ata_device *dev, int force_pio0)
 {
+	struct ata_port *ap = dev->ap;
 	unsigned long xfer_mask;
 	int highbit;
 
@@ -1934,8 +1928,9 @@ int ata_down_xfermask_limit(struct ata_port *ap, struct ata_device *dev,
 	return -EINVAL;
 }
 
-static int ata_dev_set_mode(struct ata_port *ap, struct ata_device *dev)
+static int ata_dev_set_mode(struct ata_device *dev)
 {
+	struct ata_port *ap = dev->ap;
 	unsigned int err_mask;
 	int rc;
 
@@ -1943,7 +1938,7 @@ static int ata_dev_set_mode(struct ata_port *ap, struct ata_device *dev)
 	if (dev->xfer_shift == ATA_SHIFT_PIO)
 		dev->flags |= ATA_DFLAG_PIO;
 
-	err_mask = ata_dev_set_xfermode(ap, dev);
+	err_mask = ata_dev_set_xfermode(dev);
 	if (err_mask) {
 		printk(KERN_ERR
 		       "ata%u: failed to set xfermode (err_mask=0x%x)\n",
@@ -1951,7 +1946,7 @@ static int ata_dev_set_mode(struct ata_port *ap, struct ata_device *dev)
 		return -EIO;
 	}
 
-	rc = ata_dev_revalidate(ap, dev, 0);
+	rc = ata_dev_revalidate(dev, 0);
 	if (rc)
 		return rc;
 
@@ -2007,7 +2002,7 @@ int ata_set_mode(struct ata_port *ap, struct ata_device **r_failed_dev)
 		if (!ata_dev_enabled(dev))
 			continue;
 
-		ata_dev_xfermask(ap, dev);
+		ata_dev_xfermask(dev);
 
 		pio_mask = ata_pack_xfermask(dev->pio_mask, 0, 0);
 		dma_mask = ata_pack_xfermask(0, dev->mwdma_mask, dev->udma_mask);
@@ -2060,7 +2055,7 @@ int ata_set_mode(struct ata_port *ap, struct ata_device **r_failed_dev)
 		if (!ata_dev_enabled(dev))
 			continue;
 
-		rc = ata_dev_set_mode(ap, dev);
+		rc = ata_dev_set_mode(dev);
 		if (rc)
 			goto out;
 	}
@@ -2712,7 +2707,6 @@ int ata_drive_probe_reset(struct ata_port *ap, ata_probeinit_fn_t probeinit,
 
 /**
  *	ata_dev_same_device - Determine whether new ID matches configured device
- *	@ap: port on which the device to compare against resides
  *	@dev: device to compare against
  *	@new_class: class of the new device
  *	@new_id: IDENTIFY page of the new device
@@ -2727,9 +2721,10 @@ int ata_drive_probe_reset(struct ata_port *ap, ata_probeinit_fn_t probeinit,
  *	RETURNS:
  *	1 if @dev matches @new_class and @new_id, 0 otherwise.
  */
-static int ata_dev_same_device(struct ata_port *ap, struct ata_device *dev,
-			       unsigned int new_class, const u16 *new_id)
+static int ata_dev_same_device(struct ata_device *dev, unsigned int new_class,
+			       const u16 *new_id)
 {
+	struct ata_port *ap = dev->ap;
 	const u16 *old_id = dev->id;
 	unsigned char model[2][41], serial[2][21];
 	u64 new_n_sectors;
@@ -2774,7 +2769,6 @@ static int ata_dev_same_device(struct ata_port *ap, struct ata_device *dev,
 
 /**
  *	ata_dev_revalidate - Revalidate ATA device
- *	@ap: port on which the device to revalidate resides
  *	@dev: device to revalidate
  *	@post_reset: is this revalidation after reset?
  *
@@ -2787,9 +2781,9 @@ static int ata_dev_same_device(struct ata_port *ap, struct ata_device *dev,
  *	RETURNS:
  *	0 on success, negative errno otherwise
  */
-int ata_dev_revalidate(struct ata_port *ap, struct ata_device *dev,
-		       int post_reset)
+int ata_dev_revalidate(struct ata_device *dev, int post_reset)
 {
+	struct ata_port *ap = dev->ap;
 	unsigned int class = dev->class;
 	u16 *id = (void *)ap->sector_buf;
 	int rc;
@@ -2800,12 +2794,12 @@ int ata_dev_revalidate(struct ata_port *ap, struct ata_device *dev,
 	}
 
 	/* read ID data */
-	rc = ata_dev_read_id(ap, dev, &class, post_reset, id);
+	rc = ata_dev_read_id(dev, &class, post_reset, id);
 	if (rc)
 		goto fail;
 
 	/* is the device still there? */
-	if (!ata_dev_same_device(ap, dev, class, id)) {
+	if (!ata_dev_same_device(dev, class, id)) {
 		rc = -ENODEV;
 		goto fail;
 	}
@@ -2813,7 +2807,7 @@ int ata_dev_revalidate(struct ata_port *ap, struct ata_device *dev,
 	memcpy(dev->id, id, sizeof(id[0]) * ATA_ID_WORDS);
 
 	/* configure device according to the new ID */
-	rc = ata_dev_configure(ap, dev, 0);
+	rc = ata_dev_configure(dev, 0);
 	if (rc == 0)
 		return 0;
 
@@ -2895,7 +2889,6 @@ static int ata_dma_blacklisted(const struct ata_device *dev)
 
 /**
  *	ata_dev_xfermask - Compute supported xfermask of the given device
- *	@ap: Port on which the device to compute xfermask for resides
  *	@dev: Device to compute xfermask for
  *
  *	Compute supported xfermask of @dev and store it in
@@ -2910,8 +2903,9 @@ static int ata_dma_blacklisted(const struct ata_device *dev)
  *	LOCKING:
  *	None.
  */
-static void ata_dev_xfermask(struct ata_port *ap, struct ata_device *dev)
+static void ata_dev_xfermask(struct ata_device *dev)
 {
+	struct ata_port *ap = dev->ap;
 	struct ata_host_set *hs = ap->host_set;
 	unsigned long xfer_mask;
 	int i;
@@ -2964,7 +2958,6 @@ static void ata_dev_xfermask(struct ata_port *ap, struct ata_device *dev)
 
 /**
  *	ata_dev_set_xfermode - Issue SET FEATURES - XFER MODE command
- *	@ap: Port associated with device @dev
  *	@dev: Device to which command will be sent
  *
  *	Issue SET FEATURES - XFER MODE command to device @dev
@@ -2977,8 +2970,7 @@ static void ata_dev_xfermask(struct ata_port *ap, struct ata_device *dev)
  *	0 on success, AC_ERR_* mask otherwise.
  */
 
-static unsigned int ata_dev_set_xfermode(struct ata_port *ap,
-					 struct ata_device *dev)
+static unsigned int ata_dev_set_xfermode(struct ata_device *dev)
 {
 	struct ata_taskfile tf;
 	unsigned int err_mask;
@@ -2986,14 +2978,14 @@ static unsigned int ata_dev_set_xfermode(struct ata_port *ap,
 	/* set up set-features taskfile */
 	DPRINTK("set features - xfer mode\n");
 
-	ata_tf_init(ap, &tf, dev->devno);
+	ata_tf_init(dev, &tf);
 	tf.command = ATA_CMD_SET_FEATURES;
 	tf.feature = SETFEATURES_XFER;
 	tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
 	tf.protocol = ATA_PROT_NODATA;
 	tf.nsect = dev->xfer_mode;
 
-	err_mask = ata_exec_internal(ap, dev, &tf, NULL, DMA_NONE, NULL, 0);
+	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0);
 
 	DPRINTK("EXIT, err_mask=%x\n", err_mask);
 	return err_mask;
@@ -3001,8 +2993,9 @@ static unsigned int ata_dev_set_xfermode(struct ata_port *ap,
 
 /**
  *	ata_dev_init_params - Issue INIT DEV PARAMS command
- *	@ap: Port associated with device @dev
  *	@dev: Device to which command will be sent
+ *	@heads: Number of heads
+ *	@sectors: Number of sectors
  *
  *	LOCKING:
  *	Kernel thread context (may sleep)
@@ -3010,11 +3003,8 @@ static unsigned int ata_dev_set_xfermode(struct ata_port *ap,
  *	RETURNS:
  *	0 on success, AC_ERR_* mask otherwise.
  */
-
-static unsigned int ata_dev_init_params(struct ata_port *ap,
-					struct ata_device *dev,
-					u16 heads,
-					u16 sectors)
+static unsigned int ata_dev_init_params(struct ata_device *dev,
+					u16 heads, u16 sectors)
 {
 	struct ata_taskfile tf;
 	unsigned int err_mask;
@@ -3026,14 +3016,14 @@ static unsigned int ata_dev_init_params(struct ata_port *ap,
 	/* set up init dev params taskfile */
 	DPRINTK("init dev params \n");
 
-	ata_tf_init(ap, &tf, dev->devno);
+	ata_tf_init(dev, &tf);
 	tf.command = ATA_CMD_INIT_DEV_PARAMS;
 	tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
 	tf.protocol = ATA_PROT_NODATA;
 	tf.nsect = sectors;
 	tf.device |= (heads - 1) & 0x0f; /* max head = num. of heads - 1 */
 
-	err_mask = ata_exec_internal(ap, dev, &tf, NULL, DMA_NONE, NULL, 0);
+	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0);
 
 	DPRINTK("EXIT, err_mask=%x\n", err_mask);
 	return err_mask;
@@ -4045,16 +4035,15 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
 
 /**
  *	ata_qc_new_init - Request an available ATA command, and initialize it
- *	@ap: Port associated with device @dev
  *	@dev: Device from whom we request an available command structure
  *
  *	LOCKING:
  *	None.
  */
 
-struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
-				      struct ata_device *dev)
+struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
 {
+	struct ata_port *ap = dev->ap;
 	struct ata_queued_cmd *qc;
 
 	qc = ata_qc_new(ap);
@@ -4520,19 +4509,18 @@ int ata_port_offline(struct ata_port *ap)
  * Execute a 'simple' command, that only consists of the opcode 'cmd' itself,
  * without filling any other registers
  */
-static int ata_do_simple_cmd(struct ata_port *ap, struct ata_device *dev,
-			     u8 cmd)
+static int ata_do_simple_cmd(struct ata_device *dev, u8 cmd)
 {
 	struct ata_taskfile tf;
 	int err;
 
-	ata_tf_init(ap, &tf, dev->devno);
+	ata_tf_init(dev, &tf);
 
 	tf.command = cmd;
 	tf.flags |= ATA_TFLAG_DEVICE;
 	tf.protocol = ATA_PROT_NODATA;
 
-	err = ata_exec_internal(ap, dev, &tf, NULL, DMA_NONE, NULL, 0);
+	err = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0);
 	if (err)
 		printk(KERN_ERR "%s: ata command failed: %d\n",
 				__FUNCTION__, err);
@@ -4540,7 +4528,7 @@ static int ata_do_simple_cmd(struct ata_port *ap, struct ata_device *dev,
 	return err;
 }
 
-static int ata_flush_cache(struct ata_port *ap, struct ata_device *dev)
+static int ata_flush_cache(struct ata_device *dev)
 {
 	u8 cmd;
 
@@ -4552,22 +4540,21 @@ static int ata_flush_cache(struct ata_port *ap, struct ata_device *dev)
 	else
 		cmd = ATA_CMD_FLUSH;
 
-	return ata_do_simple_cmd(ap, dev, cmd);
+	return ata_do_simple_cmd(dev, cmd);
 }
 
-static int ata_standby_drive(struct ata_port *ap, struct ata_device *dev)
+static int ata_standby_drive(struct ata_device *dev)
 {
-	return ata_do_simple_cmd(ap, dev, ATA_CMD_STANDBYNOW1);
+	return ata_do_simple_cmd(dev, ATA_CMD_STANDBYNOW1);
 }
 
-static int ata_start_drive(struct ata_port *ap, struct ata_device *dev)
+static int ata_start_drive(struct ata_device *dev)
 {
-	return ata_do_simple_cmd(ap, dev, ATA_CMD_IDLEIMMEDIATE);
+	return ata_do_simple_cmd(dev, ATA_CMD_IDLEIMMEDIATE);
 }
 
 /**
  *	ata_device_resume - wakeup a previously suspended devices
- *	@ap: port the device is connected to
  *	@dev: the device to resume
  *
  *	Kick the drive back into action, by sending it an idle immediate
@@ -4575,39 +4562,42 @@ static int ata_start_drive(struct ata_port *ap, struct ata_device *dev)
  *	and host.
  *
  */
-int ata_device_resume(struct ata_port *ap, struct ata_device *dev)
+int ata_device_resume(struct ata_device *dev)
 {
+	struct ata_port *ap = dev->ap;
+
 	if (ap->flags & ATA_FLAG_SUSPENDED) {
 		struct ata_device *failed_dev;
 		ap->flags &= ~ATA_FLAG_SUSPENDED;
 		while (ata_set_mode(ap, &failed_dev))
-			ata_dev_disable(ap, failed_dev);
+			ata_dev_disable(failed_dev);
 	}
 	if (!ata_dev_enabled(dev))
 		return 0;
 	if (dev->class == ATA_DEV_ATA)
-		ata_start_drive(ap, dev);
+		ata_start_drive(dev);
 
 	return 0;
 }
 
 /**
  *	ata_device_suspend - prepare a device for suspend
- *	@ap: port the device is connected to
  *	@dev: the device to suspend
  *
  *	Flush the cache on the drive, if appropriate, then issue a
  *	standbynow command.
  */
-int ata_device_suspend(struct ata_port *ap, struct ata_device *dev, pm_message_t state)
+int ata_device_suspend(struct ata_device *dev, pm_message_t state)
 {
+	struct ata_port *ap = dev->ap;
+
 	if (!ata_dev_enabled(dev))
 		return 0;
 	if (dev->class == ATA_DEV_ATA)
-		ata_flush_cache(ap, dev);
+		ata_flush_cache(dev);
 
 	if (state.event != PM_EVENT_FREEZE)
-		ata_standby_drive(ap, dev);
+		ata_standby_drive(dev);
 	ap->flags |= ATA_FLAG_SUSPENDED;
 	return 0;
 }
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index ce90b6352a81..fcbf64e741bb 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -302,7 +302,6 @@ int ata_scsi_ioctl(struct scsi_device *scsidev, int cmd, void __user *arg)
 
 /**
  *	ata_scsi_qc_new - acquire new ata_queued_cmd reference
- *	@ap: ATA port to which the new command is attached
  *	@dev: ATA device to which the new command is attached
  *	@cmd: SCSI command that originated this ATA command
  *	@done: SCSI command completion function
@@ -321,14 +320,13 @@ int ata_scsi_ioctl(struct scsi_device *scsidev, int cmd, void __user *arg)
  *	RETURNS:
  *	Command allocated, or %NULL if none available.
  */
-struct ata_queued_cmd *ata_scsi_qc_new(struct ata_port *ap,
-				       struct ata_device *dev,
+struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
 				       struct scsi_cmnd *cmd,
 				       void (*done)(struct scsi_cmnd *))
 {
 	struct ata_queued_cmd *qc;
 
-	qc = ata_qc_new_init(ap, dev);
+	qc = ata_qc_new_init(dev);
 	if (qc) {
 		qc->scsicmd = cmd;
 		qc->scsidone = done;
@@ -398,7 +396,7 @@ int ata_scsi_device_resume(struct scsi_device *sdev)
 	struct ata_port *ap = ata_shost_to_port(sdev->host);
 	struct ata_device *dev = &ap->device[sdev->id];
 
-	return ata_device_resume(ap, dev);
+	return ata_device_resume(dev);
 }
 
 int ata_scsi_device_suspend(struct scsi_device *sdev, pm_message_t state)
@@ -406,7 +404,7 @@ int ata_scsi_device_suspend(struct scsi_device *sdev, pm_message_t state)
 	struct ata_port *ap = ata_shost_to_port(sdev->host);
 	struct ata_device *dev = &ap->device[sdev->id];
 
-	return ata_device_suspend(ap, dev, state);
+	return ata_device_suspend(dev, state);
 }
 
 /**
@@ -1224,7 +1222,6 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
 
 /**
  *	ata_scsi_translate - Translate then issue SCSI command to ATA device
- *	@ap: ATA port to which the command is addressed
  *	@dev: ATA device to which the command is addressed
  *	@cmd: SCSI command to execute
  *	@done: SCSI command completion function
@@ -1247,17 +1244,16 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
  *	spin_lock_irqsave(host_set lock)
  */
 
-static void ata_scsi_translate(struct ata_port *ap, struct ata_device *dev,
-			      struct scsi_cmnd *cmd,
-			      void (*done)(struct scsi_cmnd *),
-			      ata_xlat_func_t xlat_func)
+static void ata_scsi_translate(struct ata_device *dev, struct scsi_cmnd *cmd,
+			       void (*done)(struct scsi_cmnd *),
+			       ata_xlat_func_t xlat_func)
 {
 	struct ata_queued_cmd *qc;
 	u8 *scsicmd = cmd->cmnd;
 
 	VPRINTK("ENTER\n");
 
-	qc = ata_scsi_qc_new(ap, dev, cmd, done);
+	qc = ata_scsi_qc_new(dev, cmd, done);
 	if (!qc)
 		goto err_mem;
 
@@ -1266,7 +1262,7 @@ static void ata_scsi_translate(struct ata_port *ap, struct ata_device *dev,
 	    cmd->sc_data_direction == DMA_TO_DEVICE) {
 		if (unlikely(cmd->request_bufflen < 1)) {
 			printk(KERN_WARNING "ata%u(%u): WARNING: zero len r/w req\n",
-			       ap->id, dev->devno);
+			       dev->ap->id, dev->devno);
 			goto err_did;
 		}
 
@@ -2433,19 +2429,20 @@ static inline void ata_scsi_dump_cdb(struct ata_port *ap,
 #endif
 }
 
-static inline void __ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *),
-				       struct ata_port *ap, struct ata_device *dev)
+static inline void __ata_scsi_queuecmd(struct scsi_cmnd *cmd,
+				       void (*done)(struct scsi_cmnd *),
+				       struct ata_device *dev)
 {
 	if (dev->class == ATA_DEV_ATA) {
 		ata_xlat_func_t xlat_func = ata_get_xlat_func(dev,
 							      cmd->cmnd[0]);
 
 		if (xlat_func)
-			ata_scsi_translate(ap, dev, cmd, done, xlat_func);
+			ata_scsi_translate(dev, cmd, done, xlat_func);
 		else
-			ata_scsi_simulate(ap, dev, cmd, done);
+			ata_scsi_simulate(dev, cmd, done);
 	} else
-		ata_scsi_translate(ap, dev, cmd, done, atapi_xlat);
+		ata_scsi_translate(dev, cmd, done, atapi_xlat);
 }
 
 /**
@@ -2483,7 +2480,7 @@ int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
 
 	dev = ata_scsi_find_dev(ap, scsidev);
 	if (likely(dev))
-		__ata_scsi_queuecmd(cmd, done, ap, dev);
+		__ata_scsi_queuecmd(cmd, done, dev);
 	else {
 		cmd->result = (DID_BAD_TARGET << 16);
 		done(cmd);
@@ -2496,7 +2493,6 @@ int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
 
 /**
  *	ata_scsi_simulate - simulate SCSI command on ATA device
- *	@ap: port the device is connected to
  *	@dev: the target device
  *	@cmd: SCSI command being sent to device.
  *	@done: SCSI command completion function.
@@ -2508,14 +2504,12 @@ int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
  *	spin_lock_irqsave(host_set lock)
  */
 
-void ata_scsi_simulate(struct ata_port *ap, struct ata_device *dev,
-		      struct scsi_cmnd *cmd,
+void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
 		      void (*done)(struct scsi_cmnd *))
 {
 	struct ata_scsi_args args;
 	const u8 *scsicmd = cmd->cmnd;
 
-	args.ap = ap;
 	args.dev = dev;
 	args.id = dev->id;
 	args.cmd = cmd;
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index 6442c2f1a80c..c9ff83bbcfae 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -32,7 +32,6 @@
 #define DRV_VERSION	"1.30"	/* must be exactly four chars */
 
 struct ata_scsi_args {
-	struct ata_port		*ap;
 	struct ata_device	*dev;
 	u16			*id;
 	struct scsi_cmnd	*cmd;
@@ -43,18 +42,16 @@ struct ata_scsi_args {
 extern int atapi_enabled;
 extern int atapi_dmadir;
 extern int libata_fua;
-extern struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
-				      struct ata_device *dev);
+extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev);
 extern int ata_rwcmd_protocol(struct ata_queued_cmd *qc);
-extern void ata_dev_disable(struct ata_port *ap, struct ata_device *dev);
+extern void ata_dev_disable(struct ata_device *dev);
 extern void ata_port_flush_task(struct ata_port *ap);
-extern unsigned ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
+extern unsigned ata_exec_internal(struct ata_device *dev,
 				  struct ata_taskfile *tf, const u8 *cdb,
 				  int dma_dir, void *buf, unsigned int buflen);
 extern int sata_down_spd_limit(struct ata_port *ap);
 extern int sata_set_spd_needed(struct ata_port *ap);
-extern int ata_down_xfermask_limit(struct ata_port *ap, struct ata_device *dev,
-				   int force_pio0);
+extern int ata_down_xfermask_limit(struct ata_device *dev, int force_pio0);
 extern int ata_set_mode(struct ata_port *ap, struct ata_device **r_failed_dev);
 extern int ata_do_reset(struct ata_port *ap, ata_reset_fn_t reset,
 			unsigned int *classes);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index ac2d2cc78b10..8154b366bbd1 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -518,8 +518,7 @@ extern void ata_std_probeinit(struct ata_port *ap);
 extern int ata_std_softreset(struct ata_port *ap, unsigned int *classes);
 extern int sata_std_hardreset(struct ata_port *ap, unsigned int *class);
 extern void ata_std_postreset(struct ata_port *ap, unsigned int *classes);
-extern int ata_dev_revalidate(struct ata_port *ap, struct ata_device *dev,
-			      int post_reset);
+extern int ata_dev_revalidate(struct ata_device *dev, int post_reset);
 extern void ata_port_disable(struct ata_port *);
 extern void ata_std_ports(struct ata_ioports *ioaddr);
 #ifdef CONFIG_PCI
@@ -545,8 +544,8 @@ extern int ata_port_online(struct ata_port *ap);
 extern int ata_port_offline(struct ata_port *ap);
 extern int ata_scsi_device_resume(struct scsi_device *);
 extern int ata_scsi_device_suspend(struct scsi_device *, pm_message_t state);
-extern int ata_device_resume(struct ata_port *, struct ata_device *);
-extern int ata_device_suspend(struct ata_port *, struct ata_device *, pm_message_t state);
+extern int ata_device_resume(struct ata_device *);
+extern int ata_device_suspend(struct ata_device *, pm_message_t state);
 extern int ata_ratelimit(void);
 extern unsigned int ata_busy_sleep(struct ata_port *ap,
 				   unsigned long timeout_pat,
@@ -592,15 +591,13 @@ extern void ata_bmdma_stop(struct ata_queued_cmd *qc);
 extern u8   ata_bmdma_status(struct ata_port *ap);
 extern void ata_bmdma_irq_clear(struct ata_port *ap);
 extern void __ata_qc_complete(struct ata_queued_cmd *qc);
-extern void ata_scsi_simulate(struct ata_port *ap, struct ata_device *dev,
-			      struct scsi_cmnd *cmd,
+extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
 			      void (*done)(struct scsi_cmnd *));
 extern int ata_std_bios_param(struct scsi_device *sdev,
 			      struct block_device *bdev,
 			      sector_t capacity, int geom[]);
 extern int ata_scsi_slave_config(struct scsi_device *sdev);
-extern struct ata_device *ata_dev_pair(struct ata_port *ap, 
-				       struct ata_device *adev);
+extern struct ata_device *ata_dev_pair(struct ata_device *adev);
 
 /*
  * Timing helpers
@@ -812,12 +809,12 @@ static inline struct ata_queued_cmd *ata_qc_from_tag (struct ata_port *ap,
 	return NULL;
 }
 
-static inline void ata_tf_init(struct ata_port *ap, struct ata_taskfile *tf, unsigned int device)
+static inline void ata_tf_init(struct ata_device *dev, struct ata_taskfile *tf)
 {
 	memset(tf, 0, sizeof(*tf));
 
-	tf->ctl = ap->ctl;
-	if (device == 0)
+	tf->ctl = dev->ap->ctl;
+	if (dev->devno == 0)
 		tf->device = ATA_DEVICE_OBS;
 	else
 		tf->device = ATA_DEVICE_OBS | ATA_DEV1;
@@ -832,7 +829,7 @@ static inline void ata_qc_reinit(struct ata_queued_cmd *qc)
 	qc->nbytes = qc->curbytes = 0;
 	qc->err_mask = 0;
 
-	ata_tf_init(qc->ap, &qc->tf, qc->dev->devno);
+	ata_tf_init(qc->dev, &qc->tf);
 
 	/* init result_tf such that it indicates normal completion */
 	qc->result_tf.command = ATA_DRDY;
-- 
cgit v1.2.3


From 61440db61fe4945ad9f7b32b4d6a22b17174aa1f Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:57:55 +0900
Subject: [PATCH] libata: implement ATA printk helpers

Implement ata_{port|dev}_printk() which prefixes the message with
proper identification string.  This change is necessary for later PM
support because devices and links should be identified differently
depending on how they are attached.

This also helps unifying device id strings.  Currently, there are two
forms in use (P is the port number D device number) - 'ataP(D):', and
'ataP: dev D '.  These macros also make it harder to forget proper ID
string (e.g. printing only port number when a device is in question).

Debug message handling can be integrated into these printk macros by
passing debug type and level via @lv.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 include/linux/libata.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 8154b366bbd1..91e10e6b7565 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -650,7 +650,18 @@ extern void ata_eng_timeout(struct ata_port *ap);
 extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
 
+/*
+ * printk helpers
+ */
+#define ata_port_printk(ap, lv, fmt, args...) \
+	printk(lv"ata%u: "fmt, (ap)->id , ##args)
+
+#define ata_dev_printk(dev, lv, fmt, args...) \
+	printk(lv"ata%u.%02u: "fmt, (dev)->ap->id, (dev)->devno , ##args)
 
+/*
+ * qc helpers
+ */
 static inline int
 ata_sg_is_last(struct scatterlist *sg, struct ata_queued_cmd *qc)
 {
-- 
cgit v1.2.3


From 9ec957f2002bd2994be659bbc0ec28397fa251ee Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:57:58 +0900
Subject: [PATCH] libata-eh-fw: add flags and operations for new EH

Add ATA_FLAG_EH_{PENDING|FROZEN}, ATA_ATA_QCFLAG_{FAILED|SENSE_VALID}
and ops->freeze, thaw, error_handler, post_internal_cmd() for new EH.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 include/linux/libata.h | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 91e10e6b7565..e5d6d7f8e6dc 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -146,13 +146,16 @@ enum {
 	ATA_FLAG_PIO_LBA48	= (1 << 8), /* Host DMA engine is LBA28 only */
 	ATA_FLAG_IRQ_MASK	= (1 << 9), /* Mask IRQ in PIO xfers */
 
-	ATA_FLAG_NOINTR		= (1 << 16), /* FIXME: Remove this once
+	ATA_FLAG_NOINTR		= (1 << 13), /* FIXME: Remove this once
 					      * proper HSM is in place. */
-	ATA_FLAG_DEBUGMSG	= (1 << 17),
-	ATA_FLAG_FLUSH_PORT_TASK = (1 << 18), /* flush port task */
+	ATA_FLAG_DEBUGMSG	= (1 << 14),
+	ATA_FLAG_FLUSH_PORT_TASK = (1 << 15), /* flush port task */
 
-	ATA_FLAG_DISABLED	= (1 << 19), /* port is disabled, ignore it */
-	ATA_FLAG_SUSPENDED	= (1 << 20), /* port is suspended */
+	ATA_FLAG_EH_PENDING	= (1 << 16), /* EH pending */
+	ATA_FLAG_FROZEN		= (1 << 17), /* port is frozen */
+
+	ATA_FLAG_DISABLED	= (1 << 22), /* port is disabled, ignore it */
+	ATA_FLAG_SUSPENDED	= (1 << 23), /* port is suspended (power) */
 
 	/* bits 24:31 of ap->flags are reserved for LLDD specific flags */
 
@@ -164,7 +167,9 @@ enum {
 	ATA_QCFLAG_IO		= (1 << 3), /* standard IO command */
 	ATA_QCFLAG_RESULT_TF	= (1 << 4), /* result TF requested */
 
-	ATA_QCFLAG_EH_SCHEDULED = (1 << 16), /* EH scheduled */
+	ATA_QCFLAG_FAILED	= (1 << 16), /* cmd failed and is owned by EH */
+	ATA_QCFLAG_SENSE_VALID	= (1 << 17), /* sense data valid */
+	ATA_QCFLAG_EH_SCHEDULED = (1 << 18), /* EH scheduled (obsolete) */
 
 	/* host set flags */
 	ATA_HOST_SIMPLEX	= (1 << 0),	/* Host is simplex, one DMA channel per host_set only */
@@ -463,7 +468,15 @@ struct ata_port_operations {
 	void (*qc_prep) (struct ata_queued_cmd *qc);
 	unsigned int (*qc_issue) (struct ata_queued_cmd *qc);
 
-	void (*eng_timeout) (struct ata_port *ap);
+	/* Error handlers.  ->error_handler overrides ->eng_timeout and
+	 * indicates that new-style EH is in place.
+	 */
+	void (*eng_timeout) (struct ata_port *ap); /* obsolete */
+
+	void (*freeze) (struct ata_port *ap);
+	void (*thaw) (struct ata_port *ap);
+	void (*error_handler) (struct ata_port *ap);
+	void (*post_internal_cmd) (struct ata_queued_cmd *qc);
 
 	irqreturn_t (*irq_handler)(int, void *, struct pt_regs *);
 	void (*irq_clear) (struct ata_port *);
-- 
cgit v1.2.3


From 2ab7db1ff1d64a2ba389d0692d532f42a15f1f72 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:58:02 +0900
Subject: [PATCH] libata-eh-fw: use special reserved tag and qc for internal
 commands

New EH may issue internal commands to recover from error while failed
qc's are still hanging around.  To allow such usage, reserve tag
ATA_MAX_QUEUE-1 for internal command.  This also makes it easy to tell
whether a qc is for internal command or not.  ata_tag_internal() test
implements this test.

To avoid breaking existing drivers, ata_exec_internal() uses
ATA_TAG_INTERNAL only for drivers which implement ->error_handler.
For drivers using old EH, tag 0 is used.  Note that this makes
ata_tag_internal() test valid only when ->error_handler is
implemented.  This is okay as drivers on old EH should not and does
not have any reason to use ata_tag_internal().

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c | 32 +++++++++++++++++++++++++++++---
 include/linux/libata.h     |  9 ++++++++-
 2 files changed, 37 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index de2cd61a264d..966abb5f423e 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -980,15 +980,39 @@ unsigned ata_exec_internal(struct ata_device *dev,
 	struct ata_port *ap = dev->ap;
 	u8 command = tf->command;
 	struct ata_queued_cmd *qc;
+	unsigned int tag, preempted_tag;
 	DECLARE_COMPLETION(wait);
 	unsigned long flags;
 	unsigned int err_mask;
 
 	spin_lock_irqsave(&ap->host_set->lock, flags);
 
-	qc = ata_qc_new_init(dev);
-	BUG_ON(qc == NULL);
+	/* initialize internal qc */
 
+	/* XXX: Tag 0 is used for drivers with legacy EH as some
+	 * drivers choke if any other tag is given.  This breaks
+	 * ata_tag_internal() test for those drivers.  Don't use new
+	 * EH stuff without converting to it.
+	 */
+	if (ap->ops->error_handler)
+		tag = ATA_TAG_INTERNAL;
+	else
+		tag = 0;
+
+	if (test_and_set_bit(tag, &ap->qactive))
+		BUG();
+	qc = ata_qc_from_tag(ap, tag);
+
+	qc->tag = tag;
+	qc->scsicmd = NULL;
+	qc->ap = ap;
+	qc->dev = dev;
+	ata_qc_reinit(qc);
+
+	preempted_tag = ap->active_tag;
+	ap->active_tag = ATA_TAG_POISON;
+
+	/* prepare & issue qc */
 	qc->tf = *tf;
 	if (cdb)
 		memcpy(qc->cdb, cdb, ATAPI_CDB_LEN);
@@ -1035,6 +1059,7 @@ unsigned ata_exec_internal(struct ata_device *dev,
 	err_mask = qc->err_mask;
 
 	ata_qc_free(qc);
+	ap->active_tag = preempted_tag;
 
 	/* XXX - Some LLDDs (sata_mv) disable port on command failure.
 	 * Until those drivers are fixed, we detect the condition
@@ -4014,7 +4039,8 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
 	struct ata_queued_cmd *qc = NULL;
 	unsigned int i;
 
-	for (i = 0; i < ATA_MAX_QUEUE; i++)
+	/* the last tag is reserved for internal command. */
+	for (i = 0; i < ATA_MAX_QUEUE - 1; i++)
 		if (!test_and_set_bit(i, &ap->qactive)) {
 			qc = ata_qc_from_tag(ap, i);
 			break;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index e5d6d7f8e6dc..5a403e434ff8 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -108,7 +108,9 @@ enum {
 	LIBATA_MAX_PRD		= ATA_MAX_PRD / 2,
 	ATA_MAX_PORTS		= 8,
 	ATA_DEF_QUEUE		= 1,
-	ATA_MAX_QUEUE		= 1,
+	/* tag ATA_MAX_QUEUE - 1 is reserved for internal commands */
+	ATA_MAX_QUEUE		= 2,
+	ATA_TAG_INTERNAL	= ATA_MAX_QUEUE - 1,
 	ATA_MAX_SECTORS		= 200,	/* FIXME */
 	ATA_MAX_BUS		= 2,
 	ATA_DEF_BUSY_WAIT	= 10000,
@@ -717,6 +719,11 @@ static inline unsigned int ata_tag_valid(unsigned int tag)
 	return (tag < ATA_MAX_QUEUE) ? 1 : 0;
 }
 
+static inline unsigned int ata_tag_internal(unsigned int tag)
+{
+	return tag == ATA_MAX_QUEUE - 1;
+}
+
 static inline unsigned int ata_class_enabled(unsigned int class)
 {
 	return class == ATA_DEV_ATA || class == ATA_DEV_ATAPI;
-- 
cgit v1.2.3


From f69499f42caf74194df678c9c293f2ee0fe90bc3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:58:03 +0900
Subject: [PATCH] libata-eh-fw: update ata_qc_from_tag() to enforce normal/EH
 qc ownership

New EH framework has clear distinction about who owns a qc.  Every qc
starts owned by normal execution path - PIO, interrupt or whatever.
When an exception condition occurs which affects the qc, the qc gets
scheduled for EH.  Note that some events (say, link lost and regained,
command timeout) may schedule qc's which are not directly related but
could have been affected for EH too.  Scheduling for EH is atomic
w.r.t. ap->host_set->lock and once schedule for EH, normal execution
path is not allowed to access the qc in whatever way.  (PIO
synchronization acts a bit different and will be dealt with later)

This patch make ata_qc_from_tag() check whether a qc is active and
owned by normal path before returning it.  If conditions don't match,
NULL is returned and thus access to the qc is denied.
__ata_qc_from_tag() is the original ata_qc_from_tag() and is used by
libata core/EH layers to access inactive/failed qc's.

This change is applied only if the associated LLDD implements new EH
as indicated by non-NULL ->error_handler

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c |  4 ++--
 include/linux/libata.h     | 19 +++++++++++++++++--
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 966abb5f423e..1c34c1427aa3 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1001,7 +1001,7 @@ unsigned ata_exec_internal(struct ata_device *dev,
 
 	if (test_and_set_bit(tag, &ap->qactive))
 		BUG();
-	qc = ata_qc_from_tag(ap, tag);
+	qc = __ata_qc_from_tag(ap, tag);
 
 	qc->tag = tag;
 	qc->scsicmd = NULL;
@@ -4042,7 +4042,7 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
 	/* the last tag is reserved for internal command. */
 	for (i = 0; i < ATA_MAX_QUEUE - 1; i++)
 		if (!test_and_set_bit(i, &ap->qactive)) {
-			qc = ata_qc_from_tag(ap, i);
+			qc = __ata_qc_from_tag(ap, i);
 			break;
 		}
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 5a403e434ff8..bfcefdca0616 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -832,14 +832,29 @@ static inline void ata_qc_set_polling(struct ata_queued_cmd *qc)
 	qc->tf.ctl |= ATA_NIEN;
 }
 
-static inline struct ata_queued_cmd *ata_qc_from_tag (struct ata_port *ap,
-						      unsigned int tag)
+static inline struct ata_queued_cmd *__ata_qc_from_tag(struct ata_port *ap,
+						       unsigned int tag)
 {
 	if (likely(ata_tag_valid(tag)))
 		return &ap->qcmd[tag];
 	return NULL;
 }
 
+static inline struct ata_queued_cmd *ata_qc_from_tag(struct ata_port *ap,
+						     unsigned int tag)
+{
+	struct ata_queued_cmd *qc = __ata_qc_from_tag(ap, tag);
+
+	if (unlikely(!qc) || !ap->ops->error_handler)
+		return qc;
+
+	if ((qc->flags & (ATA_QCFLAG_ACTIVE |
+			  ATA_QCFLAG_FAILED)) == ATA_QCFLAG_ACTIVE)
+		return qc;
+
+	return NULL;
+}
+
 static inline void ata_tf_init(struct ata_device *dev, struct ata_taskfile *tf)
 {
 	memset(tf, 0, sizeof(*tf));
-- 
cgit v1.2.3


From f686bcb8078ac7505ec88818886c2c72639f4fc5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:58:05 +0900
Subject: [PATCH] libata-eh-fw: implement new EH scheduling via error
 completion

There are several ways a qc can get schedule for EH in new EH.  This
patch implements one of them - completing a qc with ATA_QCFLAG_FAILED
set or with non-zero qc->err_mask.  ALL such qc's are examined by EH.

New EH schedules a qc for EH from completion iff ->error_handler is
implemented, qc is marked as failed or qc->err_mask is non-zero and
the command is not an internal command (internal cmd is handled via
->post_internal_cmd).  The EH scheduling itself is performed by asking
SCSI midlayer to schedule EH for the specified scmd.

For drivers implementing old-EH, nothing changes.  As this change
makes ata_qc_complete() rather large, it's not inlined anymore and
__ata_qc_complete() is exported to other parts of libata for later
use.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c | 62 +++++++++++++++++++++++++++++++++++++++++++++-
 drivers/scsi/libata-eh.c   | 27 ++++++++++++++++++++
 drivers/scsi/libata.h      |  2 ++
 include/linux/libata.h     | 27 +-------------------
 4 files changed, 91 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 1c34c1427aa3..1f5c3270992a 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -4123,6 +4123,66 @@ void __ata_qc_complete(struct ata_queued_cmd *qc)
 	qc->complete_fn(qc);
 }
 
+/**
+ *	ata_qc_complete - Complete an active ATA command
+ *	@qc: Command to complete
+ *	@err_mask: ATA Status register contents
+ *
+ *	Indicate to the mid and upper layers that an ATA
+ *	command has completed, with either an ok or not-ok status.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host_set lock)
+ */
+void ata_qc_complete(struct ata_queued_cmd *qc)
+{
+	struct ata_port *ap = qc->ap;
+
+	/* XXX: New EH and old EH use different mechanisms to
+	 * synchronize EH with regular execution path.
+	 *
+	 * In new EH, a failed qc is marked with ATA_QCFLAG_FAILED.
+	 * Normal execution path is responsible for not accessing a
+	 * failed qc.  libata core enforces the rule by returning NULL
+	 * from ata_qc_from_tag() for failed qcs.
+	 *
+	 * Old EH depends on ata_qc_complete() nullifying completion
+	 * requests if ATA_QCFLAG_EH_SCHEDULED is set.  Old EH does
+	 * not synchronize with interrupt handler.  Only PIO task is
+	 * taken care of.
+	 */
+	if (ap->ops->error_handler) {
+		WARN_ON(ap->flags & ATA_FLAG_FROZEN);
+
+		if (unlikely(qc->err_mask))
+			qc->flags |= ATA_QCFLAG_FAILED;
+
+		if (unlikely(qc->flags & ATA_QCFLAG_FAILED)) {
+			if (!ata_tag_internal(qc->tag)) {
+				/* always fill result TF for failed qc */
+				ap->ops->tf_read(ap, &qc->result_tf);
+				ata_qc_schedule_eh(qc);
+				return;
+			}
+		}
+
+		/* read result TF if requested */
+		if (qc->flags & ATA_QCFLAG_RESULT_TF)
+			ap->ops->tf_read(ap, &qc->result_tf);
+
+		__ata_qc_complete(qc);
+	} else {
+		if (qc->flags & ATA_QCFLAG_EH_SCHEDULED)
+			return;
+
+		/* read result TF if failed or requested */
+		if (qc->err_mask || qc->flags & ATA_QCFLAG_RESULT_TF)
+			ap->ops->tf_read(ap, &qc->result_tf);
+
+		__ata_qc_complete(qc);
+	}
+}
+
 static inline int ata_should_dma_map(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
@@ -5245,7 +5305,7 @@ EXPORT_SYMBOL_GPL(ata_device_add);
 EXPORT_SYMBOL_GPL(ata_host_set_remove);
 EXPORT_SYMBOL_GPL(ata_sg_init);
 EXPORT_SYMBOL_GPL(ata_sg_init_one);
-EXPORT_SYMBOL_GPL(__ata_qc_complete);
+EXPORT_SYMBOL_GPL(ata_qc_complete);
 EXPORT_SYMBOL_GPL(ata_qc_issue_prot);
 EXPORT_SYMBOL_GPL(ata_tf_load);
 EXPORT_SYMBOL_GPL(ata_tf_read);
diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index 959a1cdffac2..471846fe4b73 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -210,6 +210,33 @@ void ata_eng_timeout(struct ata_port *ap)
 	DPRINTK("EXIT\n");
 }
 
+/**
+ *	ata_qc_schedule_eh - schedule qc for error handling
+ *	@qc: command to schedule error handling for
+ *
+ *	Schedule error handling for @qc.  EH will kick in as soon as
+ *	other commands are drained.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host_set lock)
+ */
+void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
+{
+	struct ata_port *ap = qc->ap;
+
+	WARN_ON(!ap->ops->error_handler);
+
+	qc->flags |= ATA_QCFLAG_FAILED;
+	qc->ap->flags |= ATA_FLAG_EH_PENDING;
+
+	/* The following will fail if timeout has already expired.
+	 * ata_scsi_error() takes care of such scmds on EH entry.
+	 * Note that ATA_QCFLAG_FAILED is unconditionally set after
+	 * this function completes.
+	 */
+	scsi_req_abort_cmd(qc->scsicmd);
+}
+
 static void ata_eh_scsidone(struct scsi_cmnd *scmd)
 {
 	/* nada */
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index c9ff83bbcfae..52622b7f8a9e 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -57,6 +57,7 @@ extern int ata_do_reset(struct ata_port *ap, ata_reset_fn_t reset,
 			unsigned int *classes);
 extern void ata_qc_free(struct ata_queued_cmd *qc);
 extern void ata_qc_issue(struct ata_queued_cmd *qc);
+extern void __ata_qc_complete(struct ata_queued_cmd *qc);
 extern int ata_check_atapi_dma(struct ata_queued_cmd *qc);
 extern void ata_dev_select(struct ata_port *ap, unsigned int device,
                            unsigned int wait, unsigned int can_sleep);
@@ -101,5 +102,6 @@ extern void ata_scsi_rbuf_fill(struct ata_scsi_args *args,
 /* libata-eh.c */
 extern enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
 extern void ata_scsi_error(struct Scsi_Host *host);
+extern void ata_qc_schedule_eh(struct ata_queued_cmd *qc);
 
 #endif /* __LIBATA_H__ */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index bfcefdca0616..6023f324e68e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -605,7 +605,7 @@ extern void ata_bmdma_start (struct ata_queued_cmd *qc);
 extern void ata_bmdma_stop(struct ata_queued_cmd *qc);
 extern u8   ata_bmdma_status(struct ata_port *ap);
 extern void ata_bmdma_irq_clear(struct ata_port *ap);
-extern void __ata_qc_complete(struct ata_queued_cmd *qc);
+extern void ata_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
 			      void (*done)(struct scsi_cmnd *));
 extern int ata_std_bios_param(struct scsi_device *sdev,
@@ -882,31 +882,6 @@ static inline void ata_qc_reinit(struct ata_queued_cmd *qc)
 	qc->result_tf.feature = 0;
 }
 
-/**
- *	ata_qc_complete - Complete an active ATA command
- *	@qc: Command to complete
- *	@err_mask: ATA Status register contents
- *
- *	Indicate to the mid and upper layers that an ATA
- *	command has completed, with either an ok or not-ok status.
- *
- *	LOCKING:
- *	spin_lock_irqsave(host_set lock)
- */
-static inline void ata_qc_complete(struct ata_queued_cmd *qc)
-{
-	struct ata_port *ap = qc->ap;
-
-	if (unlikely(qc->flags & ATA_QCFLAG_EH_SCHEDULED))
-		return;
-
-	/* read result TF if failed or requested */
-	if (qc->err_mask || qc->flags & ATA_QCFLAG_RESULT_TF)
-		ap->ops->tf_read(ap, &qc->result_tf);
-
-	__ata_qc_complete(qc);
-}
-
 /**
  *	ata_irq_on - Enable interrupts on a port.
  *	@ap: Port on which interrupts are enabled.
-- 
cgit v1.2.3


From 7b70fc039824bc7303e4007a5f758f832de56611 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:58:07 +0900
Subject: [PATCH] libata-eh-fw: implement ata_port_schedule_eh() and
 ata_port_abort()

ata_port_schedule_eh() directly schedules EH for @ap without
associated qc.  Once EH scheduled, no further qc is allowed and EH
kicks in as soon as all currently active qc's are drained.

ata_port_abort() schedules all currently active commands for EH by
qc_completing them with ATA_QCFLAG_FAILED set.  If ata_port_abort()
doesn't find any qc to abort, it directly schedule EH using
ata_port_schedule_eh().

These two functions provide ways to invoke EH for conditions which
aren't directly related to any specfic qc.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c |  2 ++
 drivers/scsi/libata-eh.c   | 54 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/scsi/libata-scsi.c | 23 ++++++++++++++++++++
 drivers/scsi/libata.h      |  1 +
 include/linux/libata.h     |  4 ++++
 5 files changed, 84 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 1f5c3270992a..9c97783462d6 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -5383,5 +5383,7 @@ EXPORT_SYMBOL_GPL(ata_scsi_device_suspend);
 EXPORT_SYMBOL_GPL(ata_scsi_device_resume);
 
 EXPORT_SYMBOL_GPL(ata_eng_timeout);
+EXPORT_SYMBOL_GPL(ata_port_schedule_eh);
+EXPORT_SYMBOL_GPL(ata_port_abort);
 EXPORT_SYMBOL_GPL(ata_eh_qc_complete);
 EXPORT_SYMBOL_GPL(ata_eh_qc_retry);
diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index 471846fe4b73..037a561809f5 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -237,6 +237,60 @@ void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
 	scsi_req_abort_cmd(qc->scsicmd);
 }
 
+/**
+ *	ata_port_schedule_eh - schedule error handling without a qc
+ *	@ap: ATA port to schedule EH for
+ *
+ *	Schedule error handling for @ap.  EH will kick in as soon as
+ *	all commands are drained.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host_set lock)
+ */
+void ata_port_schedule_eh(struct ata_port *ap)
+{
+	WARN_ON(!ap->ops->error_handler);
+
+	ap->flags |= ATA_FLAG_EH_PENDING;
+	ata_schedule_scsi_eh(ap->host);
+
+	DPRINTK("port EH scheduled\n");
+}
+
+/**
+ *	ata_port_abort - abort all qc's on the port
+ *	@ap: ATA port to abort qc's for
+ *
+ *	Abort all active qc's of @ap and schedule EH.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host_set lock)
+ *
+ *	RETURNS:
+ *	Number of aborted qc's.
+ */
+int ata_port_abort(struct ata_port *ap)
+{
+	int tag, nr_aborted = 0;
+
+	WARN_ON(!ap->ops->error_handler);
+
+	for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
+		struct ata_queued_cmd *qc = ata_qc_from_tag(ap, tag);
+
+		if (qc) {
+			qc->flags |= ATA_QCFLAG_FAILED;
+			ata_qc_complete(qc);
+			nr_aborted++;
+		}
+	}
+
+	if (!nr_aborted)
+		ata_port_schedule_eh(ap);
+
+	return nr_aborted;
+}
+
 static void ata_eh_scsidone(struct scsi_cmnd *scmd)
 {
 	/* nada */
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index a9b4083a4f67..fd7064b9697d 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -2596,3 +2596,26 @@ void ata_scsi_scan_host(struct ata_port *ap)
 	}
 }
 
+/**
+ *	ata_schedule_scsi_eh - schedule EH for SCSI host
+ *	@shost:	SCSI host to invoke error handling on.
+ *
+ *	Schedule SCSI EH without scmd.  This is a hack.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host_set lock)
+ **/
+void ata_schedule_scsi_eh(struct Scsi_Host *shost)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(shost->host_lock, flags);
+
+	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
+	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
+		shost->host_eh_scheduled++;
+		scsi_eh_wakeup(shost);
+	}
+
+	spin_unlock_irqrestore(shost->host_lock, flags);
+}
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index 52622b7f8a9e..b76ad7d7062a 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -98,6 +98,7 @@ extern void ata_scsi_set_sense(struct scsi_cmnd *cmd,
 extern void ata_scsi_rbuf_fill(struct ata_scsi_args *args,
                         unsigned int (*actor) (struct ata_scsi_args *args,
                                            u8 *rbuf, unsigned int buflen));
+extern void ata_schedule_scsi_eh(struct Scsi_Host *shost);
 
 /* libata-eh.c */
 extern enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 6023f324e68e..086e14690954 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -662,6 +662,10 @@ extern unsigned long ata_pci_default_filter(const struct ata_port *, struct ata_
  * EH
  */
 extern void ata_eng_timeout(struct ata_port *ap);
+
+extern void ata_port_schedule_eh(struct ata_port *ap);
+extern int ata_port_abort(struct ata_port *ap);
+
 extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
 
-- 
cgit v1.2.3


From e318049949b07152d851dbfebbd93e560af45ebe Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:58:09 +0900
Subject: [PATCH] libata-eh-fw: implement freeze/thaw

Freezing is performed atomic w.r.t. host_set->lock and once frozen
LLDD is not allowed to access the port or any qc on it.  Also, libata
makes sure that no new qc gets issued to a frozen port.

A frozen port is thawed after a reset operation completes
successfully, so reset methods must do its job while the port is
frozen.  During initialization all ports get frozen before requesting
IRQ, so reset methods are always invoked on a frozen port.

Optional ->freeze and ->thaw operations notify LLDD that the port is
being frozen and thawed, respectively.  LLDD can disable/enable
hardware interrupt in these callbacks if the controller's IRQ mask can
be changed dynamically.  If the controller doesn't allow such
operation, LLDD can check for frozen state in the interrupt handler
and ack/clear interrupts unconditionally while frozen.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c |  26 +++++++++++-
 drivers/scsi/libata-eh.c   | 103 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/libata.h     |   4 ++
 3 files changed, 131 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 9c97783462d6..63857a90ac28 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -987,6 +987,12 @@ unsigned ata_exec_internal(struct ata_device *dev,
 
 	spin_lock_irqsave(&ap->host_set->lock, flags);
 
+	/* no internal command while frozen */
+	if (ap->flags & ATA_FLAG_FROZEN) {
+		spin_unlock_irqrestore(&ap->host_set->lock, flags);
+		return AC_ERR_SYSTEM;
+	}
+
 	/* initialize internal qc */
 
 	/* XXX: Tag 0 is used for drivers with legacy EH as some
@@ -2565,8 +2571,11 @@ void ata_std_postreset(struct ata_port *ap, unsigned int *classes)
 		sata_scr_write(ap, SCR_ERROR, serror);
 
 	/* re-enable interrupts */
-	if (ap->ioaddr.ctl_addr)	/* FIXME: hack. create a hook instead */
-		ata_irq_on(ap);
+	if (!ap->ops->error_handler) {
+		/* FIXME: hack. create a hook instead */
+		if (ap->ioaddr.ctl_addr)
+			ata_irq_on(ap);
+	}
 
 	/* is double-select really necessary? */
 	if (classes[0] != ATA_DEV_NONE)
@@ -2681,6 +2690,8 @@ int ata_drive_probe_reset(struct ata_port *ap, ata_probeinit_fn_t probeinit,
 {
 	int rc = -EINVAL;
 
+	ata_eh_freeze_port(ap);
+
 	if (probeinit)
 		probeinit(ap);
 
@@ -2725,6 +2736,9 @@ int ata_drive_probe_reset(struct ata_port *ap, ata_probeinit_fn_t probeinit,
 	if (rc == 0) {
 		if (postreset)
 			postreset(ap, classes);
+
+		ata_eh_thaw_port(ap);
+
 		if (classes[0] == ATA_DEV_UNKNOWN)
 			rc = -ENODEV;
 	}
@@ -4039,6 +4053,10 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
 	struct ata_queued_cmd *qc = NULL;
 	unsigned int i;
 
+	/* no command while frozen */
+	if (unlikely(ap->flags & ATA_FLAG_FROZEN))
+		return NULL;
+
 	/* the last tag is reserved for internal command. */
 	for (i = 0; i < ATA_MAX_QUEUE - 1; i++)
 		if (!test_and_set_bit(i, &ap->qactive)) {
@@ -4953,6 +4971,7 @@ int ata_device_add(const struct ata_probe_ent *ent)
 
 		ata_chk_status(ap);
 		host_set->ops->irq_clear(ap);
+		ata_eh_freeze_port(ap);	/* freeze port before requesting IRQ */
 		count++;
 	}
 
@@ -5385,5 +5404,8 @@ EXPORT_SYMBOL_GPL(ata_scsi_device_resume);
 EXPORT_SYMBOL_GPL(ata_eng_timeout);
 EXPORT_SYMBOL_GPL(ata_port_schedule_eh);
 EXPORT_SYMBOL_GPL(ata_port_abort);
+EXPORT_SYMBOL_GPL(ata_port_freeze);
+EXPORT_SYMBOL_GPL(ata_eh_freeze_port);
+EXPORT_SYMBOL_GPL(ata_eh_thaw_port);
 EXPORT_SYMBOL_GPL(ata_eh_qc_complete);
 EXPORT_SYMBOL_GPL(ata_eh_qc_retry);
diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index 037a561809f5..cb4e2b8d32d9 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -291,6 +291,109 @@ int ata_port_abort(struct ata_port *ap)
 	return nr_aborted;
 }
 
+/**
+ *	__ata_port_freeze - freeze port
+ *	@ap: ATA port to freeze
+ *
+ *	This function is called when HSM violation or some other
+ *	condition disrupts normal operation of the port.  Frozen port
+ *	is not allowed to perform any operation until the port is
+ *	thawed, which usually follows a successful reset.
+ *
+ *	ap->ops->freeze() callback can be used for freezing the port
+ *	hardware-wise (e.g. mask interrupt and stop DMA engine).  If a
+ *	port cannot be frozen hardware-wise, the interrupt handler
+ *	must ack and clear interrupts unconditionally while the port
+ *	is frozen.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host_set lock)
+ */
+static void __ata_port_freeze(struct ata_port *ap)
+{
+	WARN_ON(!ap->ops->error_handler);
+
+	if (ap->ops->freeze)
+		ap->ops->freeze(ap);
+
+	ap->flags |= ATA_FLAG_FROZEN;
+
+	DPRINTK("ata%u port frozen\n", ap->id);
+}
+
+/**
+ *	ata_port_freeze - abort & freeze port
+ *	@ap: ATA port to freeze
+ *
+ *	Abort and freeze @ap.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host_set lock)
+ *
+ *	RETURNS:
+ *	Number of aborted commands.
+ */
+int ata_port_freeze(struct ata_port *ap)
+{
+	int nr_aborted;
+
+	WARN_ON(!ap->ops->error_handler);
+
+	nr_aborted = ata_port_abort(ap);
+	__ata_port_freeze(ap);
+
+	return nr_aborted;
+}
+
+/**
+ *	ata_eh_freeze_port - EH helper to freeze port
+ *	@ap: ATA port to freeze
+ *
+ *	Freeze @ap.
+ *
+ *	LOCKING:
+ *	None.
+ */
+void ata_eh_freeze_port(struct ata_port *ap)
+{
+	unsigned long flags;
+
+	if (!ap->ops->error_handler)
+		return;
+
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	__ata_port_freeze(ap);
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+}
+
+/**
+ *	ata_port_thaw_port - EH helper to thaw port
+ *	@ap: ATA port to thaw
+ *
+ *	Thaw frozen port @ap.
+ *
+ *	LOCKING:
+ *	None.
+ */
+void ata_eh_thaw_port(struct ata_port *ap)
+{
+	unsigned long flags;
+
+	if (!ap->ops->error_handler)
+		return;
+
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+
+	ap->flags &= ~ATA_FLAG_FROZEN;
+
+	if (ap->ops->thaw)
+		ap->ops->thaw(ap);
+
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	DPRINTK("ata%u port thawed\n", ap->id);
+}
+
 static void ata_eh_scsidone(struct scsi_cmnd *scmd)
 {
 	/* nada */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 086e14690954..6758b4d374a0 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -665,6 +665,10 @@ extern void ata_eng_timeout(struct ata_port *ap);
 
 extern void ata_port_schedule_eh(struct ata_port *ap);
 extern int ata_port_abort(struct ata_port *ap);
+extern int ata_port_freeze(struct ata_port *ap);
+
+extern void ata_eh_freeze_port(struct ata_port *ap);
+extern void ata_eh_thaw_port(struct ata_port *ap);
 
 extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
-- 
cgit v1.2.3


From ad9e27624479bd167dd7eac0cea4bb3ad13bc926 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:58:12 +0900
Subject: [PATCH] libata-eh-fw: update ata_scsi_error() for new EH

Update ata_scsi_error() for new EH.  ata_scsi_error() is responsible
for claiming timed out qcs and invoking ->error_handler in safe and
synchronized manner.  As the state of the controller is unknown if a
qc has timed out, the port is frozen in such cases.

Note that ata_scsi_timed_out() isn't used for new EH.  This is because
a timed out qc cannot be claimed by EH without freezing the port and
freezing the port in ata_scsi_timed_out() results in unnecessary
abortion of other active qcs.  ata_scsi_timed_out() can be removed
once all drivers are converted to new EH.

While at it, add 'TODO: kill' comments to old EH functions.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-eh.c | 136 +++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/libata.h   |   3 ++
 2 files changed, 134 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index cb4e2b8d32d9..0803231f6577 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -44,6 +44,8 @@
 
 #include "libata.h"
 
+static void __ata_port_freeze(struct ata_port *ap);
+
 /**
  *	ata_scsi_timed_out - SCSI layer time out callback
  *	@cmd: timed out SCSI command
@@ -55,6 +57,8 @@
  *	from finishing it by setting EH_SCHEDULED and return
  *	EH_NOT_HANDLED.
  *
+ *	TODO: kill this function once old EH is gone.
+ *
  *	LOCKING:
  *	Called from timer context
  *
@@ -67,10 +71,16 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 	struct ata_port *ap = ata_shost_to_port(host);
 	unsigned long flags;
 	struct ata_queued_cmd *qc;
-	enum scsi_eh_timer_return ret = EH_HANDLED;
+	enum scsi_eh_timer_return ret;
 
 	DPRINTK("ENTER\n");
 
+	if (ap->ops->error_handler) {
+		ret = EH_NOT_HANDLED;
+		goto out;
+	}
+
+	ret = EH_HANDLED;
 	spin_lock_irqsave(&ap->host_set->lock, flags);
 	qc = ata_qc_from_tag(ap, ap->active_tag);
 	if (qc) {
@@ -81,6 +91,7 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 	}
 	spin_unlock_irqrestore(&ap->host_set->lock, flags);
 
+ out:
 	DPRINTK("EXIT, ret=%d\n", ret);
 	return ret;
 }
@@ -100,21 +111,132 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 void ata_scsi_error(struct Scsi_Host *host)
 {
 	struct ata_port *ap = ata_shost_to_port(host);
+	spinlock_t *hs_lock = &ap->host_set->lock;
+	int i, repeat_cnt = ATA_EH_MAX_REPEAT;
+	unsigned long flags;
 
 	DPRINTK("ENTER\n");
 
-	/* synchronize with IRQ handler and port task */
-	spin_unlock_wait(&ap->host_set->lock);
+	/* synchronize with port task */
 	ata_port_flush_task(ap);
 
-	WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL);
+	/* synchronize with host_set lock and sort out timeouts */
+
+	/* For new EH, all qcs are finished in one of three ways -
+	 * normal completion, error completion, and SCSI timeout.
+	 * Both cmpletions can race against SCSI timeout.  When normal
+	 * completion wins, the qc never reaches EH.  When error
+	 * completion wins, the qc has ATA_QCFLAG_FAILED set.
+	 *
+	 * When SCSI timeout wins, things are a bit more complex.
+	 * Normal or error completion can occur after the timeout but
+	 * before this point.  In such cases, both types of
+	 * completions are honored.  A scmd is determined to have
+	 * timed out iff its associated qc is active and not failed.
+	 */
+	if (ap->ops->error_handler) {
+		struct scsi_cmnd *scmd, *tmp;
+		int nr_timedout = 0;
+
+		spin_lock_irqsave(hs_lock, flags);
+
+		list_for_each_entry_safe(scmd, tmp, &host->eh_cmd_q, eh_entry) {
+			struct ata_queued_cmd *qc;
+
+			for (i = 0; i < ATA_MAX_QUEUE; i++) {
+				qc = __ata_qc_from_tag(ap, i);
+				if (qc->flags & ATA_QCFLAG_ACTIVE &&
+				    qc->scsicmd == scmd)
+					break;
+			}
+
+			if (i < ATA_MAX_QUEUE) {
+				/* the scmd has an associated qc */
+				if (!(qc->flags & ATA_QCFLAG_FAILED)) {
+					/* which hasn't failed yet, timeout */
+					qc->err_mask |= AC_ERR_TIMEOUT;
+					qc->flags |= ATA_QCFLAG_FAILED;
+					nr_timedout++;
+				}
+			} else {
+				/* Normal completion occurred after
+				 * SCSI timeout but before this point.
+				 * Successfully complete it.
+				 */
+				scmd->retries = scmd->allowed;
+				scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
+			}
+		}
+
+		/* If we have timed out qcs.  They belong to EH from
+		 * this point but the state of the controller is
+		 * unknown.  Freeze the port to make sure the IRQ
+		 * handler doesn't diddle with those qcs.  This must
+		 * be done atomically w.r.t. setting QCFLAG_FAILED.
+		 */
+		if (nr_timedout)
+			__ata_port_freeze(ap);
+
+		spin_unlock_irqrestore(hs_lock, flags);
+	} else
+		spin_unlock_wait(hs_lock);
+
+ repeat:
+	/* invoke error handler */
+	if (ap->ops->error_handler) {
+		/* clear EH pending */
+		spin_lock_irqsave(hs_lock, flags);
+		ap->flags &= ~ATA_FLAG_EH_PENDING;
+		spin_unlock_irqrestore(hs_lock, flags);
+
+		/* invoke EH */
+		ap->ops->error_handler(ap);
+
+		/* Exception might have happend after ->error_handler
+		 * recovered the port but before this point.  Repeat
+		 * EH in such case.
+		 */
+		spin_lock_irqsave(hs_lock, flags);
+
+		if (ap->flags & ATA_FLAG_EH_PENDING) {
+			if (--repeat_cnt) {
+				ata_port_printk(ap, KERN_INFO,
+					"EH pending after completion, "
+					"repeating EH (cnt=%d)\n", repeat_cnt);
+				spin_unlock_irqrestore(hs_lock, flags);
+				goto repeat;
+			}
+			ata_port_printk(ap, KERN_ERR, "EH pending after %d "
+					"tries, giving up\n", ATA_EH_MAX_REPEAT);
+		}
 
-	ap->ops->eng_timeout(ap);
+		/* Clear host_eh_scheduled while holding hs_lock such
+		 * that if exception occurs after this point but
+		 * before EH completion, SCSI midlayer will
+		 * re-initiate EH.
+		 */
+		host->host_eh_scheduled = 0;
+
+		spin_unlock_irqrestore(hs_lock, flags);
+	} else {
+		WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL);
+		ap->ops->eng_timeout(ap);
+	}
 
+	/* finish or retry handled scmd's and clean up */
 	WARN_ON(host->host_failed || !list_empty(&host->eh_cmd_q));
 
 	scsi_eh_flush_done_q(&ap->eh_done_q);
 
+	/* clean up */
+	spin_lock_irqsave(hs_lock, flags);
+
+	if (ap->flags & ATA_FLAG_RECOVERED)
+		ata_port_printk(ap, KERN_INFO, "EH complete\n");
+	ap->flags &= ~ATA_FLAG_RECOVERED;
+
+	spin_unlock_irqrestore(hs_lock, flags);
+
 	DPRINTK("EXIT\n");
 }
 
@@ -133,6 +255,8 @@ void ata_scsi_error(struct Scsi_Host *host)
  *	an interrupt was not delivered to the driver, even though the
  *	transaction completed successfully.
  *
+ *	TODO: kill this function once old EH is gone.
+ *
  *	LOCKING:
  *	Inherited from SCSI layer (none, can sleep)
  */
@@ -198,6 +322,8 @@ static void ata_qc_timeout(struct ata_queued_cmd *qc)
  *	an interrupt was not delivered to the driver, even though the
  *	transaction completed successfully.
  *
+ *	TODO: kill this function once old EH is gone.
+ *
  *	LOCKING:
  *	Inherited from SCSI layer (none, can sleep)
  */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 6758b4d374a0..5ad50163c8ef 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -225,6 +225,9 @@ enum {
 	ATA_PORT_PRIMARY	= (1 << 0),
 	ATA_PORT_SECONDARY	= (1 << 1),
 
+	/* max repeat if error condition is still set after ->error_handler */
+	ATA_EH_MAX_REPEAT	= 5,
+
 	/* how hard are we gonna try to probe/recover devices */
 	ATA_PROBE_MAX_TRIES	= 3,
 };
-- 
cgit v1.2.3


From 9be1e979f2e1e57a091a658fa88dac266f9fd6fe Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:58:17 +0900
Subject: [PATCH] libata-eh: add ATA and libata flags for new EH

Add ATA and libata flags to be used by new EH.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 include/linux/ata.h    | 13 +++++++++++++
 include/linux/libata.h |  8 ++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 312a2c0c64e6..a7c41f3df8f4 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -97,6 +97,9 @@ enum {
 	ATA_DRQ			= (1 << 3),	/* data request i/o */
 	ATA_ERR			= (1 << 0),	/* have an error */
 	ATA_SRST		= (1 << 2),	/* software reset */
+	ATA_ICRC		= (1 << 7),	/* interface CRC error */
+	ATA_UNC			= (1 << 6),	/* uncorrectable media error */
+	ATA_IDNF		= (1 << 4),	/* ID not found */
 	ATA_ABORTED		= (1 << 2),	/* command aborted */
 
 	/* ATA command block registers */
@@ -192,6 +195,16 @@ enum {
 	SCR_ACTIVE		= 3,
 	SCR_NOTIFICATION	= 4,
 
+	/* SError bits */
+	SERR_DATA_RECOVERED	= (1 << 0), /* recovered data error */
+	SERR_COMM_RECOVERED	= (1 << 1), /* recovered comm failure */
+	SERR_DATA		= (1 << 8), /* unrecovered data error */
+	SERR_PERSISTENT		= (1 << 9), /* persistent data/comm error */
+	SERR_PROTOCOL		= (1 << 10), /* protocol violation */
+	SERR_INTERNAL		= (1 << 11), /* host internal error */
+	SERR_PHYRDY_CHG		= (1 << 16), /* PHY RDY changed */
+	SERR_DEV_XCHG		= (1 << 26), /* device exchanged */
+
 	/* struct ata_taskfile flags */
 	ATA_TFLAG_LBA48		= (1 << 0), /* enable 48-bit LBA and "HOB" */
 	ATA_TFLAG_ISADDR	= (1 << 1), /* enable r/w to nsect/lba regs */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 5ad50163c8ef..6fe5ed8eabf5 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -155,6 +155,7 @@ enum {
 
 	ATA_FLAG_EH_PENDING	= (1 << 16), /* EH pending */
 	ATA_FLAG_FROZEN		= (1 << 17), /* port is frozen */
+	ATA_FLAG_RECOVERED	= (1 << 18), /* recovery action performed */
 
 	ATA_FLAG_DISABLED	= (1 << 22), /* port is disabled, ignore it */
 	ATA_FLAG_SUSPENDED	= (1 << 23), /* port is suspended (power) */
@@ -225,6 +226,13 @@ enum {
 	ATA_PORT_PRIMARY	= (1 << 0),
 	ATA_PORT_SECONDARY	= (1 << 1),
 
+	/* reset / recovery action types */
+	ATA_EH_REVALIDATE	= (1 << 0),
+	ATA_EH_SOFTRESET	= (1 << 1),
+	ATA_EH_HARDRESET	= (1 << 2),
+
+	ATA_EH_RESET_MASK	= ATA_EH_SOFTRESET | ATA_EH_HARDRESET,
+
 	/* max repeat if error condition is still set after ->error_handler */
 	ATA_EH_MAX_REPEAT	= 5,
 
-- 
cgit v1.2.3


From 0c247c559cd70f85ba9f0764ce13ae00e20fcad8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:58:19 +0900
Subject: [PATCH] libata-eh: implement dev->ering

This patch implements ata_ering and uses it to define dev->ering.

ata_ering is a ring buffer which records libata errors - whether a
command was for normar IO request, err_mask and timestamp.  Errors are
recorded per-device in dev->ering.  This will be used by EH to
determine recovery actions.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-eh.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/libata.h   | 17 +++++++++++++++++
 2 files changed, 62 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index 0803231f6577..71ad18b7cff6 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -46,6 +46,51 @@
 
 static void __ata_port_freeze(struct ata_port *ap);
 
+static void ata_ering_record(struct ata_ering *ering, int is_io,
+			     unsigned int err_mask)
+{
+	struct ata_ering_entry *ent;
+
+	WARN_ON(!err_mask);
+
+	ering->cursor++;
+	ering->cursor %= ATA_ERING_SIZE;
+
+	ent = &ering->ring[ering->cursor];
+	ent->is_io = is_io;
+	ent->err_mask = err_mask;
+	ent->timestamp = get_jiffies_64();
+}
+
+static struct ata_ering_entry * ata_ering_top(struct ata_ering *ering)
+{
+	struct ata_ering_entry *ent = &ering->ring[ering->cursor];
+	if (!ent->err_mask)
+		return NULL;
+	return ent;
+}
+
+static int ata_ering_map(struct ata_ering *ering,
+			 int (*map_fn)(struct ata_ering_entry *, void *),
+			 void *arg)
+{
+	int idx, rc = 0;
+	struct ata_ering_entry *ent;
+
+	idx = ering->cursor;
+	do {
+		ent = &ering->ring[idx];
+		if (!ent->err_mask)
+			break;
+		rc = map_fn(ent, arg);
+		if (rc)
+			break;
+		idx = (idx - 1 + ATA_ERING_SIZE) % ATA_ERING_SIZE;
+	} while (idx != ering->cursor);
+
+	return rc;
+}
+
 /**
  *	ata_scsi_timed_out - SCSI layer time out callback
  *	@cmd: timed out SCSI command
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 6fe5ed8eabf5..f5cea13599c3 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -226,6 +226,9 @@ enum {
 	ATA_PORT_PRIMARY	= (1 << 0),
 	ATA_PORT_SECONDARY	= (1 << 1),
 
+	/* ering size */
+	ATA_ERING_SIZE		= 32,
+
 	/* reset / recovery action types */
 	ATA_EH_REVALIDATE	= (1 << 0),
 	ATA_EH_SOFTRESET	= (1 << 1),
@@ -375,6 +378,17 @@ struct ata_host_stats {
 	unsigned long		rw_reqbuf;
 };
 
+struct ata_ering_entry {
+	int			is_io;
+	unsigned int		err_mask;
+	u64			timestamp;
+};
+
+struct ata_ering {
+	int			cursor;
+	struct ata_ering_entry	ring[ATA_ERING_SIZE];
+};
+
 struct ata_device {
 	struct ata_port		*ap;
 	u64			n_sectors;	/* size of device, if ATA */
@@ -401,6 +415,9 @@ struct ata_device {
 	u16			cylinders;	/* Number of cylinders */
 	u16			heads;		/* Number of heads */
 	u16			sectors;	/* Number of sectors per track */
+
+	/* error history */
+	struct ata_ering	ering;
 };
 
 struct ata_port {
-- 
cgit v1.2.3


From f3e81b19aac23c0e8c55d5961324ef7de44c23bb Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:58:21 +0900
Subject: [PATCH] libata-eh: implement ata_eh_info and ata_eh_context

struct ata_eh_info serves as the communication channel between
execution path and EH.  Execution path describes detected error
condition in ap->eh_info and EH recovers the port using it.  To avoid
missing error conditions detected during EH, EH makes its own copy of
eh_info and clears it on entry allowing error info to accumulate
during EH.

Most EH states including EH's copy of eh_info are stored in
ap->eh_context (struct ata_eh_context) which is owned by EH and thus
doesn't require any synchronization to access and alter.  This
standardized context makes it easy to integrate various parts of EH
and extend EH to handle multiple links (for PM).

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-eh.c | 11 ++++++++++-
 include/linux/libata.h   | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index 71ad18b7cff6..1968f2d140f3 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -229,9 +229,15 @@ void ata_scsi_error(struct Scsi_Host *host)
  repeat:
 	/* invoke error handler */
 	if (ap->ops->error_handler) {
-		/* clear EH pending */
+		/* fetch & clear EH info */
 		spin_lock_irqsave(hs_lock, flags);
+
+		memset(&ap->eh_context, 0, sizeof(ap->eh_context));
+		ap->eh_context.i = ap->eh_info;
+		memset(&ap->eh_info, 0, sizeof(ap->eh_info));
+
 		ap->flags &= ~ATA_FLAG_EH_PENDING;
+
 		spin_unlock_irqrestore(hs_lock, flags);
 
 		/* invoke EH */
@@ -255,6 +261,9 @@ void ata_scsi_error(struct Scsi_Host *host)
 					"tries, giving up\n", ATA_EH_MAX_REPEAT);
 		}
 
+		/* this run is complete, make sure EH info is clear */
+		memset(&ap->eh_info, 0, sizeof(ap->eh_info));
+
 		/* Clear host_eh_scheduled while holding hs_lock such
 		 * that if exception occurs after this point but
 		 * before EH completion, SCSI midlayer will
diff --git a/include/linux/libata.h b/include/linux/libata.h
index f5cea13599c3..298f9918e375 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -229,6 +229,9 @@ enum {
 	/* ering size */
 	ATA_ERING_SIZE		= 32,
 
+	/* desc_len for ata_eh_info and context */
+	ATA_EH_DESC_LEN		= 80,
+
 	/* reset / recovery action types */
 	ATA_EH_REVALIDATE	= (1 << 0),
 	ATA_EH_SOFTRESET	= (1 << 1),
@@ -236,6 +239,9 @@ enum {
 
 	ATA_EH_RESET_MASK	= ATA_EH_SOFTRESET | ATA_EH_HARDRESET,
 
+	/* ata_eh_info->flags */
+	ATA_EHI_DID_RESET	= (1 << 0), /* already reset this port */
+
 	/* max repeat if error condition is still set after ->error_handler */
 	ATA_EH_MAX_REPEAT	= 5,
 
@@ -420,6 +426,21 @@ struct ata_device {
 	struct ata_ering	ering;
 };
 
+struct ata_eh_info {
+	struct ata_device	*dev;		/* offending device */
+	u32			serror;		/* SError from LLDD */
+	unsigned int		err_mask;	/* port-wide err_mask */
+	unsigned int		action;		/* ATA_EH_* action mask */
+	unsigned int		flags;		/* ATA_EHI_* flags */
+	char			desc[ATA_EH_DESC_LEN];
+	int			desc_len;
+};
+
+struct ata_eh_context {
+	struct ata_eh_info	i;
+	int			tries[ATA_MAX_DEVICES];
+};
+
 struct ata_port {
 	struct Scsi_Host	*host;	/* our co-allocated scsi host */
 	const struct ata_port_operations *ops;
@@ -444,6 +465,11 @@ struct ata_port {
 	unsigned int		cbl;	/* cable type; ATA_CBL_xxx */
 	unsigned int		sata_spd_limit;	/* SATA PHY speed limit */
 
+	/* record runtime error info, protected by host_set lock */
+	struct ata_eh_info	eh_info;
+	/* EH context owned by EH */
+	struct ata_eh_context	eh_context;
+
 	struct ata_device	device[ATA_MAX_DEVICES];
 
 	struct ata_queued_cmd	qcmd[ATA_MAX_QUEUE];
@@ -710,6 +736,20 @@ extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
 #define ata_dev_printk(dev, lv, fmt, args...) \
 	printk(lv"ata%u.%02u: "fmt, (dev)->ap->id, (dev)->devno , ##args)
 
+/*
+ * ata_eh_info helpers
+ */
+#define ata_ehi_push_desc(ehi, fmt, args...) do { \
+	(ehi)->desc_len += scnprintf((ehi)->desc + (ehi)->desc_len, \
+				     ATA_EH_DESC_LEN - (ehi)->desc_len, \
+				     fmt , ##args); \
+} while (0)
+
+#define ata_ehi_clear_desc(ehi) do { \
+	(ehi)->desc[0] = '\0'; \
+	(ehi)->desc_len = 0; \
+} while (0)
+
 /*
  * qc helpers
  */
-- 
cgit v1.2.3


From 022bdb075b9e1f224088a0b268de56268d7bc5b6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:58:22 +0900
Subject: [PATCH] libata-eh: implement new EH

Implement new EH.  The exported interface is ata_do_eh() which is to
be called from ->error_handler and performs the following steps to
recover the failed port.

ata_eh_autopsy() : analyze SError/TF, determine the cause of failure
		   and required recovery actions and record it in
		   ap->eh_context
ata_eh_report()	 : report the failure to user
ata_eh_recover() : perform recovery actions described in ap->eh_context
ata_eh_finish()	 : finish failed qcs

LLDDs can customize error handling by modifying eh_context before
calling ata_do_eh() or, if necessary, doing so inbetween each major
steps by calling each step explicitly.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c |   1 +
 drivers/scsi/libata-eh.c   | 775 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/libata.h     |   5 +
 3 files changed, 781 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 4def48ed6f46..ddc47097d37e 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -5440,3 +5440,4 @@ EXPORT_SYMBOL_GPL(ata_eh_freeze_port);
 EXPORT_SYMBOL_GPL(ata_eh_thaw_port);
 EXPORT_SYMBOL_GPL(ata_eh_qc_complete);
 EXPORT_SYMBOL_GPL(ata_eh_qc_retry);
+EXPORT_SYMBOL_GPL(ata_do_eh);
diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index 1968f2d140f3..cd133f83e595 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -626,3 +626,778 @@ void ata_eh_qc_retry(struct ata_queued_cmd *qc)
 		scmd->retries--;
 	__ata_eh_qc_complete(qc);
 }
+
+/**
+ *	ata_eh_about_to_do - about to perform eh_action
+ *	@ap: target ATA port
+ *	@action: action about to be performed
+ *
+ *	Called just before performing EH actions to clear related bits
+ *	in @ap->eh_info such that eh actions are not unnecessarily
+ *	repeated.
+ *
+ *	LOCKING:
+ *	None.
+ */
+static void ata_eh_about_to_do(struct ata_port *ap, unsigned int action)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	ap->eh_info.action &= ~action;
+	ap->flags |= ATA_FLAG_RECOVERED;
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+}
+
+/**
+ *	ata_err_string - convert err_mask to descriptive string
+ *	@err_mask: error mask to convert to string
+ *
+ *	Convert @err_mask to descriptive string.  Errors are
+ *	prioritized according to severity and only the most severe
+ *	error is reported.
+ *
+ *	LOCKING:
+ *	None.
+ *
+ *	RETURNS:
+ *	Descriptive string for @err_mask
+ */
+static const char * ata_err_string(unsigned int err_mask)
+{
+	if (err_mask & AC_ERR_HOST_BUS)
+		return "host bus error";
+	if (err_mask & AC_ERR_ATA_BUS)
+		return "ATA bus error";
+	if (err_mask & AC_ERR_TIMEOUT)
+		return "timeout";
+	if (err_mask & AC_ERR_HSM)
+		return "HSM violation";
+	if (err_mask & AC_ERR_SYSTEM)
+		return "internal error";
+	if (err_mask & AC_ERR_MEDIA)
+		return "media error";
+	if (err_mask & AC_ERR_INVALID)
+		return "invalid argument";
+	if (err_mask & AC_ERR_DEV)
+		return "device error";
+	return "unknown error";
+}
+
+/**
+ *	atapi_eh_request_sense - perform ATAPI REQUEST_SENSE
+ *	@dev: device to perform REQUEST_SENSE to
+ *	@sense_buf: result sense data buffer (SCSI_SENSE_BUFFERSIZE bytes long)
+ *
+ *	Perform ATAPI REQUEST_SENSE after the device reported CHECK
+ *	SENSE.  This function is EH helper.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	0 on success, AC_ERR_* mask on failure
+ */
+static unsigned int atapi_eh_request_sense(struct ata_device *dev,
+					   unsigned char *sense_buf)
+{
+	struct ata_port *ap = dev->ap;
+	struct ata_taskfile tf;
+	u8 cdb[ATAPI_CDB_LEN];
+
+	DPRINTK("ATAPI request sense\n");
+
+	ata_tf_init(dev, &tf);
+
+	/* FIXME: is this needed? */
+	memset(sense_buf, 0, SCSI_SENSE_BUFFERSIZE);
+
+	/* XXX: why tf_read here? */
+	ap->ops->tf_read(ap, &tf);
+
+	/* fill these in, for the case where they are -not- overwritten */
+	sense_buf[0] = 0x70;
+	sense_buf[2] = tf.feature >> 4;
+
+	memset(cdb, 0, ATAPI_CDB_LEN);
+	cdb[0] = REQUEST_SENSE;
+	cdb[4] = SCSI_SENSE_BUFFERSIZE;
+
+	tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
+	tf.command = ATA_CMD_PACKET;
+
+	/* is it pointless to prefer PIO for "safety reasons"? */
+	if (ap->flags & ATA_FLAG_PIO_DMA) {
+		tf.protocol = ATA_PROT_ATAPI_DMA;
+		tf.feature |= ATAPI_PKT_DMA;
+	} else {
+		tf.protocol = ATA_PROT_ATAPI;
+		tf.lbam = (8 * 1024) & 0xff;
+		tf.lbah = (8 * 1024) >> 8;
+	}
+
+	return ata_exec_internal(dev, &tf, cdb, DMA_FROM_DEVICE,
+				 sense_buf, SCSI_SENSE_BUFFERSIZE);
+}
+
+/**
+ *	ata_eh_analyze_serror - analyze SError for a failed port
+ *	@ap: ATA port to analyze SError for
+ *
+ *	Analyze SError if available and further determine cause of
+ *	failure.
+ *
+ *	LOCKING:
+ *	None.
+ */
+static void ata_eh_analyze_serror(struct ata_port *ap)
+{
+	struct ata_eh_context *ehc = &ap->eh_context;
+	u32 serror = ehc->i.serror;
+	unsigned int err_mask = 0, action = 0;
+
+	if (serror & SERR_PERSISTENT) {
+		err_mask |= AC_ERR_ATA_BUS;
+		action |= ATA_EH_HARDRESET;
+	}
+	if (serror &
+	    (SERR_DATA_RECOVERED | SERR_COMM_RECOVERED | SERR_DATA)) {
+		err_mask |= AC_ERR_ATA_BUS;
+		action |= ATA_EH_SOFTRESET;
+	}
+	if (serror & SERR_PROTOCOL) {
+		err_mask |= AC_ERR_HSM;
+		action |= ATA_EH_SOFTRESET;
+	}
+	if (serror & SERR_INTERNAL) {
+		err_mask |= AC_ERR_SYSTEM;
+		action |= ATA_EH_SOFTRESET;
+	}
+	if (serror & (SERR_PHYRDY_CHG | SERR_DEV_XCHG)) {
+		err_mask |= AC_ERR_ATA_BUS;
+		action |= ATA_EH_HARDRESET;
+	}
+
+	ehc->i.err_mask |= err_mask;
+	ehc->i.action |= action;
+}
+
+/**
+ *	ata_eh_analyze_tf - analyze taskfile of a failed qc
+ *	@qc: qc to analyze
+ *	@tf: Taskfile registers to analyze
+ *
+ *	Analyze taskfile of @qc and further determine cause of
+ *	failure.  This function also requests ATAPI sense data if
+ *	avaliable.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	Determined recovery action
+ */
+static unsigned int ata_eh_analyze_tf(struct ata_queued_cmd *qc,
+				      const struct ata_taskfile *tf)
+{
+	unsigned int tmp, action = 0;
+	u8 stat = tf->command, err = tf->feature;
+
+	if ((stat & (ATA_BUSY | ATA_DRQ | ATA_DRDY)) != ATA_DRDY) {
+		qc->err_mask |= AC_ERR_HSM;
+		return ATA_EH_SOFTRESET;
+	}
+
+	if (!(qc->err_mask & AC_ERR_DEV))
+		return 0;
+
+	switch (qc->dev->class) {
+	case ATA_DEV_ATA:
+		if (err & ATA_ICRC)
+			qc->err_mask |= AC_ERR_ATA_BUS;
+		if (err & ATA_UNC)
+			qc->err_mask |= AC_ERR_MEDIA;
+		if (err & ATA_IDNF)
+			qc->err_mask |= AC_ERR_INVALID;
+		break;
+
+	case ATA_DEV_ATAPI:
+		tmp = atapi_eh_request_sense(qc->dev,
+					     qc->scsicmd->sense_buffer);
+		if (!tmp) {
+			/* ATA_QCFLAG_SENSE_VALID is used to tell
+			 * atapi_qc_complete() that sense data is
+			 * already valid.
+			 *
+			 * TODO: interpret sense data and set
+			 * appropriate err_mask.
+			 */
+			qc->flags |= ATA_QCFLAG_SENSE_VALID;
+		} else
+			qc->err_mask |= tmp;
+	}
+
+	if (qc->err_mask & (AC_ERR_HSM | AC_ERR_TIMEOUT | AC_ERR_ATA_BUS))
+		action |= ATA_EH_SOFTRESET;
+
+	return action;
+}
+
+static int ata_eh_categorize_ering_entry(struct ata_ering_entry *ent)
+{
+	if (ent->err_mask & (AC_ERR_ATA_BUS | AC_ERR_TIMEOUT))
+		return 1;
+
+	if (ent->is_io) {
+		if (ent->err_mask & AC_ERR_HSM)
+			return 1;
+		if ((ent->err_mask &
+		     (AC_ERR_DEV|AC_ERR_MEDIA|AC_ERR_INVALID)) == AC_ERR_DEV)
+			return 2;
+	}
+
+	return 0;
+}
+
+struct speed_down_needed_arg {
+	u64 since;
+	int nr_errors[3];
+};
+
+static int speed_down_needed_cb(struct ata_ering_entry *ent, void *void_arg)
+{
+	struct speed_down_needed_arg *arg = void_arg;
+
+	if (ent->timestamp < arg->since)
+		return -1;
+
+	arg->nr_errors[ata_eh_categorize_ering_entry(ent)]++;
+	return 0;
+}
+
+/**
+ *	ata_eh_speed_down_needed - Determine wheter speed down is necessary
+ *	@dev: Device of interest
+ *
+ *	This function examines error ring of @dev and determines
+ *	whether speed down is necessary.  Speed down is necessary if
+ *	there have been more than 3 of Cat-1 errors or 10 of Cat-2
+ *	errors during last 15 minutes.
+ *
+ *	Cat-1 errors are ATA_BUS, TIMEOUT for any command and HSM
+ *	violation for known supported commands.
+ *
+ *	Cat-2 errors are unclassified DEV error for known supported
+ *	command.
+ *
+ *	LOCKING:
+ *	Inherited from caller.
+ *
+ *	RETURNS:
+ *	1 if speed down is necessary, 0 otherwise
+ */
+static int ata_eh_speed_down_needed(struct ata_device *dev)
+{
+	const u64 interval = 15LLU * 60 * HZ;
+	static const int err_limits[3] = { -1, 3, 10 };
+	struct speed_down_needed_arg arg;
+	struct ata_ering_entry *ent;
+	int err_cat;
+	u64 j64;
+
+	ent = ata_ering_top(&dev->ering);
+	if (!ent)
+		return 0;
+
+	err_cat = ata_eh_categorize_ering_entry(ent);
+	if (err_cat == 0)
+		return 0;
+
+	memset(&arg, 0, sizeof(arg));
+
+	j64 = get_jiffies_64();
+	if (j64 >= interval)
+		arg.since = j64 - interval;
+	else
+		arg.since = 0;
+
+	ata_ering_map(&dev->ering, speed_down_needed_cb, &arg);
+
+	return arg.nr_errors[err_cat] > err_limits[err_cat];
+}
+
+/**
+ *	ata_eh_speed_down - record error and speed down if necessary
+ *	@dev: Failed device
+ *	@is_io: Did the device fail during normal IO?
+ *	@err_mask: err_mask of the error
+ *
+ *	Record error and examine error history to determine whether
+ *	adjusting transmission speed is necessary.  It also sets
+ *	transmission limits appropriately if such adjustment is
+ *	necessary.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	0 on success, -errno otherwise
+ */
+static int ata_eh_speed_down(struct ata_device *dev, int is_io,
+			     unsigned int err_mask)
+{
+	if (!err_mask)
+		return 0;
+
+	/* record error and determine whether speed down is necessary */
+	ata_ering_record(&dev->ering, is_io, err_mask);
+
+	if (!ata_eh_speed_down_needed(dev))
+		return 0;
+
+	/* speed down SATA link speed if possible */
+	if (sata_down_spd_limit(dev->ap) == 0)
+		return ATA_EH_HARDRESET;
+
+	/* lower transfer mode */
+	if (ata_down_xfermask_limit(dev, 0) == 0)
+		return ATA_EH_SOFTRESET;
+
+	ata_dev_printk(dev, KERN_ERR,
+		       "speed down requested but no transfer mode left\n");
+	return 0;
+}
+
+/**
+ *	ata_eh_autopsy - analyze error and determine recovery action
+ *	@ap: ATA port to perform autopsy on
+ *
+ *	Analyze why @ap failed and determine which recovery action is
+ *	needed.  This function also sets more detailed AC_ERR_* values
+ *	and fills sense data for ATAPI CHECK SENSE.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ */
+static void ata_eh_autopsy(struct ata_port *ap)
+{
+	struct ata_eh_context *ehc = &ap->eh_context;
+	unsigned int action = ehc->i.action;
+	struct ata_device *failed_dev = NULL;
+	unsigned int all_err_mask = 0;
+	int tag, is_io = 0;
+	u32 serror;
+	int rc;
+
+	DPRINTK("ENTER\n");
+
+	/* obtain and analyze SError */
+	rc = sata_scr_read(ap, SCR_ERROR, &serror);
+	if (rc == 0) {
+		ehc->i.serror |= serror;
+		ata_eh_analyze_serror(ap);
+	} else if (rc != -EOPNOTSUPP)
+		action |= ATA_EH_HARDRESET;
+
+	/* any real error trumps AC_ERR_OTHER */
+	if (ehc->i.err_mask & ~AC_ERR_OTHER)
+		ehc->i.err_mask &= ~AC_ERR_OTHER;
+
+	all_err_mask |= ehc->i.err_mask;
+
+	for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
+		struct ata_queued_cmd *qc = __ata_qc_from_tag(ap, tag);
+
+		if (!(qc->flags & ATA_QCFLAG_FAILED))
+			continue;
+
+		/* inherit upper level err_mask */
+		qc->err_mask |= ehc->i.err_mask;
+
+		if (qc->err_mask & AC_ERR_TIMEOUT)
+			action |= ATA_EH_SOFTRESET;
+
+		/* analyze TF */
+		action |= ata_eh_analyze_tf(qc, &qc->result_tf);
+
+		/* DEV errors are probably spurious in case of ATA_BUS error */
+		if (qc->err_mask & AC_ERR_ATA_BUS)
+			qc->err_mask &= ~(AC_ERR_DEV | AC_ERR_MEDIA |
+					  AC_ERR_INVALID);
+
+		/* any real error trumps unknown error */
+		if (qc->err_mask & ~AC_ERR_OTHER)
+			qc->err_mask &= ~AC_ERR_OTHER;
+
+		/* SENSE_VALID trumps dev/unknown error and revalidation */
+		if (qc->flags & ATA_QCFLAG_SENSE_VALID) {
+			qc->err_mask &= ~(AC_ERR_DEV | AC_ERR_OTHER);
+			action &= ~ATA_EH_REVALIDATE;
+		}
+
+		/* accumulate error info */
+		failed_dev = qc->dev;
+		all_err_mask |= qc->err_mask;
+		if (qc->flags & ATA_QCFLAG_IO)
+			is_io = 1;
+	}
+
+	/* speed down iff command was in progress */
+	if (failed_dev)
+		action |= ata_eh_speed_down(failed_dev, is_io, all_err_mask);
+
+	if (all_err_mask)
+		action |= ATA_EH_REVALIDATE;
+
+	ehc->i.dev = failed_dev;
+	ehc->i.action = action;
+
+	DPRINTK("EXIT\n");
+}
+
+/**
+ *	ata_eh_report - report error handling to user
+ *	@ap: ATA port EH is going on
+ *
+ *	Report EH to user.
+ *
+ *	LOCKING:
+ *	None.
+ */
+static void ata_eh_report(struct ata_port *ap)
+{
+	struct ata_eh_context *ehc = &ap->eh_context;
+	const char *frozen, *desc;
+	int tag, nr_failed = 0;
+
+	desc = NULL;
+	if (ehc->i.desc[0] != '\0')
+		desc = ehc->i.desc;
+
+	for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
+		struct ata_queued_cmd *qc = __ata_qc_from_tag(ap, tag);
+
+		if (!(qc->flags & ATA_QCFLAG_FAILED))
+			continue;
+		if (qc->flags & ATA_QCFLAG_SENSE_VALID && !qc->err_mask)
+			continue;
+
+		nr_failed++;
+	}
+
+	if (!nr_failed && !ehc->i.err_mask)
+		return;
+
+	frozen = "";
+	if (ap->flags & ATA_FLAG_FROZEN)
+		frozen = " frozen";
+
+	if (ehc->i.dev) {
+		ata_dev_printk(ehc->i.dev, KERN_ERR,
+			       "exception Emask 0x%x SErr 0x%x action 0x%x%s\n",
+			       ehc->i.err_mask, ehc->i.serror, ehc->i.action,
+			       frozen);
+		if (desc)
+			ata_dev_printk(ehc->i.dev, KERN_ERR, "(%s)\n", desc);
+	} else {
+		ata_port_printk(ap, KERN_ERR,
+				"exception Emask 0x%x SErr 0x%x action 0x%x%s\n",
+				ehc->i.err_mask, ehc->i.serror, ehc->i.action,
+				frozen);
+		if (desc)
+			ata_port_printk(ap, KERN_ERR, "(%s)\n", desc);
+	}
+
+	for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
+		struct ata_queued_cmd *qc = __ata_qc_from_tag(ap, tag);
+
+		if (!(qc->flags & ATA_QCFLAG_FAILED) || !qc->err_mask)
+			continue;
+
+		ata_dev_printk(qc->dev, KERN_ERR, "tag %d cmd 0x%x "
+			       "Emask 0x%x stat 0x%x err 0x%x (%s)\n",
+			       qc->tag, qc->tf.command, qc->err_mask,
+			       qc->result_tf.command, qc->result_tf.feature,
+			       ata_err_string(qc->err_mask));
+	}
+}
+
+static int ata_eh_reset(struct ata_port *ap, ata_reset_fn_t softreset,
+			ata_reset_fn_t hardreset, ata_postreset_fn_t postreset)
+{
+	struct ata_eh_context *ehc = &ap->eh_context;
+	unsigned int classes[ATA_MAX_DEVICES];
+	int tries = ATA_EH_RESET_TRIES;
+	ata_reset_fn_t reset;
+	int rc;
+
+	if (softreset && (!hardreset || (!sata_set_spd_needed(ap) &&
+					 !(ehc->i.action & ATA_EH_HARDRESET))))
+		reset = softreset;
+	else
+		reset = hardreset;
+
+ retry:
+	ata_port_printk(ap, KERN_INFO, "%s resetting port\n",
+			reset == softreset ? "soft" : "hard");
+
+	/* reset */
+	ata_eh_about_to_do(ap, ATA_EH_RESET_MASK);
+	ehc->i.flags |= ATA_EHI_DID_RESET;
+
+	rc = ata_do_reset(ap, reset, classes);
+
+	if (rc && --tries) {
+		ata_port_printk(ap, KERN_WARNING,
+				"%sreset failed, retrying in 5 secs\n",
+				reset == softreset ? "soft" : "hard");
+		ssleep(5);
+
+		if (reset == hardreset)
+			sata_down_spd_limit(ap);
+		if (hardreset)
+			reset = hardreset;
+		goto retry;
+	}
+
+	if (rc == 0) {
+		if (postreset)
+			postreset(ap, classes);
+
+		/* reset successful, schedule revalidation */
+		ehc->i.dev = NULL;
+		ehc->i.action &= ~ATA_EH_RESET_MASK;
+		ehc->i.action |= ATA_EH_REVALIDATE;
+	}
+
+	return rc;
+}
+
+static int ata_eh_revalidate(struct ata_port *ap,
+			     struct ata_device **r_failed_dev)
+{
+	struct ata_eh_context *ehc = &ap->eh_context;
+	struct ata_device *dev;
+	int i, rc = 0;
+
+	DPRINTK("ENTER\n");
+
+	for (i = 0; i < ATA_MAX_DEVICES; i++) {
+		dev = &ap->device[i];
+
+		if (ehc->i.action & ATA_EH_REVALIDATE && ata_dev_enabled(dev) &&
+		    (!ehc->i.dev || ehc->i.dev == dev)) {
+			if (ata_port_offline(ap)) {
+				rc = -EIO;
+				break;
+			}
+
+			ata_eh_about_to_do(ap, ATA_EH_REVALIDATE);
+			rc = ata_dev_revalidate(dev,
+					ehc->i.flags & ATA_EHI_DID_RESET);
+			if (rc)
+				break;
+
+			ehc->i.action &= ~ATA_EH_REVALIDATE;
+		}
+	}
+
+	if (rc)
+		*r_failed_dev = dev;
+
+	DPRINTK("EXIT\n");
+	return rc;
+}
+
+static int ata_port_nr_enabled(struct ata_port *ap)
+{
+	int i, cnt = 0;
+
+	for (i = 0; i < ATA_MAX_DEVICES; i++)
+		if (ata_dev_enabled(&ap->device[i]))
+			cnt++;
+	return cnt;
+}
+
+/**
+ *	ata_eh_recover - recover host port after error
+ *	@ap: host port to recover
+ *	@softreset: softreset method (can be NULL)
+ *	@hardreset: hardreset method (can be NULL)
+ *	@postreset: postreset method (can be NULL)
+ *
+ *	This is the alpha and omega, eum and yang, heart and soul of
+ *	libata exception handling.  On entry, actions required to
+ *	recover each devices are recorded in eh_context.  This
+ *	function executes all the operations with appropriate retrials
+ *	and fallbacks to resurrect failed devices.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	0 on success, -errno on failure.
+ */
+static int ata_eh_recover(struct ata_port *ap, ata_reset_fn_t softreset,
+			  ata_reset_fn_t hardreset,
+			  ata_postreset_fn_t postreset)
+{
+	struct ata_eh_context *ehc = &ap->eh_context;
+	struct ata_device *dev;
+	int down_xfermask, i, rc;
+
+	DPRINTK("ENTER\n");
+
+	/* prep for recovery */
+	for (i = 0; i < ATA_MAX_DEVICES; i++) {
+		dev = &ap->device[i];
+
+		ehc->tries[dev->devno] = ATA_EH_DEV_TRIES;
+	}
+
+ retry:
+	down_xfermask = 0;
+	rc = 0;
+
+	/* skip EH if possible. */
+	if (!ata_port_nr_enabled(ap) && !(ap->flags & ATA_FLAG_FROZEN))
+		ehc->i.action = 0;
+
+	/* reset */
+	if (ehc->i.action & ATA_EH_RESET_MASK) {
+		ata_eh_freeze_port(ap);
+
+		rc = ata_eh_reset(ap, softreset, hardreset, postreset);
+		if (rc) {
+			ata_port_printk(ap, KERN_ERR,
+					"reset failed, giving up\n");
+			goto out;
+		}
+
+		ata_eh_thaw_port(ap);
+	}
+
+	/* revalidate existing devices */
+	rc = ata_eh_revalidate(ap, &dev);
+	if (rc)
+		goto dev_fail;
+
+	/* configure transfer mode if the port has been reset */
+	if (ehc->i.flags & ATA_EHI_DID_RESET) {
+		rc = ata_set_mode(ap, &dev);
+		if (rc) {
+			down_xfermask = 1;
+			goto dev_fail;
+		}
+	}
+
+	goto out;
+
+ dev_fail:
+	switch (rc) {
+	case -ENODEV:
+	case -EINVAL:
+		ehc->tries[dev->devno] = 0;
+		break;
+	case -EIO:
+		sata_down_spd_limit(ap);
+	default:
+		ehc->tries[dev->devno]--;
+		if (down_xfermask &&
+		    ata_down_xfermask_limit(dev, ehc->tries[dev->devno] == 1))
+			ehc->tries[dev->devno] = 0;
+	}
+
+	/* disable device if it has used up all its chances */
+	if (ata_dev_enabled(dev) && !ehc->tries[dev->devno])
+		ata_dev_disable(dev);
+
+	/* soft didn't work?  be haaaaard */
+	if (ehc->i.flags & ATA_EHI_DID_RESET)
+		ehc->i.action |= ATA_EH_HARDRESET;
+	else
+		ehc->i.action |= ATA_EH_SOFTRESET;
+
+	if (ata_port_nr_enabled(ap)) {
+		ata_port_printk(ap, KERN_WARNING, "failed to recover some "
+				"devices, retrying in 5 secs\n");
+		ssleep(5);
+	} else {
+		/* no device left, repeat fast */
+		msleep(500);
+	}
+
+	goto retry;
+
+ out:
+	if (rc) {
+		for (i = 0; i < ATA_MAX_DEVICES; i++)
+			ata_dev_disable(&ap->device[i]);
+	}
+
+	DPRINTK("EXIT, rc=%d\n", rc);
+	return rc;
+}
+
+/**
+ *	ata_eh_finish - finish up EH
+ *	@ap: host port to finish EH for
+ *
+ *	Recovery is complete.  Clean up EH states and retry or finish
+ *	failed qcs.
+ *
+ *	LOCKING:
+ *	None.
+ */
+static void ata_eh_finish(struct ata_port *ap)
+{
+	int tag;
+
+	/* retry or finish qcs */
+	for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
+		struct ata_queued_cmd *qc = __ata_qc_from_tag(ap, tag);
+
+		if (!(qc->flags & ATA_QCFLAG_FAILED))
+			continue;
+
+		if (qc->err_mask) {
+			/* FIXME: Once EH migration is complete,
+			 * generate sense data in this function,
+			 * considering both err_mask and tf.
+			 */
+			if (qc->err_mask & AC_ERR_INVALID)
+				ata_eh_qc_complete(qc);
+			else
+				ata_eh_qc_retry(qc);
+		} else {
+			if (qc->flags & ATA_QCFLAG_SENSE_VALID) {
+				ata_eh_qc_complete(qc);
+			} else {
+				/* feed zero TF to sense generation */
+				memset(&qc->result_tf, 0, sizeof(qc->result_tf));
+				ata_eh_qc_retry(qc);
+			}
+		}
+	}
+}
+
+/**
+ *	ata_do_eh - do standard error handling
+ *	@ap: host port to handle error for
+ *	@softreset: softreset method (can be NULL)
+ *	@hardreset: hardreset method (can be NULL)
+ *	@postreset: postreset method (can be NULL)
+ *
+ *	Perform standard error handling sequence.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ */
+void ata_do_eh(struct ata_port *ap, ata_reset_fn_t softreset,
+	       ata_reset_fn_t hardreset, ata_postreset_fn_t postreset)
+{
+	ata_eh_autopsy(ap);
+	ata_eh_report(ap);
+	ata_eh_recover(ap, softreset, hardreset, postreset);
+	ata_eh_finish(ap);
+}
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 298f9918e375..9fe46073cf8c 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -247,6 +247,8 @@ enum {
 
 	/* how hard are we gonna try to probe/recover devices */
 	ATA_PROBE_MAX_TRIES	= 3,
+	ATA_EH_RESET_TRIES	= 3,
+	ATA_EH_DEV_TRIES	= 3,
 };
 
 enum hsm_task_states {
@@ -727,6 +729,9 @@ extern void ata_eh_thaw_port(struct ata_port *ap);
 extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
 
+extern void ata_do_eh(struct ata_port *ap, ata_reset_fn_t softreset,
+		      ata_reset_fn_t hardreset, ata_postreset_fn_t postreset);
+
 /*
  * printk helpers
  */
-- 
cgit v1.2.3


From 6d97dbd72da31a0e334f251fa9df4be9fab6fde2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 20:58:24 +0900
Subject: [PATCH] libata-eh: implement BMDMA EH

Implement stock BMDMA error handling methods.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-bmdma.c | 144 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/scsi/libata-core.c  |   5 ++
 include/linux/libata.h      |   8 +++
 3 files changed, 157 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-bmdma.c b/drivers/scsi/libata-bmdma.c
index 835dff0bafdc..49eff18a67e3 100644
--- a/drivers/scsi/libata-bmdma.c
+++ b/drivers/scsi/libata-bmdma.c
@@ -652,6 +652,150 @@ void ata_bmdma_stop(struct ata_queued_cmd *qc)
 	ata_altstatus(ap);        /* dummy read */
 }
 
+/**
+ *	ata_bmdma_freeze - Freeze BMDMA controller port
+ *	@ap: port to freeze
+ *
+ *	Freeze BMDMA controller port.
+ *
+ *	LOCKING:
+ *	Inherited from caller.
+ */
+void ata_bmdma_freeze(struct ata_port *ap)
+{
+	struct ata_ioports *ioaddr = &ap->ioaddr;
+
+	ap->ctl |= ATA_NIEN;
+	ap->last_ctl = ap->ctl;
+
+	if (ap->flags & ATA_FLAG_MMIO)
+		writeb(ap->ctl, (void __iomem *)ioaddr->ctl_addr);
+	else
+		outb(ap->ctl, ioaddr->ctl_addr);
+}
+
+/**
+ *	ata_bmdma_thaw - Thaw BMDMA controller port
+ *	@ap: port to thaw
+ *
+ *	Thaw BMDMA controller port.
+ *
+ *	LOCKING:
+ *	Inherited from caller.
+ */
+void ata_bmdma_thaw(struct ata_port *ap)
+{
+	/* clear & re-enable interrupts */
+	ata_chk_status(ap);
+	ap->ops->irq_clear(ap);
+	if (ap->ioaddr.ctl_addr)	/* FIXME: hack. create a hook instead */
+		ata_irq_on(ap);
+}
+
+/**
+ *	ata_bmdma_drive_eh - Perform EH with given methods for BMDMA controller
+ *	@ap: port to handle error for
+ *	@softreset: softreset method (can be NULL)
+ *	@hardreset: hardreset method (can be NULL)
+ *	@postreset: postreset method (can be NULL)
+ *
+ *	Handle error for ATA BMDMA controller.  It can handle both
+ *	PATA and SATA controllers.  Many controllers should be able to
+ *	use this EH as-is or with some added handling before and
+ *	after.
+ *
+ *	This function is intended to be used for constructing
+ *	->error_handler callback by low level drivers.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ */
+void ata_bmdma_drive_eh(struct ata_port *ap, ata_reset_fn_t softreset,
+			ata_reset_fn_t hardreset, ata_postreset_fn_t postreset)
+{
+	struct ata_host_set *host_set = ap->host_set;
+	struct ata_eh_context *ehc = &ap->eh_context;
+	struct ata_queued_cmd *qc;
+	unsigned long flags;
+	int thaw = 0;
+
+	qc = __ata_qc_from_tag(ap, ap->active_tag);
+	if (qc && !(qc->flags & ATA_QCFLAG_FAILED))
+		qc = NULL;
+
+	/* reset PIO HSM and stop DMA engine */
+	spin_lock_irqsave(&host_set->lock, flags);
+
+	ap->flags &= ~ATA_FLAG_NOINTR;
+	ap->hsm_task_state = HSM_ST_IDLE;
+
+	if (qc && (qc->tf.protocol == ATA_PROT_DMA ||
+		   qc->tf.protocol == ATA_PROT_ATAPI_DMA)) {
+		u8 host_stat;
+
+		host_stat = ata_bmdma_status(ap);
+
+		ata_ehi_push_desc(&ehc->i, "BMDMA stat 0x%x", host_stat);
+
+		/* BMDMA controllers indicate host bus error by
+		 * setting DMA_ERR bit and timing out.  As it wasn't
+		 * really a timeout event, adjust error mask and
+		 * cancel frozen state.
+		 */
+		if (qc->err_mask == AC_ERR_TIMEOUT && host_stat & ATA_DMA_ERR) {
+			qc->err_mask = AC_ERR_HOST_BUS;
+			thaw = 1;
+		}
+
+		ap->ops->bmdma_stop(qc);
+	}
+
+	ata_altstatus(ap);
+	ata_chk_status(ap);
+	ap->ops->irq_clear(ap);
+
+	spin_unlock_irqrestore(&host_set->lock, flags);
+
+	if (thaw)
+		ata_eh_thaw_port(ap);
+
+	/* PIO and DMA engines have been stopped, perform recovery */
+	ata_do_eh(ap, softreset, hardreset, postreset);
+}
+
+/**
+ *	ata_bmdma_error_handler - Stock error handler for BMDMA controller
+ *	@ap: port to handle error for
+ *
+ *	Stock error handler for BMDMA controller.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ */
+void ata_bmdma_error_handler(struct ata_port *ap)
+{
+	ata_reset_fn_t hardreset;
+
+	hardreset = NULL;
+	if (sata_scr_valid(ap))
+		hardreset = sata_std_hardreset;
+
+	ata_bmdma_drive_eh(ap, ata_std_softreset, hardreset, ata_std_postreset);
+}
+
+/**
+ *	ata_bmdma_post_internal_cmd - Stock post_internal_cmd for
+ *				      BMDMA controller
+ *	@qc: internal command to clean up
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ */
+void ata_bmdma_post_internal_cmd(struct ata_queued_cmd *qc)
+{
+	ata_bmdma_stop(qc);
+}
+
 #ifdef CONFIG_PCI
 static struct ata_probe_ent *
 ata_probe_ent_alloc(struct device *dev, const struct ata_port_info *port)
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index ddc47097d37e..2969599ec0b9 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -5377,6 +5377,11 @@ EXPORT_SYMBOL_GPL(ata_bmdma_start);
 EXPORT_SYMBOL_GPL(ata_bmdma_irq_clear);
 EXPORT_SYMBOL_GPL(ata_bmdma_status);
 EXPORT_SYMBOL_GPL(ata_bmdma_stop);
+EXPORT_SYMBOL_GPL(ata_bmdma_freeze);
+EXPORT_SYMBOL_GPL(ata_bmdma_thaw);
+EXPORT_SYMBOL_GPL(ata_bmdma_drive_eh);
+EXPORT_SYMBOL_GPL(ata_bmdma_error_handler);
+EXPORT_SYMBOL_GPL(ata_bmdma_post_internal_cmd);
 EXPORT_SYMBOL_GPL(ata_port_probe);
 EXPORT_SYMBOL_GPL(sata_set_spd);
 EXPORT_SYMBOL_GPL(sata_phy_reset);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 9fe46073cf8c..6ccacbf889e3 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -661,6 +661,14 @@ extern void ata_bmdma_start (struct ata_queued_cmd *qc);
 extern void ata_bmdma_stop(struct ata_queued_cmd *qc);
 extern u8   ata_bmdma_status(struct ata_port *ap);
 extern void ata_bmdma_irq_clear(struct ata_port *ap);
+extern void ata_bmdma_freeze(struct ata_port *ap);
+extern void ata_bmdma_thaw(struct ata_port *ap);
+extern void ata_bmdma_drive_eh(struct ata_port *ap,
+			       ata_reset_fn_t softreset,
+			       ata_reset_fn_t hardreset,
+			       ata_postreset_fn_t postreset);
+extern void ata_bmdma_error_handler(struct ata_port *ap);
+extern void ata_bmdma_post_internal_cmd(struct ata_queued_cmd *qc);
 extern void ata_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
 			      void (*done)(struct scsi_cmnd *));
-- 
cgit v1.2.3


From 88e490340ea4c3a2ebc0187a4339912e2fc1a081 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 21:03:38 +0900
Subject: [PATCH] libata-ncq: add NCQ related ATA/libata constants and macros

Add NCQ related ATA/libata constants and macros.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 include/linux/ata.h    | 9 +++++++++
 include/linux/libata.h | 2 ++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 1cbeb434af9a..c494e1c0531e 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -133,6 +133,8 @@ enum {
 	ATA_CMD_WRITE		= 0xCA,
 	ATA_CMD_WRITE_EXT	= 0x35,
 	ATA_CMD_WRITE_FUA_EXT	= 0x3D,
+	ATA_CMD_FPDMA_READ	= 0x60,
+	ATA_CMD_FPDMA_WRITE	= 0x61,
 	ATA_CMD_PIO_READ	= 0x20,
 	ATA_CMD_PIO_READ_EXT	= 0x24,
 	ATA_CMD_PIO_WRITE	= 0x30,
@@ -151,6 +153,10 @@ enum {
 	ATA_CMD_INIT_DEV_PARAMS	= 0x91,
 	ATA_CMD_READ_NATIVE_MAX	= 0xF8,
 	ATA_CMD_READ_NATIVE_MAX_EXT = 0x27,
+	ATA_CMD_READ_LOG_EXT	= 0x2f,
+
+	/* READ_LOG_EXT pages */
+	ATA_LOG_SATA_NCQ	= 0x10,
 
 	/* SETFEATURES stuff */
 	SETFEATURES_XFER	= 0x03,
@@ -221,6 +227,7 @@ enum ata_tf_protocols {
 	ATA_PROT_NODATA,	/* no data */
 	ATA_PROT_PIO,		/* PIO single sector */
 	ATA_PROT_DMA,		/* DMA */
+	ATA_PROT_NCQ,		/* NCQ */
 	ATA_PROT_ATAPI,		/* packet command, PIO data xfer*/
 	ATA_PROT_ATAPI_NODATA,	/* packet command, no data */
 	ATA_PROT_ATAPI_DMA,	/* packet command with special DMA sauce */
@@ -276,6 +283,8 @@ struct ata_taskfile {
 #define ata_id_has_pm(id)	((id)[82] & (1 << 3))
 #define ata_id_has_lba(id)	((id)[49] & (1 << 9))
 #define ata_id_has_dma(id)	((id)[49] & (1 << 8))
+#define ata_id_has_ncq(id)	((id)[76] & (1 << 8))
+#define ata_id_queue_depth(id)	(((id)[75] & 0x1f) + 1)
 #define ata_id_removeable(id)	((id)[0] & (1 << 7))
 #define ata_id_has_dword_io(id)	((id)[50] & (1 << 0))
 #define ata_id_u32(id,n)	\
diff --git a/include/linux/libata.h b/include/linux/libata.h
index db17723e23fb..7c9e280a4829 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -125,6 +125,7 @@ enum {
 	ATA_DFLAG_LBA		= (1 << 0), /* device supports LBA */
 	ATA_DFLAG_LBA48		= (1 << 1), /* device supports LBA48 */
 	ATA_DFLAG_CDB_INTR	= (1 << 2), /* device asserts INTRQ when ready for CDB */
+	ATA_DFLAG_NCQ		= (1 << 3), /* device supports NCQ */
 	ATA_DFLAG_CFG_MASK	= (1 << 8) - 1,
 
 	ATA_DFLAG_PIO		= (1 << 8), /* device currently in PIO mode */
@@ -150,6 +151,7 @@ enum {
 	ATA_FLAG_IRQ_MASK	= (1 << 9), /* Mask IRQ in PIO xfers */
 	ATA_FLAG_PIO_POLLING	= (1 << 10), /* use polling PIO if LLD
 					      * doesn't handle PIO interrupts */
+	ATA_FLAG_NCQ		= (1 << 11), /* host supports NCQ */
 
 	ATA_FLAG_DEBUGMSG	= (1 << 14),
 	ATA_FLAG_FLUSH_PORT_TASK = (1 << 15), /* flush port task */
-- 
cgit v1.2.3


From 6cec4a3943bdfe46e2952bc246f17670f747be8d Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 21:03:41 +0900
Subject: [PATCH] libata-ncq: rename ap->qactive to ap->qc_allocated

Rename ap->qactive to ap->qc_allocated.  This is to accomodate
addition of ap->qc_active, mask of active qcs.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c | 6 +++---
 include/linux/libata.h     | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 487b8f22981f..f8401800dc1a 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1006,7 +1006,7 @@ unsigned ata_exec_internal(struct ata_device *dev,
 	else
 		tag = 0;
 
-	if (test_and_set_bit(tag, &ap->qactive))
+	if (test_and_set_bit(tag, &ap->qc_allocated))
 		BUG();
 	qc = __ata_qc_from_tag(ap, tag);
 
@@ -4207,7 +4207,7 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
 
 	/* the last tag is reserved for internal command. */
 	for (i = 0; i < ATA_MAX_QUEUE - 1; i++)
-		if (!test_and_set_bit(i, &ap->qactive)) {
+		if (!test_and_set_bit(i, &ap->qc_allocated)) {
 			qc = __ata_qc_from_tag(ap, i);
 			break;
 		}
@@ -4264,7 +4264,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
 	tag = qc->tag;
 	if (likely(ata_tag_valid(tag))) {
 		qc->tag = ATA_TAG_POISON;
-		clear_bit(tag, &ap->qactive);
+		clear_bit(tag, &ap->qc_allocated);
 	}
 }
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 7c9e280a4829..b3a4f8bea828 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -474,7 +474,7 @@ struct ata_port {
 	struct ata_device	device[ATA_MAX_DEVICES];
 
 	struct ata_queued_cmd	qcmd[ATA_MAX_QUEUE];
-	unsigned long		qactive;
+	unsigned long		qc_allocated;
 	unsigned int		active_tag;
 
 	struct ata_host_stats	stats;
-- 
cgit v1.2.3


From dedaf2b0365ccec50714fbde0b3215e7e94fa47c Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 21:03:43 +0900
Subject: [PATCH] libata-ncq: implement ap->qc_active, ap->sactive and complete
 helper

Add ap->qc_active and ap->sactive, mask of all active qcs and libata's
view of the SActive register, respectively.  Also, implement
ata_qc_complete_multiple() which takes new qc_active mask and complete
multiple qcs according to the mask.

These will be used to track NCQ commands and complete them.  The
distinction between ap->qc_active and ap->sactive is also useful for
later PM implementation.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c | 81 ++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/libata.h     |  5 +++
 2 files changed, 84 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index f8401800dc1a..eea1fe9c8b79 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -981,6 +981,7 @@ unsigned ata_exec_internal(struct ata_device *dev,
 	u8 command = tf->command;
 	struct ata_queued_cmd *qc;
 	unsigned int tag, preempted_tag;
+	u32 preempted_sactive, preempted_qc_active;
 	DECLARE_COMPLETION(wait);
 	unsigned long flags;
 	unsigned int err_mask;
@@ -1017,7 +1018,11 @@ unsigned ata_exec_internal(struct ata_device *dev,
 	ata_qc_reinit(qc);
 
 	preempted_tag = ap->active_tag;
+	preempted_sactive = ap->sactive;
+	preempted_qc_active = ap->qc_active;
 	ap->active_tag = ATA_TAG_POISON;
+	ap->sactive = 0;
+	ap->qc_active = 0;
 
 	/* prepare & issue qc */
 	qc->tf = *tf;
@@ -1082,6 +1087,8 @@ unsigned ata_exec_internal(struct ata_device *dev,
 
 	ata_qc_free(qc);
 	ap->active_tag = preempted_tag;
+	ap->sactive = preempted_sactive;
+	ap->qc_active = preempted_qc_active;
 
 	/* XXX - Some LLDDs (sata_mv) disable port on command failure.
 	 * Until those drivers are fixed, we detect the condition
@@ -4270,6 +4277,8 @@ void ata_qc_free(struct ata_queued_cmd *qc)
 
 void __ata_qc_complete(struct ata_queued_cmd *qc)
 {
+	struct ata_port *ap = qc->ap;
+
 	WARN_ON(qc == NULL);	/* ata_qc_from_tag _might_ return NULL */
 	WARN_ON(!(qc->flags & ATA_QCFLAG_ACTIVE));
 
@@ -4277,13 +4286,17 @@ void __ata_qc_complete(struct ata_queued_cmd *qc)
 		ata_sg_clean(qc);
 
 	/* command should be marked inactive atomically with qc completion */
-	qc->ap->active_tag = ATA_TAG_POISON;
+	if (qc->tf.protocol == ATA_PROT_NCQ)
+		ap->sactive &= ~(1 << qc->tag);
+	else
+		ap->active_tag = ATA_TAG_POISON;
 
 	/* atapi: mark qc as inactive to prevent the interrupt handler
 	 * from completing the command twice later, before the error handler
 	 * is called. (when rc != 0 and atapi request sense is needed)
 	 */
 	qc->flags &= ~ATA_QCFLAG_ACTIVE;
+	ap->qc_active &= ~(1 << qc->tag);
 
 	/* call completion callback */
 	qc->complete_fn(qc);
@@ -4349,6 +4362,55 @@ void ata_qc_complete(struct ata_queued_cmd *qc)
 	}
 }
 
+/**
+ *	ata_qc_complete_multiple - Complete multiple qcs successfully
+ *	@ap: port in question
+ *	@qc_active: new qc_active mask
+ *	@finish_qc: LLDD callback invoked before completing a qc
+ *
+ *	Complete in-flight commands.  This functions is meant to be
+ *	called from low-level driver's interrupt routine to complete
+ *	requests normally.  ap->qc_active and @qc_active is compared
+ *	and commands are completed accordingly.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host_set lock)
+ *
+ *	RETURNS:
+ *	Number of completed commands on success, -errno otherwise.
+ */
+int ata_qc_complete_multiple(struct ata_port *ap, u32 qc_active,
+			     void (*finish_qc)(struct ata_queued_cmd *))
+{
+	int nr_done = 0;
+	u32 done_mask;
+	int i;
+
+	done_mask = ap->qc_active ^ qc_active;
+
+	if (unlikely(done_mask & qc_active)) {
+		ata_port_printk(ap, KERN_ERR, "illegal qc_active transition "
+				"(%08x->%08x)\n", ap->qc_active, qc_active);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < ATA_MAX_QUEUE; i++) {
+		struct ata_queued_cmd *qc;
+
+		if (!(done_mask & (1 << i)))
+			continue;
+
+		if ((qc = ata_qc_from_tag(ap, i))) {
+			if (finish_qc)
+				finish_qc(qc);
+			ata_qc_complete(qc);
+			nr_done++;
+		}
+	}
+
+	return nr_done;
+}
+
 static inline int ata_should_dma_map(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
@@ -4388,8 +4450,22 @@ void ata_qc_issue(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
 
-	qc->ap->active_tag = qc->tag;
+	/* Make sure only one non-NCQ command is outstanding.  The
+	 * check is skipped for old EH because it reuses active qc to
+	 * request ATAPI sense.
+	 */
+	WARN_ON(ap->ops->error_handler && ata_tag_valid(ap->active_tag));
+
+	if (qc->tf.protocol == ATA_PROT_NCQ) {
+		WARN_ON(ap->sactive & (1 << qc->tag));
+		ap->sactive |= 1 << qc->tag;
+	} else {
+		WARN_ON(ap->sactive);
+		ap->active_tag = qc->tag;
+	}
+
 	qc->flags |= ATA_QCFLAG_ACTIVE;
+	ap->qc_active |= 1 << qc->tag;
 
 	if (ata_should_dma_map(qc)) {
 		if (qc->flags & ATA_QCFLAG_SG) {
@@ -5549,6 +5625,7 @@ EXPORT_SYMBOL_GPL(ata_host_set_remove);
 EXPORT_SYMBOL_GPL(ata_sg_init);
 EXPORT_SYMBOL_GPL(ata_sg_init_one);
 EXPORT_SYMBOL_GPL(ata_qc_complete);
+EXPORT_SYMBOL_GPL(ata_qc_complete_multiple);
 EXPORT_SYMBOL_GPL(ata_qc_issue_prot);
 EXPORT_SYMBOL_GPL(ata_tf_load);
 EXPORT_SYMBOL_GPL(ata_tf_read);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index b3a4f8bea828..dd0db2d21bc5 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -475,7 +475,10 @@ struct ata_port {
 
 	struct ata_queued_cmd	qcmd[ATA_MAX_QUEUE];
 	unsigned long		qc_allocated;
+	unsigned int		qc_active;
+
 	unsigned int		active_tag;
+	u32			sactive;
 
 	struct ata_host_stats	stats;
 	struct ata_host_set	*host_set;
@@ -668,6 +671,8 @@ extern void ata_bmdma_drive_eh(struct ata_port *ap,
 extern void ata_bmdma_error_handler(struct ata_port *ap);
 extern void ata_bmdma_post_internal_cmd(struct ata_queued_cmd *qc);
 extern void ata_qc_complete(struct ata_queued_cmd *qc);
+extern int ata_qc_complete_multiple(struct ata_port *ap, u32 qc_active,
+				    void (*finish_qc)(struct ata_queued_cmd *));
 extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
 			      void (*done)(struct scsi_cmnd *));
 extern int ata_std_bios_param(struct scsi_device *sdev,
-- 
cgit v1.2.3


From a6e6ce8e8dc907a2cf2b994b0ea4099423f046bf Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 15 May 2006 21:03:48 +0900
Subject: [PATCH] libata-ncq: implement NCQ device configuration

Now that all NCQ related stuff are in place, implement NCQ device
configuration and bump ATA_MAX_QUEUE to 32 thus activating NCQ
support.

Original implementation is from Jens Axboe.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c | 31 +++++++++++++++++++++++++++++--
 drivers/scsi/libata-scsi.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/libata.h     |  4 +++-
 3 files changed, 78 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 14ffb5264b65..9051b6821c1c 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1251,6 +1251,28 @@ static inline u8 ata_dev_knobble(struct ata_device *dev)
 	return ((dev->ap->cbl == ATA_CBL_SATA) && (!ata_id_is_sata(dev->id)));
 }
 
+static void ata_dev_config_ncq(struct ata_device *dev,
+			       char *desc, size_t desc_sz)
+{
+	struct ata_port *ap = dev->ap;
+	int hdepth = 0, ddepth = ata_id_queue_depth(dev->id);
+
+	if (!ata_id_has_ncq(dev->id)) {
+		desc[0] = '\0';
+		return;
+	}
+
+	if (ap->flags & ATA_FLAG_NCQ) {
+		hdepth = min(ap->host->can_queue, ATA_MAX_QUEUE - 1);
+		dev->flags |= ATA_DFLAG_NCQ;
+	}
+
+	if (hdepth >= ddepth)
+		snprintf(desc, desc_sz, "NCQ (depth %d)", ddepth);
+	else
+		snprintf(desc, desc_sz, "NCQ (depth %d/%d)", hdepth, ddepth);
+}
+
 /**
  *	ata_dev_configure - Configure the specified ATA/ATAPI device
  *	@dev: Target device to configure
@@ -1311,6 +1333,7 @@ static int ata_dev_configure(struct ata_device *dev, int print_info)
 
 		if (ata_id_has_lba(id)) {
 			const char *lba_desc;
+			char ncq_desc[20];
 
 			lba_desc = "LBA";
 			dev->flags |= ATA_DFLAG_LBA;
@@ -1319,14 +1342,17 @@ static int ata_dev_configure(struct ata_device *dev, int print_info)
 				lba_desc = "LBA48";
 			}
 
+			/* config NCQ */
+			ata_dev_config_ncq(dev, ncq_desc, sizeof(ncq_desc));
+
 			/* print device info to dmesg */
 			if (print_info)
 				ata_dev_printk(dev, KERN_INFO, "ATA-%d, "
-					"max %s, %Lu sectors: %s\n",
+					"max %s, %Lu sectors: %s %s\n",
 					ata_id_major_version(id),
 					ata_mode_string(xfer_mask),
 					(unsigned long long)dev->n_sectors,
-					lba_desc);
+					lba_desc, ncq_desc);
 		} else {
 			/* CHS */
 
@@ -5675,6 +5701,7 @@ EXPORT_SYMBOL_GPL(ata_port_queue_task);
 EXPORT_SYMBOL_GPL(ata_scsi_ioctl);
 EXPORT_SYMBOL_GPL(ata_scsi_queuecmd);
 EXPORT_SYMBOL_GPL(ata_scsi_slave_config);
+EXPORT_SYMBOL_GPL(ata_scsi_change_queue_depth);
 EXPORT_SYMBOL_GPL(ata_scsi_release);
 EXPORT_SYMBOL_GPL(ata_host_intr);
 EXPORT_SYMBOL_GPL(sata_scr_valid);
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 9bef68c7c1de..996058af1bcd 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -41,6 +41,7 @@
 #include <scsi/scsi_eh.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_request.h>
+#include <scsi/scsi_tcq.h>
 #include <scsi/scsi_transport.h>
 #include <linux/libata.h>
 #include <linux/hdreg.h>
@@ -684,6 +685,14 @@ static void ata_scsi_dev_config(struct scsi_device *sdev,
 		request_queue_t *q = sdev->request_queue;
 		blk_queue_max_hw_segments(q, q->max_hw_segments - 1);
 	}
+
+	if (dev->flags & ATA_DFLAG_NCQ) {
+		int depth;
+
+		depth = min(sdev->host->can_queue, ata_id_queue_depth(dev->id));
+		depth = min(ATA_MAX_QUEUE - 1, depth);
+		scsi_adjust_queue_depth(sdev, MSG_SIMPLE_TAG, depth);
+	}
 }
 
 /**
@@ -717,6 +726,43 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
 	return 0;	/* scsi layer doesn't check return value, sigh */
 }
 
+/**
+ *	ata_scsi_change_queue_depth - SCSI callback for queue depth config
+ *	@sdev: SCSI device to configure queue depth for
+ *	@queue_depth: new queue depth
+ *
+ *	This is libata standard hostt->change_queue_depth callback.
+ *	SCSI will call into this callback when user tries to set queue
+ *	depth via sysfs.
+ *
+ *	LOCKING:
+ *	SCSI layer (we don't care)
+ *
+ *	RETURNS:
+ *	Newly configured queue depth.
+ */
+int ata_scsi_change_queue_depth(struct scsi_device *sdev, int queue_depth)
+{
+	struct ata_port *ap = ata_shost_to_port(sdev->host);
+	struct ata_device *dev;
+	int max_depth;
+
+	if (queue_depth < 1)
+		return sdev->queue_depth;
+
+	dev = ata_scsi_find_dev(ap, sdev);
+	if (!dev || !ata_dev_enabled(dev))
+		return sdev->queue_depth;
+
+	max_depth = min(sdev->host->can_queue, ata_id_queue_depth(dev->id));
+	max_depth = min(ATA_MAX_QUEUE - 1, max_depth);
+	if (queue_depth > max_depth)
+		queue_depth = max_depth;
+
+	scsi_adjust_queue_depth(sdev, MSG_SIMPLE_TAG, queue_depth);
+	return queue_depth;
+}
+
 /**
  *	ata_scsi_start_stop_xlat - Translate SCSI START STOP UNIT command
  *	@qc: Storage for translated ATA taskfile
diff --git a/include/linux/libata.h b/include/linux/libata.h
index dd0db2d21bc5..fcdd798bb086 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -109,7 +109,7 @@ enum {
 	ATA_MAX_PORTS		= 8,
 	ATA_DEF_QUEUE		= 1,
 	/* tag ATA_MAX_QUEUE - 1 is reserved for internal commands */
-	ATA_MAX_QUEUE		= 2,
+	ATA_MAX_QUEUE		= 32,
 	ATA_TAG_INTERNAL	= ATA_MAX_QUEUE - 1,
 	ATA_MAX_SECTORS		= 200,	/* FIXME */
 	ATA_MAX_BUS		= 2,
@@ -679,6 +679,8 @@ extern int ata_std_bios_param(struct scsi_device *sdev,
 			      struct block_device *bdev,
 			      sector_t capacity, int geom[]);
 extern int ata_scsi_slave_config(struct scsi_device *sdev);
+extern int ata_scsi_change_queue_depth(struct scsi_device *sdev,
+				       int queue_depth);
 extern struct ata_device *ata_dev_pair(struct ata_device *adev);
 
 /*
-- 
cgit v1.2.3


From ba9627b85fcb5ed67285ca0711f0f4d1e965746e Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 16 May 2006 23:03:08 +0100
Subject: [JFFS2] Repack some on-medium structures. ARM is weirder than I
 thought.

We have to pack at least the jint16_t structure, because otherwise it'll
be four bytes in size. Thankfully, we can do that and _not_ pack the
actual node structures, and the compiler still doesn't emit stupid code.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/jffs2.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jffs2.h b/include/linux/jffs2.h
index 228ad72f7dd8..a26fbd498c79 100644
--- a/include/linux/jffs2.h
+++ b/include/linux/jffs2.h
@@ -82,15 +82,15 @@
 
 typedef struct {
 	uint32_t v32;
-} jint32_t;
+} __attribute__((packed)) jint32_t;
 
 typedef struct {
 	uint32_t m;
-} jmode_t;
+} __attribute__((packed)) jmode_t;
 
 typedef struct {
 	uint16_t v16;
-} jint16_t;
+} __attribute__((packed)) jint16_t;
 
 struct jffs2_unknown_node
 {
-- 
cgit v1.2.3


From 3ac6c7b44560fdf2ea8865536bd52d4ff038107e Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 16 May 2006 23:25:37 +0100
Subject: Remove struct fddi_statistics from user view in <linux/if_fddi.h>

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/if_fddi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_fddi.h b/include/linux/if_fddi.h
index 1288a161bc0b..e0a150046208 100644
--- a/include/linux/if_fddi.h
+++ b/include/linux/if_fddi.h
@@ -102,6 +102,7 @@ struct fddihdr
 		} hdr;
 	} __attribute__ ((packed));
 
+#ifdef __KERNEL__
 /* Define FDDI statistics structure */
 struct fddi_statistics {
 
@@ -193,5 +194,6 @@ struct fddi_statistics {
 	__u32	port_ler_flag[2];
 	__u32	port_hardware_present[2];
 	};
+#endif /* __KERNEL__ */
 
 #endif	/* _LINUX_IF_FDDI_H */
-- 
cgit v1.2.3


From aef9ab47841af45888d950baa6448072cc70bdd5 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 19 May 2006 00:28:49 +0100
Subject: [JFFS2] Support new device nodes

Device node major/minor numbers are just stored in the payload of a single
data node. Just extend that to 4 bytes and use new_encode_dev() for it.

We only use the 4-byte format if we _need_ to, if !old_valid_dev(foo).
This preserves backwards compatibility with older code as much as
possible. If we do make devices with major or minor numbers above 255, and
then mount the file system with the old code, it'll just read the first
two bytes and get the numbers wrong. If it comes to garbage-collect it,
it'll then write back those wrong numbers. But that's about the best we
can expect.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/dir.c        | 12 +++++-------
 fs/jffs2/fs.c         | 25 ++++++++++++++++++-------
 fs/jffs2/gc.c         |  7 ++-----
 fs/jffs2/nodelist.h   | 11 +++++++++++
 fs/jffs2/os-linux.h   |  4 +---
 include/linux/jffs2.h |  6 ++++++
 6 files changed, 43 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 1c8e8c0f6cea..a6c11cef1b73 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -591,12 +591,12 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	struct jffs2_full_dnode *fn;
 	struct jffs2_full_dirent *fd;
 	int namelen;
-	jint16_t dev;
+	union jffs2_device_node dev;
 	int devlen = 0;
 	uint32_t alloclen, phys_ofs;
 	int ret;
 
-	if (!old_valid_dev(rdev))
+	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
 	ri = jffs2_alloc_raw_inode();
@@ -605,17 +605,15 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 
 	c = JFFS2_SB_INFO(dir_i->i_sb);
 
-	if (S_ISBLK(mode) || S_ISCHR(mode)) {
-		dev = cpu_to_je16(old_encode_dev(rdev));
-		devlen = sizeof(dev);
-	}
+	if (S_ISBLK(mode) || S_ISCHR(mode))
+		devlen = jffs2_encode_dev(&dev, rdev);
 
 	/* Try to reserve enough space for both node and dirent.
 	 * Just the node will do for now, though
 	 */
 	namelen = dentry->d_name.len;
 	ret = jffs2_reserve_space(c, sizeof(*ri) + devlen, &phys_ofs, &alloclen,
-				ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+				  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
 
 	if (ret) {
 		jffs2_free_raw_inode(ri);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index ea1f37d4fc58..24cb4c688efc 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -33,7 +33,7 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 	struct jffs2_raw_inode *ri;
-	unsigned short dev;
+	union jffs2_device_node dev;
 	unsigned char *mdata = NULL;
 	int mdatalen = 0;
 	unsigned int ivalid;
@@ -51,9 +51,8 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 	   it out again with the appropriate data attached */
 	if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
 		/* For these, we don't actually need to read the old node */
-		dev = old_encode_dev(inode->i_rdev);
+		mdatalen = jffs2_encode_dev(&dev, inode->i_rdev);
 		mdata = (char *)&dev;
-		mdatalen = sizeof(dev);
 		D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of kdev_t\n", mdatalen));
 	} else if (S_ISLNK(inode->i_mode)) {
 		down(&f->sem);
@@ -232,6 +231,8 @@ void jffs2_read_inode (struct inode *inode)
 	struct jffs2_inode_info *f;
 	struct jffs2_sb_info *c;
 	struct jffs2_raw_inode latest_node;
+	union jffs2_device_node jdev;
+	dev_t rdev = 0;
 	int ret;
 
 	D1(printk(KERN_DEBUG "jffs2_read_inode(): inode->i_ino == %lu\n", inode->i_ino));
@@ -263,7 +264,6 @@ void jffs2_read_inode (struct inode *inode)
 	inode->i_blocks = (inode->i_size + 511) >> 9;
 
 	switch (inode->i_mode & S_IFMT) {
-		jint16_t rdev;
 
 	case S_IFLNK:
 		inode->i_op = &jffs2_symlink_inode_operations;
@@ -297,8 +297,16 @@ void jffs2_read_inode (struct inode *inode)
 	case S_IFBLK:
 	case S_IFCHR:
 		/* Read the device numbers from the media */
+		if (f->metadata->size != sizeof(jdev.old) &&
+		    f->metadata->size != sizeof(jdev.new)) {
+			printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
+			up(&f->sem);
+			jffs2_do_clear_inode(c, f);
+			make_bad_inode(inode);
+			return;
+		}
 		D1(printk(KERN_DEBUG "Reading device numbers from flash\n"));
-		if (jffs2_read_dnode(c, f, f->metadata, (char *)&rdev, 0, sizeof(rdev)) < 0) {
+		if (jffs2_read_dnode(c, f, f->metadata, (char *)&jdev, 0, f->metadata->size) < 0) {
 			/* Eep */
 			printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
 			up(&f->sem);
@@ -306,12 +314,15 @@ void jffs2_read_inode (struct inode *inode)
 			make_bad_inode(inode);
 			return;
 		}
+		if (f->metadata->size == sizeof(jdev.old))
+			rdev = old_decode_dev(je16_to_cpu(jdev.old));
+		else
+			rdev = new_decode_dev(je32_to_cpu(jdev.new));
 
 	case S_IFSOCK:
 	case S_IFIFO:
 		inode->i_op = &jffs2_file_inode_operations;
-		init_special_inode(inode, inode->i_mode,
-				   old_decode_dev((je16_to_cpu(rdev))));
+		init_special_inode(inode, inode->i_mode, rdev);
 		break;
 
 	default:
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 967fb2cf8e21..77d30707de56 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -679,7 +679,7 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
 	struct jffs2_full_dnode *new_fn;
 	struct jffs2_raw_inode ri;
 	struct jffs2_node_frag *last_frag;
-	jint16_t dev;
+	union jffs2_device_node dev;
 	char *mdata = NULL, mdatalen = 0;
 	uint32_t alloclen, phys_ofs, ilen;
 	int ret;
@@ -687,11 +687,8 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
 	if (S_ISBLK(JFFS2_F_I_MODE(f)) ||
 	    S_ISCHR(JFFS2_F_I_MODE(f)) ) {
 		/* For these, we don't actually need to read the old node */
-		/* FIXME: for minor or major > 255. */
-		dev = cpu_to_je16(((JFFS2_F_I_RDEV_MAJ(f) << 8) |
-			JFFS2_F_I_RDEV_MIN(f)));
+		mdatalen = jffs2_encode_dev(&dev, JFFS2_F_I_RDEV(f));
 		mdata = (char *)&dev;
-		mdatalen = sizeof(dev);
 		D1(printk(KERN_DEBUG "jffs2_garbage_collect_metadata(): Writing %d bytes of kdev_t\n", mdatalen));
 	} else if (S_ISLNK(JFFS2_F_I_MODE(f))) {
 		mdatalen = fn->size;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index f6645afe88e4..24e0f28a8bac 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -268,6 +268,17 @@ static inline uint32_t ref_totlen(struct jffs2_sb_info *c,
 
 #define PAD(x) (((x)+3)&~3)
 
+static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
+{
+	if (old_valid_dev(rdev)) {
+		jdev->old = cpu_to_je16(old_encode_dev(rdev));
+		return sizeof(jdev->old);
+	} else {
+		jdev->new = cpu_to_je32(new_encode_dev(rdev));
+		return sizeof(jdev->new);
+	}
+}
+
 static inline struct jffs2_inode_cache *jffs2_raw_ref_to_ic(struct jffs2_raw_node_ref *raw)
 {
 	while(raw->next_in_ino) {
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index d307cf548625..a10eb03ac95b 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -31,9 +31,7 @@ struct kvec;
 #define JFFS2_F_I_MODE(f) (OFNI_EDONI_2SFFJ(f)->i_mode)
 #define JFFS2_F_I_UID(f) (OFNI_EDONI_2SFFJ(f)->i_uid)
 #define JFFS2_F_I_GID(f) (OFNI_EDONI_2SFFJ(f)->i_gid)
-
-#define JFFS2_F_I_RDEV_MIN(f) (iminor(OFNI_EDONI_2SFFJ(f)))
-#define JFFS2_F_I_RDEV_MAJ(f) (imajor(OFNI_EDONI_2SFFJ(f)))
+#define JFFS2_F_I_RDEV(f) (OFNI_EDONI_2SFFJ(f)->i_rdev)
 
 #define ITIME(sec) ((struct timespec){sec, 0})
 #define I_SEC(tv) ((tv).tv_sec)
diff --git a/include/linux/jffs2.h b/include/linux/jffs2.h
index a26fbd498c79..007d76d290cb 100644
--- a/include/linux/jffs2.h
+++ b/include/linux/jffs2.h
@@ -173,4 +173,10 @@ union jffs2_node_union
 	struct jffs2_unknown_node u;
 };
 
+/* Data payload for device nodes. */
+union jffs2_device_node {
+	jint16_t old;
+	jint32_t new;
+};
+
 #endif /* __LINUX_JFFS2_H__ */
-- 
cgit v1.2.3


From 3655d1d323386e001c786af10f0a3f39f438f03b Mon Sep 17 00:00:00 2001
From: Albert Lee <albertcc@tw.ibm.com>
Date: Fri, 19 May 2006 11:43:04 +0800
Subject: [PATCH] libata: Fix the HSM error_mask mapping (was: Re: libata-tj
 and SMART)

Fix the HSM error_mask mapping.

Changes:
- Better mapping in ac_err_mask()
- In HSM_ST_FIRST ans HSM_ST state, check ATA_ERR|ATA_DF and map it to AC_ERR_DEV instead of AC_ERR_HSM.
- In HSM_ST_FIRST and HSM_ST state, map DRQ=1 ERR=1 to AC_ERR_HSM.
- For PIO data in and DRQ=1 ERR=1, add check after the junk data block is read.

Signed-off-by: Albert Lee <albertcc@tw.ibm.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 31 ++++++++++++++++++++++++-------
 include/linux/libata.h     |  2 +-
 2 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 00881226f8dd..aa38ed3e59a8 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -4009,9 +4009,15 @@ fsm_start:
 		poll_next = (qc->tf.flags & ATA_TFLAG_POLLING);
 
 		/* check device status */
-		if (unlikely((status & (ATA_BUSY | ATA_DRQ)) != ATA_DRQ)) {
-			/* Wrong status. Let EH handle this */
-			qc->err_mask |= AC_ERR_HSM;
+		if (unlikely((status & ATA_DRQ) == 0)) {
+			/* handle BSY=0, DRQ=0 as error */
+			if (likely(status & (ATA_ERR | ATA_DF)))
+				/* device stops HSM for abort/error */
+				qc->err_mask |= AC_ERR_DEV;
+			else
+				/* HSM violation. Let EH handle this */
+				qc->err_mask |= AC_ERR_HSM;
+
 			ap->hsm_task_state = HSM_ST_ERR;
 			goto fsm_start;
 		}
@@ -4025,7 +4031,7 @@ fsm_start:
 		if (unlikely(status & (ATA_ERR | ATA_DF))) {
 			printk(KERN_WARNING "ata%d: DRQ=1 with device error, dev_stat 0x%X\n",
 			       ap->id, status);
-			qc->err_mask |= AC_ERR_DEV;
+			qc->err_mask |= AC_ERR_HSM;
 			ap->hsm_task_state = HSM_ST_ERR;
 			goto fsm_start;
 		}
@@ -4067,7 +4073,9 @@ fsm_start:
 		if (qc->tf.protocol == ATA_PROT_ATAPI) {
 			/* ATAPI PIO protocol */
 			if ((status & ATA_DRQ) == 0) {
-				/* no more data to transfer */
+				/* No more data to transfer or device error.
+				 * Device error will be tagged in HSM_ST_LAST.
+				 */
 				ap->hsm_task_state = HSM_ST_LAST;
 				goto fsm_start;
 			}
@@ -4081,7 +4089,7 @@ fsm_start:
 			if (unlikely(status & (ATA_ERR | ATA_DF))) {
 				printk(KERN_WARNING "ata%d: DRQ=1 with device error, dev_stat 0x%X\n",
 				       ap->id, status);
-				qc->err_mask |= AC_ERR_DEV;
+				qc->err_mask |= AC_ERR_HSM;
 				ap->hsm_task_state = HSM_ST_ERR;
 				goto fsm_start;
 			}
@@ -4096,7 +4104,13 @@ fsm_start:
 			/* ATA PIO protocol */
 			if (unlikely((status & ATA_DRQ) == 0)) {
 				/* handle BSY=0, DRQ=0 as error */
-				qc->err_mask |= AC_ERR_HSM;
+				if (likely(status & (ATA_ERR | ATA_DF)))
+					/* device stops HSM for abort/error */
+					qc->err_mask |= AC_ERR_DEV;
+				else
+					/* HSM violation. Let EH handle this */
+					qc->err_mask |= AC_ERR_HSM;
+
 				ap->hsm_task_state = HSM_ST_ERR;
 				goto fsm_start;
 			}
@@ -4121,6 +4135,9 @@ fsm_start:
 					status = ata_wait_idle(ap);
 				}
 
+				if (status & (ATA_BUSY | ATA_DRQ))
+					qc->err_mask |= AC_ERR_HSM;
+
 				/* ata_pio_sectors() might change the
 				 * state to HSM_ST_LAST. so, the state
 				 * is changed after ata_pio_sectors().
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2803ab8e9243..c51502c047a4 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1062,7 +1062,7 @@ static inline int ata_try_flush_cache(const struct ata_device *dev)
 
 static inline unsigned int ac_err_mask(u8 status)
 {
-	if (status & ATA_BUSY)
+	if (status & (ATA_BUSY | ATA_DRQ))
 		return AC_ERR_HSM;
 	if (status & (ATA_ERR | ATA_DF))
 		return AC_ERR_DEV;
-- 
cgit v1.2.3


From 28318776a80bc3261f9af91ef79e6e38bb9f5bec Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@wh.fh-wedel.de>
Date: Mon, 22 May 2006 23:18:05 +0200
Subject: [MTD] Introduce writesize

At least two flashes exists that have the concept of a minimum write unit,
similar to NAND pages, but no other NAND characteristics.  Therefore, rename
the minimum write unit to "writesize" for all flashes, including NAND.

Signed-off-by: Joern Engel <joern@wh.fh-wedel.de>
---
 drivers/mtd/chips/cfi_cmdset_0001.c |  4 +--
 drivers/mtd/devices/doc2000.c       |  2 +-
 drivers/mtd/devices/doc2001.c       |  2 +-
 drivers/mtd/devices/doc2001plus.c   |  2 +-
 drivers/mtd/mtdconcat.c             | 10 +++---
 drivers/mtd/mtdpart.c               |  2 +-
 drivers/mtd/nand/au1550nd.c         |  4 +--
 drivers/mtd/nand/diskonchip.c       | 16 ++++-----
 drivers/mtd/nand/nand_base.c        | 64 ++++++++++++++++-----------------
 drivers/mtd/nand/nand_bbt.c         | 30 ++++++++--------
 drivers/mtd/nand/nandsim.c          |  2 +-
 drivers/mtd/nand/rtc_from4.c        |  2 +-
 drivers/mtd/onenand/onenand_base.c  | 72 ++++++++++++++++++-------------------
 drivers/mtd/onenand/onenand_bbt.c   |  4 +--
 fs/jffs2/wbuf.c                     |  6 ++--
 include/linux/mtd/mtd.h             |  7 ++--
 include/mtd/mtd-abi.h               |  2 +-
 17 files changed, 117 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index d0d5e521b564..35c3689bc5c1 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -545,12 +545,12 @@ static int cfi_intelext_partition_fixup(struct mtd_info *mtd,
 		if (extp->MinorVersion >= '4') {
 			struct cfi_intelext_programming_regioninfo *prinfo;
 			prinfo = (struct cfi_intelext_programming_regioninfo *)&extp->extra[offs];
-			MTD_PROGREGION_SIZE(mtd) = cfi->interleave << prinfo->ProgRegShift;
+			mtd->writesize = cfi->interleave << prinfo->ProgRegShift;
 			MTD_PROGREGION_CTRLMODE_VALID(mtd) = cfi->interleave * prinfo->ControlValid;
 			MTD_PROGREGION_CTRLMODE_INVALID(mtd) = cfi->interleave * prinfo->ControlInvalid;
 			mtd->flags |= MTD_PROGRAM_REGIONS;
 			printk(KERN_DEBUG "%s: program region size/ctrl_valid/ctrl_inval = %d/%d/%d\n",
-			       map->name, MTD_PROGREGION_SIZE(mtd),
+			       map->name, mtd->writesize,
 			       MTD_PROGREGION_CTRLMODE_VALID(mtd),
 			       MTD_PROGREGION_CTRLMODE_INVALID(mtd));
 		}
diff --git a/drivers/mtd/devices/doc2000.c b/drivers/mtd/devices/doc2000.c
index 40cc20f6d164..423a34f4638c 100644
--- a/drivers/mtd/devices/doc2000.c
+++ b/drivers/mtd/devices/doc2000.c
@@ -579,7 +579,7 @@ void DoC2k_init(struct mtd_info *mtd)
 	mtd->ecctype = MTD_ECC_RS_DiskOnChip;
 	mtd->size = 0;
 	mtd->erasesize = 0;
-	mtd->oobblock = 512;
+	mtd->writesize = 512;
 	mtd->oobsize = 16;
 	mtd->owner = THIS_MODULE;
 	mtd->erase = doc_erase;
diff --git a/drivers/mtd/devices/doc2001.c b/drivers/mtd/devices/doc2001.c
index 1670eb8b9755..e6eaef28a2b0 100644
--- a/drivers/mtd/devices/doc2001.c
+++ b/drivers/mtd/devices/doc2001.c
@@ -361,7 +361,7 @@ void DoCMil_init(struct mtd_info *mtd)
 	/* FIXME: erase size is not always 8KiB */
 	mtd->erasesize = 0x2000;
 
-	mtd->oobblock = 512;
+	mtd->writesize = 512;
 	mtd->oobsize = 16;
 	mtd->owner = THIS_MODULE;
 	mtd->erase = doc_erase;
diff --git a/drivers/mtd/devices/doc2001plus.c b/drivers/mtd/devices/doc2001plus.c
index 0dc5d108f7b5..8422c5e92d27 100644
--- a/drivers/mtd/devices/doc2001plus.c
+++ b/drivers/mtd/devices/doc2001plus.c
@@ -483,7 +483,7 @@ void DoCMilPlus_init(struct mtd_info *mtd)
 	mtd->size = 0;
 
 	mtd->erasesize = 0;
-	mtd->oobblock = 512;
+	mtd->writesize = 512;
 	mtd->oobsize = 16;
 	mtd->owner = THIS_MODULE;
 	mtd->erase = doc_erase;
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 3c61a980c56c..a5e8373349a5 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -278,9 +278,9 @@ concat_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
 		return -EINVAL;
 
 	/* Check alignment */
-	if (mtd->oobblock > 1) {
+	if (mtd->writesize > 1) {
 		loff_t __to = to;
-		if (do_div(__to, mtd->oobblock) || (total_len % mtd->oobblock))
+		if (do_div(__to, mtd->writesize) || (total_len % mtd->writesize))
 			return -EINVAL;
 	}
 
@@ -334,7 +334,7 @@ concat_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
 		*retlen += retsize;
 		total_len -= wsize;
 		if (concat->mtd.type == MTD_NANDFLASH && eccbuf)
-			eccbuf += mtd->oobavail * (wsize / mtd->oobblock);
+			eccbuf += mtd->oobavail * (wsize / mtd->writesize);
 
 		if (total_len == 0)
 			break;
@@ -833,7 +833,7 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 	concat->mtd.flags = subdev[0]->flags;
 	concat->mtd.size = subdev[0]->size;
 	concat->mtd.erasesize = subdev[0]->erasesize;
-	concat->mtd.oobblock = subdev[0]->oobblock;
+	concat->mtd.writesize = subdev[0]->writesize;
 	concat->mtd.oobsize = subdev[0]->oobsize;
 	concat->mtd.ecctype = subdev[0]->ecctype;
 	concat->mtd.eccsize = subdev[0]->eccsize;
@@ -881,7 +881,7 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 				    subdev[i]->flags & MTD_WRITEABLE;
 		}
 		concat->mtd.size += subdev[i]->size;
-		if (concat->mtd.oobblock   !=  subdev[i]->oobblock ||
+		if (concat->mtd.writesize   !=  subdev[i]->writesize ||
 		    concat->mtd.oobsize    !=  subdev[i]->oobsize ||
 		    concat->mtd.ecctype    !=  subdev[i]->ecctype ||
 		    concat->mtd.eccsize    !=  subdev[i]->eccsize ||
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 29ed5abe70c4..082662f90481 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -398,7 +398,7 @@ int add_mtd_partitions(struct mtd_info *master,
 		slave->mtd.type = master->type;
 		slave->mtd.flags = master->flags & ~parts[i].mask_flags;
 		slave->mtd.size = parts[i].size;
-		slave->mtd.oobblock = master->oobblock;
+		slave->mtd.writesize = master->writesize;
 		slave->mtd.oobsize = master->oobsize;
 		slave->mtd.oobavail = master->oobavail;
 		slave->mtd.ecctype = master->ecctype;
diff --git a/drivers/mtd/nand/au1550nd.c b/drivers/mtd/nand/au1550nd.c
index d9a0143e1d3a..4253b9309789 100644
--- a/drivers/mtd/nand/au1550nd.c
+++ b/drivers/mtd/nand/au1550nd.c
@@ -356,9 +356,9 @@ static void au1550_command(struct mtd_info *mtd, unsigned command, int column, i
 	if (command == NAND_CMD_SEQIN) {
 		int readcmd;
 
-		if (column >= mtd->oobblock) {
+		if (column >= mtd->writesize) {
 			/* OOB area */
-			column -= mtd->oobblock;
+			column -= mtd->writesize;
 			readcmd = NAND_CMD_READOOB;
 		} else if (column < 256) {
 			/* First 256 bytes --> READ0 */
diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index a2391c66a63f..d160930276d6 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -761,9 +761,9 @@ static void doc2001plus_command(struct mtd_info *mtd, unsigned command, int colu
 	if (command == NAND_CMD_SEQIN) {
 		int readcmd;
 
-		if (column >= mtd->oobblock) {
+		if (column >= mtd->writesize) {
 			/* OOB area */
-			column -= mtd->oobblock;
+			column -= mtd->writesize;
 			readcmd = NAND_CMD_READOOB;
 		} else if (column < 256) {
 			/* First 256 bytes --> READ0 */
@@ -1093,8 +1093,8 @@ static int __init find_media_headers(struct mtd_info *mtd, u_char *buf, const ch
 	size_t retlen;
 
 	for (offs = 0; offs < mtd->size; offs += mtd->erasesize) {
-		ret = mtd->read(mtd, offs, mtd->oobblock, &retlen, buf);
-		if (retlen != mtd->oobblock)
+		ret = mtd->read(mtd, offs, mtd->writesize, &retlen, buf);
+		if (retlen != mtd->writesize)
 			continue;
 		if (ret) {
 			printk(KERN_WARNING "ECC error scanning DOC at 0x%x\n", offs);
@@ -1118,8 +1118,8 @@ static int __init find_media_headers(struct mtd_info *mtd, u_char *buf, const ch
 	/* Only one mediaheader was found.  We want buf to contain a
 	   mediaheader on return, so we'll have to re-read the one we found. */
 	offs = doc->mh0_page << this->page_shift;
-	ret = mtd->read(mtd, offs, mtd->oobblock, &retlen, buf);
-	if (retlen != mtd->oobblock) {
+	ret = mtd->read(mtd, offs, mtd->writesize, &retlen, buf);
+	if (retlen != mtd->writesize) {
 		/* Insanity.  Give up. */
 		printk(KERN_ERR "Read DiskOnChip Media Header once, but can't reread it???\n");
 		return 0;
@@ -1139,7 +1139,7 @@ static inline int __init nftl_partscan(struct mtd_info *mtd, struct mtd_partitio
 	unsigned blocks, maxblocks;
 	int offs, numheaders;
 
-	buf = kmalloc(mtd->oobblock, GFP_KERNEL);
+	buf = kmalloc(mtd->writesize, GFP_KERNEL);
 	if (!buf) {
 		printk(KERN_ERR "DiskOnChip mediaheader kmalloc failed!\n");
 		return 0;
@@ -1247,7 +1247,7 @@ static inline int __init inftl_partscan(struct mtd_info *mtd, struct mtd_partiti
 	if (inftl_bbt_write)
 		end -= (INFTL_BBT_RESERVED_BLOCKS << this->phys_erase_shift);
 
-	buf = kmalloc(mtd->oobblock, GFP_KERNEL);
+	buf = kmalloc(mtd->writesize, GFP_KERNEL);
 	if (!buf) {
 		printk(KERN_ERR "DiskOnChip mediaheader kmalloc failed!\n");
 		return 0;
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 08dffb7a9389..055f6608a2ec 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -560,9 +560,9 @@ static void nand_command(struct mtd_info *mtd, unsigned command, int column, int
 	if (command == NAND_CMD_SEQIN) {
 		int readcmd;
 
-		if (column >= mtd->oobblock) {
+		if (column >= mtd->writesize) {
 			/* OOB area */
-			column -= mtd->oobblock;
+			column -= mtd->writesize;
 			readcmd = NAND_CMD_READOOB;
 		} else if (column < 256) {
 			/* First 256 bytes --> READ0 */
@@ -658,7 +658,7 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned command, int column,
 
 	/* Emulate NAND_CMD_READOOB */
 	if (command == NAND_CMD_READOOB) {
-		column += mtd->oobblock;
+		column += mtd->writesize;
 		command = NAND_CMD_READ0;
 	}
 
@@ -889,7 +889,7 @@ static int nand_write_page(struct mtd_info *mtd, struct nand_chip *this, int pag
 		/* No ecc, write all */
 	case NAND_ECC_NONE:
 		printk(KERN_WARNING "Writing data without ECC to NAND-FLASH is not recommended\n");
-		this->write_buf(mtd, this->data_poi, mtd->oobblock);
+		this->write_buf(mtd, this->data_poi, mtd->writesize);
 		break;
 
 		/* Software ecc 3/256, write all */
@@ -900,7 +900,7 @@ static int nand_write_page(struct mtd_info *mtd, struct nand_chip *this, int pag
 				oob_buf[oob_config[eccidx]] = ecc_code[i];
 			datidx += this->eccsize;
 		}
-		this->write_buf(mtd, this->data_poi, mtd->oobblock);
+		this->write_buf(mtd, this->data_poi, mtd->writesize);
 		break;
 	default:
 		eccbytes = this->eccbytes;
@@ -1161,9 +1161,9 @@ int nand_do_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 	page = realpage & this->pagemask;
 
 	/* Get raw starting column */
-	col = from & (mtd->oobblock - 1);
+	col = from & (mtd->writesize - 1);
 
-	end = mtd->oobblock;
+	end = mtd->writesize;
 	ecc = this->eccsize;
 	eccbytes = this->eccbytes;
 
@@ -1321,7 +1321,7 @@ int nand_do_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 				buf[read++] = data_poi[j];
 			this->pagebuf = realpage;
 		} else
-			read += mtd->oobblock;
+			read += mtd->writesize;
 
 		/* Apply delay or wait for ready/busy pin
 		 * Do this before the AUTOINCR check, so no problems
@@ -1479,7 +1479,7 @@ int nand_read_raw(struct mtd_info *mtd, uint8_t *buf, loff_t from, size_t len, s
 	int chip = (int)(from >> this->chip_shift);
 	int sndcmd = 1;
 	int cnt = 0;
-	int pagesize = mtd->oobblock + mtd->oobsize;
+	int pagesize = mtd->writesize + mtd->oobsize;
 	int blockcheck = (1 << (this->phys_erase_shift - this->page_shift)) - 1;
 
 	/* Do not allow reads past end of device */
@@ -1581,7 +1581,7 @@ static u_char *nand_prepare_oobbuf(struct mtd_info *mtd, u_char *fsbuf, struct n
 	return this->oob_buf;
 }
 
-#define NOTALIGNED(x) (x & (mtd->oobblock-1)) != 0
+#define NOTALIGNED(x) (x & (mtd->writesize-1)) != 0
 
 /**
  * nand_write - [MTD Interface] compability function for nand_write_ecc
@@ -1694,7 +1694,7 @@ static int nand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 		/* Next oob page */
 		oob += mtd->oobsize;
 		/* Update written bytes count */
-		written += mtd->oobblock;
+		written += mtd->writesize;
 		if (written == len)
 			goto cmp;
 
@@ -1805,7 +1805,7 @@ static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len, size_t *r
 
 	if (NAND_MUST_PAD(this)) {
 		/* Write out desired data */
-		this->cmdfunc(mtd, NAND_CMD_SEQIN, mtd->oobblock, page & this->pagemask);
+		this->cmdfunc(mtd, NAND_CMD_SEQIN, mtd->writesize, page & this->pagemask);
 		/* prepad 0xff for partial programming */
 		this->write_buf(mtd, ffchars, column);
 		/* write data */
@@ -1814,7 +1814,7 @@ static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len, size_t *r
 		this->write_buf(mtd, ffchars, mtd->oobsize - (len + column));
 	} else {
 		/* Write out desired data */
-		this->cmdfunc(mtd, NAND_CMD_SEQIN, mtd->oobblock + column, page & this->pagemask);
+		this->cmdfunc(mtd, NAND_CMD_SEQIN, mtd->writesize + column, page & this->pagemask);
 		/* write data */
 		this->write_buf(mtd, buf, len);
 	}
@@ -1947,7 +1947,7 @@ static int nand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs, unsign
 		/* If the given tuple is >= pagesize then
 		 * write it out from the iov
 		 */
-		if ((vecs->iov_len - len) >= mtd->oobblock) {
+		if ((vecs->iov_len - len) >= mtd->writesize) {
 			/* Calc number of pages we can write
 			 * out of this iov in one go */
 			numpages = (vecs->iov_len - len) >> this->page_shift;
@@ -1967,8 +1967,8 @@ static int nand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs, unsign
 						      &oobbuf[oob], oobsel, i != numpages);
 				if (ret)
 					goto out;
-				this->data_poi += mtd->oobblock;
-				len += mtd->oobblock;
+				this->data_poi += mtd->writesize;
+				len += mtd->writesize;
 				oob += mtd->oobsize;
 				page++;
 			}
@@ -1983,7 +1983,7 @@ static int nand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs, unsign
 			 * tuple until we have a full page to write
 			 */
 			int cnt = 0;
-			while (cnt < mtd->oobblock) {
+			while (cnt < mtd->writesize) {
 				if (vecs->iov_base != NULL && vecs->iov_len)
 					this->data_buf[cnt++] = ((u_char *) vecs->iov_base)[len++];
 				/* Check, if we have to switch to the next tuple */
@@ -2009,7 +2009,7 @@ static int nand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs, unsign
 		if (ret)
 			goto out;
 
-		written += mtd->oobblock * numpages;
+		written += mtd->writesize * numpages;
 		/* All done ? */
 		if (!count)
 			break;
@@ -2411,10 +2411,10 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 			/* The 4th id byte is the important one */
 			extid = this->read_byte(mtd);
 			/* Calc pagesize */
-			mtd->oobblock = 1024 << (extid & 0x3);
+			mtd->writesize = 1024 << (extid & 0x3);
 			extid >>= 2;
 			/* Calc oobsize */
-			mtd->oobsize = (8 << (extid & 0x01)) * (mtd->oobblock >> 9);
+			mtd->oobsize = (8 << (extid & 0x01)) * (mtd->writesize >> 9);
 			extid >>= 2;
 			/* Calc blocksize. Blocksize is multiples of 64KiB */
 			mtd->erasesize = (64 * 1024) << (extid & 0x03);
@@ -2426,8 +2426,8 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 			/* Old devices have this data hardcoded in the
 			 * device id table */
 			mtd->erasesize = nand_flash_ids[i].erasesize;
-			mtd->oobblock = nand_flash_ids[i].pagesize;
-			mtd->oobsize = mtd->oobblock / 32;
+			mtd->writesize = nand_flash_ids[i].pagesize;
+			mtd->oobsize = mtd->writesize / 32;
 			busw = nand_flash_ids[i].options & NAND_BUSWIDTH_16;
 		}
 
@@ -2451,12 +2451,12 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 		}
 
 		/* Calculate the address shift from the page size */
-		this->page_shift = ffs(mtd->oobblock) - 1;
+		this->page_shift = ffs(mtd->writesize) - 1;
 		this->bbt_erase_shift = this->phys_erase_shift = ffs(mtd->erasesize) - 1;
 		this->chip_shift = ffs(this->chipsize) - 1;
 
 		/* Set the bad block position */
-		this->badblockpos = mtd->oobblock > 512 ? NAND_LARGE_BADBLOCK_POS : NAND_SMALL_BADBLOCK_POS;
+		this->badblockpos = mtd->writesize > 512 ? NAND_LARGE_BADBLOCK_POS : NAND_SMALL_BADBLOCK_POS;
 
 		/* Get chip options, preserve non chip based options */
 		this->options &= ~NAND_CHIPOPTIONS_MSK;
@@ -2476,7 +2476,7 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 			this->erase_cmd = single_erase_cmd;
 
 		/* Do not replace user supplied command function ! */
-		if (mtd->oobblock > 512 && this->cmdfunc == nand_command)
+		if (mtd->writesize > 512 && this->cmdfunc == nand_command)
 			this->cmdfunc = nand_command_lp;
 
 		printk(KERN_INFO "NAND device: Manufacturer ID:"
@@ -2519,7 +2519,7 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 
 	if (!this->data_buf) {
 		size_t len;
-		len = mtd->oobblock + mtd->oobsize;
+		len = mtd->writesize + mtd->oobsize;
 		this->data_buf = kmalloc(len, GFP_KERNEL);
 		if (!this->data_buf) {
 			if (this->options & NAND_OOBBUF_ALLOC)
@@ -2575,9 +2575,9 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 
 	switch (this->eccmode) {
 	case NAND_ECC_HW12_2048:
-		if (mtd->oobblock < 2048) {
+		if (mtd->writesize < 2048) {
 			printk(KERN_WARNING "2048 byte HW ECC not possible on %d byte page size, fallback to SW ECC\n",
-			       mtd->oobblock);
+			       mtd->writesize);
 			this->eccmode = NAND_ECC_SOFT;
 			this->calculate_ecc = nand_calculate_ecc;
 			this->correct_data = nand_correct_data;
@@ -2588,7 +2588,7 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 	case NAND_ECC_HW3_512:
 	case NAND_ECC_HW6_512:
 	case NAND_ECC_HW8_512:
-		if (mtd->oobblock == 256) {
+		if (mtd->writesize == 256) {
 			printk(KERN_WARNING "512 byte HW ECC not possible on 256 Byte pagesize, fallback to SW ECC \n");
 			this->eccmode = NAND_ECC_SOFT;
 			this->calculate_ecc = nand_calculate_ecc;
@@ -2638,16 +2638,16 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 	/* Set the number of read / write steps for one page to ensure ECC generation */
 	switch (this->eccmode) {
 	case NAND_ECC_HW12_2048:
-		this->eccsteps = mtd->oobblock / 2048;
+		this->eccsteps = mtd->writesize / 2048;
 		break;
 	case NAND_ECC_HW3_512:
 	case NAND_ECC_HW6_512:
 	case NAND_ECC_HW8_512:
-		this->eccsteps = mtd->oobblock / 512;
+		this->eccsteps = mtd->writesize / 512;
 		break;
 	case NAND_ECC_HW3_256:
 	case NAND_ECC_SOFT:
-		this->eccsteps = mtd->oobblock / 256;
+		this->eccsteps = mtd->writesize / 256;
 		break;
 
 	case NAND_ECC_NONE:
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index 9adc6d62332a..fbccb2a25186 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -247,15 +247,15 @@ static int read_abs_bbts(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_des
 
 	/* Read the primary version, if available */
 	if (td->options & NAND_BBT_VERSION) {
-		nand_read_raw(mtd, buf, td->pages[0] << this->page_shift, mtd->oobblock, mtd->oobsize);
-		td->version[0] = buf[mtd->oobblock + td->veroffs];
+		nand_read_raw(mtd, buf, td->pages[0] << this->page_shift, mtd->writesize, mtd->oobsize);
+		td->version[0] = buf[mtd->writesize + td->veroffs];
 		printk(KERN_DEBUG "Bad block table at page %d, version 0x%02X\n", td->pages[0], td->version[0]);
 	}
 
 	/* Read the mirror version, if available */
 	if (md && (md->options & NAND_BBT_VERSION)) {
-		nand_read_raw(mtd, buf, md->pages[0] << this->page_shift, mtd->oobblock, mtd->oobsize);
-		md->version[0] = buf[mtd->oobblock + md->veroffs];
+		nand_read_raw(mtd, buf, md->pages[0] << this->page_shift, mtd->writesize, mtd->oobsize);
+		md->version[0] = buf[mtd->writesize + md->veroffs];
 		printk(KERN_DEBUG "Bad block table at page %d, version 0x%02X\n", md->pages[0], md->version[0]);
 	}
 
@@ -298,8 +298,8 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 		readlen = bd->len;
 	} else {
 		/* Full page content should be read */
-		scanlen = mtd->oobblock + mtd->oobsize;
-		readlen = len * mtd->oobblock;
+		scanlen = mtd->writesize + mtd->oobsize;
+		readlen = len * mtd->writesize;
 		ooblen = len * mtd->oobsize;
 	}
 
@@ -334,7 +334,7 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 
 				/* Read the full oob until read_oob is fixed to
 				 * handle single byte reads for 16 bit buswidth */
-				ret = mtd->read_oob(mtd, from + j * mtd->oobblock, mtd->oobsize, &retlen, buf);
+				ret = mtd->read_oob(mtd, from + j * mtd->writesize, mtd->oobsize, &retlen, buf);
 				if (ret)
 					return ret;
 
@@ -345,7 +345,7 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 					break;
 				}
 			} else {
-				if (check_pattern(&buf[j * scanlen], scanlen, mtd->oobblock, bd)) {
+				if (check_pattern(&buf[j * scanlen], scanlen, mtd->writesize, bd)) {
 					this->bbt[i >> 3] |= 0x03 << (i & 0x6);
 					printk(KERN_WARNING "Bad eraseblock %d at 0x%08x\n",
 					       i >> 1, (unsigned int)from);
@@ -381,7 +381,7 @@ static int search_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 	struct nand_chip *this = mtd->priv;
 	int i, chips;
 	int bits, startblock, block, dir;
-	int scanlen = mtd->oobblock + mtd->oobsize;
+	int scanlen = mtd->writesize + mtd->oobsize;
 	int bbtblocks;
 
 	/* Search direction top -> down ? */
@@ -414,11 +414,11 @@ static int search_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 		for (block = 0; block < td->maxblocks; block++) {
 			int actblock = startblock + dir * block;
 			/* Read first page */
-			nand_read_raw(mtd, buf, actblock << this->bbt_erase_shift, mtd->oobblock, mtd->oobsize);
-			if (!check_pattern(buf, scanlen, mtd->oobblock, td)) {
+			nand_read_raw(mtd, buf, actblock << this->bbt_erase_shift, mtd->writesize, mtd->oobsize);
+			if (!check_pattern(buf, scanlen, mtd->writesize, td)) {
 				td->pages[i] = actblock << (this->bbt_erase_shift - this->page_shift);
 				if (td->options & NAND_BBT_VERSION) {
-					td->version[i] = buf[mtd->oobblock + td->veroffs];
+					td->version[i] = buf[mtd->writesize + td->veroffs];
 				}
 				break;
 			}
@@ -586,7 +586,7 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 			/* Calc length */
 			len = (size_t) (numblocks >> sft);
 			/* Make it page aligned ! */
-			len = (len + (mtd->oobblock - 1)) & ~(mtd->oobblock - 1);
+			len = (len + (mtd->writesize - 1)) & ~(mtd->writesize - 1);
 			/* Preset the buffer with 0xff */
 			memset(buf, 0xff, len + (len >> this->page_shift) * mtd->oobsize);
 			offs = 0;
@@ -1063,13 +1063,13 @@ int nand_default_bbt(struct mtd_info *mtd)
 			this->bbt_md = &bbt_mirror_descr;
 		}
 		if (!this->badblock_pattern) {
-			this->badblock_pattern = (mtd->oobblock > 512) ? &largepage_flashbased : &smallpage_flashbased;
+			this->badblock_pattern = (mtd->writesize > 512) ? &largepage_flashbased : &smallpage_flashbased;
 		}
 	} else {
 		this->bbt_td = NULL;
 		this->bbt_md = NULL;
 		if (!this->badblock_pattern) {
-			this->badblock_pattern = (mtd->oobblock > 512) ?
+			this->badblock_pattern = (mtd->writesize > 512) ?
 			    &largepage_memorybased : &smallpage_memorybased;
 		}
 	}
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 6903f5b903c6..8674f1e9d3c6 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -369,7 +369,7 @@ init_nandsim(struct mtd_info *mtd)
 	/* Initialize the NAND flash parameters */
 	ns->busw = chip->options & NAND_BUSWIDTH_16 ? 16 : 8;
 	ns->geom.totsz    = mtd->size;
-	ns->geom.pgsz     = mtd->oobblock;
+	ns->geom.pgsz     = mtd->writesize;
 	ns->geom.oobsz    = mtd->oobsize;
 	ns->geom.secsz    = mtd->erasesize;
 	ns->geom.pgszoob  = ns->geom.pgsz + ns->geom.oobsz;
diff --git a/drivers/mtd/nand/rtc_from4.c b/drivers/mtd/nand/rtc_from4.c
index bc9d849fbd5d..64ccf4c9613f 100644
--- a/drivers/mtd/nand/rtc_from4.c
+++ b/drivers/mtd/nand/rtc_from4.c
@@ -487,7 +487,7 @@ static int rtc_from4_errstat(struct mtd_info *mtd, struct nand_chip *this, int s
 		if (!(rtn & ERR_STAT_ECC_AVAILABLE)) {
 			er_stat |= 1 << 1;	/* err_ecc_not_avail */
 		} else {
-			len = mtd->oobblock;
+			len = mtd->writesize;
 			buf = kmalloc(len, GFP_KERNEL);
 			if (!buf) {
 				printk(KERN_ERR "rtc_from4_errstat: Out of memory!\n");
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index fe5b48997275..198bb8562d93 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -354,7 +354,7 @@ static inline int onenand_bufferram_offset(struct mtd_info *mtd, int area)
 
 	if (ONENAND_CURRENT_BUFFERRAM(this)) {
 		if (area == ONENAND_DATARAM)
-			return mtd->oobblock;
+			return mtd->writesize;
 		if (area == ONENAND_SPARERAM)
 			return mtd->oobsize;
 	}
@@ -632,14 +632,14 @@ static int onenand_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 	/* TODO handling oob */
 
 	while (read < len) {
-		thislen = min_t(int, mtd->oobblock, len - read);
+		thislen = min_t(int, mtd->writesize, len - read);
 
-		column = from & (mtd->oobblock - 1);
-		if (column + thislen > mtd->oobblock)
-			thislen = mtd->oobblock - column;
+		column = from & (mtd->writesize - 1);
+		if (column + thislen > mtd->writesize)
+			thislen = mtd->writesize - column;
 
 		if (!onenand_check_bufferram(mtd, from)) {
-			this->command(mtd, ONENAND_CMD_READ, from, mtd->oobblock);
+			this->command(mtd, ONENAND_CMD_READ, from, mtd->writesize);
 
 			ret = this->wait(mtd, FL_READING);
 			/* First copy data and check return value for ECC handling */
@@ -752,7 +752,7 @@ static int onenand_read_oob(struct mtd_info *mtd, loff_t from, size_t len,
 		/* Read more? */
 		if (read < len) {
 			/* Page size */
-			from += mtd->oobblock;
+			from += mtd->writesize;
 			column = 0;
 		}
 	}
@@ -809,7 +809,7 @@ static int onenand_verify_page(struct mtd_info *mtd, u_char *buf, loff_t addr)
 	void __iomem *dataram0, *dataram1;
 	int ret = 0;
 
-	this->command(mtd, ONENAND_CMD_READ, addr, mtd->oobblock);
+	this->command(mtd, ONENAND_CMD_READ, addr, mtd->writesize);
 
 	ret = this->wait(mtd, FL_READING);
 	if (ret)
@@ -819,9 +819,9 @@ static int onenand_verify_page(struct mtd_info *mtd, u_char *buf, loff_t addr)
 
 	/* Check, if the two dataram areas are same */
 	dataram0 = this->base + ONENAND_DATARAM;
-	dataram1 = dataram0 + mtd->oobblock;
+	dataram1 = dataram0 + mtd->writesize;
 
-	if (memcmp(dataram0, dataram1, mtd->oobblock))
+	if (memcmp(dataram0, dataram1, mtd->writesize))
 		return -EBADMSG;
 
 	return 0;
@@ -831,7 +831,7 @@ static int onenand_verify_page(struct mtd_info *mtd, u_char *buf, loff_t addr)
 #define onenand_verify_oob(...)		(0)
 #endif
 
-#define NOTALIGNED(x)	((x & (mtd->oobblock - 1)) != 0)
+#define NOTALIGNED(x)	((x & (mtd->writesize - 1)) != 0)
 
 /**
  * onenand_write_ecc - [MTD Interface] OneNAND write with ECC
@@ -875,14 +875,14 @@ static int onenand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 
 	/* Loop until all data write */
 	while (written < len) {
-		int thislen = min_t(int, mtd->oobblock, len - written);
+		int thislen = min_t(int, mtd->writesize, len - written);
 
-		this->command(mtd, ONENAND_CMD_BUFFERRAM, to, mtd->oobblock);
+		this->command(mtd, ONENAND_CMD_BUFFERRAM, to, mtd->writesize);
 
 		this->write_bufferram(mtd, ONENAND_DATARAM, buf, 0, thislen);
 		this->write_bufferram(mtd, ONENAND_SPARERAM, ffchars, 0, mtd->oobsize);
 
-		this->command(mtd, ONENAND_CMD_PROG, to, mtd->oobblock);
+		this->command(mtd, ONENAND_CMD_PROG, to, mtd->writesize);
 
 		onenand_update_bufferram(mtd, to, 1);
 
@@ -1070,10 +1070,10 @@ static int onenand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
 		 * If the given tuple is >= pagesize then
 		 * write it out from the iov
 		 */
-		if ((vecs->iov_len - len) >= mtd->oobblock) {
+		if ((vecs->iov_len - len) >= mtd->writesize) {
 			pbuf = vecs->iov_base + len;
 
-			len += mtd->oobblock;
+			len += mtd->writesize;
 
 			/* Check, if we have to switch to the next tuple */
 			if (len >= (int) vecs->iov_len) {
@@ -1083,8 +1083,8 @@ static int onenand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
 			}
 		} else {
 			int cnt = 0, thislen;
-			while (cnt < mtd->oobblock) {
-				thislen = min_t(int, mtd->oobblock - cnt, vecs->iov_len - len);
+			while (cnt < mtd->writesize) {
+				thislen = min_t(int, mtd->writesize - cnt, vecs->iov_len - len);
 				memcpy(this->page_buf + cnt, vecs->iov_base + len, thislen);
 				cnt += thislen;
 				len += thislen;
@@ -1098,12 +1098,12 @@ static int onenand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
 			}
 		}
 
-		this->command(mtd, ONENAND_CMD_BUFFERRAM, to, mtd->oobblock);
+		this->command(mtd, ONENAND_CMD_BUFFERRAM, to, mtd->writesize);
 
-		this->write_bufferram(mtd, ONENAND_DATARAM, pbuf, 0, mtd->oobblock);
+		this->write_bufferram(mtd, ONENAND_DATARAM, pbuf, 0, mtd->writesize);
 		this->write_bufferram(mtd, ONENAND_SPARERAM, ffchars, 0, mtd->oobsize);
 
-		this->command(mtd, ONENAND_CMD_PROG, to, mtd->oobblock);
+		this->command(mtd, ONENAND_CMD_PROG, to, mtd->writesize);
 
 		onenand_update_bufferram(mtd, to, 1);
 
@@ -1121,9 +1121,9 @@ static int onenand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
 			goto out;
 		}
 
-		written += mtd->oobblock;
+		written += mtd->writesize;
 
-		to += mtd->oobblock;
+		to += mtd->writesize;
 	}
 
 out:
@@ -1467,11 +1467,11 @@ static int do_otp_write(struct mtd_info *mtd, loff_t from, size_t len,
 	int ret;
 
 	/* Force buffer page aligned */
-	if (len < mtd->oobblock) {
+	if (len < mtd->writesize) {
 		memcpy(this->page_buf, buf, len);
-		memset(this->page_buf + len, 0xff, mtd->oobblock - len);
+		memset(this->page_buf + len, 0xff, mtd->writesize - len);
 		pbuf = this->page_buf;
-		len = mtd->oobblock;
+		len = mtd->writesize;
 	}
 
 	/* Enter OTP access mode */
@@ -1546,12 +1546,12 @@ static int onenand_otp_walk(struct mtd_info *mtd, loff_t from, size_t len,
 		otp_pages = 10;
 
 	if (mode == MTD_OTP_FACTORY) {
-		from += mtd->oobblock * otp_pages;
+		from += mtd->writesize * otp_pages;
 		otp_pages = 64 - otp_pages;
 	}
 
 	/* Check User/Factory boundary */
-	if (((mtd->oobblock * otp_pages) - (from + len)) < 0)
+	if (((mtd->writesize * otp_pages) - (from + len)) < 0)
 		return 0;
 
 	while (len > 0 && otp_pages > 0) {
@@ -1564,10 +1564,10 @@ static int onenand_otp_walk(struct mtd_info *mtd, loff_t from, size_t len,
 
 			otpinfo = (struct otp_info *) buf;
 			otpinfo->start = from;
-			otpinfo->length = mtd->oobblock;
+			otpinfo->length = mtd->writesize;
 			otpinfo->locked = 0;
 
-			from += mtd->oobblock;
+			from += mtd->writesize;
 			buf += sizeof(struct otp_info);
 			*retlen += sizeof(struct otp_info);
 		} else {
@@ -1811,15 +1811,15 @@ static int onenand_probe(struct mtd_info *mtd)
 
 	/* OneNAND page size & block size */
 	/* The data buffer size is equal to page size */
-	mtd->oobblock = this->read_word(this->base + ONENAND_REG_DATA_BUFFER_SIZE);
-	mtd->oobsize = mtd->oobblock >> 5;
+	mtd->writesize = this->read_word(this->base + ONENAND_REG_DATA_BUFFER_SIZE);
+	mtd->oobsize = mtd->writesize >> 5;
 	/* Pagers per block is always 64 in OneNAND */
-	mtd->erasesize = mtd->oobblock << 6;
+	mtd->erasesize = mtd->writesize << 6;
 
 	this->erase_shift = ffs(mtd->erasesize) - 1;
-	this->page_shift = ffs(mtd->oobblock) - 1;
+	this->page_shift = ffs(mtd->writesize) - 1;
 	this->ppb_shift = (this->erase_shift - this->page_shift);
-	this->page_mask = (mtd->erasesize / mtd->oobblock) - 1;
+	this->page_mask = (mtd->erasesize / mtd->writesize) - 1;
 
 	/* REVIST: Multichip handling */
 
@@ -1909,7 +1909,7 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 	/* Allocate buffers, if necessary */
 	if (!this->page_buf) {
 		size_t len;
-		len = mtd->oobblock + mtd->oobsize;
+		len = mtd->writesize + mtd->oobsize;
 		this->page_buf = kmalloc(len, GFP_KERNEL);
 		if (!this->page_buf) {
 			printk(KERN_ERR "onenand_scan(): Can't allocate page_buf\n");
diff --git a/drivers/mtd/onenand/onenand_bbt.c b/drivers/mtd/onenand/onenand_bbt.c
index 4510d3361eaa..aafd7c2f7802 100644
--- a/drivers/mtd/onenand/onenand_bbt.c
+++ b/drivers/mtd/onenand/onenand_bbt.c
@@ -87,13 +87,13 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 
 			/* No need to read pages fully,
 			 * just read required OOB bytes */
-			ret = mtd->read_oob(mtd, from + j * mtd->oobblock + bd->offs,
+			ret = mtd->read_oob(mtd, from + j * mtd->writesize + bd->offs,
 						readlen, &retlen, &buf[0]);
 
 			if (ret)
 				return ret;
 
-			if (check_short_pattern(&buf[j * scanlen], scanlen, mtd->oobblock, bd)) {
+			if (check_short_pattern(&buf[j * scanlen], scanlen, mtd->writesize, bd)) {
 				bbm->bbt[i >> 3] |= 0x03 << (i & 0x6);
 				printk(KERN_WARNING "Bad eraseblock %d at 0x%08x\n",
 					i >> 1, (unsigned int) from);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 62f685faeba8..355226d8ce29 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1173,7 +1173,7 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
 
 	/* Initialise write buffer */
 	init_rwsem(&c->wbuf_sem);
-	c->wbuf_pagesize = c->mtd->oobblock;
+	c->wbuf_pagesize = c->mtd->writesize;
 	c->wbuf_ofs = 0xFFFFFFFF;
 
 	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
@@ -1266,11 +1266,11 @@ void jffs2_nor_ecc_flash_cleanup(struct jffs2_sb_info *c) {
 
 int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
 	/* Cleanmarker currently occupies a whole programming region */
-	c->cleanmarker_size = MTD_PROGREGION_SIZE(c->mtd);
+	c->cleanmarker_size = c->mtd->writesize;
 
 	/* Initialize write buffer */
 	init_rwsem(&c->wbuf_sem);
-	c->wbuf_pagesize = MTD_PROGREGION_SIZE(c->mtd);
+	c->wbuf_pagesize = c->mtd->writesize;
 	c->wbuf_ofs = 0xFFFFFFFF;
 
 	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 73620ef83364..d48c7492392b 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -66,8 +66,12 @@ struct mtd_info {
 	 * information below if they desire
 	 */
 	u_int32_t erasesize;
+	/* Smallest availlable size for writing to the device.  For NAND,
+	 * this is the page size, for some NOR chips, the size of ECC
+	 * covered blocks.
+	 */
+	u_int32_t writesize;
 
-	u_int32_t oobblock;  // Size of OOB blocks (e.g. 512)
 	u_int32_t oobsize;   // Amount of OOB data per block (e.g. 16)
 	u_int32_t ecctype;
 	u_int32_t eccsize;
@@ -79,7 +83,6 @@ struct mtd_info {
 	 * MTD_PROGRAM_REGIONS flag is set.
 	 * (Maybe we should have an union for those?)
 	 */
-#define MTD_PROGREGION_SIZE(mtd)  (mtd)->oobblock
 #define MTD_PROGREGION_CTRLMODE_VALID(mtd)  (mtd)->oobsize
 #define MTD_PROGREGION_CTRLMODE_INVALID(mtd)  (mtd)->ecctype
 
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
index e4d61f33d5bb..520a3b483100 100644
--- a/include/mtd/mtd-abi.h
+++ b/include/mtd/mtd-abi.h
@@ -63,7 +63,7 @@ struct mtd_info_user {
 	uint32_t flags;
 	uint32_t size;	 // Total size of the MTD
 	uint32_t erasesize;
-	uint32_t oobblock;  // Size of OOB blocks (e.g. 512)
+	uint32_t writesize;
 	uint32_t oobsize;   // Amount of OOB data per block (e.g. 16)
 	uint32_t ecctype;
 	uint32_t eccsize;
-- 
cgit v1.2.3


From a36ed2995c56d4f858ecb524a78837473e7115ae Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 23 May 2006 11:37:03 +0200
Subject: [MTD] Simplify NAND locking

Replace the chip lock by a the controller lock. For simple drivers a
dummy controller structure is created by the scan code.
This simplifies the locking algorithm in nand_get/release_chip().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/nand_base.c | 81 +++++++++++++++++++++++---------------------
 include/linux/mtd/nand.h     |  7 ++--
 2 files changed, 47 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 08dffb7a9389..7933ca273c95 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -172,20 +172,12 @@ static void nand_release_device(struct mtd_info *mtd)
 	/* De-select the NAND device */
 	this->select_chip(mtd, -1);
 
-	if (this->controller) {
-		/* Release the controller and the chip */
-		spin_lock(&this->controller->lock);
-		this->controller->active = NULL;
-		this->state = FL_READY;
-		wake_up(&this->controller->wq);
-		spin_unlock(&this->controller->lock);
-	} else {
-		/* Release the chip */
-		spin_lock(&this->chip_lock);
-		this->state = FL_READY;
-		wake_up(&this->wq);
-		spin_unlock(&this->chip_lock);
-	}
+	/* Release the controller and the chip */
+	spin_lock(&this->controller->lock);
+	this->controller->active = NULL;
+	this->state = FL_READY;
+	wake_up(&this->controller->wq);
+	spin_unlock(&this->controller->lock);
 }
 
 /**
@@ -765,25 +757,18 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned command, int column,
  */
 static int nand_get_device(struct nand_chip *this, struct mtd_info *mtd, int new_state)
 {
-	struct nand_chip *active;
-	spinlock_t *lock;
-	wait_queue_head_t *wq;
+	spinlock_t *lock = &this->controller->lock;
+	wait_queue_head_t *wq = &this->controller->wq;
 	DECLARE_WAITQUEUE(wait, current);
-
-	lock = (this->controller) ? &this->controller->lock : &this->chip_lock;
-	wq = (this->controller) ? &this->controller->wq : &this->wq;
  retry:
-	active = this;
 	spin_lock(lock);
 
 	/* Hardware controller shared among independend devices */
-	if (this->controller) {
-		if (this->controller->active)
-			active = this->controller->active;
-		else
-			this->controller->active = this;
-	}
-	if (active == this && this->state == FL_READY) {
+	/* Hardware controller shared among independend devices */
+	if (!this->controller->active)
+		this->controller->active = this;
+
+	if (this->controller->active == this && this->state == FL_READY) {
 		this->state = new_state;
 		spin_unlock(lock);
 		return 0;
@@ -2312,6 +2297,22 @@ static void nand_resume(struct mtd_info *mtd)
 
 }
 
+/*
+ * Free allocated data structures
+ */
+static void nand_free_kmem(struct nand_chip *this)
+{
+	/* Buffer allocated by nand_scan ? */
+	if (this->options & NAND_OOBBUF_ALLOC)
+		kfree(this->oob_buf);
+	/* Buffer allocated by nand_scan ? */
+	if (this->options & NAND_DATABUF_ALLOC)
+		kfree(this->data_buf);
+	/* Controller allocated by nand_scan ? */
+	if (this->options & NAND_CONTROLLER_ALLOC)
+		kfree(this->controller);
+}
+
 /* module_text_address() isn't exported, and it's mostly a pointless
    test if this is a module _anyway_ -- they'd have to try _really_ hard
    to call us from in-kernel code if the core NAND support is modular. */
@@ -2522,9 +2523,8 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 		len = mtd->oobblock + mtd->oobsize;
 		this->data_buf = kmalloc(len, GFP_KERNEL);
 		if (!this->data_buf) {
-			if (this->options & NAND_OOBBUF_ALLOC)
-				kfree(this->oob_buf);
 			printk(KERN_ERR "nand_scan(): Cannot allocate data_buf\n");
+			nand_free_kmem(this);
 			return -ENOMEM;
 		}
 		this->options |= NAND_DATABUF_ALLOC;
@@ -2657,8 +2657,17 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 
 	/* Initialize state, waitqueue and spinlock */
 	this->state = FL_READY;
-	init_waitqueue_head(&this->wq);
-	spin_lock_init(&this->chip_lock);
+	if (!this->controller) {
+		this->controller = kzalloc(sizeof(struct nand_hw_control),
+					   GFP_KERNEL);
+		if (!this->controller) {
+			nand_free_kmem(this);
+			return -ENOMEM;
+		}
+		this->options |= NAND_CONTROLLER_ALLOC;
+	}
+	init_waitqueue_head(&this->controller->wq);
+	spin_lock_init(&this->controller->lock);
 
 	/* De-select the device */
 	this->select_chip(mtd, -1);
@@ -2718,12 +2727,8 @@ void nand_release(struct mtd_info *mtd)
 
 	/* Free bad block table memory */
 	kfree(this->bbt);
-	/* Buffer allocated by nand_scan ? */
-	if (this->options & NAND_OOBBUF_ALLOC)
-		kfree(this->oob_buf);
-	/* Buffer allocated by nand_scan ? */
-	if (this->options & NAND_DATABUF_ALLOC)
-		kfree(this->data_buf);
+	/* Free buffers */
+	nand_free_kmem(this);
 }
 
 EXPORT_SYMBOL_GPL(nand_scan);
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index da5e67b3fc70..b8792be3c4e0 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -227,6 +227,8 @@ extern int nand_read_raw (struct mtd_info *mtd, uint8_t *buf, loff_t from, size_
 #define NAND_SKIP_BBTSCAN	0x00040000
 
 /* Options set by nand scan */
+/* Nand scan has allocated controller struct */
+#define NAND_CONTROLLER_ALLOC	0x20000000
 /* Nand scan has allocated oob_buf */
 #define NAND_OOBBUF_ALLOC	0x40000000
 /* Nand scan has allocated data_buf */
@@ -294,7 +296,6 @@ struct nand_hw_control {
  * @eccbytes: 		[INTERN] number of ecc bytes per ecc-calculation step
  * @eccsteps:		[INTERN] number of ecc calculation steps per page
  * @chip_delay:		[BOARDSPECIFIC] chip dependent delay for transfering data from array to read regs (tR)
- * @chip_lock:		[INTERN] spinlock used to protect access to this structure and the chip
  * @wq:			[INTERN] wait queue to sleep on if a NAND operation is in progress
  * @state: 		[INTERN] the current state of the NAND device
  * @page_shift:		[INTERN] number of address bits in a page (column address bits)
@@ -317,7 +318,8 @@ struct nand_hw_control {
  * @bbt_td:		[REPLACEABLE] bad block table descriptor for flash lookup
  * @bbt_md:		[REPLACEABLE] bad block table mirror descriptor
  * @badblock_pattern:	[REPLACEABLE] bad block scan pattern used for initial bad block scan
- * @controller:		[OPTIONAL] a pointer to a hardware controller structure which is shared among multiple independend devices
+ * @controller:		[REPLACEABLE] a pointer to a hardware controller structure
+ *			which is shared among multiple independend devices
  * @priv:		[OPTIONAL] pointer to private chip date
  * @errstat:		[OPTIONAL] hardware specific function to perform additional error status checks
  *			(determine if errors are correctable)
@@ -352,7 +354,6 @@ struct nand_chip {
 	int		eccbytes;
 	int		eccsteps;
 	int 		chip_delay;
-	spinlock_t	chip_lock;
 	wait_queue_head_t wq;
 	nand_state_t 	state;
 	int 		page_shift;
-- 
cgit v1.2.3


From 41796c2ea9b74cdf3bc2c368193d15b8ae8950ca Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 23 May 2006 11:38:59 +0200
Subject: [MTD] Add platform support for NAND

Add the data structures necessary to provide platform device support
for NAND

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mtd/nand.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index b8792be3c4e0..05c6ecc07036 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -510,4 +510,51 @@ extern int nand_do_read_ecc (struct mtd_info *mtd, loff_t from, size_t len,
 #define NAND_SMALL_BADBLOCK_POS		5
 #define NAND_LARGE_BADBLOCK_POS		0
 
+/**
+ * struct platform_nand_chip - chip level device structure
+ *
+ * @nr_chips:		max. number of chips to scan for
+ * @chip_offs:		chip number offset
+ * @nr_partitions:	number of partitions pointed to be partitoons (or zero)
+ * @partitions:		mtd partition list
+ * @chip_delay:		R/B delay value in us
+ * @options:		Option flags, e.g. 16bit buswidth
+ * @priv:		hardware controller specific settings
+ */
+struct platform_nand_chip {
+	int			nr_chips;
+	int			chip_offset;
+	int			nr_partitions;
+	struct mtd_partition	*partitions;
+	int 			chip_delay;
+	unsigned int		options;
+	void			*priv;
+};
+
+/**
+ * struct platform_nand_ctrl - controller level device structure
+ *
+ * @hwcontrol:		platform specific hardware control structure
+ * @dev_ready:		platform specific function to read ready/busy pin
+ * @select_chip:	platform specific chip select function
+ * @priv_data:		private data to transport driver specific settings
+ *
+ * All fields are optional and depend on the hardware driver requirements
+ */
+struct platform_nand_ctrl {
+	void 		(*hwcontrol)(struct mtd_info *mtd, int cmd);
+	int  		(*dev_ready)(struct mtd_info *mtd);
+	void		(*select_chip)(struct mtd_info *mtd, int chip);
+	void		*priv;
+};
+
+/* Some helpers to access the data structures */
+static inline
+struct platform_nand_chip *get_platform_nandchip(struct mtd_info *mtd)
+{
+	struct nand_chip *chip = mtd->priv;
+
+	return chip->priv;
+}
+
 #endif /* __LINUX_MTD_NAND_H */
-- 
cgit v1.2.3


From ce4c61f184864991881ec789f7524f4b332eaafc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 23 May 2006 11:43:28 +0200
Subject: [MTD] Add support for NDFC NAND controller

NDFC NAND Flash controller is embedded in PPC EP44x SoCs.
Add platform driver based support.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/Kconfig  |   6 +
 drivers/mtd/nand/Makefile |   1 +
 drivers/mtd/nand/ndfc.c   | 317 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/ndfc.h  |  66 ++++++++++
 4 files changed, 390 insertions(+)
 create mode 100644 drivers/mtd/nand/ndfc.c
 create mode 100644 include/linux/mtd/ndfc.h

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 4d235b91267d..c2cb87fc4cb8 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -129,6 +129,12 @@ config MTD_NAND_S3C2410_HWECC
 	  currently not be able to switch to software, as there is no
 	  implementation for ECC method used by the S3C2410
 
+config MTD_NAND_NDFC
+	tristate "NDFC NanD Flash Controller"
+	depends on MTD_NAND && 44x
+	help
+	 NDFC Nand Flash Controllers are integrated in EP44x SoCs
+
 config MTD_NAND_DISKONCHIP
 	tristate "DiskOnChip 2000, Millennium and Millennium Plus (NAND reimplementation) (EXPERIMENTAL)"
 	depends on MTD_NAND && EXPERIMENTAL
diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index 33475087dbff..f74759351c91 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -21,5 +21,6 @@ obj-$(CONFIG_MTD_NAND_SHARPSL)		+= sharpsl.o
 obj-$(CONFIG_MTD_NAND_TS7250)		+= ts7250.o
 obj-$(CONFIG_MTD_NAND_NANDSIM)		+= nandsim.o
 obj-$(CONFIG_MTD_NAND_CS553X)		+= cs553x_nand.o
+obj-$(CONFIG_MTD_NAND_NDFC)		+= ndfc.o
 
 nand-objs = nand_base.o nand_bbt.o
diff --git a/drivers/mtd/nand/ndfc.c b/drivers/mtd/nand/ndfc.c
new file mode 100644
index 000000000000..22fd682b70ca
--- /dev/null
+++ b/drivers/mtd/nand/ndfc.c
@@ -0,0 +1,317 @@
+/*
+ *  drivers/mtd/ndfc.c
+ *
+ *  Overview:
+ *   Platform independend driver for NDFC (NanD Flash Controller)
+ *   integrated into EP440 cores
+ *
+ *  Author: Thomas Gleixner
+ *
+ *  Copyright 2006 IBM
+ *
+ *  This program is free software; you can redistribute	 it and/or modify it
+ *  under  the terms of	 the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the	License, or (at your
+ *  option) any later version.
+ *
+ */
+#include <linux/module.h>
+#include <linux/mtd/nand.h>
+#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/ndfc.h>
+#include <linux/mtd/ubi.h>
+#include <linux/mtd/mtd.h>
+#include <linux/platform_device.h>
+
+#include <asm/io.h>
+#include <asm/ibm44x.h>
+
+struct ndfc_nand_mtd {
+	struct mtd_info			mtd;
+	struct nand_chip		chip;
+	struct platform_nand_chip	*pl_chip;
+};
+
+static struct ndfc_nand_mtd ndfc_mtd[NDFC_MAX_BANKS];
+
+struct ndfc_controller {
+	void __iomem		*ndfcbase;
+	struct nand_hw_control	ndfc_control;
+	atomic_t		childs_active;
+};
+
+static struct ndfc_controller ndfc_ctrl;
+
+static void ndfc_select_chip(struct mtd_info *mtd, int chip)
+{
+	uint32_t ccr;
+	struct ndfc_controller *ndfc = &ndfc_ctrl;
+	struct nand_chip *nandchip = mtd->priv;
+	struct ndfc_nand_mtd *nandmtd = nandchip->priv;
+	struct platform_nand_chip *pchip = nandmtd->pl_chip;
+
+	ccr = __raw_readl(ndfc->ndfcbase + NDFC_CCR);
+	if (chip >= 0) {
+		ccr &= ~NDFC_CCR_BS_MASK;
+		ccr |= NDFC_CCR_BS(chip + pchip->chip_offset);
+	} else
+		ccr |= NDFC_CCR_RESET_CE;
+	writel(ccr, ndfc->ndfcbase + NDFC_CCR);
+}
+
+static void ndfc_hwcontrol(struct mtd_info *mtd, int cmd)
+{
+	struct ndfc_controller *ndfc = &ndfc_ctrl;
+	struct nand_chip *chip = mtd->priv;
+
+	switch (cmd) {
+	case NAND_CTL_SETCLE:
+		chip->IO_ADDR_W = ndfc->ndfcbase + NDFC_CMD;
+		break;
+	case NAND_CTL_SETALE:
+		chip->IO_ADDR_W = ndfc->ndfcbase + NDFC_ALE;
+		break;
+	default:
+		chip->IO_ADDR_W = ndfc->ndfcbase + NDFC_DATA;
+		break;
+	}
+}
+
+static int ndfc_ready(struct mtd_info *mtd)
+{
+	struct ndfc_controller *ndfc = &ndfc_ctrl;
+
+	return __raw_readl(ndfc->ndfcbase + NDFC_STAT) & NDFC_STAT_IS_READY;
+}
+
+static void ndfc_enable_hwecc(struct mtd_info *mtd, int mode)
+{
+	uint32_t ccr;
+	struct ndfc_controller *ndfc = &ndfc_ctrl;
+
+	ccr = __raw_readl(ndfc->ndfcbase + NDFC_CCR);
+	ccr |= NDFC_CCR_RESET_ECC;
+	__raw_writel(ccr, ndfc->ndfcbase + NDFC_CCR);
+	wmb();
+}
+
+static int ndfc_calculate_ecc(struct mtd_info *mtd,
+			      const u_char *dat, u_char *ecc_code)
+{
+	struct ndfc_controller *ndfc = &ndfc_ctrl;
+	uint32_t ecc;
+	uint8_t *p = (uint8_t *)&ecc;
+
+	wmb();
+	ecc = __raw_readl(ndfc->ndfcbase + NDFC_ECC);
+	ecc_code[0] = p[1];
+	ecc_code[1] = p[2];
+	ecc_code[2] = p[3];
+
+	return 0;
+}
+
+/*
+ * Speedups for buffer read/write/verify
+ *
+ * NDFC allows 32bit read/write of data. So we can speed up the buffer
+ * functions. No further checking, as nand_base will always read/write
+ * page aligned.
+ */
+static void ndfc_read_buf(struct mtd_info *mtd, uint8_t *buf, int len)
+{
+	struct ndfc_controller *ndfc = &ndfc_ctrl;
+	uint32_t *p = (uint32_t *) buf;
+
+	for(;len > 0; len -= 4)
+		*p++ = __raw_readl(ndfc->ndfcbase + NDFC_DATA);
+}
+
+static void ndfc_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
+{
+	struct ndfc_controller *ndfc = &ndfc_ctrl;
+	uint32_t *p = (uint32_t *) buf;
+
+	for(;len > 0; len -= 4)
+		__raw_writel(*p++, ndfc->ndfcbase + NDFC_DATA);
+}
+
+static int ndfc_verify_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
+{
+	struct ndfc_controller *ndfc = &ndfc_ctrl;
+	uint32_t *p = (uint32_t *) buf;
+
+	for(;len > 0; len -= 4)
+		if (*p++ != __raw_readl(ndfc->ndfcbase + NDFC_DATA))
+			return -EFAULT;
+	return 0;
+}
+
+/*
+ * Initialize chip structure
+ */
+static void ndfc_chip_init(struct ndfc_nand_mtd *mtd)
+{
+	struct ndfc_controller *ndfc = &ndfc_ctrl;
+	struct nand_chip *chip = &mtd->chip;
+
+	chip->IO_ADDR_R = ndfc->ndfcbase + NDFC_DATA;
+	chip->IO_ADDR_W = ndfc->ndfcbase + NDFC_DATA;
+	chip->hwcontrol = ndfc_hwcontrol;
+	chip->dev_ready = ndfc_ready;
+	chip->select_chip = ndfc_select_chip;
+	chip->chip_delay = 50;
+	chip->priv = mtd;
+	chip->options = mtd->pl_chip->options;
+	chip->controller = &ndfc->ndfc_control;
+	chip->read_buf = ndfc_read_buf;
+	chip->write_buf = ndfc_write_buf;
+	chip->verify_buf = ndfc_verify_buf;
+	chip->correct_data  = nand_correct_data;
+	chip->enable_hwecc  = ndfc_enable_hwecc;
+	chip->calculate_ecc = ndfc_calculate_ecc;
+	chip->eccmode = NAND_ECC_HW3_256;
+	chip->autooob = mtd->pl_chip->autooob;
+	mtd->mtd.priv = chip;
+	mtd->mtd.owner = THIS_MODULE;
+}
+
+static int ndfc_chip_probe(struct platform_device *pdev)
+{
+	int rc;
+	struct platform_nand_chip *nc = pdev->dev.platform_data;
+	struct ndfc_chip_settings *settings = nc->priv;
+	struct ndfc_controller *ndfc = &ndfc_ctrl;
+	struct ndfc_nand_mtd *nandmtd;
+
+	if (nc->chip_offset >= NDFC_MAX_BANKS || nc->nr_chips > NDFC_MAX_BANKS)
+		return -EINVAL;
+
+	/* Set the bank settings */
+	__raw_writel(settings->bank_settings,
+		     ndfc->ndfcbase + NDFC_BCFG0 + (nc->chip_offset << 2));
+
+	nandmtd = &ndfc_mtd[pdev->id];
+	if (nandmtd->pl_chip)
+		return -EBUSY;
+
+	nandmtd->pl_chip = nc;
+	ndfc_chip_init(nandmtd);
+
+	/* Scan for chips */
+	if (nand_scan(&nandmtd->mtd, nc->nr_chips)) {
+		nandmtd->pl_chip = NULL;
+		return -ENODEV;
+	}
+
+#ifdef CONFIG_MTD_PARTITIONS
+	printk("Number of partitions %d\n", nc->nr_partitions);
+	if (nc->nr_partitions) {
+		struct mtd_info *mtd_ubi;
+		nc->partitions[NAND_PARTS_CONTENT_IDX].mtdp = &mtd_ubi;
+
+		add_mtd_device(&nandmtd->mtd); /* for testing */
+		add_mtd_partitions(&nandmtd->mtd,
+				   nc->partitions,
+				   nc->nr_partitions);
+
+		add_mtd_device(mtd_ubi);
+
+	} else
+#else
+		add_mtd_device(&nandmtd->mtd);
+#endif
+
+	atomic_inc(&ndfc->childs_active);
+	return 0;
+}
+
+static int ndfc_chip_remove(struct platform_device *pdev)
+{
+	return 0;
+}
+
+static int ndfc_nand_probe(struct platform_device *pdev)
+{
+	struct platform_nand_ctrl *nc = pdev->dev.platform_data;
+	struct ndfc_controller_settings *settings = nc->priv;
+	struct resource *res = pdev->resource;
+	struct ndfc_controller *ndfc = &ndfc_ctrl;
+	unsigned long long phys = NDFC_PHYSADDR_OFFS | res->start;
+
+	ndfc->ndfcbase = ioremap64(phys, res->end - res->start + 1);
+	if (!ndfc->ndfcbase) {
+		printk(KERN_ERR "NDFC: ioremap failed\n");
+		return -EIO;
+	}
+
+	__raw_writel(settings->ccr_settings, ndfc->ndfcbase + NDFC_CCR);
+
+	spin_lock_init(&ndfc->ndfc_control.lock);
+	init_waitqueue_head(&ndfc->ndfc_control.wq);
+
+	platform_set_drvdata(pdev, ndfc);
+
+	printk("NDFC NAND Driver initialized. Chip-Rev: 0x%08x\n",
+	       __raw_readl(ndfc->ndfcbase + NDFC_REVID));
+
+	return 0;
+}
+
+static int ndfc_nand_remove(struct platform_device *pdev)
+{
+	struct ndfc_controller *ndfc = platform_get_drvdata(pdev);
+
+	if (atomic_read(&ndfc->childs_active))
+		return -EBUSY;
+
+	if (ndfc) {
+		platform_set_drvdata(pdev, NULL);
+		iounmap(ndfc_ctrl.ndfcbase);
+		ndfc_ctrl.ndfcbase = NULL;
+	}
+	return 0;
+}
+
+/* driver device registration */
+
+static struct platform_driver ndfc_chip_driver = {
+	.probe		= ndfc_chip_probe,
+	.remove		= ndfc_chip_remove,
+	.driver		= {
+		.name	= "ndfc-chip",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static struct platform_driver ndfc_nand_driver = {
+	.probe		= ndfc_nand_probe,
+	.remove		= ndfc_nand_remove,
+	.driver		= {
+		.name	= "ndfc-nand",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init ndfc_nand_init(void)
+{
+	int ret = platform_driver_register(&ndfc_nand_driver);
+
+	if (!ret)
+		ret = platform_driver_register(&ndfc_chip_driver);
+	return ret;
+}
+
+static void __exit ndfc_nand_exit(void)
+{
+	platform_driver_unregister(&ndfc_chip_driver);
+	platform_driver_unregister(&ndfc_nand_driver);
+}
+
+module_init(ndfc_nand_init);
+module_exit(ndfc_nand_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Thomas Gleixner <tglx@linutronix.de>");
+MODULE_DESCRIPTION("Platform driver for NDFC");
diff --git a/include/linux/mtd/ndfc.h b/include/linux/mtd/ndfc.h
new file mode 100644
index 000000000000..31d61f07d768
--- /dev/null
+++ b/include/linux/mtd/ndfc.h
@@ -0,0 +1,66 @@
+/*
+ *  linux/include/linux/mtd/ndfc.h
+ *
+ *  Copyright (c) 2006 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  Info:
+ *   Contains defines, datastructures for ndfc nand controller
+ *
+ */
+#ifndef __LINUX_MTD_NDFC_H
+#define __LINUX_MTD_NDFC_H
+
+/* NDFC Register definitions */
+#define NDFC_CMD		0x00
+#define NDFC_ALE		0x04
+#define NDFC_DATA		0x08
+#define NDFC_ECC		0x10
+#define NDFC_BCFG0		0x30
+#define NDFC_BCFG1		0x34
+#define NDFC_BCFG2		0x38
+#define NDFC_BCFG3		0x3c
+#define NDFC_CCR		0x40
+#define NDFC_STAT		0x44
+#define NDFC_HWCTL		0x48
+#define NDFC_REVID		0x50
+
+#define NDFC_STAT_IS_READY	0x01000000
+
+#define NDFC_CCR_RESET_CE	0x80000000 /* CE Reset */
+#define NDFC_CCR_RESET_ECC	0x40000000 /* ECC Reset */
+#define NDFC_CCR_RIE		0x20000000 /* Interrupt Enable on Device Rdy */
+#define NDFC_CCR_REN		0x10000000 /* Enable wait for Rdy in LinearR */
+#define NDFC_CCR_ROMEN		0x08000000 /* Enable ROM In LinearR */
+#define NDFC_CCR_ARE		0x04000000 /* Auto-Read Enable */
+#define NDFC_CCR_BS(x)		(((x) & 0x3) << 24) /* Select Bank on CE[x] */
+#define NDFC_CCR_BS_MASK	0x03000000 /* Select Bank */
+#define NDFC_CCR_ARAC0		0x00000000 /* 3 Addr, 1 Col 2 Row 512b page */
+#define NDFC_CCR_ARAC1		0x00001000 /* 4 Addr, 1 Col 3 Row 512b page */
+#define NDFC_CCR_ARAC2		0x00002000 /* 4 Addr, 2 Col 2 Row 2K page */
+#define NDFC_CCR_ARAC3		0x00003000 /* 5 Addr, 2 Col 3 Row 2K page */
+#define NDFC_CCR_ARAC_MASK	0x00003000 /* Auto-Read mode Addr Cycles */
+#define NDFC_CCR_RPG		0x0000C000 /* Auto-Read Page */
+#define NDFC_CCR_EBCC		0x00000004 /* EBC Configuration Completed */
+#define NDFC_CCR_DHC		0x00000002 /* Direct Hardware Control Enable */
+
+#define NDFC_BxCFG_EN		0x80000000 /* Bank Enable */
+#define NDFC_BxCFG_CED		0x40000000 /* nCE Style */
+#define NDFC_BxCFG_SZ_MASK	0x08000000 /* Bank Size */
+#define NDFC_BxCFG_SZ_8BIT	0x00000000 /* 8bit */
+#define NDFC_BxCFG_SZ_16BIT	0x08000000 /* 16bit */
+
+#define NDFC_MAX_BANKS		4
+
+struct ndfc_controller_settings {
+	uint32_t		ccr_settings;
+};
+
+struct ndfc_chip_settings {
+	uint32_t	bank_settings;
+};
+
+#endif
-- 
cgit v1.2.3


From 2c0a2bed9276ebbec5794edc07f66e21e9a1735c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 23 May 2006 11:50:56 +0200
Subject: [MTD] NAND whitespace and formatting cleanup

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/nand_base.c | 104 ++++++++++++++++++++++++++-----------------
 include/linux/mtd/nand.h     | 103 ++++++++++++++----------------------------
 2 files changed, 96 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 7933ca273c95..6ef1893996ce 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -10,7 +10,7 @@
  *	http://www.linux-mtd.infradead.org/tech/nand.html
  *
  *  Copyright (C) 2000 Steven J. Hill (sjhill@realitydiluted.com)
- * 		  2002 Thomas Gleixner (tglx@linutronix.de)
+ *		  2002 Thomas Gleixner (tglx@linutronix.de)
  *
  *  02-08-2004  tglx: support for strange chips, which cannot auto increment
  *		pages on read / read_oob
@@ -25,26 +25,30 @@
  *  05-19-2004  tglx: Basic support for Renesas AG-AND chips
  *
  *  09-24-2004  tglx: add support for hardware controllers (e.g. ECC) shared
- *		among multiple independend devices. Suggestions and initial patch
- *		from Ben Dooks <ben-mtd@fluff.org>
- *
- *  12-05-2004	dmarlin: add workaround for Renesas AG-AND chips "disturb" issue.
- *		Basically, any block not rewritten may lose data when surrounding blocks
- *		are rewritten many times.  JFFS2 ensures this doesn't happen for blocks
- *		it uses, but the Bad Block Table(s) may not be rewritten.  To ensure they
- *		do not lose data, force them to be rewritten when some of the surrounding
- *		blocks are erased.  Rather than tracking a specific nearby block (which
- *		could itself go bad), use a page address 'mask' to select several blocks
- *		in the same area, and rewrite the BBT when any of them are erased.
- *
- *  01-03-2005	dmarlin: added support for the device recovery command sequence for Renesas
- *		AG-AND chips.  If there was a sudden loss of power during an erase operation,
- * 		a "device recovery" operation must be performed when power is restored
- * 		to ensure correct operation.
- *
- *  01-20-2005	dmarlin: added support for optional hardware specific callback routine to
- *		perform extra error status checks on erase and write failures.  This required
- *		adding a wrapper function for nand_read_ecc.
+ *		among multiple independend devices. Suggestions and initial
+ *		patch from Ben Dooks <ben-mtd@fluff.org>
+ *
+ *  12-05-2004	dmarlin: add workaround for Renesas AG-AND chips "disturb"
+ *		issue. Basically, any block not rewritten may lose data when
+ *		surrounding blocks are rewritten many times.  JFFS2 ensures
+ *		this doesn't happen for blocks it uses, but the Bad Block
+ *		Table(s) may not be rewritten.  To ensure they do not lose
+ *		data, force them to be rewritten when some of the surrounding
+ *		blocks are erased.  Rather than tracking a specific nearby
+ *		block (which could itself go bad), use a page address 'mask' to
+ *		select several blocks in the same area, and rewrite the BBT
+ *		when any of them are erased.
+ *
+ *  01-03-2005	dmarlin: added support for the device recovery command sequence
+ *		for Renesas AG-AND chips.  If there was a sudden loss of power
+ *		during an erase operation, a "device recovery" operation must
+ *		be performed when power is restored to ensure correct
+ *		operation.
+ *
+ *  01-20-2005	dmarlin: added support for optional hardware specific callback
+ *		routine to perform extra error status checks on erase and write
+ *		failures.  This required adding a wrapper function for
+ *		nand_read_ecc.
  *
  * 08-20-2005	vwool: suspend/resume added
  *
@@ -132,32 +136,43 @@ static void nand_write_buf(struct mtd_info *mtd, const u_char *buf, int len);
 static void nand_read_buf(struct mtd_info *mtd, u_char *buf, int len);
 static int nand_verify_buf(struct mtd_info *mtd, const u_char *buf, int len);
 
-static int nand_read(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf);
+static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
+		     size_t *retlen, u_char *buf);
 static int nand_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
-			 size_t *retlen, u_char *buf, u_char *eccbuf, struct nand_oobinfo *oobsel);
-static int nand_read_oob(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf);
-static int nand_write(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf);
+			 size_t *retlen, u_char *buf, u_char *eccbuf,
+			 struct nand_oobinfo *oobsel);
+static int nand_read_oob(struct mtd_info *mtd, loff_t from, size_t len,
+			 size_t *retlen, u_char *buf);
+static int nand_write(struct mtd_info *mtd, loff_t to, size_t len,
+		      size_t *retlen, const u_char *buf);
 static int nand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
-			  size_t *retlen, const u_char *buf, u_char *eccbuf, struct nand_oobinfo *oobsel);
-static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf);
-static int nand_writev(struct mtd_info *mtd, const struct kvec *vecs, unsigned long count, loff_t to, size_t *retlen);
+			  size_t *retlen, const u_char *buf, u_char *eccbuf,
+			  struct nand_oobinfo *oobsel);
+static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len,
+			  size_t *retlen, const u_char *buf);
+static int nand_writev(struct mtd_info *mtd, const struct kvec *vecs,
+		       unsigned long count, loff_t to, size_t *retlen);
 static int nand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
-			   unsigned long count, loff_t to, size_t *retlen, u_char *eccbuf,
-			   struct nand_oobinfo *oobsel);
+			   unsigned long count, loff_t to, size_t *retlen,
+			   u_char *eccbuf, struct nand_oobinfo *oobsel);
 static int nand_erase(struct mtd_info *mtd, struct erase_info *instr);
 static void nand_sync(struct mtd_info *mtd);
 
 /* Some internal functions */
-static int nand_write_page(struct mtd_info *mtd, struct nand_chip *this, int page, u_char * oob_buf,
+static int nand_write_page(struct mtd_info *mtd, struct nand_chip *this,
+			   int page, u_char * oob_buf,
 			   struct nand_oobinfo *oobsel, int mode);
 #ifdef CONFIG_MTD_NAND_VERIFY_WRITE
-static int nand_verify_pages(struct mtd_info *mtd, struct nand_chip *this, int page, int numpages,
-			     u_char *oob_buf, struct nand_oobinfo *oobsel, int chipnr, int oobmode);
+static int nand_verify_pages(struct mtd_info *mtd, struct nand_chip *this,
+			     int page, int numpages, u_char *oob_buf,
+			     struct nand_oobinfo *oobsel, int chipnr,
+			     int oobmode);
 #else
 #define nand_verify_pages(...) (0)
 #endif
 
-static int nand_get_device(struct nand_chip *this, struct mtd_info *mtd, int new_state);
+static int nand_get_device(struct nand_chip *this, struct mtd_info *mtd,
+			   int new_state);
 
 /**
  * nand_release_device - [GENERIC] release chip
@@ -424,14 +439,16 @@ static int nand_block_bad(struct mtd_info *mtd, loff_t ofs, int getchip)
 		page = (int)ofs;
 
 	if (this->options & NAND_BUSWIDTH_16) {
-		this->cmdfunc(mtd, NAND_CMD_READOOB, this->badblockpos & 0xFE, page & this->pagemask);
+		this->cmdfunc(mtd, NAND_CMD_READOOB, this->badblockpos & 0xFE,
+			      page & this->pagemask);
 		bad = cpu_to_le16(this->read_word(mtd));
 		if (this->badblockpos & 0x1)
 			bad >>= 8;
 		if ((bad & 0xFF) != 0xff)
 			res = 1;
 	} else {
-		this->cmdfunc(mtd, NAND_CMD_READOOB, this->badblockpos, page & this->pagemask);
+		this->cmdfunc(mtd, NAND_CMD_READOOB, this->badblockpos,
+			      page & this->pagemask);
 		if (this->read_byte(mtd) != 0xff)
 			res = 1;
 	}
@@ -498,7 +515,8 @@ static int nand_check_wp(struct mtd_info *mtd)
  * Check, if the block is bad. Either by reading the bad block table or
  * calling of the scan function.
  */
-static int nand_block_checkbad(struct mtd_info *mtd, loff_t ofs, int getchip, int allowbbt)
+static int nand_block_checkbad(struct mtd_info *mtd, loff_t ofs, int getchip,
+			       int allowbbt)
 {
 	struct nand_chip *this = mtd->priv;
 
@@ -540,7 +558,8 @@ static void nand_wait_ready(struct mtd_info *mtd)
  * Send command to NAND device. This function is used for small page
  * devices (256/512 Bytes per page)
  */
-static void nand_command(struct mtd_info *mtd, unsigned command, int column, int page_addr)
+static void nand_command(struct mtd_info *mtd, unsigned command, int column,
+			 int page_addr)
 {
 	register struct nand_chip *this = mtd->priv;
 
@@ -755,7 +774,8 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned command, int column,
  *
  * Get the device and lock it for exclusive access
  */
-static int nand_get_device(struct nand_chip *this, struct mtd_info *mtd, int new_state)
+static int
+nand_get_device(struct nand_chip *this, struct mtd_info *mtd, int new_state)
 {
 	spinlock_t *lock = &this->controller->lock;
 	wait_queue_head_t *wq = &this->controller->wq;
@@ -942,7 +962,7 @@ static int nand_write_page(struct mtd_info *mtd, struct nand_chip *this, int pag
  * nand_verify_pages - [GENERIC] verify the chip contents after a write
  * @mtd:	MTD device structure
  * @this:	NAND chip structure
- * @page: 	startpage inside the chip, must be called with (page & this->pagemask)
+ * @page:	startpage inside the chip, must be called with (page & this->pagemask)
  * @numpages:	number of pages to verify
  * @oob_buf:	out of band data buffer
  * @oobsel:	out of band selecttion structre
@@ -2293,8 +2313,8 @@ static void nand_resume(struct mtd_info *mtd)
 	if (this->state == FL_PM_SUSPENDED)
 		nand_release_device(mtd);
 	else
-		printk(KERN_ERR "resume() called for the chip which is not in suspended state\n");
-
+		printk(KERN_ERR "nand_resume() called for a chip which is not "
+		       "in suspended state\n");
 }
 
 /*
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 05c6ecc07036..014ceefbec0e 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -11,47 +11,11 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
- *  Info:
- *   Contains standard defines and IDs for NAND flash devices
+ * Info:
+ *	Contains standard defines and IDs for NAND flash devices
  *
- *  Changelog:
- *   01-31-2000 DMW     Created
- *   09-18-2000 SJH     Moved structure out of the Disk-On-Chip drivers
- *			so it can be used by other NAND flash device
- *			drivers. I also changed the copyright since none
- *			of the original contents of this file are specific
- *			to DoC devices. David can whack me with a baseball
- *			bat later if I did something naughty.
- *   10-11-2000 SJH     Added private NAND flash structure for driver
- *   10-24-2000 SJH     Added prototype for 'nand_scan' function
- *   10-29-2001 TG	changed nand_chip structure to support
- *			hardwarespecific function for accessing control lines
- *   02-21-2002 TG	added support for different read/write adress and
- *			ready/busy line access function
- *   02-26-2002 TG	added chip_delay to nand_chip structure to optimize
- *			command delay times for different chips
- *   04-28-2002 TG	OOB config defines moved from nand.c to avoid duplicate
- *			defines in jffs2/wbuf.c
- *   08-07-2002 TG	forced bad block location to byte 5 of OOB, even if
- *			CONFIG_MTD_NAND_ECC_JFFS2 is not set
- *   08-10-2002 TG	extensions to nand_chip structure to support HW-ECC
- *
- *   08-29-2002 tglx 	nand_chip structure: data_poi for selecting
- *			internal / fs-driver buffer
- *			support for 6byte/512byte hardware ECC
- *			read_ecc, write_ecc extended for different oob-layout
- *			oob layout selections: NAND_NONE_OOB, NAND_JFFS2_OOB,
- *			NAND_YAFFS_OOB
- *  11-25-2002 tglx	Added Manufacturer code FUJITSU, NATIONAL
- *			Split manufacturer and device ID structures
- *
- *  02-08-2004 tglx 	added option field to nand structure for chip anomalities
- *  05-25-2004 tglx 	added bad block table support, ST-MICRO manufacturer id
- *			update of nand_chip structure description
- *  01-17-2005 dmarlin	added extended commands for AG-AND device and added option
- * 			for BBT_AUTO_REFRESH.
- *  01-20-2005 dmarlin	added optional pointer to hardware specific callback for
- *			extra error status checks.
+ * Changelog:
+ *	See git changelog.
  */
 #ifndef __LINUX_MTD_NAND_H
 #define __LINUX_MTD_NAND_H
@@ -68,7 +32,8 @@ extern int nand_scan (struct mtd_info *mtd, int max_chips);
 extern void nand_release (struct mtd_info *mtd);
 
 /* Read raw data from the device without ECC */
-extern int nand_read_raw (struct mtd_info *mtd, uint8_t *buf, loff_t from, size_t len, size_t ooblen);
+extern int nand_read_raw (struct mtd_info *mtd, uint8_t *buf, loff_t from,
+			  size_t len, size_t ooblen);
 
 
 /* The maximum number of NAND chips in an array */
@@ -84,7 +49,7 @@ extern int nand_read_raw (struct mtd_info *mtd, uint8_t *buf, loff_t from, size_
  * Constants for hardware specific CLE/ALE/NCE function
 */
 /* Select the chip by setting nCE to low */
-#define NAND_CTL_SETNCE 	1
+#define NAND_CTL_SETNCE		1
 /* Deselect the chip by setting nCE to high */
 #define NAND_CTL_CLRNCE		2
 /* Select the command latch by setting CLE to high */
@@ -285,19 +250,19 @@ struct nand_hw_control {
  *			is read from the chip status register
  * @cmdfunc:		[REPLACEABLE] hardwarespecific function for writing commands to the chip
  * @waitfunc:		[REPLACEABLE] hardwarespecific function for wait on ready
- * @calculate_ecc: 	[REPLACEABLE] function for ecc calculation or readback from ecc hardware
+ * @calculate_ecc:	[REPLACEABLE] function for ecc calculation or readback from ecc hardware
  * @correct_data:	[REPLACEABLE] function for ecc correction, matching to ecc generator (sw/hw)
  * @enable_hwecc:	[BOARDSPECIFIC] function to enable (reset) hardware ecc generator. Must only
  *			be provided if a hardware ECC is available
  * @erase_cmd:		[INTERN] erase command write function, selectable due to AND support
  * @scan_bbt:		[REPLACEABLE] function to scan bad block table
  * @eccmode:		[BOARDSPECIFIC] mode of ecc, see defines
- * @eccsize: 		[INTERN] databytes used per ecc-calculation
- * @eccbytes: 		[INTERN] number of ecc bytes per ecc-calculation step
+ * @eccsize:		[INTERN] databytes used per ecc-calculation
+ * @eccbytes:		[INTERN] number of ecc bytes per ecc-calculation step
  * @eccsteps:		[INTERN] number of ecc calculation steps per page
  * @chip_delay:		[BOARDSPECIFIC] chip dependent delay for transfering data from array to read regs (tR)
  * @wq:			[INTERN] wait queue to sleep on if a NAND operation is in progress
- * @state: 		[INTERN] the current state of the NAND device
+ * @state:		[INTERN] the current state of the NAND device
  * @page_shift:		[INTERN] number of address bits in a page (column address bits)
  * @phys_erase_shift:	[INTERN] number of address bits in a physical eraseblock
  * @bbt_erase_shift:	[INTERN] number of address bits in a bbt entry
@@ -327,7 +292,7 @@ struct nand_hw_control {
 
 struct nand_chip {
 	void  __iomem	*IO_ADDR_R;
-	void  __iomem 	*IO_ADDR_W;
+	void  __iomem	*IO_ADDR_W;
 
 	u_char		(*read_byte)(struct mtd_info *mtd);
 	void		(*write_byte)(struct mtd_info *mtd, u_char byte);
@@ -340,12 +305,12 @@ struct nand_chip {
 	void		(*select_chip)(struct mtd_info *mtd, int chip);
 	int		(*block_bad)(struct mtd_info *mtd, loff_t ofs, int getchip);
 	int		(*block_markbad)(struct mtd_info *mtd, loff_t ofs);
-	void 		(*hwcontrol)(struct mtd_info *mtd, int cmd);
-	int  		(*dev_ready)(struct mtd_info *mtd);
-	void 		(*cmdfunc)(struct mtd_info *mtd, unsigned command, int column, int page_addr);
-	int 		(*waitfunc)(struct mtd_info *mtd, struct nand_chip *this, int state);
+	void		(*hwcontrol)(struct mtd_info *mtd, int cmd);
+	int		(*dev_ready)(struct mtd_info *mtd);
+	void		(*cmdfunc)(struct mtd_info *mtd, unsigned command, int column, int page_addr);
+	int		(*waitfunc)(struct mtd_info *mtd, struct nand_chip *this, int state);
 	int		(*calculate_ecc)(struct mtd_info *mtd, const u_char *dat, u_char *ecc_code);
-	int 		(*correct_data)(struct mtd_info *mtd, u_char *dat, u_char *read_ecc, u_char *calc_ecc);
+	int		(*correct_data)(struct mtd_info *mtd, u_char *dat, u_char *read_ecc, u_char *calc_ecc);
 	void		(*enable_hwecc)(struct mtd_info *mtd, int mode);
 	void		(*erase_cmd)(struct mtd_info *mtd, int page);
 	int		(*scan_bbt)(struct mtd_info *mtd);
@@ -353,14 +318,14 @@ struct nand_chip {
 	int		eccsize;
 	int		eccbytes;
 	int		eccsteps;
-	int 		chip_delay;
+	int		chip_delay;
 	wait_queue_head_t wq;
-	nand_state_t 	state;
-	int 		page_shift;
+	nand_state_t	state;
+	int		page_shift;
 	int		phys_erase_shift;
 	int		bbt_erase_shift;
 	int		chip_shift;
-	u_char 		*data_buf;
+	u_char		*data_buf;
 	u_char		*oob_buf;
 	int		oobdirty;
 	u_char		*data_poi;
@@ -389,19 +354,19 @@ struct nand_chip {
 #define NAND_MFR_NATIONAL	0x8f
 #define NAND_MFR_RENESAS	0x07
 #define NAND_MFR_STMICRO	0x20
-#define NAND_MFR_HYNIX          0xad
+#define NAND_MFR_HYNIX		0xad
 
 /**
  * struct nand_flash_dev - NAND Flash Device ID Structure
  *
- * @name:  	Identify the device type
- * @id:   	device ID code
- * @pagesize:  	Pagesize in bytes. Either 256 or 512 or 0
+ * @name:	Identify the device type
+ * @id:		device ID code
+ * @pagesize:	Pagesize in bytes. Either 256 or 512 or 0
  *		If the pagesize is 0, then the real pagesize
  *		and the eraseize are determined from the
  *		extended id bytes in the chip
- * @erasesize: 	Size of an erase block in the flash device.
- * @chipsize:  	Total chipsize in Mega Bytes
+ * @erasesize:	Size of an erase block in the flash device.
+ * @chipsize:	Total chipsize in Mega Bytes
  * @options:	Bitfield to store chip relevant options
  */
 struct nand_flash_dev {
@@ -416,7 +381,7 @@ struct nand_flash_dev {
 /**
  * struct nand_manufacturers - NAND Flash Manufacturer ID Structure
  * @name:	Manufacturer name
- * @id: 	manufacturer ID code of device.
+ * @id:		manufacturer ID code of device.
 */
 struct nand_manufacturers {
 	int id;
@@ -456,7 +421,7 @@ struct nand_bbt_descr {
 	int	veroffs;
 	uint8_t	version[NAND_MAX_CHIPS];
 	int	len;
-	int 	maxblocks;
+	int	maxblocks;
 	int	reserved_block_code;
 	uint8_t	*pattern;
 };
@@ -501,8 +466,8 @@ extern int nand_default_bbt (struct mtd_info *mtd);
 extern int nand_isbad_bbt (struct mtd_info *mtd, loff_t offs, int allowbbt);
 extern int nand_erase_nand (struct mtd_info *mtd, struct erase_info *instr, int allowbbt);
 extern int nand_do_read_ecc (struct mtd_info *mtd, loff_t from, size_t len,
-                             size_t * retlen, u_char * buf, u_char * oob_buf,
-                             struct nand_oobinfo *oobsel, int flags);
+			     size_t * retlen, u_char * buf, u_char * oob_buf,
+			     struct nand_oobinfo *oobsel, int flags);
 
 /*
 * Constants for oob configuration
@@ -526,7 +491,7 @@ struct platform_nand_chip {
 	int			chip_offset;
 	int			nr_partitions;
 	struct mtd_partition	*partitions;
-	int 			chip_delay;
+	int			chip_delay;
 	unsigned int		options;
 	void			*priv;
 };
@@ -542,8 +507,8 @@ struct platform_nand_chip {
  * All fields are optional and depend on the hardware driver requirements
  */
 struct platform_nand_ctrl {
-	void 		(*hwcontrol)(struct mtd_info *mtd, int cmd);
-	int  		(*dev_ready)(struct mtd_info *mtd);
+	void		(*hwcontrol)(struct mtd_info *mtd, int cmd);
+	int		(*dev_ready)(struct mtd_info *mtd);
 	void		(*select_chip)(struct mtd_info *mtd, int chip);
 	void		*priv;
 };
-- 
cgit v1.2.3


From 58dd8f2bfdcad1b219a4a92a2aadd8ea8c819f79 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 23 May 2006 11:52:35 +0200
Subject: [MTD] NAND consolidate data types

The NAND driver used a mix of unsigned char, u_char amd uint8_t
data types. Consolidate to uint8_t usage

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/nand_base.c | 106 +++++++++++++++++++++----------------------
 include/linux/mtd/nand.h     |  22 ++++-----
 2 files changed, 64 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 6ef1893996ce..afa77d1ed900 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -118,7 +118,7 @@ static struct nand_oobinfo nand_oob_64 = {
 };
 
 /* This is used for padding purposes in nand_write_oob */
-static u_char ffchars[] = {
+static uint8_t ffchars[] = {
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -132,39 +132,39 @@ static u_char ffchars[] = {
 /*
  * NAND low-level MTD interface functions
  */
-static void nand_write_buf(struct mtd_info *mtd, const u_char *buf, int len);
-static void nand_read_buf(struct mtd_info *mtd, u_char *buf, int len);
-static int nand_verify_buf(struct mtd_info *mtd, const u_char *buf, int len);
+static void nand_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len);
+static void nand_read_buf(struct mtd_info *mtd, uint8_t *buf, int len);
+static int nand_verify_buf(struct mtd_info *mtd, const uint8_t *buf, int len);
 
 static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
-		     size_t *retlen, u_char *buf);
+		     size_t *retlen, uint8_t *buf);
 static int nand_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
-			 size_t *retlen, u_char *buf, u_char *eccbuf,
+			 size_t *retlen, uint8_t *buf, uint8_t *eccbuf,
 			 struct nand_oobinfo *oobsel);
 static int nand_read_oob(struct mtd_info *mtd, loff_t from, size_t len,
-			 size_t *retlen, u_char *buf);
+			 size_t *retlen, uint8_t *buf);
 static int nand_write(struct mtd_info *mtd, loff_t to, size_t len,
-		      size_t *retlen, const u_char *buf);
+		      size_t *retlen, const uint8_t *buf);
 static int nand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
-			  size_t *retlen, const u_char *buf, u_char *eccbuf,
+			  size_t *retlen, const uint8_t *buf, uint8_t *eccbuf,
 			  struct nand_oobinfo *oobsel);
 static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len,
-			  size_t *retlen, const u_char *buf);
+			  size_t *retlen, const uint8_t *buf);
 static int nand_writev(struct mtd_info *mtd, const struct kvec *vecs,
 		       unsigned long count, loff_t to, size_t *retlen);
 static int nand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
 			   unsigned long count, loff_t to, size_t *retlen,
-			   u_char *eccbuf, struct nand_oobinfo *oobsel);
+			   uint8_t *eccbuf, struct nand_oobinfo *oobsel);
 static int nand_erase(struct mtd_info *mtd, struct erase_info *instr);
 static void nand_sync(struct mtd_info *mtd);
 
 /* Some internal functions */
 static int nand_write_page(struct mtd_info *mtd, struct nand_chip *this,
-			   int page, u_char * oob_buf,
+			   int page, uint8_t * oob_buf,
 			   struct nand_oobinfo *oobsel, int mode);
 #ifdef CONFIG_MTD_NAND_VERIFY_WRITE
 static int nand_verify_pages(struct mtd_info *mtd, struct nand_chip *this,
-			     int page, int numpages, u_char *oob_buf,
+			     int page, int numpages, uint8_t *oob_buf,
 			     struct nand_oobinfo *oobsel, int chipnr,
 			     int oobmode);
 #else
@@ -201,7 +201,7 @@ static void nand_release_device(struct mtd_info *mtd)
  *
  * Default read function for 8bit buswith
  */
-static u_char nand_read_byte(struct mtd_info *mtd)
+static uint8_t nand_read_byte(struct mtd_info *mtd)
 {
 	struct nand_chip *this = mtd->priv;
 	return readb(this->IO_ADDR_R);
@@ -214,7 +214,7 @@ static u_char nand_read_byte(struct mtd_info *mtd)
  *
  * Default write function for 8it buswith
  */
-static void nand_write_byte(struct mtd_info *mtd, u_char byte)
+static void nand_write_byte(struct mtd_info *mtd, uint8_t byte)
 {
 	struct nand_chip *this = mtd->priv;
 	writeb(byte, this->IO_ADDR_W);
@@ -227,10 +227,10 @@ static void nand_write_byte(struct mtd_info *mtd, u_char byte)
  * Default read function for 16bit buswith with
  * endianess conversion
  */
-static u_char nand_read_byte16(struct mtd_info *mtd)
+static uint8_t nand_read_byte16(struct mtd_info *mtd)
 {
 	struct nand_chip *this = mtd->priv;
-	return (u_char) cpu_to_le16(readw(this->IO_ADDR_R));
+	return (uint8_t) cpu_to_le16(readw(this->IO_ADDR_R));
 }
 
 /**
@@ -241,7 +241,7 @@ static u_char nand_read_byte16(struct mtd_info *mtd)
  * Default write function for 16bit buswith with
  * endianess conversion
  */
-static void nand_write_byte16(struct mtd_info *mtd, u_char byte)
+static void nand_write_byte16(struct mtd_info *mtd, uint8_t byte)
 {
 	struct nand_chip *this = mtd->priv;
 	writew(le16_to_cpu((u16) byte), this->IO_ADDR_W);
@@ -305,7 +305,7 @@ static void nand_select_chip(struct mtd_info *mtd, int chip)
  *
  * Default write function for 8bit buswith
  */
-static void nand_write_buf(struct mtd_info *mtd, const u_char *buf, int len)
+static void nand_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
 {
 	int i;
 	struct nand_chip *this = mtd->priv;
@@ -322,7 +322,7 @@ static void nand_write_buf(struct mtd_info *mtd, const u_char *buf, int len)
  *
  * Default read function for 8bit buswith
  */
-static void nand_read_buf(struct mtd_info *mtd, u_char *buf, int len)
+static void nand_read_buf(struct mtd_info *mtd, uint8_t *buf, int len)
 {
 	int i;
 	struct nand_chip *this = mtd->priv;
@@ -339,7 +339,7 @@ static void nand_read_buf(struct mtd_info *mtd, u_char *buf, int len)
  *
  * Default verify function for 8bit buswith
  */
-static int nand_verify_buf(struct mtd_info *mtd, const u_char *buf, int len)
+static int nand_verify_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
 {
 	int i;
 	struct nand_chip *this = mtd->priv;
@@ -359,7 +359,7 @@ static int nand_verify_buf(struct mtd_info *mtd, const u_char *buf, int len)
  *
  * Default write function for 16bit buswith
  */
-static void nand_write_buf16(struct mtd_info *mtd, const u_char *buf, int len)
+static void nand_write_buf16(struct mtd_info *mtd, const uint8_t *buf, int len)
 {
 	int i;
 	struct nand_chip *this = mtd->priv;
@@ -379,7 +379,7 @@ static void nand_write_buf16(struct mtd_info *mtd, const u_char *buf, int len)
  *
  * Default read function for 16bit buswith
  */
-static void nand_read_buf16(struct mtd_info *mtd, u_char *buf, int len)
+static void nand_read_buf16(struct mtd_info *mtd, uint8_t *buf, int len)
 {
 	int i;
 	struct nand_chip *this = mtd->priv;
@@ -398,7 +398,7 @@ static void nand_read_buf16(struct mtd_info *mtd, u_char *buf, int len)
  *
  * Default verify function for 16bit buswith
  */
-static int nand_verify_buf16(struct mtd_info *mtd, const u_char *buf, int len)
+static int nand_verify_buf16(struct mtd_info *mtd, const uint8_t *buf, int len)
 {
 	int i;
 	struct nand_chip *this = mtd->priv;
@@ -472,7 +472,7 @@ static int nand_block_bad(struct mtd_info *mtd, loff_t ofs, int getchip)
 static int nand_default_block_markbad(struct mtd_info *mtd, loff_t ofs)
 {
 	struct nand_chip *this = mtd->priv;
-	u_char buf[2] = { 0, 0 };
+	uint8_t buf[2] = { 0, 0 };
 	size_t retlen;
 	int block;
 
@@ -600,11 +600,11 @@ static void nand_command(struct mtd_info *mtd, unsigned command, int column,
 			this->write_byte(mtd, column);
 		}
 		if (page_addr != -1) {
-			this->write_byte(mtd, (unsigned char)(page_addr & 0xff));
-			this->write_byte(mtd, (unsigned char)((page_addr >> 8) & 0xff));
+			this->write_byte(mtd, (uint8_t)(page_addr & 0xff));
+			this->write_byte(mtd, (uint8_t)((page_addr >> 8) & 0xff));
 			/* One more address cycle for devices > 32MiB */
 			if (this->chipsize > (32 << 20))
-				this->write_byte(mtd, (unsigned char)((page_addr >> 16) & 0x0f));
+				this->write_byte(mtd, (uint8_t)((page_addr >> 16) & 0x0f));
 		}
 		/* Latch in address */
 		this->hwcontrol(mtd, NAND_CTL_CLRALE);
@@ -692,11 +692,11 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned command, int column,
 			this->write_byte(mtd, column >> 8);
 		}
 		if (page_addr != -1) {
-			this->write_byte(mtd, (unsigned char)(page_addr & 0xff));
-			this->write_byte(mtd, (unsigned char)((page_addr >> 8) & 0xff));
+			this->write_byte(mtd, (uint8_t)(page_addr & 0xff));
+			this->write_byte(mtd, (uint8_t)((page_addr >> 8) & 0xff));
 			/* One more address cycle for devices > 128MiB */
 			if (this->chipsize > (128 << 20))
-				this->write_byte(mtd, (unsigned char)((page_addr >> 16) & 0xff));
+				this->write_byte(mtd, (uint8_t)((page_addr >> 16) & 0xff));
 		}
 		/* Latch in address */
 		this->hwcontrol(mtd, NAND_CTL_CLRALE);
@@ -874,10 +874,10 @@ static int nand_wait(struct mtd_info *mtd, struct nand_chip *this, int state)
  * Cached programming is not supported yet.
  */
 static int nand_write_page(struct mtd_info *mtd, struct nand_chip *this, int page,
-			   u_char *oob_buf, struct nand_oobinfo *oobsel, int cached)
+			   uint8_t *oob_buf, struct nand_oobinfo *oobsel, int cached)
 {
 	int i, status;
-	u_char ecc_code[32];
+	uint8_t ecc_code[32];
 	int eccmode = oobsel->useecc ? this->eccmode : NAND_ECC_NONE;
 	int *oob_config = oobsel->eccpos;
 	int datidx = 0, eccidx = 0, eccsteps = this->eccsteps;
@@ -978,12 +978,12 @@ static int nand_write_page(struct mtd_info *mtd, struct nand_chip *this, int pag
  * it early in the page write stage. Better to write no data than invalid data.
  */
 static int nand_verify_pages(struct mtd_info *mtd, struct nand_chip *this, int page, int numpages,
-			     u_char *oob_buf, struct nand_oobinfo *oobsel, int chipnr, int oobmode)
+			     uint8_t *oob_buf, struct nand_oobinfo *oobsel, int chipnr, int oobmode)
 {
 	int i, j, datidx = 0, oobofs = 0, res = -EIO;
 	int eccsteps = this->eccsteps;
 	int hweccbytes;
-	u_char oobdata[64];
+	uint8_t oobdata[64];
 
 	hweccbytes = (this->options & NAND_HWECC_SYNDROME) ? (oobsel->eccbytes / eccsteps) : 0;
 
@@ -1078,7 +1078,7 @@ static int nand_verify_pages(struct mtd_info *mtd, struct nand_chip *this, int p
  * This function simply calls nand_do_read_ecc with oob buffer and oobsel = NULL
  * and flags = 0xff
  */
-static int nand_read(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf)
+static int nand_read(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, uint8_t *buf)
 {
 	return nand_do_read_ecc(mtd, from, len, retlen, buf, NULL, &mtd->oobinfo, 0xff);
 }
@@ -1096,7 +1096,7 @@ static int nand_read(struct mtd_info *mtd, loff_t from, size_t len, size_t *retl
  * This function simply calls nand_do_read_ecc with flags = 0xff
  */
 static int nand_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
-			 size_t *retlen, u_char *buf, u_char *oob_buf, struct nand_oobinfo *oobsel)
+			 size_t *retlen, uint8_t *buf, uint8_t *oob_buf, struct nand_oobinfo *oobsel)
 {
 	/* use userspace supplied oobinfo, if zero */
 	if (oobsel == NULL)
@@ -1121,15 +1121,15 @@ static int nand_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
  * NAND read with ECC
  */
 int nand_do_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
-		     size_t *retlen, u_char *buf, u_char *oob_buf, struct nand_oobinfo *oobsel, int flags)
+		     size_t *retlen, uint8_t *buf, uint8_t *oob_buf, struct nand_oobinfo *oobsel, int flags)
 {
 
 	int i, j, col, realpage, page, end, ecc, chipnr, sndcmd = 1;
 	int read = 0, oob = 0, ecc_status = 0, ecc_failed = 0;
 	struct nand_chip *this = mtd->priv;
-	u_char *data_poi, *oob_data = oob_buf;
-	u_char ecc_calc[32];
-	u_char ecc_code[32];
+	uint8_t *data_poi, *oob_data = oob_buf;
+	uint8_t ecc_calc[32];
+	uint8_t ecc_code[32];
 	int eccmode, eccsteps;
 	int *oob_config, datidx;
 	int blockcheck = (1 << (this->phys_erase_shift - this->page_shift)) - 1;
@@ -1383,7 +1383,7 @@ int nand_do_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
  *
  * NAND read out-of-band data from the spare area
  */
-static int nand_read_oob(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf)
+static int nand_read_oob(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, uint8_t *buf)
 {
 	int i, col, page, chipnr;
 	struct nand_chip *this = mtd->priv;
@@ -1550,7 +1550,7 @@ int nand_read_raw(struct mtd_info *mtd, uint8_t *buf, loff_t from, size_t len, s
  * forces the 0xff fill before using the buffer again.
  *
 */
-static u_char *nand_prepare_oobbuf(struct mtd_info *mtd, u_char *fsbuf, struct nand_oobinfo *oobsel,
+static uint8_t *nand_prepare_oobbuf(struct mtd_info *mtd, uint8_t *fsbuf, struct nand_oobinfo *oobsel,
 				   int autoplace, int numpages)
 {
 	struct nand_chip *this = mtd->priv;
@@ -1599,7 +1599,7 @@ static u_char *nand_prepare_oobbuf(struct mtd_info *mtd, u_char *fsbuf, struct n
  * This function simply calls nand_write_ecc with oob buffer and oobsel = NULL
  *
 */
-static int nand_write(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf)
+static int nand_write(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const uint8_t *buf)
 {
 	return (nand_write_ecc(mtd, to, len, retlen, buf, NULL, NULL));
 }
@@ -1617,13 +1617,13 @@ static int nand_write(struct mtd_info *mtd, loff_t to, size_t len, size_t *retle
  * NAND write with ECC
  */
 static int nand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
-			  size_t *retlen, const u_char *buf, u_char *eccbuf,
+			  size_t *retlen, const uint8_t *buf, uint8_t *eccbuf,
 			  struct nand_oobinfo *oobsel)
 {
 	int startpage, page, ret = -EIO, oob = 0, written = 0, chipnr;
 	int autoplace = 0, numpages, totalpages;
 	struct nand_chip *this = mtd->priv;
-	u_char *oobbuf, *bufstart;
+	uint8_t *oobbuf, *bufstart;
 	int ppblock = (1 << (this->phys_erase_shift - this->page_shift));
 
 	DEBUG(MTD_DEBUG_LEVEL3, "nand_write_ecc: to = 0x%08x, len = %i\n", (unsigned int)to, (int)len);
@@ -1680,12 +1680,12 @@ static int nand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 	/* Calc number of pages we can write in one go */
 	numpages = min(ppblock - (startpage & (ppblock - 1)), totalpages);
 	oobbuf = nand_prepare_oobbuf(mtd, eccbuf, oobsel, autoplace, numpages);
-	bufstart = (u_char *) buf;
+	bufstart = (uint8_t *) buf;
 
 	/* Loop until all data is written */
 	while (written < len) {
 
-		this->data_poi = (u_char *) &buf[written];
+		this->data_poi = (uint8_t *) &buf[written];
 		/* Write one page. If this is the last page to write
 		 * or the last page in this block, then use the
 		 * real pageprogram command, else select cached programming
@@ -1764,7 +1764,7 @@ static int nand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
  *
  * NAND write out-of-band
  */
-static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf)
+static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const uint8_t *buf)
 {
 	int column, page, status, ret = -EIO, chipnr;
 	struct nand_chip *this = mtd->priv;
@@ -1884,13 +1884,13 @@ static int nand_writev(struct mtd_info *mtd, const struct kvec *vecs, unsigned l
  * NAND write with iovec with ecc
  */
 static int nand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs, unsigned long count,
-			   loff_t to, size_t *retlen, u_char *eccbuf, struct nand_oobinfo *oobsel)
+			   loff_t to, size_t *retlen, uint8_t *eccbuf, struct nand_oobinfo *oobsel)
 {
 	int i, page, len, total_len, ret = -EIO, written = 0, chipnr;
 	int oob, numpages, autoplace = 0, startpage;
 	struct nand_chip *this = mtd->priv;
 	int ppblock = (1 << (this->phys_erase_shift - this->page_shift));
-	u_char *oobbuf, *bufstart;
+	uint8_t *oobbuf, *bufstart;
 
 	/* Preset written len for early exit */
 	*retlen = 0;
@@ -1959,7 +1959,7 @@ static int nand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs, unsign
 			/* Do not cross block boundaries */
 			numpages = min(ppblock - (startpage & (ppblock - 1)), numpages);
 			oobbuf = nand_prepare_oobbuf(mtd, NULL, oobsel, autoplace, numpages);
-			bufstart = (u_char *) vecs->iov_base;
+			bufstart = (uint8_t *) vecs->iov_base;
 			bufstart += len;
 			this->data_poi = bufstart;
 			oob = 0;
@@ -1990,7 +1990,7 @@ static int nand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs, unsign
 			int cnt = 0;
 			while (cnt < mtd->oobblock) {
 				if (vecs->iov_base != NULL && vecs->iov_len)
-					this->data_buf[cnt++] = ((u_char *) vecs->iov_base)[len++];
+					this->data_buf[cnt++] = ((uint8_t *) vecs->iov_base)[len++];
 				/* Check, if we have to switch to the next tuple */
 				if (len >= (int)vecs->iov_len) {
 					vecs++;
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 014ceefbec0e..601c5c703a05 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -294,14 +294,14 @@ struct nand_chip {
 	void  __iomem	*IO_ADDR_R;
 	void  __iomem	*IO_ADDR_W;
 
-	u_char		(*read_byte)(struct mtd_info *mtd);
-	void		(*write_byte)(struct mtd_info *mtd, u_char byte);
+	uint8_t		(*read_byte)(struct mtd_info *mtd);
+	void		(*write_byte)(struct mtd_info *mtd, uint8_t byte);
 	u16		(*read_word)(struct mtd_info *mtd);
 	void		(*write_word)(struct mtd_info *mtd, u16 word);
 
-	void		(*write_buf)(struct mtd_info *mtd, const u_char *buf, int len);
-	void		(*read_buf)(struct mtd_info *mtd, u_char *buf, int len);
-	int		(*verify_buf)(struct mtd_info *mtd, const u_char *buf, int len);
+	void		(*write_buf)(struct mtd_info *mtd, const uint8_t *buf, int len);
+	void		(*read_buf)(struct mtd_info *mtd, uint8_t *buf, int len);
+	int		(*verify_buf)(struct mtd_info *mtd, const uint8_t *buf, int len);
 	void		(*select_chip)(struct mtd_info *mtd, int chip);
 	int		(*block_bad)(struct mtd_info *mtd, loff_t ofs, int getchip);
 	int		(*block_markbad)(struct mtd_info *mtd, loff_t ofs);
@@ -309,8 +309,8 @@ struct nand_chip {
 	int		(*dev_ready)(struct mtd_info *mtd);
 	void		(*cmdfunc)(struct mtd_info *mtd, unsigned command, int column, int page_addr);
 	int		(*waitfunc)(struct mtd_info *mtd, struct nand_chip *this, int state);
-	int		(*calculate_ecc)(struct mtd_info *mtd, const u_char *dat, u_char *ecc_code);
-	int		(*correct_data)(struct mtd_info *mtd, u_char *dat, u_char *read_ecc, u_char *calc_ecc);
+	int		(*calculate_ecc)(struct mtd_info *mtd, const uint8_t *dat, uint8_t *ecc_code);
+	int		(*correct_data)(struct mtd_info *mtd, uint8_t *dat, uint8_t *read_ecc, uint8_t *calc_ecc);
 	void		(*enable_hwecc)(struct mtd_info *mtd, int mode);
 	void		(*erase_cmd)(struct mtd_info *mtd, int page);
 	int		(*scan_bbt)(struct mtd_info *mtd);
@@ -325,10 +325,10 @@ struct nand_chip {
 	int		phys_erase_shift;
 	int		bbt_erase_shift;
 	int		chip_shift;
-	u_char		*data_buf;
-	u_char		*oob_buf;
+	uint8_t		*data_buf;
+	uint8_t		*oob_buf;
 	int		oobdirty;
-	u_char		*data_poi;
+	uint8_t		*data_poi;
 	unsigned int	options;
 	int		badblockpos;
 	int		numchips;
@@ -466,7 +466,7 @@ extern int nand_default_bbt (struct mtd_info *mtd);
 extern int nand_isbad_bbt (struct mtd_info *mtd, loff_t offs, int allowbbt);
 extern int nand_erase_nand (struct mtd_info *mtd, struct erase_info *instr, int allowbbt);
 extern int nand_do_read_ecc (struct mtd_info *mtd, loff_t from, size_t len,
-			     size_t * retlen, u_char * buf, u_char * oob_buf,
+			     size_t * retlen, uint8_t * buf, uint8_t * oob_buf,
 			     struct nand_oobinfo *oobsel, int flags);
 
 /*
-- 
cgit v1.2.3


From 6dfc6d250d0b7ebaa6423c44dcd09fcfe68deabd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 23 May 2006 12:00:46 +0200
Subject: [MTD] NAND modularize ECC

First step of modularizing ECC support.
- Move ECC related functionality into a seperate embedded data structure
- Get rid of the hardware dependend constants to simplify new ECC models

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/ams-delta.c      |   2 +-
 drivers/mtd/nand/au1550nd.c       |   2 +-
 drivers/mtd/nand/autcpu12.c       |   2 +-
 drivers/mtd/nand/cs553x_nand.c    |  12 ++--
 drivers/mtd/nand/diskonchip.c     |  10 +--
 drivers/mtd/nand/h1910.c          |   2 +-
 drivers/mtd/nand/nand_base.c      | 146 +++++++++++++-------------------------
 drivers/mtd/nand/nandsim.c        |   2 +-
 drivers/mtd/nand/ndfc.c           |  10 +--
 drivers/mtd/nand/ppchameleonevb.c |   4 +-
 drivers/mtd/nand/rtc_from4.c      |  12 ++--
 drivers/mtd/nand/s3c2410.c        |  16 +++--
 drivers/mtd/nand/sharpsl.c        |  10 +--
 drivers/mtd/nand/toto.c           |   2 +-
 drivers/mtd/nand/ts7250.c         |   2 +-
 include/linux/mtd/nand.h          |  63 ++++++++--------
 16 files changed, 131 insertions(+), 166 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ams-delta.c b/drivers/mtd/nand/ams-delta.c
index 5a349eb316f5..aeaf2dece095 100644
--- a/drivers/mtd/nand/ams-delta.c
+++ b/drivers/mtd/nand/ams-delta.c
@@ -192,7 +192,7 @@ static int __init ams_delta_init(void)
 	}
 	/* 25 us command delay time */
 	this->chip_delay = 30;
-	this->eccmode = NAND_ECC_SOFT;
+	this->ecc.mode = NAND_ECC_SOFT;
 
 	/* Set chip enabled, but  */
 	ams_delta_latch2_write(NAND_MASK, AMS_DELTA_LATCH2_NAND_NRE |
diff --git a/drivers/mtd/nand/au1550nd.c b/drivers/mtd/nand/au1550nd.c
index d9a0143e1d3a..d7f04abfe18e 100644
--- a/drivers/mtd/nand/au1550nd.c
+++ b/drivers/mtd/nand/au1550nd.c
@@ -578,7 +578,7 @@ static int __init au1xxx_nand_init(void)
 
 	/* 30 us command delay time */
 	this->chip_delay = 30;
-	this->eccmode = NAND_ECC_SOFT;
+	this->ecc.mode = NAND_ECC_SOFT;
 
 	this->options = NAND_NO_AUTOINCR;
 
diff --git a/drivers/mtd/nand/autcpu12.c b/drivers/mtd/nand/autcpu12.c
index 43b296040d7f..dbb1b6267ade 100644
--- a/drivers/mtd/nand/autcpu12.c
+++ b/drivers/mtd/nand/autcpu12.c
@@ -163,7 +163,7 @@ static int __init autcpu12_init(void)
 	this->dev_ready = autcpu12_device_ready;
 	/* 20 us command delay time */
 	this->chip_delay = 20;
-	this->eccmode = NAND_ECC_SOFT;
+	this->ecc.mode = NAND_ECC_SOFT;
 
 	/* Enable the following for a flash based bad block table */
 	/*
diff --git a/drivers/mtd/nand/cs553x_nand.c b/drivers/mtd/nand/cs553x_nand.c
index bf251253ea1f..064f3feadf53 100644
--- a/drivers/mtd/nand/cs553x_nand.c
+++ b/drivers/mtd/nand/cs553x_nand.c
@@ -242,11 +242,13 @@ static int __init cs553x_init_one(int cs, int mmio, unsigned long adr)
 
 	this->chip_delay = 0;
 
-	this->eccmode = NAND_ECC_HW3_256;
-	this->enable_hwecc  = cs_enable_hwecc;
-	this->calculate_ecc = cs_calculate_ecc;
-	this->correct_data  = nand_correct_data;
-	
+	this->ecc.mode = NAND_ECC_HW;
+	this->ecc.size = 256;
+	this->ecc.bytes = 3;
+	this->ecc.hwctl  = cs_enable_hwecc;
+	this->ecc.calculate = cs_calculate_ecc;
+	this->ecc.correct  = nand_correct_data;
+
 	/* Enable the following for a flash based bad block table */
 	this->options = NAND_USE_FLASH_BBT | NAND_NO_AUTOINCR;
 
diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index a2391c66a63f..128c937af32f 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -1674,12 +1674,14 @@ static int __init doc_probe(unsigned long physadr)
 	nand->dev_ready		= doc200x_dev_ready;
 	nand->waitfunc		= doc200x_wait;
 	nand->block_bad		= doc200x_block_bad;
-	nand->enable_hwecc	= doc200x_enable_hwecc;
-	nand->calculate_ecc	= doc200x_calculate_ecc;
-	nand->correct_data	= doc200x_correct_data;
+	nand->ecc.hwctl		= doc200x_enable_hwecc;
+	nand->ecc.calculate	= doc200x_calculate_ecc;
+	nand->ecc.correct	= doc200x_correct_data;
 
 	nand->autooob		= &doc200x_oobinfo;
-	nand->eccmode		= NAND_ECC_HW6_512;
+	nand->ecc.mode		= NAND_ECC_HW_SYNDROME;
+	nand->ecc.size		= 512;
+	nand->ecc.bytes		= 6;
 	nand->options		= NAND_USE_FLASH_BBT | NAND_HWECC_SYNDROME;
 
 	doc->physadr		= physadr;
diff --git a/drivers/mtd/nand/h1910.c b/drivers/mtd/nand/h1910.c
index 9848eb09b884..06e91fa11b34 100644
--- a/drivers/mtd/nand/h1910.c
+++ b/drivers/mtd/nand/h1910.c
@@ -149,7 +149,7 @@ static int __init h1910_init(void)
 	this->dev_ready = NULL;	/* unknown whether that was correct or not so we will just do it like this */
 	/* 15 us command delay time */
 	this->chip_delay = 50;
-	this->eccmode = NAND_ECC_SOFT;
+	this->ecc.mode = NAND_ECC_SOFT;
 	this->options = NAND_NO_AUTOINCR;
 
 	/* Scan to find existence of the device */
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 37db98a58c34..98792ec4c2dc 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -879,9 +879,9 @@ static int nand_write_page(struct mtd_info *mtd, struct nand_chip *this, int pag
 {
 	int i, status;
 	uint8_t ecc_code[32];
-	int eccmode = oobsel->useecc ? this->eccmode : NAND_ECC_NONE;
+	int eccmode = oobsel->useecc ? this->ecc.mode : NAND_ECC_NONE;
 	int *oob_config = oobsel->eccpos;
-	int datidx = 0, eccidx = 0, eccsteps = this->eccsteps;
+	int datidx = 0, eccidx = 0, eccsteps = this->ecc.steps;
 	int eccbytes = 0;
 
 	/* FIXME: Enable cached programming */
@@ -901,20 +901,20 @@ static int nand_write_page(struct mtd_info *mtd, struct nand_chip *this, int pag
 		/* Software ecc 3/256, write all */
 	case NAND_ECC_SOFT:
 		for (; eccsteps; eccsteps--) {
-			this->calculate_ecc(mtd, &this->data_poi[datidx], ecc_code);
+			this->ecc.calculate(mtd, &this->data_poi[datidx], ecc_code);
 			for (i = 0; i < 3; i++, eccidx++)
 				oob_buf[oob_config[eccidx]] = ecc_code[i];
-			datidx += this->eccsize;
+			datidx += this->ecc.size;
 		}
 		this->write_buf(mtd, this->data_poi, mtd->oobblock);
 		break;
 	default:
-		eccbytes = this->eccbytes;
+		eccbytes = this->ecc.bytes;
 		for (; eccsteps; eccsteps--) {
 			/* enable hardware ecc logic for write */
-			this->enable_hwecc(mtd, NAND_ECC_WRITE);
-			this->write_buf(mtd, &this->data_poi[datidx], this->eccsize);
-			this->calculate_ecc(mtd, &this->data_poi[datidx], ecc_code);
+			this->ecc.hwctl(mtd, NAND_ECC_WRITE);
+			this->write_buf(mtd, &this->data_poi[datidx], this->ecc.size);
+			this->ecc.calculate(mtd, &this->data_poi[datidx], ecc_code);
 			for (i = 0; i < eccbytes; i++, eccidx++)
 				oob_buf[oob_config[eccidx]] = ecc_code[i];
 			/* If the hardware ecc provides syndromes then
@@ -922,7 +922,7 @@ static int nand_write_page(struct mtd_info *mtd, struct nand_chip *this, int pag
 			 * the data bytes (words) */
 			if (this->options & NAND_HWECC_SYNDROME)
 				this->write_buf(mtd, ecc_code, eccbytes);
-			datidx += this->eccsize;
+			datidx += this->ecc.size;
 		}
 		break;
 	}
@@ -1155,7 +1155,7 @@ int nand_do_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 	if (oobsel->useecc == MTD_NANDECC_AUTOPLACE)
 		oobsel = this->autooob;
 
-	eccmode = oobsel->useecc ? this->eccmode : NAND_ECC_NONE;
+	eccmode = oobsel->useecc ? this->ecc.mode : NAND_ECC_NONE;
 	oob_config = oobsel->eccpos;
 
 	/* Select the NAND device */
@@ -1170,8 +1170,8 @@ int nand_do_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 	col = from & (mtd->oobblock - 1);
 
 	end = mtd->oobblock;
-	ecc = this->eccsize;
-	eccbytes = this->eccbytes;
+	ecc = this->ecc.size;
+	eccbytes = this->ecc.bytes;
 
 	if ((eccmode == NAND_ECC_NONE) || (this->options & NAND_HWECC_SYNDROME))
 		compareecc = 0;
@@ -1216,7 +1216,7 @@ int nand_do_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 			oobsel->useecc == MTD_NANDECC_AUTOPL_USR)
 			oob_data = &this->data_buf[end];
 
-		eccsteps = this->eccsteps;
+		eccsteps = this->ecc.steps;
 
 		switch (eccmode) {
 		case NAND_ECC_NONE:{
@@ -1234,12 +1234,12 @@ int nand_do_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 		case NAND_ECC_SOFT:	/* Software ECC 3/256: Read in a page + oob data */
 			this->read_buf(mtd, data_poi, end);
 			for (i = 0, datidx = 0; eccsteps; eccsteps--, i += 3, datidx += ecc)
-				this->calculate_ecc(mtd, &data_poi[datidx], &ecc_calc[i]);
+				this->ecc.calculate(mtd, &data_poi[datidx], &ecc_calc[i]);
 			break;
 
 		default:
 			for (i = 0, datidx = 0; eccsteps; eccsteps--, i += eccbytes, datidx += ecc) {
-				this->enable_hwecc(mtd, NAND_ECC_READ);
+				this->ecc.hwctl(mtd, NAND_ECC_READ);
 				this->read_buf(mtd, &data_poi[datidx], ecc);
 
 				/* HW ecc with syndrome calculation must read the
@@ -1247,19 +1247,19 @@ int nand_do_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 				if (!compareecc) {
 					/* Some hw ecc generators need to know when the
 					 * syndrome is read from flash */
-					this->enable_hwecc(mtd, NAND_ECC_READSYN);
+					this->ecc.hwctl(mtd, NAND_ECC_READSYN);
 					this->read_buf(mtd, &oob_data[i], eccbytes);
 					/* We calc error correction directly, it checks the hw
 					 * generator for an error, reads back the syndrome and
 					 * does the error correction on the fly */
-					ecc_status = this->correct_data(mtd, &data_poi[datidx], &oob_data[i], &ecc_code[i]);
+					ecc_status = this->ecc.correct(mtd, &data_poi[datidx], &oob_data[i], &ecc_code[i]);
 					if ((ecc_status == -1) || (ecc_status > (flags && 0xff))) {
 						DEBUG(MTD_DEBUG_LEVEL0, "nand_read_ecc: "
 						      "Failed ECC read, page 0x%08x on chip %d\n", page, chipnr);
 						ecc_failed++;
 					}
 				} else {
-					this->calculate_ecc(mtd, &data_poi[datidx], &ecc_calc[i]);
+					this->ecc.calculate(mtd, &data_poi[datidx], &ecc_calc[i]);
 				}
 			}
 			break;
@@ -1277,8 +1277,8 @@ int nand_do_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 			ecc_code[j] = oob_data[oob_config[j]];
 
 		/* correct data, if necessary */
-		for (i = 0, j = 0, datidx = 0; i < this->eccsteps; i++, datidx += ecc) {
-			ecc_status = this->correct_data(mtd, &data_poi[datidx], &ecc_code[j], &ecc_calc[j]);
+		for (i = 0, j = 0, datidx = 0; i < this->ecc.steps; i++, datidx += ecc) {
+			ecc_status = this->ecc.correct(mtd, &data_poi[datidx], &ecc_code[j], &ecc_calc[j]);
 
 			/* Get next chunk of ecc bytes */
 			j += eccbytes;
@@ -1315,7 +1315,7 @@ int nand_do_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 				break;
 			case MTD_NANDECC_PLACE:
 				/* YAFFS1 legacy mode */
-				oob_data += this->eccsteps * sizeof(int);
+				oob_data += this->ecc.steps * sizeof(int);
 			default:
 				oob_data += mtd->oobsize;
 			}
@@ -2648,99 +2648,49 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 	 * check ECC mode, default to software if 3byte/512byte hardware ECC is
 	 * selected and we have 256 byte pagesize fallback to software ECC
 	 */
-	this->eccsize = 256;
-	this->eccbytes = 3;
-
-	switch (this->eccmode) {
-	case NAND_ECC_HW12_2048:
-		if (mtd->oobblock < 2048) {
-			printk(KERN_WARNING "2048 byte HW ECC not possible on "
-			       "%d byte page size, fallback to SW ECC\n",
-			       mtd->oobblock);
-			this->eccmode = NAND_ECC_SOFT;
-			this->calculate_ecc = nand_calculate_ecc;
-			this->correct_data = nand_correct_data;
-		} else
-			this->eccsize = 2048;
-		break;
-
-	case NAND_ECC_HW3_512:
-	case NAND_ECC_HW6_512:
-	case NAND_ECC_HW8_512:
-		if (mtd->oobblock == 256) {
-			printk(KERN_WARNING "512 byte HW ECC not possible on "
-			       "256 Byte pagesize, fallback to SW ECC \n");
-			this->eccmode = NAND_ECC_SOFT;
-			this->calculate_ecc = nand_calculate_ecc;
-			this->correct_data = nand_correct_data;
-		} else
-			this->eccsize = 512;	/* set eccsize to 512 */
-		break;
+	switch (this->ecc.mode) {
+	case NAND_ECC_HW:
+	case NAND_ECC_HW_SYNDROME:
+		if (!this->ecc.calculate || !this->ecc.correct ||
+		    !this->ecc.hwctl) {
+			printk(KERN_WARNING "No ECC functions supplied, "
+			       "Hardware ECC not possible\n");
+			BUG();
+		}
+		if (mtd->oobblock >= this->ecc.size)
+			break;
+		printk(KERN_WARNING "%d byte HW ECC not possible on "
+		       "%d byte page size, fallback to SW ECC\n",
+		       this->ecc.size, mtd->oobblock);
+		this->ecc.mode = NAND_ECC_SOFT;
 
-	case NAND_ECC_HW3_256:
+	case NAND_ECC_SOFT:
+		this->ecc.calculate = nand_calculate_ecc;
+		this->ecc.correct = nand_correct_data;
+		this->ecc.size = 256;
+		this->ecc.bytes = 3;
 		break;
 
 	case NAND_ECC_NONE:
 		printk(KERN_WARNING "NAND_ECC_NONE selected by board driver. "
 		       "This is not recommended !!\n");
-		this->eccmode = NAND_ECC_NONE;
+		this->ecc.size = mtd->oobblock;
+		this->ecc.bytes = 0;
 		break;
-
-	case NAND_ECC_SOFT:
-		this->calculate_ecc = nand_calculate_ecc;
-		this->correct_data = nand_correct_data;
-		break;
-
 	default:
 		printk(KERN_WARNING "Invalid NAND_ECC_MODE %d\n",
-		       this->eccmode);
-		BUG();
-	}
-
-	/*
-	 * Check hardware ecc function availability and adjust number of ecc
-	 * bytes per calculation step
-	 */
-	switch (this->eccmode) {
-	case NAND_ECC_HW12_2048:
-		this->eccbytes += 4;
-	case NAND_ECC_HW8_512:
-		this->eccbytes += 2;
-	case NAND_ECC_HW6_512:
-		this->eccbytes += 3;
-	case NAND_ECC_HW3_512:
-	case NAND_ECC_HW3_256:
-		if (this->calculate_ecc && this->correct_data &&
-		    this->enable_hwecc)
-			break;
-		printk(KERN_WARNING "No ECC functions supplied, "
-		       "Hardware ECC not possible\n");
+		       this->ecc.mode);
 		BUG();
 	}
 
-	mtd->eccsize = this->eccsize;
-
 	/*
 	 * Set the number of read / write steps for one page depending on ECC
 	 * mode
 	 */
-	switch (this->eccmode) {
-	case NAND_ECC_HW12_2048:
-		this->eccsteps = mtd->oobblock / 2048;
-		break;
-	case NAND_ECC_HW3_512:
-	case NAND_ECC_HW6_512:
-	case NAND_ECC_HW8_512:
-		this->eccsteps = mtd->oobblock / 512;
-		break;
-	case NAND_ECC_HW3_256:
-	case NAND_ECC_SOFT:
-		this->eccsteps = mtd->oobblock / 256;
-		break;
-
-	case NAND_ECC_NONE:
-		this->eccsteps = 1;
-		break;
+	this->ecc.steps = mtd->oobblock / this->ecc.size;
+	if(this->ecc.steps * this->ecc.size != mtd->oobblock) {
+		printk(KERN_WARNING "Invalid ecc parameters\n");
+		BUG();
 	}
 
 	/* Initialize state, waitqueue and spinlock */
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 6903f5b903c6..9008bc5493fb 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -1523,7 +1523,7 @@ static int __init ns_init_module(void)
 	chip->verify_buf = ns_nand_verify_buf;
 	chip->write_word = ns_nand_write_word;
 	chip->read_word  = ns_nand_read_word;
-	chip->eccmode    = NAND_ECC_SOFT;
+	chip->ecc.mode   = NAND_ECC_SOFT;
 	chip->options   |= NAND_SKIP_BBTSCAN;
 
 	/*
diff --git a/drivers/mtd/nand/ndfc.c b/drivers/mtd/nand/ndfc.c
index 22fd682b70ca..e2dc81de106a 100644
--- a/drivers/mtd/nand/ndfc.c
+++ b/drivers/mtd/nand/ndfc.c
@@ -168,10 +168,12 @@ static void ndfc_chip_init(struct ndfc_nand_mtd *mtd)
 	chip->read_buf = ndfc_read_buf;
 	chip->write_buf = ndfc_write_buf;
 	chip->verify_buf = ndfc_verify_buf;
-	chip->correct_data  = nand_correct_data;
-	chip->enable_hwecc  = ndfc_enable_hwecc;
-	chip->calculate_ecc = ndfc_calculate_ecc;
-	chip->eccmode = NAND_ECC_HW3_256;
+	chip->ecc.correct = nand_correct_data;
+	chip->ecc.hwctl = ndfc_enable_hwecc;
+	chip->ecc.calculate = ndfc_calculate_ecc;
+	chip->ecc.mode = NAND_ECC_HW;
+	chip->ecc.size = 256;
+	chip->ecc.bytes = 3;
 	chip->autooob = mtd->pl_chip->autooob;
 	mtd->mtd.priv = chip;
 	mtd->mtd.owner = THIS_MODULE;
diff --git a/drivers/mtd/nand/ppchameleonevb.c b/drivers/mtd/nand/ppchameleonevb.c
index 5d4d16fb1df6..9fab0998524d 100644
--- a/drivers/mtd/nand/ppchameleonevb.c
+++ b/drivers/mtd/nand/ppchameleonevb.c
@@ -257,7 +257,7 @@ static int __init ppchameleonevb_init(void)
 #endif
 	this->chip_delay = NAND_BIG_DELAY_US;
 	/* ECC mode */
-	this->eccmode = NAND_ECC_SOFT;
+	this->ecc.mode = NAND_ECC_SOFT;
 
 	/* Scan to find existence of the device (it could not be mounted) */
 	if (nand_scan(ppchameleon_mtd, 1)) {
@@ -358,7 +358,7 @@ static int __init ppchameleonevb_init(void)
 	this->chip_delay = NAND_SMALL_DELAY_US;
 
 	/* ECC mode */
-	this->eccmode = NAND_ECC_SOFT;
+	this->ecc.mode = NAND_ECC_SOFT;
 
 	/* Scan to find existence of the device */
 	if (nand_scan(ppchameleonevb_mtd, 1)) {
diff --git a/drivers/mtd/nand/rtc_from4.c b/drivers/mtd/nand/rtc_from4.c
index bc9d849fbd5d..a2122fe4101a 100644
--- a/drivers/mtd/nand/rtc_from4.c
+++ b/drivers/mtd/nand/rtc_from4.c
@@ -570,19 +570,21 @@ static int __init rtc_from4_init(void)
 #ifdef RTC_FROM4_HWECC
 	printk(KERN_INFO "rtc_from4_init: using hardware ECC detection.\n");
 
-	this->eccmode = NAND_ECC_HW8_512;
+	this->ecc.mode = NAND_ECC_HW_SYNDROME;
+	this->ecc.size = 512;
+	this->ecc.bytes = 8;
 	this->options |= NAND_HWECC_SYNDROME;
 	/* return the status of extra status and ECC checks */
 	this->errstat = rtc_from4_errstat;
 	/* set the nand_oobinfo to support FPGA H/W error detection */
 	this->autooob = &rtc_from4_nand_oobinfo;
-	this->enable_hwecc = rtc_from4_enable_hwecc;
-	this->calculate_ecc = rtc_from4_calculate_ecc;
-	this->correct_data = rtc_from4_correct_data;
+	this->ecc.hwctl = rtc_from4_enable_hwecc;
+	this->ecc.calculate = rtc_from4_calculate_ecc;
+	this->ecc.correct = rtc_from4_correct_data;
 #else
 	printk(KERN_INFO "rtc_from4_init: using software ECC detection.\n");
 
-	this->eccmode = NAND_ECC_SOFT;
+	this->ecc.mode = NAND_ECC_SOFT;
 #endif
 
 	/* set the bad block tables to support debugging */
diff --git a/drivers/mtd/nand/s3c2410.c b/drivers/mtd/nand/s3c2410.c
index f8002596de8b..608340a25278 100644
--- a/drivers/mtd/nand/s3c2410.c
+++ b/drivers/mtd/nand/s3c2410.c
@@ -520,18 +520,20 @@ static void s3c2410_nand_init_chip(struct s3c2410_nand_info *info,
 	nmtd->set	   = set;
 
 	if (hardware_ecc) {
-		chip->correct_data  = s3c2410_nand_correct_data;
-		chip->enable_hwecc  = s3c2410_nand_enable_hwecc;
-		chip->calculate_ecc = s3c2410_nand_calculate_ecc;
-		chip->eccmode	    = NAND_ECC_HW3_512;
+		chip->ecc.correct   = s3c2410_nand_correct_data;
+		chip->ecc.hwctl	    = s3c2410_nand_enable_hwecc;
+		chip->ecc.calculate = s3c2410_nand_calculate_ecc;
+		chip->ecc.mode	    = NAND_ECC_HW;
+		chip->ecc.size	    = 512;
+		chip->ecc.bytes	    = 3;
 		chip->autooob       = &nand_hw_eccoob;
 
 		if (info->is_s3c2440) {
-			chip->enable_hwecc  = s3c2440_nand_enable_hwecc;
-			chip->calculate_ecc = s3c2440_nand_calculate_ecc;
+			chip->ecc.hwctl     = s3c2440_nand_enable_hwecc;
+			chip->ecc.calculate = s3c2440_nand_calculate_ecc;
 		}
 	} else {
-		chip->eccmode	    = NAND_ECC_SOFT;
+		chip->ecc.mode	    = NAND_ECC_SOFT;
 	}
 }
 
diff --git a/drivers/mtd/nand/sharpsl.c b/drivers/mtd/nand/sharpsl.c
index 60e10c0d6980..5554d0b97c8c 100644
--- a/drivers/mtd/nand/sharpsl.c
+++ b/drivers/mtd/nand/sharpsl.c
@@ -201,15 +201,17 @@ static int __init sharpsl_nand_init(void)
 	/* 15 us command delay time */
 	this->chip_delay = 15;
 	/* set eccmode using hardware ECC */
-	this->eccmode = NAND_ECC_HW3_256;
+	this->ecc.mode = NAND_ECC_HW;
+	this->ecc.size = 256;
+	this->ecc.bytes = 3;
 	this->badblock_pattern = &sharpsl_bbt;
 	if (machine_is_akita() || machine_is_borzoi()) {
 		this->badblock_pattern = &sharpsl_akita_bbt;
 		this->autooob = &akita_oobinfo;
 	}
-	this->enable_hwecc = sharpsl_nand_enable_hwecc;
-	this->calculate_ecc = sharpsl_nand_calculate_ecc;
-	this->correct_data = nand_correct_data;
+	this->ecc.hwctl = sharpsl_nand_enable_hwecc;
+	this->ecc.calculate = sharpsl_nand_calculate_ecc;
+	this->ecc.correct = nand_correct_data;
 
 	/* Scan to find existence of the device */
 	err = nand_scan(sharpsl_mtd, 1);
diff --git a/drivers/mtd/nand/toto.c b/drivers/mtd/nand/toto.c
index c51c89559514..50aa6a46911f 100644
--- a/drivers/mtd/nand/toto.c
+++ b/drivers/mtd/nand/toto.c
@@ -146,7 +146,7 @@ static int __init toto_init(void)
 	this->dev_ready = NULL;
 	/* 25 us command delay time */
 	this->chip_delay = 30;
-	this->eccmode = NAND_ECC_SOFT;
+	this->ecc.mode = NAND_ECC_SOFT;
 
 	/* Scan to find existance of the device */
 	if (nand_scan(toto_mtd, 1)) {
diff --git a/drivers/mtd/nand/ts7250.c b/drivers/mtd/nand/ts7250.c
index 622db3127f7c..70bce1b0326c 100644
--- a/drivers/mtd/nand/ts7250.c
+++ b/drivers/mtd/nand/ts7250.c
@@ -155,7 +155,7 @@ static int __init ts7250_init(void)
 	this->hwcontrol = ts7250_hwcontrol;
 	this->dev_ready = ts7250_device_ready;
 	this->chip_delay = 15;
-	this->eccmode = NAND_ECC_SOFT;
+	this->ecc.mode = NAND_ECC_SOFT;
 
 	printk("Searching for NAND flash...\n");
 	/* Scan to find existence of the device */
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 601c5c703a05..460525841a27 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -113,21 +113,12 @@ extern int nand_read_raw (struct mtd_info *mtd, uint8_t *buf, loff_t from,
 /*
  * Constants for ECC_MODES
  */
-
-/* No ECC. Usage is not recommended ! */
-#define NAND_ECC_NONE		0
-/* Software ECC 3 byte ECC per 256 Byte data */
-#define NAND_ECC_SOFT		1
-/* Hardware ECC 3 byte ECC per 256 Byte data */
-#define NAND_ECC_HW3_256	2
-/* Hardware ECC 3 byte ECC per 512 Byte data */
-#define NAND_ECC_HW3_512	3
-/* Hardware ECC 3 byte ECC per 512 Byte data */
-#define NAND_ECC_HW6_512	4
-/* Hardware ECC 8 byte ECC per 512 Byte data */
-#define NAND_ECC_HW8_512	6
-/* Hardware ECC 12 byte ECC per 2048 Byte data */
-#define NAND_ECC_HW12_2048	7
+typedef enum {
+	NAND_ECC_NONE,
+	NAND_ECC_SOFT,
+	NAND_ECC_HW,
+	NAND_ECC_HW_SYNDROME,
+} nand_ecc_modes_t;
 
 /*
  * Constants for Hardware ECC
@@ -230,6 +221,31 @@ struct nand_hw_control {
 	wait_queue_head_t wq;
 };
 
+/**
+ * struct nand_ecc_ctrl - Control structure for ecc
+ * @mode:	ecc mode
+ * @steps:	number of ecc steps per page
+ * @size:	data bytes per ecc step
+ * @bytes:	ecc bytes per step
+ * @hwctl:	function to control hardware ecc generator. Must only
+ *		be provided if an hardware ECC is available
+ * @calculate:	function for ecc calculation or readback from ecc hardware
+ * @correct:	function for ecc correction, matching to ecc generator (sw/hw)
+ */
+struct nand_ecc_ctrl {
+	nand_ecc_modes_t	mode;
+	int			steps;
+	int			size;
+	int			bytes;
+	int			(*hwctl)(struct mtd_info *mtd, int mode);
+	int			(*calculate)(struct mtd_info *mtd,
+					     const uint8_t *dat,
+					     uint8_t *ecc_code);
+	int			(*correct)(struct mtd_info *mtd, uint8_t *dat,
+					   uint8_t *read_ecc,
+					   uint8_t *calc_ecc);
+};
+
 /**
  * struct nand_chip - NAND Private Flash Chip Data
  * @IO_ADDR_R:		[BOARDSPECIFIC] address to read the 8 I/O lines of the flash device
@@ -250,16 +266,9 @@ struct nand_hw_control {
  *			is read from the chip status register
  * @cmdfunc:		[REPLACEABLE] hardwarespecific function for writing commands to the chip
  * @waitfunc:		[REPLACEABLE] hardwarespecific function for wait on ready
- * @calculate_ecc:	[REPLACEABLE] function for ecc calculation or readback from ecc hardware
- * @correct_data:	[REPLACEABLE] function for ecc correction, matching to ecc generator (sw/hw)
- * @enable_hwecc:	[BOARDSPECIFIC] function to enable (reset) hardware ecc generator. Must only
- *			be provided if a hardware ECC is available
+ * @ecc:		[BOARDSPECIFIC] ecc control ctructure
  * @erase_cmd:		[INTERN] erase command write function, selectable due to AND support
  * @scan_bbt:		[REPLACEABLE] function to scan bad block table
- * @eccmode:		[BOARDSPECIFIC] mode of ecc, see defines
- * @eccsize:		[INTERN] databytes used per ecc-calculation
- * @eccbytes:		[INTERN] number of ecc bytes per ecc-calculation step
- * @eccsteps:		[INTERN] number of ecc calculation steps per page
  * @chip_delay:		[BOARDSPECIFIC] chip dependent delay for transfering data from array to read regs (tR)
  * @wq:			[INTERN] wait queue to sleep on if a NAND operation is in progress
  * @state:		[INTERN] the current state of the NAND device
@@ -309,15 +318,9 @@ struct nand_chip {
 	int		(*dev_ready)(struct mtd_info *mtd);
 	void		(*cmdfunc)(struct mtd_info *mtd, unsigned command, int column, int page_addr);
 	int		(*waitfunc)(struct mtd_info *mtd, struct nand_chip *this, int state);
-	int		(*calculate_ecc)(struct mtd_info *mtd, const uint8_t *dat, uint8_t *ecc_code);
-	int		(*correct_data)(struct mtd_info *mtd, uint8_t *dat, uint8_t *read_ecc, uint8_t *calc_ecc);
-	void		(*enable_hwecc)(struct mtd_info *mtd, int mode);
 	void		(*erase_cmd)(struct mtd_info *mtd, int page);
 	int		(*scan_bbt)(struct mtd_info *mtd);
-	int		eccmode;
-	int		eccsize;
-	int		eccbytes;
-	int		eccsteps;
+	struct nand_ecc_ctrl ecc;
 	int		chip_delay;
 	wait_queue_head_t wq;
 	nand_state_t	state;
-- 
cgit v1.2.3


From 9a57d470fd4a77b9732fee97bed29c565c730af0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 23 May 2006 15:58:23 +0200
Subject: [MTD] NAND ECC hwctl function has no return value

Fix the broken prototype

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mtd/nand.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 460525841a27..6931376ed68d 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -237,7 +237,7 @@ struct nand_ecc_ctrl {
 	int			steps;
 	int			size;
 	int			bytes;
-	int			(*hwctl)(struct mtd_info *mtd, int mode);
+	void			(*hwctl)(struct mtd_info *mtd, int mode);
 	int			(*calculate)(struct mtd_info *mtd,
 					     const uint8_t *dat,
 					     uint8_t *ecc_code);
-- 
cgit v1.2.3


From 9d8522df37f91621a70c5c0dbbf5bf2220b16798 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 23 May 2006 16:06:03 +0200
Subject: [MTD] Remove nand writev support

NAND writev(_ecc) support is not longer necessary. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/devices/doc2000.c      |  64 -------------
 drivers/mtd/mtdconcat.c            |  20 +---
 drivers/mtd/mtdpart.c              |  23 +----
 drivers/mtd/nand/nand_base.c       | 188 -------------------------------------
 drivers/mtd/onenand/onenand_base.c | 140 ---------------------------
 include/linux/mtd/mtd.h            |   2 -
 6 files changed, 3 insertions(+), 434 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/devices/doc2000.c b/drivers/mtd/devices/doc2000.c
index 423a34f4638c..6f32942fdf77 100644
--- a/drivers/mtd/devices/doc2000.c
+++ b/drivers/mtd/devices/doc2000.c
@@ -59,9 +59,6 @@ static int doc_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 			size_t *retlen, u_char *buf, u_char *eccbuf, struct nand_oobinfo *oobsel);
 static int doc_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 			 size_t *retlen, const u_char *buf, u_char *eccbuf, struct nand_oobinfo *oobsel);
-static int doc_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
-			  unsigned long count, loff_t to, size_t *retlen,
-			  u_char *eccbuf, struct nand_oobinfo *oobsel);
 static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
 			size_t *retlen, u_char *buf);
 static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
@@ -589,7 +586,6 @@ void DoC2k_init(struct mtd_info *mtd)
 	mtd->write = doc_write;
 	mtd->read_ecc = doc_read_ecc;
 	mtd->write_ecc = doc_write_ecc;
-	mtd->writev_ecc = doc_writev_ecc;
 	mtd->read_oob = doc_read_oob;
 	mtd->write_oob = doc_write_oob;
 	mtd->sync = NULL;
@@ -965,66 +961,6 @@ static int doc_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 	return 0;
 }
 
-static int doc_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
-			  unsigned long count, loff_t to, size_t *retlen,
-			  u_char *eccbuf, struct nand_oobinfo *oobsel)
-{
-	static char static_buf[512];
-	static DEFINE_MUTEX(writev_buf_mutex);
-
-	size_t totretlen = 0;
-	size_t thisvecofs = 0;
-	int ret= 0;
-
-	mutex_lock(&writev_buf_mutex);
-
-	while(count) {
-		size_t thislen, thisretlen;
-		unsigned char *buf;
-
-		buf = vecs->iov_base + thisvecofs;
-		thislen = vecs->iov_len - thisvecofs;
-
-
-		if (thislen >= 512) {
-			thislen = thislen & ~(512-1);
-			thisvecofs += thislen;
-		} else {
-			/* Not enough to fill a page. Copy into buf */
-			memcpy(static_buf, buf, thislen);
-			buf = &static_buf[thislen];
-
-			while(count && thislen < 512) {
-				vecs++;
-				count--;
-				thisvecofs = min((512-thislen), vecs->iov_len);
-				memcpy(buf, vecs->iov_base, thisvecofs);
-				thislen += thisvecofs;
-				buf += thisvecofs;
-			}
-			buf = static_buf;
-		}
-		if (count && thisvecofs == vecs->iov_len) {
-			thisvecofs = 0;
-			vecs++;
-			count--;
-		}
-		ret = doc_write_ecc(mtd, to, thislen, &thisretlen, buf, eccbuf, oobsel);
-
-		totretlen += thisretlen;
-
-		if (ret || thisretlen != thislen)
-			break;
-
-		to += thislen;
-	}
-
-	mutex_unlock(&writev_buf_mutex);
-	*retlen = totretlen;
-	return ret;
-}
-
-
 static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
 			size_t * retlen, u_char * buf)
 {
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index a5e8373349a5..a6fcee2713b0 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -253,9 +253,8 @@ concat_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 }
 
 static int
-concat_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
-		unsigned long count, loff_t to, size_t * retlen,
-		u_char *eccbuf, struct nand_oobinfo *oobsel)
+concat_writev(struct mtd_info *mtd, const struct kvec *vecs,
+		unsigned long count, loff_t to, size_t * retlen)
 {
 	struct mtd_concat *concat = CONCAT(mtd);
 	struct kvec *vecs_copy;
@@ -315,10 +314,6 @@ concat_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
 
 		if (!(subdev->flags & MTD_WRITEABLE))
 			err = -EROFS;
-		else if (eccbuf)
-			err = subdev->writev_ecc(subdev, &vecs_copy[entry_low],
-				entry_high - entry_low + 1, to, &retsize,
-				eccbuf, oobsel);
 		else
 			err = subdev->writev(subdev, &vecs_copy[entry_low],
 				entry_high - entry_low + 1, to, &retsize);
@@ -333,8 +328,6 @@ concat_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
 
 		*retlen += retsize;
 		total_len -= wsize;
-		if (concat->mtd.type == MTD_NANDFLASH && eccbuf)
-			eccbuf += mtd->oobavail * (wsize / mtd->writesize);
 
 		if (total_len == 0)
 			break;
@@ -347,13 +340,6 @@ concat_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
 	return err;
 }
 
-static int
-concat_writev(struct mtd_info *mtd, const struct kvec *vecs,
-		unsigned long count, loff_t to, size_t * retlen)
-{
-	return concat_writev_ecc(mtd, vecs, count, to, retlen, NULL, NULL);
-}
-
 static int
 concat_read_oob(struct mtd_info *mtd, loff_t from, size_t len,
 		size_t * retlen, u_char * buf)
@@ -843,8 +829,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 		concat->mtd.write_ecc = concat_write_ecc;
 	if (subdev[0]->writev)
 		concat->mtd.writev = concat_writev;
-	if (subdev[0]->writev_ecc)
-		concat->mtd.writev_ecc = concat_writev_ecc;
 	if (subdev[0]->read_oob)
 		concat->mtd.read_oob = concat_read_oob;
 	if (subdev[0]->write_oob)
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 082662f90481..ae675608fa91 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -208,13 +208,8 @@ static int part_writev (struct mtd_info *mtd,  const struct kvec *vecs,
 	struct mtd_part *part = PART(mtd);
 	if (!(mtd->flags & MTD_WRITEABLE))
 		return -EROFS;
-	if (part->master->writev_ecc == NULL)
-		return part->master->writev (part->master, vecs, count,
+	return part->master->writev (part->master, vecs, count,
 					to + part->offset, retlen);
-	else
-		return part->master->writev_ecc (part->master, vecs, count,
-					to + part->offset, retlen,
-					NULL, &mtd->oobinfo);
 }
 
 static int part_readv (struct mtd_info *mtd,  struct kvec *vecs,
@@ -230,20 +225,6 @@ static int part_readv (struct mtd_info *mtd,  struct kvec *vecs,
 					NULL, &mtd->oobinfo);
 }
 
-static int part_writev_ecc (struct mtd_info *mtd,  const struct kvec *vecs,
-			 unsigned long count, loff_t to, size_t *retlen,
-			 u_char *eccbuf,  struct nand_oobinfo *oobsel)
-{
-	struct mtd_part *part = PART(mtd);
-	if (!(mtd->flags & MTD_WRITEABLE))
-		return -EROFS;
-	if (oobsel == NULL)
-		oobsel = &mtd->oobinfo;
-	return part->master->writev_ecc (part->master, vecs, count,
-					to + part->offset, retlen,
-					eccbuf, oobsel);
-}
-
 static int part_readv_ecc (struct mtd_info *mtd,  struct kvec *vecs,
 			 unsigned long count, loff_t from, size_t *retlen,
 			 u_char *eccbuf,  struct nand_oobinfo *oobsel)
@@ -446,8 +427,6 @@ int add_mtd_partitions(struct mtd_info *master,
 			slave->mtd.writev = part_writev;
 		if (master->readv)
 			slave->mtd.readv = part_readv;
-		if (master->writev_ecc)
-			slave->mtd.writev_ecc = part_writev_ecc;
 		if (master->readv_ecc)
 			slave->mtd.readv_ecc = part_readv_ecc;
 		if (master->lock)
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 778535006c83..9aaeb3aa9d4d 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -151,11 +151,6 @@ static int nand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 			  struct nand_oobinfo *oobsel);
 static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len,
 			  size_t *retlen, const uint8_t *buf);
-static int nand_writev(struct mtd_info *mtd, const struct kvec *vecs,
-		       unsigned long count, loff_t to, size_t *retlen);
-static int nand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
-			   unsigned long count, loff_t to, size_t *retlen,
-			   uint8_t *eccbuf, struct nand_oobinfo *oobsel);
 static int nand_erase(struct mtd_info *mtd, struct erase_info *instr);
 static void nand_sync(struct mtd_info *mtd);
 
@@ -1856,187 +1851,6 @@ static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len, size_t *r
 	return ret;
 }
 
-/**
- * nand_writev - [MTD Interface] compabilty function for nand_writev_ecc
- * @mtd:	MTD device structure
- * @vecs:	the iovectors to write
- * @count:	number of vectors
- * @to:		offset to write to
- * @retlen:	pointer to variable to store the number of written bytes
- *
- * NAND write with kvec. This just calls the ecc function
- */
-static int nand_writev(struct mtd_info *mtd, const struct kvec *vecs, unsigned long count,
-		       loff_t to, size_t *retlen)
-{
-	return (nand_writev_ecc(mtd, vecs, count, to, retlen, NULL, NULL));
-}
-
-/**
- * nand_writev_ecc - [MTD Interface] write with iovec with ecc
- * @mtd:	MTD device structure
- * @vecs:	the iovectors to write
- * @count:	number of vectors
- * @to:		offset to write to
- * @retlen:	pointer to variable to store the number of written bytes
- * @eccbuf:	filesystem supplied oob data buffer
- * @oobsel:	oob selection structure
- *
- * NAND write with iovec with ecc
- */
-static int nand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs, unsigned long count,
-			   loff_t to, size_t *retlen, uint8_t *eccbuf, struct nand_oobinfo *oobsel)
-{
-	int i, page, len, total_len, ret = -EIO, written = 0, chipnr;
-	int oob, numpages, autoplace = 0, startpage;
-	struct nand_chip *this = mtd->priv;
-	int ppblock = (1 << (this->phys_erase_shift - this->page_shift));
-	uint8_t *oobbuf, *bufstart;
-
-	/* Preset written len for early exit */
-	*retlen = 0;
-
-	/* Calculate total length of data */
-	total_len = 0;
-	for (i = 0; i < count; i++)
-		total_len += (int)vecs[i].iov_len;
-
-	DEBUG(MTD_DEBUG_LEVEL3, "nand_writev: to = 0x%08x, len = %i, count = %ld\n", (unsigned int)to, (unsigned int)total_len, count);
-
-	/* Do not allow write past end of page */
-	if ((to + total_len) > mtd->size) {
-		DEBUG(MTD_DEBUG_LEVEL0, "nand_writev: Attempted write past end of device\n");
-		return -EINVAL;
-	}
-
-	/* reject writes, which are not page aligned */
-	if (NOTALIGNED(to) || NOTALIGNED(total_len)) {
-		printk(KERN_NOTICE "nand_write_ecc: Attempt to write not page aligned data\n");
-		return -EINVAL;
-	}
-
-	/* Grab the lock and see if the device is available */
-	nand_get_device(this, mtd, FL_WRITING);
-
-	/* Get the current chip-nr */
-	chipnr = (int)(to >> this->chip_shift);
-	/* Select the NAND device */
-	this->select_chip(mtd, chipnr);
-
-	/* Check, if it is write protected */
-	if (nand_check_wp(mtd))
-		goto out;
-
-	/* if oobsel is NULL, use chip defaults */
-	if (oobsel == NULL)
-		oobsel = &mtd->oobinfo;
-
-	/* Autoplace of oob data ? Use the default placement scheme */
-	if (oobsel->useecc == MTD_NANDECC_AUTOPLACE) {
-		oobsel = this->autooob;
-		autoplace = 1;
-	}
-	if (oobsel->useecc == MTD_NANDECC_AUTOPL_USR)
-		autoplace = 1;
-
-	/* Setup start page */
-	page = (int)(to >> this->page_shift);
-	/* Invalidate the page cache, if we write to the cached page */
-	if (page <= this->pagebuf && this->pagebuf < ((to + total_len) >> this->page_shift))
-		this->pagebuf = -1;
-
-	startpage = page & this->pagemask;
-
-	/* Loop until all kvec' data has been written */
-	len = 0;
-	while (count) {
-		/* If the given tuple is >= pagesize then
-		 * write it out from the iov
-		 */
-		if ((vecs->iov_len - len) >= mtd->writesize) {
-			/* Calc number of pages we can write
-			 * out of this iov in one go */
-			numpages = (vecs->iov_len - len) >> this->page_shift;
-			/* Do not cross block boundaries */
-			numpages = min(ppblock - (startpage & (ppblock - 1)), numpages);
-			oobbuf = nand_prepare_oobbuf(mtd, NULL, oobsel, autoplace, numpages);
-			bufstart = (uint8_t *) vecs->iov_base;
-			bufstart += len;
-			this->data_poi = bufstart;
-			oob = 0;
-			for (i = 1; i <= numpages; i++) {
-				/* Write one page. If this is the last page to write
-				 * then use the real pageprogram command, else select
-				 * cached programming if supported by the chip.
-				 */
-				ret = nand_write_page(mtd, this, page & this->pagemask,
-						      &oobbuf[oob], oobsel, i != numpages);
-				if (ret)
-					goto out;
-				this->data_poi += mtd->writesize;
-				len += mtd->writesize;
-				oob += mtd->oobsize;
-				page++;
-			}
-			/* Check, if we have to switch to the next tuple */
-			if (len >= (int)vecs->iov_len) {
-				vecs++;
-				len = 0;
-				count--;
-			}
-		} else {
-			/* We must use the internal buffer, read data out of each
-			 * tuple until we have a full page to write
-			 */
-			int cnt = 0;
-			while (cnt < mtd->writesize) {
-				if (vecs->iov_base != NULL && vecs->iov_len)
-					this->data_buf[cnt++] = ((uint8_t *) vecs->iov_base)[len++];
-				/* Check, if we have to switch to the next tuple */
-				if (len >= (int)vecs->iov_len) {
-					vecs++;
-					len = 0;
-					count--;
-				}
-			}
-			this->pagebuf = page;
-			this->data_poi = this->data_buf;
-			bufstart = this->data_poi;
-			numpages = 1;
-			oobbuf = nand_prepare_oobbuf(mtd, NULL, oobsel, autoplace, numpages);
-			ret = nand_write_page(mtd, this, page & this->pagemask, oobbuf, oobsel, 0);
-			if (ret)
-				goto out;
-			page++;
-		}
-
-		this->data_poi = bufstart;
-		ret = nand_verify_pages(mtd, this, startpage, numpages, oobbuf, oobsel, chipnr, 0);
-		if (ret)
-			goto out;
-
-		written += mtd->writesize * numpages;
-		/* All done ? */
-		if (!count)
-			break;
-
-		startpage = page & this->pagemask;
-		/* Check, if we cross a chip boundary */
-		if (!startpage) {
-			chipnr++;
-			this->select_chip(mtd, -1);
-			this->select_chip(mtd, chipnr);
-		}
-	}
-	ret = 0;
- out:
-	/* Deselect and wake up anyone waiting on the device */
-	nand_release_device(mtd);
-
-	*retlen = written;
-	return ret;
-}
-
 /**
  * single_erease_cmd - [GENERIC] NAND standard block erase command function
  * @mtd:	MTD device structure
@@ -2718,8 +2532,6 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 	mtd->read_oob = nand_read_oob;
 	mtd->write_oob = nand_write_oob;
 	mtd->readv = NULL;
-	mtd->writev = nand_writev;
-	mtd->writev_ecc = nand_writev_ecc;
 	mtd->sync = nand_sync;
 	mtd->lock = NULL;
 	mtd->unlock = NULL;
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 4c2c61d54b3a..8e875fa140a8 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -1013,144 +1013,6 @@ out:
 	return ret;
 }
 
-/**
- * onenand_writev_ecc - [MTD Interface] write with iovec with ecc
- * @param mtd		MTD device structure
- * @param vecs		the iovectors to write
- * @param count		number of vectors
- * @param to		offset to write to
- * @param retlen	pointer to variable to store the number of written bytes
- * @param eccbuf	filesystem supplied oob data buffer
- * @param oobsel	oob selection structure
- *
- * OneNAND write with iovec with ecc
- */
-static int onenand_writev_ecc(struct mtd_info *mtd, const struct kvec *vecs,
-	unsigned long count, loff_t to, size_t *retlen,
-	u_char *eccbuf, struct nand_oobinfo *oobsel)
-{
-	struct onenand_chip *this = mtd->priv;
-	unsigned char *pbuf;
-	size_t total_len, len;
-	int i, written = 0;
-	int ret = 0;
-
-	/* Preset written len for early exit */
-	*retlen = 0;
-
-	/* Calculate total length of data */
-	total_len = 0;
-	for (i = 0; i < count; i++)
-		total_len += vecs[i].iov_len;
-
-	DEBUG(MTD_DEBUG_LEVEL3, "onenand_writev_ecc: to = 0x%08x, len = %i, count = %ld\n", (unsigned int) to, (unsigned int) total_len, count);
-
-	/* Do not allow write past end of the device */
-	if (unlikely((to + total_len) > mtd->size)) {
-		DEBUG(MTD_DEBUG_LEVEL0, "onenand_writev_ecc: Attempted write past end of device\n");
-		return -EINVAL;
-	}
-
-	/* Reject writes, which are not page aligned */
-        if (unlikely(NOTALIGNED(to)) || unlikely(NOTALIGNED(total_len))) {
-                DEBUG(MTD_DEBUG_LEVEL0, "onenand_writev_ecc: Attempt to write not page aligned data\n");
-                return -EINVAL;
-        }
-
-	/* Grab the lock and see if the device is available */
-	onenand_get_device(mtd, FL_WRITING);
-
-	/* TODO handling oob */
-
-	/* Loop until all keve's data has been written */
-	len = 0;
-	while (count) {
-		pbuf = this->page_buf;
-		/*
-		 * If the given tuple is >= pagesize then
-		 * write it out from the iov
-		 */
-		if ((vecs->iov_len - len) >= mtd->writesize) {
-			pbuf = vecs->iov_base + len;
-
-			len += mtd->writesize;
-
-			/* Check, if we have to switch to the next tuple */
-			if (len >= (int) vecs->iov_len) {
-				vecs++;
-				len = 0;
-				count--;
-			}
-		} else {
-			int cnt = 0, thislen;
-			while (cnt < mtd->writesize) {
-				thislen = min_t(int, mtd->writesize - cnt, vecs->iov_len - len);
-				memcpy(this->page_buf + cnt, vecs->iov_base + len, thislen);
-				cnt += thislen;
-				len += thislen;
-
-				/* Check, if we have to switch to the next tuple */
-				if (len >= (int) vecs->iov_len) {
-					vecs++;
-					len = 0;
-					count--;
-				}
-			}
-		}
-
-		this->command(mtd, ONENAND_CMD_BUFFERRAM, to, mtd->writesize);
-
-		this->write_bufferram(mtd, ONENAND_DATARAM, pbuf, 0, mtd->writesize);
-		this->write_bufferram(mtd, ONENAND_SPARERAM, ffchars, 0, mtd->oobsize);
-
-		this->command(mtd, ONENAND_CMD_PROG, to, mtd->writesize);
-
-		onenand_update_bufferram(mtd, to, 1);
-
-		ret = this->wait(mtd, FL_WRITING);
-		if (ret) {
-			DEBUG(MTD_DEBUG_LEVEL0, "onenand_writev_ecc: write failed %d\n", ret);
-			goto out;
-		}
-
-
-		/* Only check verify write turn on */
-		ret = onenand_verify_page(mtd, (u_char *) pbuf, to);
-		if (ret) {
-			DEBUG(MTD_DEBUG_LEVEL0, "onenand_writev_ecc: verify failed %d\n", ret);
-			goto out;
-		}
-
-		written += mtd->writesize;
-
-		to += mtd->writesize;
-	}
-
-out:
-	/* Deselect and wakt up anyone waiting on the device */
-	onenand_release_device(mtd);
-
-	*retlen = written;
-
-	return 0;
-}
-
-/**
- * onenand_writev - [MTD Interface] compabilty function for onenand_writev_ecc
- * @param mtd		MTD device structure
- * @param vecs		the iovectors to write
- * @param count		number of vectors
- * @param to		offset to write to
- * @param retlen	pointer to variable to store the number of written bytes
- *
- * OneNAND write with kvec. This just calls the ecc function
- */
-static int onenand_writev(struct mtd_info *mtd, const struct kvec *vecs,
-	unsigned long count, loff_t to, size_t *retlen)
-{
-	return onenand_writev_ecc(mtd, vecs, count, to, retlen, NULL, NULL);
-}
-
 /**
  * onenand_block_checkbad - [GENERIC] Check if a block is marked bad
  * @param mtd		MTD device structure
@@ -1964,8 +1826,6 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 #endif
 	mtd->readv = NULL;
 	mtd->readv_ecc = NULL;
-	mtd->writev = onenand_writev;
-	mtd->writev_ecc = onenand_writev_ecc;
 	mtd->sync = onenand_sync;
 	mtd->lock = NULL;
 	mtd->unlock = onenand_unlock;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index d48c7492392b..dba25da84aed 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -142,8 +142,6 @@ struct mtd_info {
 	int (*readv_ecc) (struct mtd_info *mtd, struct kvec *vecs, unsigned long count, loff_t from,
 		size_t *retlen, u_char *eccbuf, struct nand_oobinfo *oobsel);
 	int (*writev) (struct mtd_info *mtd, const struct kvec *vecs, unsigned long count, loff_t to, size_t *retlen);
-	int (*writev_ecc) (struct mtd_info *mtd, const struct kvec *vecs, unsigned long count, loff_t to,
-		size_t *retlen, u_char *eccbuf, struct nand_oobinfo *oobsel);
 
 	/* Sync */
 	void (*sync) (struct mtd_info *mtd);
-- 
cgit v1.2.3


From 2528e8cdf376d7da24647c442ec1e88c360d76ca Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 23 May 2006 16:10:00 +0200
Subject: [MTD] Remove readv/readv_ecc

These functions were never implemented and added only bloat to
partition and concat code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/mtdpart.c              | 29 -----------------------------
 drivers/mtd/nand/nand_base.c       |  1 -
 drivers/mtd/onenand/onenand_base.c |  2 --
 include/linux/mtd/mtd.h            |  6 +-----
 4 files changed, 1 insertion(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index ae675608fa91..f418920320d2 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -212,31 +212,6 @@ static int part_writev (struct mtd_info *mtd,  const struct kvec *vecs,
 					to + part->offset, retlen);
 }
 
-static int part_readv (struct mtd_info *mtd,  struct kvec *vecs,
-			 unsigned long count, loff_t from, size_t *retlen)
-{
-	struct mtd_part *part = PART(mtd);
-	if (part->master->readv_ecc == NULL)
-		return part->master->readv (part->master, vecs, count,
-					from + part->offset, retlen);
-	else
-		return part->master->readv_ecc (part->master, vecs, count,
-					from + part->offset, retlen,
-					NULL, &mtd->oobinfo);
-}
-
-static int part_readv_ecc (struct mtd_info *mtd,  struct kvec *vecs,
-			 unsigned long count, loff_t from, size_t *retlen,
-			 u_char *eccbuf,  struct nand_oobinfo *oobsel)
-{
-	struct mtd_part *part = PART(mtd);
-	if (oobsel == NULL)
-		oobsel = &mtd->oobinfo;
-	return part->master->readv_ecc (part->master, vecs, count,
-					from + part->offset, retlen,
-					eccbuf, oobsel);
-}
-
 static int part_erase (struct mtd_info *mtd, struct erase_info *instr)
 {
 	struct mtd_part *part = PART(mtd);
@@ -425,10 +400,6 @@ int add_mtd_partitions(struct mtd_info *master,
 		}
 		if (master->writev)
 			slave->mtd.writev = part_writev;
-		if (master->readv)
-			slave->mtd.readv = part_readv;
-		if (master->readv_ecc)
-			slave->mtd.readv_ecc = part_readv_ecc;
 		if (master->lock)
 			slave->mtd.lock = part_lock;
 		if (master->unlock)
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 9aaeb3aa9d4d..da2f4d16e506 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2531,7 +2531,6 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 	mtd->write_ecc = nand_write_ecc;
 	mtd->read_oob = nand_read_oob;
 	mtd->write_oob = nand_write_oob;
-	mtd->readv = NULL;
 	mtd->sync = nand_sync;
 	mtd->lock = NULL;
 	mtd->unlock = NULL;
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 8e875fa140a8..3a3fe1d8fcdd 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -1824,8 +1824,6 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 	mtd->write_user_prot_reg = onenand_write_user_prot_reg;
 	mtd->lock_user_prot_reg = onenand_lock_user_prot_reg;
 #endif
-	mtd->readv = NULL;
-	mtd->readv_ecc = NULL;
 	mtd->sync = onenand_sync;
 	mtd->lock = NULL;
 	mtd->unlock = onenand_unlock;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index dba25da84aed..af89e529b8d2 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -133,14 +133,10 @@ struct mtd_info {
 	int (*write_user_prot_reg) (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf);
 	int (*lock_user_prot_reg) (struct mtd_info *mtd, loff_t from, size_t len);
 
-	/* kvec-based read/write methods. We need these especially for NAND flash,
-	   with its limited number of write cycles per erase.
+	/* kvec-based read/write methods.
 	   NB: The 'count' parameter is the number of _vectors_, each of
 	   which contains an (ofs, len) tuple.
 	*/
-	int (*readv) (struct mtd_info *mtd, struct kvec *vecs, unsigned long count, loff_t from, size_t *retlen);
-	int (*readv_ecc) (struct mtd_info *mtd, struct kvec *vecs, unsigned long count, loff_t from,
-		size_t *retlen, u_char *eccbuf, struct nand_oobinfo *oobsel);
 	int (*writev) (struct mtd_info *mtd, const struct kvec *vecs, unsigned long count, loff_t to, size_t *retlen);
 
 	/* Sync */
-- 
cgit v1.2.3


From 9223a456da8ed357bf7e0b128c853e2c8bd54614 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 23 May 2006 17:21:03 +0200
Subject: [MTD] Remove read/write _ecc variants

MTD clients are agnostic of FLASH which needs ECC suppport.
Remove the functions and fixup the callers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/devices/doc2000.c      |   2 -
 drivers/mtd/devices/doc2001.c      |   2 -
 drivers/mtd/devices/doc2001plus.c  |   2 -
 drivers/mtd/inftlcore.c            |  63 +++++++++----------
 drivers/mtd/inftlmount.c           |  12 ++--
 drivers/mtd/mtdconcat.c            | 116 -----------------------------------
 drivers/mtd/mtdpart.c              |  54 ++--------------
 drivers/mtd/nand/nand_base.c       | 122 ++++++++++++++++++-------------------
 drivers/mtd/nand/nand_bbt.c        | 104 ++++++++++++++++++-------------
 drivers/mtd/nftlcore.c             |  15 ++---
 drivers/mtd/nftlmount.c            |  12 ++--
 drivers/mtd/onenand/onenand_base.c |  75 ++++++-----------------
 fs/jffs2/wbuf.c                    |  28 +++------
 include/linux/mtd/mtd.h            |   3 -
 include/linux/mtd/nand.h           |   3 +
 15 files changed, 209 insertions(+), 404 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/devices/doc2000.c b/drivers/mtd/devices/doc2000.c
index 6f32942fdf77..d9ba1ee658f6 100644
--- a/drivers/mtd/devices/doc2000.c
+++ b/drivers/mtd/devices/doc2000.c
@@ -584,8 +584,6 @@ void DoC2k_init(struct mtd_info *mtd)
 	mtd->unpoint = NULL;
 	mtd->read = doc_read;
 	mtd->write = doc_write;
-	mtd->read_ecc = doc_read_ecc;
-	mtd->write_ecc = doc_write_ecc;
 	mtd->read_oob = doc_read_oob;
 	mtd->write_oob = doc_write_oob;
 	mtd->sync = NULL;
diff --git a/drivers/mtd/devices/doc2001.c b/drivers/mtd/devices/doc2001.c
index e6eaef28a2b0..579c0b570ae5 100644
--- a/drivers/mtd/devices/doc2001.c
+++ b/drivers/mtd/devices/doc2001.c
@@ -369,8 +369,6 @@ void DoCMil_init(struct mtd_info *mtd)
 	mtd->unpoint = NULL;
 	mtd->read = doc_read;
 	mtd->write = doc_write;
-	mtd->read_ecc = doc_read_ecc;
-	mtd->write_ecc = doc_write_ecc;
 	mtd->read_oob = doc_read_oob;
 	mtd->write_oob = doc_write_oob;
 	mtd->sync = NULL;
diff --git a/drivers/mtd/devices/doc2001plus.c b/drivers/mtd/devices/doc2001plus.c
index 8422c5e92d27..1ee0c0dcb53b 100644
--- a/drivers/mtd/devices/doc2001plus.c
+++ b/drivers/mtd/devices/doc2001plus.c
@@ -491,8 +491,6 @@ void DoCMilPlus_init(struct mtd_info *mtd)
 	mtd->unpoint = NULL;
 	mtd->read = doc_read;
 	mtd->write = doc_write;
-	mtd->read_ecc = doc_read_ecc;
-	mtd->write_ecc = doc_write_ecc;
 	mtd->read_oob = doc_read_oob;
 	mtd->write_oob = doc_write_oob;
 	mtd->sync = NULL;
diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c
index a3b92479719d..ddd12993780d 100644
--- a/drivers/mtd/inftlcore.c
+++ b/drivers/mtd/inftlcore.c
@@ -36,6 +36,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nftl.h>
 #include <linux/mtd/inftl.h>
+#include <linux/mtd/nand.h>
 #include <asm/uaccess.h>
 #include <asm/errno.h>
 #include <asm/io.h>
@@ -79,14 +80,12 @@ static void inftl_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 	inftl->mbd.devnum = -1;
 	inftl->mbd.blksize = 512;
 	inftl->mbd.tr = tr;
-	memcpy(&inftl->oobinfo, &mtd->oobinfo, sizeof(struct nand_oobinfo));
-	inftl->oobinfo.useecc = MTD_NANDECC_PLACEONLY;
 
-        if (INFTL_mount(inftl) < 0) {
+	if (INFTL_mount(inftl) < 0) {
 		printk(KERN_WARNING "INFTL: could not mount device\n");
 		kfree(inftl);
 		return;
-        }
+	}
 
 	/* OK, it's a new one. Set up all the data structures. */
 
@@ -221,7 +220,7 @@ static u16 INFTL_foldchain(struct INFTLrecord *inftl, unsigned thisVUC, unsigned
 	 * Scan to find the Erase Unit which holds the actual data for each
 	 * 512-byte block within the Chain.
 	 */
-        silly = MAX_LOOPS;
+	silly = MAX_LOOPS;
 	while (thisEUN < inftl->nb_blocks) {
 		for (block = 0; block < inftl->EraseSize/SECTORSIZE; block ++) {
 			if ((BlockMap[block] != 0xffff) || BlockDeleted[block])
@@ -232,7 +231,7 @@ static u16 INFTL_foldchain(struct INFTLrecord *inftl, unsigned thisVUC, unsigned
 			     (char *)&oob) < 0)
 				status = SECTOR_IGNORE;
 			else
-                        	status = oob.b.Status | oob.b.Status1;
+				status = oob.b.Status | oob.b.Status1;
 
 			switch(status) {
 			case SECTOR_FREE:
@@ -282,29 +281,30 @@ static u16 INFTL_foldchain(struct INFTLrecord *inftl, unsigned thisVUC, unsigned
 			continue;
 		}
 
-                /*
+		/*
 		 * Copy only in non free block (free blocks can only
                  * happen in case of media errors or deleted blocks).
 		 */
-                if (BlockMap[block] == BLOCK_NIL)
-                        continue;
+		if (BlockMap[block] == BLOCK_NIL)
+			continue;
 
-                ret = MTD_READ(inftl->mbd.mtd, (inftl->EraseSize *
+		ret = MTD_READ(inftl->mbd.mtd, (inftl->EraseSize *
 			BlockMap[block]) + (block * SECTORSIZE), SECTORSIZE,
 			&retlen, movebuf);
-                if (ret < 0) {
+		if (ret < 0) {
 			ret = MTD_READ(inftl->mbd.mtd, (inftl->EraseSize *
 				BlockMap[block]) + (block * SECTORSIZE),
 				SECTORSIZE, &retlen, movebuf);
 			if (ret != -EIO)
-                        	DEBUG(MTD_DEBUG_LEVEL1, "INFTL: error went "
-					"away on retry?\n");
-                }
-                memset(&oob, 0xff, sizeof(struct inftl_oob));
-                oob.b.Status = oob.b.Status1 = SECTOR_USED;
-                MTD_WRITEECC(inftl->mbd.mtd, (inftl->EraseSize * targetEUN) +
-			(block * SECTORSIZE), SECTORSIZE, &retlen,
-			movebuf, (char *)&oob, &inftl->oobinfo);
+				DEBUG(MTD_DEBUG_LEVEL1, "INFTL: error went "
+				      "away on retry?\n");
+		}
+		memset(&oob, 0xff, sizeof(struct inftl_oob));
+		oob.b.Status = oob.b.Status1 = SECTOR_USED;
+
+		nand_write_raw(inftl->mbd.mtd, (inftl->EraseSize * targetEUN) +
+			       (block * SECTORSIZE), SECTORSIZE, &retlen,
+			       movebuf, (char *)&oob);
 	}
 
 	/*
@@ -329,17 +329,17 @@ static u16 INFTL_foldchain(struct INFTLrecord *inftl, unsigned thisVUC, unsigned
 		if (thisEUN == targetEUN)
 			break;
 
-                if (INFTL_formatblock(inftl, thisEUN) < 0) {
+		if (INFTL_formatblock(inftl, thisEUN) < 0) {
 			/*
 			 * Could not erase : mark block as reserved.
 			 */
 			inftl->PUtable[thisEUN] = BLOCK_RESERVED;
-                } else {
+		} else {
 			/* Correctly erased : mark it as free */
 			inftl->PUtable[thisEUN] = BLOCK_FREE;
 			inftl->PUtable[prevEUN] = BLOCK_NIL;
 			inftl->numfreeEUNs++;
-                }
+		}
 	}
 
 	return targetEUN;
@@ -437,7 +437,7 @@ static inline u16 INFTL_findwriteunit(struct INFTLrecord *inftl, unsigned block)
 			MTD_READOOB(inftl->mbd.mtd, (thisEUN * inftl->EraseSize) +
 				blockofs, 8, &retlen, (char *)&bci);
 
-                        status = bci.Status | bci.Status1;
+			status = bci.Status | bci.Status1;
 			DEBUG(MTD_DEBUG_LEVEL3, "INFTL: status of block %d in "
 				"EUN %d is %x\n", block , writeEUN, status);
 
@@ -670,12 +670,12 @@ static void INFTL_trydeletechain(struct INFTLrecord *inftl, unsigned thisVUC)
 		DEBUG(MTD_DEBUG_LEVEL3, "Deleting EUN %d from VUC %d\n",
 		      thisEUN, thisVUC);
 
-                if (INFTL_formatblock(inftl, thisEUN) < 0) {
+		if (INFTL_formatblock(inftl, thisEUN) < 0) {
 			/*
 			 * Could not erase : mark block as reserved.
 			 */
 			inftl->PUtable[thisEUN] = BLOCK_RESERVED;
-                } else {
+		} else {
 			/* Correctly erased : mark it as free */
 			inftl->PUtable[thisEUN] = BLOCK_FREE;
 			inftl->numfreeEUNs++;
@@ -784,9 +784,10 @@ static int inftl_writeblock(struct mtd_blktrans_dev *mbd, unsigned long block,
 
 		memset(&oob, 0xff, sizeof(struct inftl_oob));
 		oob.b.Status = oob.b.Status1 = SECTOR_USED;
-		MTD_WRITEECC(inftl->mbd.mtd, (writeEUN * inftl->EraseSize) +
-			blockofs, SECTORSIZE, &retlen, (char *)buffer,
-			(char *)&oob, &inftl->oobinfo);
+
+		nand_write_raw(inftl->mbd.mtd, (writeEUN * inftl->EraseSize) +
+			       blockofs, SECTORSIZE, &retlen, (char *)buffer,
+			       (char *)&oob);
 		/*
 		 * need to write SECTOR_USED flags since they are not written
 		 * in mtd_writeecc
@@ -804,9 +805,9 @@ static int inftl_readblock(struct mtd_blktrans_dev *mbd, unsigned long block,
 	struct INFTLrecord *inftl = (void *)mbd;
 	unsigned int thisEUN = inftl->VUtable[block / (inftl->EraseSize / SECTORSIZE)];
 	unsigned long blockofs = (block * SECTORSIZE) & (inftl->EraseSize - 1);
-        unsigned int status;
+	unsigned int status;
 	int silly = MAX_LOOPS;
-        struct inftl_bci bci;
+	struct inftl_bci bci;
 	size_t retlen;
 
 	DEBUG(MTD_DEBUG_LEVEL3, "INFTL: inftl_readblock(inftl=%p,block=%ld,"
@@ -850,7 +851,7 @@ foundit:
 		/* The requested block is not on the media, return all 0x00 */
 		memset(buffer, 0, SECTORSIZE);
 	} else {
-        	size_t retlen;
+		size_t retlen;
 		loff_t ptr = (thisEUN * inftl->EraseSize) + blockofs;
 		if (MTD_READ(inftl->mbd.mtd, ptr, SECTORSIZE, &retlen,
 		    buffer))
diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c
index 43fdc9433882..f89a03795e76 100644
--- a/drivers/mtd/inftlmount.c
+++ b/drivers/mtd/inftlmount.c
@@ -350,21 +350,21 @@ static int check_free_sectors(struct INFTLrecord *inftl, unsigned int address,
 	int len, int check_oob)
 {
 	u8 buf[SECTORSIZE + inftl->mbd.mtd->oobsize];
+	struct mtd_info *mtd = inftl->mbd.mtd;
 	size_t retlen;
 	int i;
 
-	DEBUG(MTD_DEBUG_LEVEL3, "INFTL: check_free_sectors(inftl=%p,"
-		"address=0x%x,len=%d,check_oob=%d)\n", inftl,
-		address, len, check_oob);
-
 	for (i = 0; i < len; i += SECTORSIZE) {
-		if (MTD_READECC(inftl->mbd.mtd, address, SECTORSIZE, &retlen, buf, &buf[SECTORSIZE], &inftl->oobinfo) < 0)
+		if (mtd->read(mtd, address, SECTORSIZE, &retlen, buf))
 			return -1;
 		if (memcmpb(buf, 0xff, SECTORSIZE) != 0)
 			return -1;
 
 		if (check_oob) {
-			if (memcmpb(buf + SECTORSIZE, 0xff, inftl->mbd.mtd->oobsize) != 0)
+			if(mtd->read_oob(mtd, address, mtd->oobsize,
+					 &retlen, &buf[SECTORSIZE]) < 0)
+				return -1;
+			if (memcmpb(buf + SECTORSIZE, 0xff, mtd->oobsize) != 0)
 				return -1;
 		}
 		address += SECTORSIZE;
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index a6fcee2713b0..6d52137988fa 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -142,116 +142,6 @@ concat_write(struct mtd_info *mtd, loff_t to, size_t len,
 	return err;
 }
 
-static int
-concat_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
-		size_t * retlen, u_char * buf, u_char * eccbuf,
-		struct nand_oobinfo *oobsel)
-{
-	struct mtd_concat *concat = CONCAT(mtd);
-	int err = -EINVAL;
-	int i;
-
-	*retlen = 0;
-
-	for (i = 0; i < concat->num_subdev; i++) {
-		struct mtd_info *subdev = concat->subdev[i];
-		size_t size, retsize;
-
-		if (from >= subdev->size) {
-			/* Not destined for this subdev */
-			size = 0;
-			from -= subdev->size;
-			continue;
-		}
-
-		if (from + len > subdev->size)
-			/* First part goes into this subdev */
-			size = subdev->size - from;
-		else
-			/* Entire transaction goes into this subdev */
-			size = len;
-
-		if (subdev->read_ecc)
-			err = subdev->read_ecc(subdev, from, size,
-					       &retsize, buf, eccbuf, oobsel);
-		else
-			err = -EINVAL;
-
-		if (err)
-			break;
-
-		*retlen += retsize;
-		len -= size;
-		if (len == 0)
-			break;
-
-		err = -EINVAL;
-		buf += size;
-		if (eccbuf) {
-			eccbuf += subdev->oobsize;
-			/* in nand.c at least, eccbufs are
-			   tagged with 2 (int)eccstatus'; we
-			   must account for these */
-			eccbuf += 2 * (sizeof (int));
-		}
-		from = 0;
-	}
-	return err;
-}
-
-static int
-concat_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
-		 size_t * retlen, const u_char * buf, u_char * eccbuf,
-		 struct nand_oobinfo *oobsel)
-{
-	struct mtd_concat *concat = CONCAT(mtd);
-	int err = -EINVAL;
-	int i;
-
-	if (!(mtd->flags & MTD_WRITEABLE))
-		return -EROFS;
-
-	*retlen = 0;
-
-	for (i = 0; i < concat->num_subdev; i++) {
-		struct mtd_info *subdev = concat->subdev[i];
-		size_t size, retsize;
-
-		if (to >= subdev->size) {
-			size = 0;
-			to -= subdev->size;
-			continue;
-		}
-		if (to + len > subdev->size)
-			size = subdev->size - to;
-		else
-			size = len;
-
-		if (!(subdev->flags & MTD_WRITEABLE))
-			err = -EROFS;
-		else if (subdev->write_ecc)
-			err = subdev->write_ecc(subdev, to, size,
-						&retsize, buf, eccbuf, oobsel);
-		else
-			err = -EINVAL;
-
-		if (err)
-			break;
-
-		*retlen += retsize;
-		len -= size;
-		if (len == 0)
-			break;
-
-		err = -EINVAL;
-		buf += size;
-		if (eccbuf)
-			eccbuf += subdev->oobsize;
-		to = 0;
-	}
-	return err;
-}
-
 static int
 concat_writev(struct mtd_info *mtd, const struct kvec *vecs,
 		unsigned long count, loff_t to, size_t * retlen)
@@ -823,10 +713,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 	concat->mtd.oobsize = subdev[0]->oobsize;
 	concat->mtd.ecctype = subdev[0]->ecctype;
 	concat->mtd.eccsize = subdev[0]->eccsize;
-	if (subdev[0]->read_ecc)
-		concat->mtd.read_ecc = concat_read_ecc;
-	if (subdev[0]->write_ecc)
-		concat->mtd.write_ecc = concat_write_ecc;
 	if (subdev[0]->writev)
 		concat->mtd.writev = concat_writev;
 	if (subdev[0]->read_oob)
@@ -869,8 +755,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 		    concat->mtd.oobsize    !=  subdev[i]->oobsize ||
 		    concat->mtd.ecctype    !=  subdev[i]->ecctype ||
 		    concat->mtd.eccsize    !=  subdev[i]->eccsize ||
-		    !concat->mtd.read_ecc  != !subdev[i]->read_ecc ||
-		    !concat->mtd.write_ecc != !subdev[i]->write_ecc ||
 		    !concat->mtd.read_oob  != !subdev[i]->read_oob ||
 		    !concat->mtd.write_oob != !subdev[i]->write_oob) {
 			kfree(concat);
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index f418920320d2..a93550ce7978 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -55,12 +55,8 @@ static int part_read (struct mtd_info *mtd, loff_t from, size_t len,
 		len = 0;
 	else if (from + len > mtd->size)
 		len = mtd->size - from;
-	if (part->master->read_ecc == NULL)
-		return part->master->read (part->master, from + part->offset,
-					len, retlen, buf);
-	else
-		return part->master->read_ecc (part->master, from + part->offset,
-					len, retlen, buf, NULL, &mtd->oobinfo);
+	return part->master->read (part->master, from + part->offset,
+				   len, retlen, buf);
 }
 
 static int part_point (struct mtd_info *mtd, loff_t from, size_t len,
@@ -74,6 +70,7 @@ static int part_point (struct mtd_info *mtd, loff_t from, size_t len,
 	return part->master->point (part->master, from + part->offset,
 				    len, retlen, buf);
 }
+
 static void part_unpoint (struct mtd_info *mtd, u_char *addr, loff_t from, size_t len)
 {
 	struct mtd_part *part = PART(mtd);
@@ -81,21 +78,6 @@ static void part_unpoint (struct mtd_info *mtd, u_char *addr, loff_t from, size_
 	part->master->unpoint (part->master, addr, from + part->offset, len);
 }
 
-
-static int part_read_ecc (struct mtd_info *mtd, loff_t from, size_t len,
-			size_t *retlen, u_char *buf, u_char *eccbuf, struct nand_oobinfo *oobsel)
-{
-	struct mtd_part *part = PART(mtd);
-	if (oobsel == NULL)
-		oobsel = &mtd->oobinfo;
-	if (from >= mtd->size)
-		len = 0;
-	else if (from + len > mtd->size)
-		len = mtd->size - from;
-	return part->master->read_ecc (part->master, from + part->offset,
-					len, retlen, buf, eccbuf, oobsel);
-}
-
 static int part_read_oob (struct mtd_info *mtd, loff_t from, size_t len,
 			size_t *retlen, u_char *buf)
 {
@@ -148,30 +130,8 @@ static int part_write (struct mtd_info *mtd, loff_t to, size_t len,
 		len = 0;
 	else if (to + len > mtd->size)
 		len = mtd->size - to;
-	if (part->master->write_ecc == NULL)
-		return part->master->write (part->master, to + part->offset,
-					len, retlen, buf);
-	else
-		return part->master->write_ecc (part->master, to + part->offset,
-					len, retlen, buf, NULL, &mtd->oobinfo);
-
-}
-
-static int part_write_ecc (struct mtd_info *mtd, loff_t to, size_t len,
-			size_t *retlen, const u_char *buf,
-			 u_char *eccbuf, struct nand_oobinfo *oobsel)
-{
-	struct mtd_part *part = PART(mtd);
-	if (!(mtd->flags & MTD_WRITEABLE))
-		return -EROFS;
-	if (oobsel == NULL)
-		oobsel = &mtd->oobinfo;
-	if (to >= mtd->size)
-		len = 0;
-	else if (to + len > mtd->size)
-		len = mtd->size - to;
-	return part->master->write_ecc (part->master, to + part->offset,
-					len, retlen, buf, eccbuf, oobsel);
+	return part->master->write (part->master, to + part->offset,
+				    len, retlen, buf);
 }
 
 static int part_write_oob (struct mtd_info *mtd, loff_t to, size_t len,
@@ -372,10 +332,6 @@ int add_mtd_partitions(struct mtd_info *master,
 			slave->mtd.unpoint = part_unpoint;
 		}
 
-		if (master->read_ecc)
-			slave->mtd.read_ecc = part_read_ecc;
-		if (master->write_ecc)
-			slave->mtd.write_ecc = part_write_ecc;
 		if (master->read_oob)
 			slave->mtd.read_oob = part_read_oob;
 		if (master->write_oob)
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index da2f4d16e506..d796eb508b4f 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -139,16 +139,10 @@ static int nand_verify_buf(struct mtd_info *mtd, const uint8_t *buf, int len);
 
 static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
 		     size_t *retlen, uint8_t *buf);
-static int nand_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
-			 size_t *retlen, uint8_t *buf, uint8_t *eccbuf,
-			 struct nand_oobinfo *oobsel);
 static int nand_read_oob(struct mtd_info *mtd, loff_t from, size_t len,
 			 size_t *retlen, uint8_t *buf);
 static int nand_write(struct mtd_info *mtd, loff_t to, size_t len,
 		      size_t *retlen, const uint8_t *buf);
-static int nand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
-			  size_t *retlen, const uint8_t *buf, uint8_t *eccbuf,
-			  struct nand_oobinfo *oobsel);
 static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len,
 			  size_t *retlen, const uint8_t *buf);
 static int nand_erase(struct mtd_info *mtd, struct erase_info *instr);
@@ -1079,27 +1073,6 @@ static int nand_read(struct mtd_info *mtd, loff_t from, size_t len, size_t *retl
 	return nand_do_read_ecc(mtd, from, len, retlen, buf, NULL, &mtd->oobinfo, 0xff);
 }
 
-/**
- * nand_read_ecc - [MTD Interface] MTD compability function for nand_do_read_ecc
- * @mtd:	MTD device structure
- * @from:	offset to read from
- * @len:	number of bytes to read
- * @retlen:	pointer to variable to store the number of read bytes
- * @buf:	the databuffer to put data
- * @oob_buf:	filesystem supplied oob data buffer
- * @oobsel:	oob selection structure
- *
- * This function simply calls nand_do_read_ecc with flags = 0xff
- */
-static int nand_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
-			 size_t *retlen, uint8_t *buf, uint8_t *oob_buf, struct nand_oobinfo *oobsel)
-{
-	/* use userspace supplied oobinfo, if zero */
-	if (oobsel == NULL)
-		oobsel = &mtd->oobinfo;
-	return nand_do_read_ecc(mtd, from, len, retlen, buf, oob_buf, oobsel, 0xff);
-}
-
 /**
  * nand_do_read_ecc - [MTD Interface] Read data with ECC
  * @mtd:	MTD device structure
@@ -1523,6 +1496,55 @@ int nand_read_raw(struct mtd_info *mtd, uint8_t *buf, loff_t from, size_t len, s
 	return 0;
 }
 
+/**
+ * nand_write_raw - [GENERIC] Write raw data including oob
+ * @mtd:	MTD device structure
+ * @buf:	source buffer
+ * @to:		offset to write to
+ * @len:	number of bytes to write
+ * @buf:	source buffer
+ * @oob:	oob buffer
+ *
+ * Write raw data including oob
+ */
+int nand_write_raw(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen,
+		   uint8_t *buf, uint8_t *oob)
+{
+	struct nand_chip *this = mtd->priv;
+	int page = (int)(to >> this->page_shift);
+	int chip = (int)(to >> this->chip_shift);
+	int ret;
+
+	*retlen = 0;
+
+	/* Do not allow writes past end of device */
+	if ((to + len) > mtd->size) {
+		DEBUG(MTD_DEBUG_LEVEL0, "nand_read_raw: Attempt write "
+		      "beyond end of device\n");
+		return -EINVAL;
+	}
+
+	/* Grab the lock and see if the device is available */
+	nand_get_device(this, mtd, FL_WRITING);
+
+	this->select_chip(mtd, chip);
+	this->data_poi = buf;
+
+	while (len != *retlen) {
+		ret = nand_write_page(mtd, this, page, oob, &mtd->oobinfo, 0);
+		if (ret)
+			return ret;
+		page++;
+		*retlen += mtd->writesize;
+		this->data_poi += mtd->writesize;
+		oob += mtd->oobsize;
+	}
+
+	/* Deselect and wake up anyone waiting on the device */
+	nand_release_device(mtd);
+	return 0;
+}
+
 /**
  * nand_prepare_oobbuf - [GENERIC] Prepare the out of band buffer
  * @mtd:	MTD device structure
@@ -1585,57 +1607,39 @@ static uint8_t *nand_prepare_oobbuf(struct mtd_info *mtd, uint8_t *fsbuf, struct
 #define NOTALIGNED(x) (x & (mtd->writesize-1)) != 0
 
 /**
- * nand_write - [MTD Interface] compability function for nand_write_ecc
- * @mtd:	MTD device structure
- * @to:		offset to write to
- * @len:	number of bytes to write
- * @retlen:	pointer to variable to store the number of written bytes
- * @buf:	the data to write
- *
- * This function simply calls nand_write_ecc with oob buffer and oobsel = NULL
- *
-*/
-static int nand_write(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const uint8_t *buf)
-{
-	return (nand_write_ecc(mtd, to, len, retlen, buf, NULL, NULL));
-}
-
-/**
- * nand_write_ecc - [MTD Interface] NAND write with ECC
+ * nand_write - [MTD Interface] NAND write with ECC
  * @mtd:	MTD device structure
  * @to:		offset to write to
  * @len:	number of bytes to write
  * @retlen:	pointer to variable to store the number of written bytes
  * @buf:	the data to write
- * @eccbuf:	filesystem supplied oob data buffer
- * @oobsel:	oob selection structure
  *
  * NAND write with ECC
  */
-static int nand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
-			  size_t *retlen, const uint8_t *buf, uint8_t *eccbuf,
-			  struct nand_oobinfo *oobsel)
+static int nand_write(struct mtd_info *mtd, loff_t to, size_t len,
+			  size_t *retlen, const uint8_t *buf)
 {
 	int startpage, page, ret = -EIO, oob = 0, written = 0, chipnr;
 	int autoplace = 0, numpages, totalpages;
 	struct nand_chip *this = mtd->priv;
-	uint8_t *oobbuf, *bufstart;
+	uint8_t *oobbuf, *bufstart, *eccbuf = NULL;
 	int ppblock = (1 << (this->phys_erase_shift - this->page_shift));
+	struct nand_oobinfo *oobsel = &mtd->oobinfo;
 
-	DEBUG(MTD_DEBUG_LEVEL3, "nand_write_ecc: to = 0x%08x, len = %i\n", (unsigned int)to, (int)len);
+	DEBUG(MTD_DEBUG_LEVEL3, "nand_write: to = 0x%08x, len = %i\n", (unsigned int)to, (int)len);
 
 	/* Initialize retlen, in case of early exit */
 	*retlen = 0;
 
 	/* Do not allow write past end of device */
 	if ((to + len) > mtd->size) {
-		DEBUG(MTD_DEBUG_LEVEL0, "nand_write_ecc: Attempt to write past end of page\n");
+		DEBUG(MTD_DEBUG_LEVEL0, "nand_write: Attempt to write past end of page\n");
 		return -EINVAL;
 	}
 
 	/* reject writes, which are not page aligned */
 	if (NOTALIGNED(to) || NOTALIGNED(len)) {
-		printk(KERN_NOTICE "nand_write_ecc: Attempt to write not page aligned data\n");
+		printk(KERN_NOTICE "nand_write: Attempt to write not page aligned data\n");
 		return -EINVAL;
 	}
 
@@ -1651,10 +1655,6 @@ static int nand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 	if (nand_check_wp(mtd))
 		goto out;
 
-	/* if oobsel is NULL, use chip defaults */
-	if (oobsel == NULL)
-		oobsel = &mtd->oobinfo;
-
 	/* Autoplace of oob data ? Use the default placement scheme */
 	if (oobsel->useecc == MTD_NANDECC_AUTOPLACE) {
 		oobsel = this->autooob;
@@ -1689,7 +1689,7 @@ static int nand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 		 */
 		ret = nand_write_page(mtd, this, page, &oobbuf[oob], oobsel, (--numpages > 0));
 		if (ret) {
-			DEBUG(MTD_DEBUG_LEVEL0, "nand_write_ecc: write_page failed %d\n", ret);
+			DEBUG(MTD_DEBUG_LEVEL0, "nand_write: write_page failed %d\n", ret);
 			goto out;
 		}
 		/* Next oob page */
@@ -1712,7 +1712,7 @@ static int nand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 			ret = nand_verify_pages(mtd, this, startpage, page - startpage,
 						oobbuf, oobsel, chipnr, (eccbuf != NULL));
 			if (ret) {
-				DEBUG(MTD_DEBUG_LEVEL0, "nand_write_ecc: verify_pages failed %d\n", ret);
+				DEBUG(MTD_DEBUG_LEVEL0, "nand_write: verify_pages failed %d\n", ret);
 				goto out;
 			}
 			*retlen = written;
@@ -1741,7 +1741,7 @@ static int nand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 	if (!ret)
 		*retlen = written;
 	else
-		DEBUG(MTD_DEBUG_LEVEL0, "nand_write_ecc: verify_pages failed %d\n", ret);
+		DEBUG(MTD_DEBUG_LEVEL0, "nand_write: verify_pages failed %d\n", ret);
 
  out:
 	/* Deselect and wake up anyone waiting on the device */
@@ -2527,8 +2527,6 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 	mtd->unpoint = NULL;
 	mtd->read = nand_read;
 	mtd->write = nand_write;
-	mtd->read_ecc = nand_read_ecc;
-	mtd->write_ecc = nand_write_ecc;
 	mtd->read_oob = nand_read_oob;
 	mtd->write_oob = nand_write_oob;
 	mtd->sync = nand_sync;
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index fbccb2a25186..ecaaca18d1e0 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -156,7 +156,7 @@ static int read_bbt(struct mtd_info *mtd, uint8_t *buf, int page, int num,
 
 	while (totlen) {
 		len = min(totlen, (size_t) (1 << this->bbt_erase_shift));
-		res = mtd->read_ecc(mtd, from, len, &retlen, buf, NULL, this->autooob);
+		res = mtd->read(mtd, from, len, &retlen, buf);
 		if (res < 0) {
 			if (retlen != len) {
 				printk(KERN_INFO "nand_bbt: Error reading bad block table\n");
@@ -471,17 +471,17 @@ static int search_read_bbts(struct mtd_info *mtd, uint8_t * buf, struct nand_bbt
  *
 */
 static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
-		     struct nand_bbt_descr *td, struct nand_bbt_descr *md, int chipsel)
+		     struct nand_bbt_descr *td, struct nand_bbt_descr *md,
+		     int chipsel)
 {
 	struct nand_chip *this = mtd->priv;
-	struct nand_oobinfo oobinfo;
 	struct erase_info einfo;
 	int i, j, res, chip = 0;
 	int bits, startblock, dir, page, offs, numblocks, sft, sftmsk;
-	int nrchips, bbtoffs, pageoffs;
+	int nrchips, bbtoffs, pageoffs, ooboffs;
 	uint8_t msk[4];
 	uint8_t rcode = td->reserved_block_code;
-	size_t retlen, len = 0;
+	size_t retlen, len = 0, ooblen;
 	loff_t to;
 
 	if (!rcode)
@@ -526,12 +526,14 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 		for (i = 0; i < td->maxblocks; i++) {
 			int block = startblock + dir * i;
 			/* Check, if the block is bad */
-			switch ((this->bbt[block >> 2] >> (2 * (block & 0x03))) & 0x03) {
+			switch ((this->bbt[block >> 2] >>
+				 (2 * (block & 0x03))) & 0x03) {
 			case 0x01:
 			case 0x03:
 				continue;
 			}
-			page = block << (this->bbt_erase_shift - this->page_shift);
+			page = block <<
+				(this->bbt_erase_shift - this->page_shift);
 			/* Check, if the block is used by the mirror table */
 			if (!md || md->pages[chip] != page)
 				goto write;
@@ -542,11 +544,20 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 
 		/* Set up shift count and masks for the flash table */
 		bits = td->options & NAND_BBT_NRBITS_MSK;
+		msk[2] = ~rcode;
 		switch (bits) {
-		case 1: sft = 3; sftmsk = 0x07; msk[0] = 0x00; msk[1] = 0x01; msk[2] = ~rcode; msk[3] = 0x01; break;
-		case 2: sft = 2; sftmsk = 0x06; msk[0] = 0x00; msk[1] = 0x01; msk[2] = ~rcode; msk[3] = 0x03; break;
-		case 4: sft = 1; sftmsk = 0x04; msk[0] = 0x00; msk[1] = 0x0C; msk[2] = ~rcode; msk[3] = 0x0f; break;
-		case 8: sft = 0; sftmsk = 0x00; msk[0] = 0x00; msk[1] = 0x0F; msk[2] = ~rcode; msk[3] = 0xff; break;
+		case 1: sft = 3; sftmsk = 0x07; msk[0] = 0x00; msk[1] = 0x01;
+			msk[3] = 0x01;
+			break;
+		case 2: sft = 2; sftmsk = 0x06; msk[0] = 0x00; msk[1] = 0x01;
+			msk[3] = 0x03;
+			break;
+		case 4: sft = 1; sftmsk = 0x04; msk[0] = 0x00; msk[1] = 0x0C;
+			msk[3] = 0x0f;
+			break;
+		case 8: sft = 0; sftmsk = 0x00; msk[0] = 0x00; msk[1] = 0x0F;
+			msk[3] = 0xff;
+			break;
 		default: return -EINVAL;
 		}
 
@@ -554,49 +565,55 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 
 		to = ((loff_t) page) << this->page_shift;
 
-		memcpy(&oobinfo, this->autooob, sizeof(oobinfo));
-		oobinfo.useecc = MTD_NANDECC_PLACEONLY;
-
 		/* Must we save the block contents ? */
 		if (td->options & NAND_BBT_SAVECONTENT) {
 			/* Make it block aligned */
 			to &= ~((loff_t) ((1 << this->bbt_erase_shift) - 1));
 			len = 1 << this->bbt_erase_shift;
-			res = mtd->read_ecc(mtd, to, len, &retlen, buf, &buf[len], &oobinfo);
+			res = mtd->read(mtd, to, len, &retlen, buf);
 			if (res < 0) {
 				if (retlen != len) {
-					printk(KERN_INFO
-					       "nand_bbt: Error reading block for writing the bad block table\n");
+					printk(KERN_INFO "nand_bbt: Error "
+					       "reading block for writing "
+					       "the bad block table\n");
 					return res;
 				}
-				printk(KERN_WARNING "nand_bbt: ECC error while reading block for writing bad block table\n");
+				printk(KERN_WARNING "nand_bbt: ECC error "
+				       "while reading block for writing "
+				       "bad block table\n");
 			}
+			/* Read oob data */
+			ooblen = (len >> this->page_shift) * mtd->oobsize;
+			res = mtd->read_oob(mtd, to + mtd->writesize, ooblen,
+					    &retlen, &buf[len]);
+			if (res < 0 || retlen != ooblen)
+				goto outerr;
+
 			/* Calc the byte offset in the buffer */
 			pageoffs = page - (int)(to >> this->page_shift);
 			offs = pageoffs << this->page_shift;
 			/* Preset the bbt area with 0xff */
 			memset(&buf[offs], 0xff, (size_t) (numblocks >> sft));
-			/* Preset the bbt's oob area with 0xff */
-			memset(&buf[len + pageoffs * mtd->oobsize], 0xff,
-			       ((len >> this->page_shift) - pageoffs) * mtd->oobsize);
-			if (td->options & NAND_BBT_VERSION) {
-				buf[len + (pageoffs * mtd->oobsize) + td->veroffs] = td->version[chip];
-			}
+			ooboffs = len + (pageoffs * mtd->oobsize);
+
 		} else {
 			/* Calc length */
 			len = (size_t) (numblocks >> sft);
 			/* Make it page aligned ! */
-			len = (len + (mtd->writesize - 1)) & ~(mtd->writesize - 1);
+			len = (len + (mtd->writesize - 1)) &
+				~(mtd->writesize - 1);
 			/* Preset the buffer with 0xff */
-			memset(buf, 0xff, len + (len >> this->page_shift) * mtd->oobsize);
+			memset(buf, 0xff, len +
+			       (len >> this->page_shift)* mtd->oobsize);
 			offs = 0;
+			ooboffs = len;
 			/* Pattern is located in oob area of first page */
-			memcpy(&buf[len + td->offs], td->pattern, td->len);
-			if (td->options & NAND_BBT_VERSION) {
-				buf[len + td->veroffs] = td->version[chip];
-			}
+			memcpy(&buf[ooboffs + td->offs], td->pattern, td->len);
 		}
 
+		if (td->options & NAND_BBT_VERSION)
+			buf[ooboffs + td->veroffs] = td->version[chip];
+
 		/* walk through the memory table */
 		for (i = 0; i < numblocks;) {
 			uint8_t dat;
@@ -604,7 +621,8 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 			for (j = 0; j < 4; j++, i++) {
 				int sftcnt = (i << (3 - sft)) & sftmsk;
 				/* Do not store the reserved bbt blocks ! */
-				buf[offs + (i >> sft)] &= ~(msk[dat & 0x03] << sftcnt);
+				buf[offs + (i >> sft)] &=
+					~(msk[dat & 0x03] << sftcnt);
 				dat >>= 2;
 			}
 		}
@@ -614,23 +632,25 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 		einfo.addr = (unsigned long)to;
 		einfo.len = 1 << this->bbt_erase_shift;
 		res = nand_erase_nand(mtd, &einfo, 1);
-		if (res < 0) {
-			printk(KERN_WARNING "nand_bbt: Error during block erase: %d\n", res);
-			return res;
-		}
+		if (res < 0)
+			goto outerr;
 
-		res = mtd->write_ecc(mtd, to, len, &retlen, buf, &buf[len], &oobinfo);
-		if (res < 0) {
-			printk(KERN_WARNING "nand_bbt: Error while writing bad block table %d\n", res);
-			return res;
-		}
-		printk(KERN_DEBUG "Bad block table written to 0x%08x, version 0x%02X\n",
-		       (unsigned int)to, td->version[chip]);
+		res = nand_write_raw(mtd, to, len, &retlen, buf, &buf[len]);
+		if (res < 0)
+			goto outerr;
+
+		printk(KERN_DEBUG "Bad block table written to 0x%08x, version "
+		       "0x%02X\n", (unsigned int)to, td->version[chip]);
 
 		/* Mark it as used */
 		td->pages[chip] = page;
 	}
 	return 0;
+
+ outerr:
+	printk(KERN_WARNING
+	       "nand_bbt: Error while writing bad block table %d\n", res);
+	return res;
 }
 
 /**
diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c
index d7cd5fa16ba4..dd03349946c2 100644
--- a/drivers/mtd/nftlcore.c
+++ b/drivers/mtd/nftlcore.c
@@ -70,8 +70,6 @@ static void nftl_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 	nftl->mbd.devnum = -1;
 	nftl->mbd.blksize = 512;
 	nftl->mbd.tr = tr;
-	memcpy(&nftl->oobinfo, &mtd->oobinfo, sizeof(struct nand_oobinfo));
-	nftl->oobinfo.useecc = MTD_NANDECC_PLACEONLY;
 
         if (NFTL_mount(nftl) < 0) {
 		printk(KERN_WARNING "NFTL: could not mount device\n");
@@ -369,8 +367,11 @@ static u16 NFTL_foldchain (struct NFTLrecord *nftl, unsigned thisVUC, unsigned p
                 }
 		memset(&oob, 0xff, sizeof(struct nftl_oob));
 		oob.b.Status = oob.b.Status1 = SECTOR_USED;
-                MTD_WRITEECC(nftl->mbd.mtd, (nftl->EraseSize * targetEUN) + (block * 512),
-                             512, &retlen, movebuf, (char *)&oob, &nftl->oobinfo);
+
+		nand_write_raw(nftl->mbd.mtd, (nftl->EraseSize * targetEUN) +
+			       (block * 512), 512, &retlen, movebuf,
+			       (char *)&oob);
+
 	}
 
         /* add the header so that it is now a valid chain */
@@ -639,10 +640,10 @@ static int nftl_writeblock(struct mtd_blktrans_dev *mbd, unsigned long block,
 
 	memset(&oob, 0xff, sizeof(struct nftl_oob));
 	oob.b.Status = oob.b.Status1 = SECTOR_USED;
-	MTD_WRITEECC(nftl->mbd.mtd, (writeEUN * nftl->EraseSize) + blockofs,
-		     512, &retlen, (char *)buffer, (char *)&oob, &nftl->oobinfo);
-        /* need to write SECTOR_USED flags since they are not written in mtd_writeecc */
 
+	nand_write_raw(nftl->mbd.mtd, (writeEUN * nftl->EraseSize) +
+		       blockofs, 512, &retlen, (char *)buffer,
+		       (char *)&oob);
 	return 0;
 }
 #endif /* CONFIG_NFTL_RW */
diff --git a/drivers/mtd/nftlmount.c b/drivers/mtd/nftlmount.c
index 3b104ebb219a..90e5e7e97fdc 100644
--- a/drivers/mtd/nftlmount.c
+++ b/drivers/mtd/nftlmount.c
@@ -268,18 +268,22 @@ static int memcmpb(void *a, int c, int n)
 static int check_free_sectors(struct NFTLrecord *nftl, unsigned int address, int len,
 			      int check_oob)
 {
-	int i;
-	size_t retlen;
 	u8 buf[SECTORSIZE + nftl->mbd.mtd->oobsize];
+	struct mtd_info *mtd = nftl->mbd.mtd;
+	size_t retlen;
+	int i;
 
 	for (i = 0; i < len; i += SECTORSIZE) {
-		if (MTD_READECC(nftl->mbd.mtd, address, SECTORSIZE, &retlen, buf, &buf[SECTORSIZE], &nftl->oobinfo) < 0)
+		if (mtd->read(mtd, address, SECTORSIZE, &retlen, buf))
 			return -1;
 		if (memcmpb(buf, 0xff, SECTORSIZE) != 0)
 			return -1;
 
 		if (check_oob) {
-			if (memcmpb(buf + SECTORSIZE, 0xff, nftl->mbd.mtd->oobsize) != 0)
+			if(mtd->read_oob(mtd, address, mtd->oobsize,
+					 &retlen, &buf[SECTORSIZE]) < 0)
+				return -1;
+			if (memcmpb(buf + SECTORSIZE, 0xff, mtd->oobsize) != 0)
 				return -1;
 		}
 		address += SECTORSIZE;
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 3a3fe1d8fcdd..7a2419186ff4 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -597,31 +597,28 @@ static void onenand_release_device(struct mtd_info *mtd)
 }
 
 /**
- * onenand_read_ecc - [MTD Interface] Read data with ECC
+ * onenand_read - [MTD Interface] Read data from flash
  * @param mtd		MTD device structure
  * @param from		offset to read from
  * @param len		number of bytes to read
  * @param retlen	pointer to variable to store the number of read bytes
  * @param buf		the databuffer to put data
- * @param oob_buf	filesystem supplied oob data buffer
- * @param oobsel	oob selection structure
  *
- * OneNAND read with ECC
- */
-static int onenand_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
-	size_t *retlen, u_char *buf,
-	u_char *oob_buf, struct nand_oobinfo *oobsel)
+ * Read with ecc
+*/
+static int onenand_read(struct mtd_info *mtd, loff_t from, size_t len,
+	size_t *retlen, u_char *buf)
 {
 	struct onenand_chip *this = mtd->priv;
 	int read = 0, column;
 	int thislen;
 	int ret = 0;
 
-	DEBUG(MTD_DEBUG_LEVEL3, "onenand_read_ecc: from = 0x%08x, len = %i\n", (unsigned int) from, (int) len);
+	DEBUG(MTD_DEBUG_LEVEL3, "onenand_read: from = 0x%08x, len = %i\n", (unsigned int) from, (int) len);
 
 	/* Do not allow reads past end of device */
 	if ((from + len) > mtd->size) {
-		DEBUG(MTD_DEBUG_LEVEL0, "onenand_read_ecc: Attempt read beyond end of device\n");
+		DEBUG(MTD_DEBUG_LEVEL0, "onenand_read: Attempt read beyond end of device\n");
 		*retlen = 0;
 		return -EINVAL;
 	}
@@ -654,7 +651,7 @@ static int onenand_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 			break;
 
 		if (ret) {
-			DEBUG(MTD_DEBUG_LEVEL0, "onenand_read_ecc: read failed = %d\n", ret);
+			DEBUG(MTD_DEBUG_LEVEL0, "onenand_read: read failed = %d\n", ret);
 			goto out;
 		}
 
@@ -675,22 +672,6 @@ out:
 	return ret;
 }
 
-/**
- * onenand_read - [MTD Interface] MTD compability function for onenand_read_ecc
- * @param mtd		MTD device structure
- * @param from		offset to read from
- * @param len		number of bytes to read
- * @param retlen	pointer to variable to store the number of read bytes
- * @param buf		the databuffer to put data
- *
- * This function simply calls onenand_read_ecc with oob buffer and oobsel = NULL
-*/
-static int onenand_read(struct mtd_info *mtd, loff_t from, size_t len,
-	size_t *retlen, u_char *buf)
-{
-	return onenand_read_ecc(mtd, from, len, retlen, buf, NULL, NULL);
-}
-
 /**
  * onenand_read_oob - [MTD Interface] OneNAND read out-of-band
  * @param mtd		MTD device structure
@@ -834,39 +815,36 @@ static int onenand_verify_page(struct mtd_info *mtd, u_char *buf, loff_t addr)
 #define NOTALIGNED(x)	((x & (mtd->writesize - 1)) != 0)
 
 /**
- * onenand_write_ecc - [MTD Interface] OneNAND write with ECC
+ * onenand_write - [MTD Interface] write buffer to FLASH
  * @param mtd		MTD device structure
  * @param to		offset to write to
  * @param len		number of bytes to write
  * @param retlen	pointer to variable to store the number of written bytes
  * @param buf		the data to write
- * @param eccbuf	filesystem supplied oob data buffer
- * @param oobsel	oob selection structure
  *
- * OneNAND write with ECC
+ * Write with ECC
  */
-static int onenand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
-	size_t *retlen, const u_char *buf,
-	u_char *eccbuf, struct nand_oobinfo *oobsel)
+static int onenand_write(struct mtd_info *mtd, loff_t to, size_t len,
+	size_t *retlen, const u_char *buf)
 {
 	struct onenand_chip *this = mtd->priv;
 	int written = 0;
 	int ret = 0;
 
-	DEBUG(MTD_DEBUG_LEVEL3, "onenand_write_ecc: to = 0x%08x, len = %i\n", (unsigned int) to, (int) len);
+	DEBUG(MTD_DEBUG_LEVEL3, "onenand_write: to = 0x%08x, len = %i\n", (unsigned int) to, (int) len);
 
 	/* Initialize retlen, in case of early exit */
 	*retlen = 0;
 
 	/* Do not allow writes past end of device */
 	if (unlikely((to + len) > mtd->size)) {
-		DEBUG(MTD_DEBUG_LEVEL0, "onenand_write_ecc: Attempt write to past end of device\n");
+		DEBUG(MTD_DEBUG_LEVEL0, "onenand_write: Attempt write to past end of device\n");
 		return -EINVAL;
 	}
 
 	/* Reject writes, which are not page aligned */
         if (unlikely(NOTALIGNED(to)) || unlikely(NOTALIGNED(len))) {
-                DEBUG(MTD_DEBUG_LEVEL0, "onenand_write_ecc: Attempt to write not page aligned data\n");
+                DEBUG(MTD_DEBUG_LEVEL0, "onenand_write: Attempt to write not page aligned data\n");
                 return -EINVAL;
         }
 
@@ -888,7 +866,7 @@ static int onenand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 
 		ret = this->wait(mtd, FL_WRITING);
 		if (ret) {
-			DEBUG(MTD_DEBUG_LEVEL0, "onenand_write_ecc: write filaed %d\n", ret);
+			DEBUG(MTD_DEBUG_LEVEL0, "onenand_write: write filaed %d\n", ret);
 			goto out;
 		}
 
@@ -897,7 +875,7 @@ static int onenand_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 		/* Only check verify write turn on */
 		ret = onenand_verify_page(mtd, (u_char *) buf, to);
 		if (ret) {
-			DEBUG(MTD_DEBUG_LEVEL0, "onenand_write_ecc: verify failed %d\n", ret);
+			DEBUG(MTD_DEBUG_LEVEL0, "onenand_write: verify failed %d\n", ret);
 			goto out;
 		}
 
@@ -917,23 +895,6 @@ out:
 	return ret;
 }
 
-/**
- * onenand_write - [MTD Interface] compability function for onenand_write_ecc
- * @param mtd		MTD device structure
- * @param to		offset to write to
- * @param len		number of bytes to write
- * @param retlen	pointer to variable to store the number of written bytes
- * @param buf		the data to write
- *
- * This function simply calls onenand_write_ecc
- * with oob buffer and oobsel = NULL
- */
-static int onenand_write(struct mtd_info *mtd, loff_t to, size_t len,
-	size_t *retlen, const u_char *buf)
-{
-	return onenand_write_ecc(mtd, to, len, retlen, buf, NULL, NULL);
-}
-
 /**
  * onenand_write_oob - [MTD Interface] OneNAND write out-of-band
  * @param mtd		MTD device structure
@@ -1812,8 +1773,6 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 	mtd->unpoint = NULL;
 	mtd->read = onenand_read;
 	mtd->write = onenand_write;
-	mtd->read_ecc = onenand_read_ecc;
-	mtd->write_ecc = onenand_write_ecc;
 	mtd->read_oob = onenand_read_oob;
 	mtd->write_oob = onenand_write_oob;
 #ifdef CONFIG_MTD_ONENAND_OTP
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 916c87d3393b..76d4c361ef1f 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -233,10 +233,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 		}
 
 		/* Do the read... */
-		if (jffs2_cleanmarker_oob(c))
-			ret = c->mtd->read_ecc(c->mtd, start, c->wbuf_ofs - start, &retlen, buf, NULL, c->oobinfo);
-		else
-			ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf);
+		ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf);
 
 		if (ret == -EBADMSG && retlen == c->wbuf_ofs - start) {
 			/* ECC recovered */
@@ -290,16 +287,13 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 		if (breakme++ == 20) {
 			printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs);
 			breakme = 0;
-			c->mtd->write_ecc(c->mtd, ofs, towrite, &retlen,
-					  brokenbuf, NULL, c->oobinfo);
+			c->mtd->write(c->mtd, ofs, towrite, &retlen,
+				      brokenbuf);
 			ret = -EIO;
 		} else
 #endif
-		if (jffs2_cleanmarker_oob(c))
-			ret = c->mtd->write_ecc(c->mtd, ofs, towrite, &retlen,
-						rewrite_buf, NULL, c->oobinfo);
-		else
-			ret = c->mtd->write(c->mtd, ofs, towrite, &retlen, rewrite_buf);
+			ret = c->mtd->write(c->mtd, ofs, towrite, &retlen,
+					    rewrite_buf);
 
 		if (ret || retlen != towrite) {
 			/* Argh. We tried. Really we did. */
@@ -457,15 +451,12 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 	if (breakme++ == 20) {
 		printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs);
 		breakme = 0;
-		c->mtd->write_ecc(c->mtd, c->wbuf_ofs, c->wbuf_pagesize,
-					&retlen, brokenbuf, NULL, c->oobinfo);
+		c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen,
+			      brokenbuf);
 		ret = -EIO;
 	} else
 #endif
 
-	if (jffs2_cleanmarker_oob(c))
-		ret = c->mtd->write_ecc(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf, NULL, c->oobinfo);
-	else
 		ret = c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf);
 
 	if (ret || retlen != c->wbuf_pagesize) {
@@ -800,10 +791,7 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re
 
 	/* Read flash */
 	down_read(&c->wbuf_sem);
-	if (jffs2_cleanmarker_oob(c))
-		ret = c->mtd->read_ecc(c->mtd, ofs, len, retlen, buf, NULL, c->oobinfo);
-	else
-		ret = c->mtd->read(c->mtd, ofs, len, retlen, buf);
+	ret = c->mtd->read(c->mtd, ofs, len, retlen, buf);
 
 	if ( (ret == -EBADMSG) && (*retlen == len) ) {
 		printk(KERN_WARNING "mtd->read(0x%zx bytes from 0x%llx) returned ECC error\n",
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index af89e529b8d2..b8ad634391db 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -115,9 +115,6 @@ struct mtd_info {
 	int (*read) (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf);
 	int (*write) (struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf);
 
-	int (*read_ecc) (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf, u_char *eccbuf, struct nand_oobinfo *oobsel);
-	int (*write_ecc) (struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf, u_char *eccbuf, struct nand_oobinfo *oobsel);
-
 	int (*read_oob) (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf);
 	int (*write_oob) (struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf);
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 6931376ed68d..8362b466df3a 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -36,6 +36,9 @@ extern int nand_read_raw (struct mtd_info *mtd, uint8_t *buf, loff_t from,
 			  size_t len, size_t ooblen);
 
 
+extern int nand_write_raw(struct mtd_info *mtd, loff_t to, size_t len,
+			  size_t *retlen, uint8_t *buf, uint8_t *oob);
+
 /* The maximum number of NAND chips in an array */
 #define NAND_MAX_CHIPS		8
 
-- 
cgit v1.2.3


From 7abd3ef9875eb2afcdcd4f450680298a2983a55e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 23 May 2006 23:25:53 +0200
Subject: [MTD] Refactor NAND hwcontrol to cmd_ctrl

The hwcontrol function enforced a step by step state machine
for any kind of hardware chip access. Let the hardware driver
know which control bits are set and inform it about a change
of the control lines. Let the hardware driver write out the
command and address bytes directly. This gives a peformance
advantage for address bus controlled chips and simplifies the
quirks in the hardware drivers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/ams-delta.c      |  56 +++++++++----------
 drivers/mtd/nand/au1550nd.c       |  25 ++++++---
 drivers/mtd/nand/autcpu12.c       |  77 +++++++++++++++----------
 drivers/mtd/nand/cs553x_nand.c    |  32 +++--------
 drivers/mtd/nand/diskonchip.c     |  77 ++++++++++---------------
 drivers/mtd/nand/edb7312.c        |  42 ++++++--------
 drivers/mtd/nand/h1910.c          |  40 ++++---------
 drivers/mtd/nand/nand_base.c      | 115 ++++++++++++++++++--------------------
 drivers/mtd/nand/nandsim.c        |  76 +++++--------------------
 drivers/mtd/nand/ndfc.c           |  23 +++-----
 drivers/mtd/nand/ppchameleonevb.c | 102 ++++++++++++++++++---------------
 drivers/mtd/nand/rtc_from4.c      |  34 ++++-------
 drivers/mtd/nand/s3c2410.c        |  64 +++++++--------------
 drivers/mtd/nand/sharpsl.c        |  41 ++++++--------
 drivers/mtd/nand/spia.c           |  27 +++++----
 drivers/mtd/nand/toto.c           |  65 ++++++++++-----------
 drivers/mtd/nand/ts7250.c         |  44 +++++++--------
 include/linux/mtd/nand.h          |  33 +++++------
 18 files changed, 430 insertions(+), 543 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ams-delta.c b/drivers/mtd/nand/ams-delta.c
index aeaf2dece095..c0e96860686e 100644
--- a/drivers/mtd/nand/ams-delta.c
+++ b/drivers/mtd/nand/ams-delta.c
@@ -34,13 +34,6 @@ static struct mtd_info *ams_delta_mtd = NULL;
 
 #define NAND_MASK (AMS_DELTA_LATCH2_NAND_NRE | AMS_DELTA_LATCH2_NAND_NWE | AMS_DELTA_LATCH2_NAND_CLE | AMS_DELTA_LATCH2_NAND_ALE | AMS_DELTA_LATCH2_NAND_NCE | AMS_DELTA_LATCH2_NAND_NWP)
 
-#define T_NAND_CTL_CLRALE(iob) ams_delta_latch2_write(AMS_DELTA_LATCH2_NAND_ALE, 0)
-#define T_NAND_CTL_SETALE(iob) ams_delta_latch2_write(AMS_DELTA_LATCH2_NAND_ALE, AMS_DELTA_LATCH2_NAND_ALE)
-#define T_NAND_CTL_CLRCLE(iob) ams_delta_latch2_write(AMS_DELTA_LATCH2_NAND_CLE, 0)
-#define T_NAND_CTL_SETCLE(iob) ams_delta_latch2_write(AMS_DELTA_LATCH2_NAND_CLE, AMS_DELTA_LATCH2_NAND_CLE)
-#define T_NAND_CTL_SETNCE(iob) ams_delta_latch2_write(AMS_DELTA_LATCH2_NAND_NCE, 0)
-#define T_NAND_CTL_CLRNCE(iob) ams_delta_latch2_write(AMS_DELTA_LATCH2_NAND_NCE, AMS_DELTA_LATCH2_NAND_NCE)
-
 /*
  * Define partitions for flash devices
  */
@@ -66,25 +59,6 @@ static struct mtd_partition partition_info[] = {
 	  .size		=  3 * SZ_256K },
 };
 
-/*
- *	hardware specific access to control-lines
-*/
-
-static void ams_delta_hwcontrol(struct mtd_info *mtd, int cmd)
-{
-	switch (cmd) {
-
-		case NAND_CTL_SETCLE: T_NAND_CTL_SETCLE(cmd); break;
-		case NAND_CTL_CLRCLE: T_NAND_CTL_CLRCLE(cmd); break;
-
-		case NAND_CTL_SETALE: T_NAND_CTL_SETALE(cmd); break;
-		case NAND_CTL_CLRALE: T_NAND_CTL_CLRALE(cmd); break;
-
-		case NAND_CTL_SETNCE: T_NAND_CTL_SETNCE(cmd); break;
-		case NAND_CTL_CLRNCE: T_NAND_CTL_CLRNCE(cmd); break;
-	}
-}
-
 static void ams_delta_write_byte(struct mtd_info *mtd, u_char byte)
 {
 	struct nand_chip *this = mtd->priv;
@@ -141,6 +115,32 @@ static int ams_delta_verify_buf(struct mtd_info *mtd, const u_char *buf,
 	return 0;
 }
 
+/*
+ * Command control function
+ *
+ * ctrl:
+ * NAND_NCE: bit 0 -> bit 2
+ * NAND_CLE: bit 1 -> bit 7
+ * NAND_ALE: bit 2 -> bit 6
+ */
+static void ams_delta_hwcontrol(struct mtd_info *mtd, int cmd,
+				unsigned int ctrl)
+{
+
+	if (ctrl & NAND_CTRL_CHANGE) {
+		unsigned long bits;
+
+		bits = (~ctrl & NAND_NCE) << 2;
+		bits |= (ctrl & NAND_CLE) << 7;
+		bits |= (ctrl & NAND_ALE) << 6;
+
+		ams_delta_latch2_write(0xC2, bits);
+	}
+
+	if (cmd != NAND_CMD_NONE)
+		ams_delta_write_byte(mtd, cmd);
+}
+
 static int ams_delta_nand_ready(struct mtd_info *mtd)
 {
 	return omap_get_gpio_datain(AMS_DELTA_GPIO_PIN_NAND_RB);
@@ -183,7 +183,7 @@ static int __init ams_delta_init(void)
 	this->write_buf = ams_delta_write_buf;
 	this->read_buf = ams_delta_read_buf;
 	this->verify_buf = ams_delta_verify_buf;
-	this->hwcontrol = ams_delta_hwcontrol;
+	this->cmd_ctrl = ams_delta_hwcontrol;
 	if (!omap_request_gpio(AMS_DELTA_GPIO_PIN_NAND_RB)) {
 		this->dev_ready = ams_delta_nand_ready;
 	} else {
@@ -200,7 +200,7 @@ static int __init ams_delta_init(void)
 					  AMS_DELTA_LATCH2_NAND_NCE |
 					  AMS_DELTA_LATCH2_NAND_NWP);
 
-        /* Scan to find existance of the device */
+	/* Scan to find existance of the device */
 	if (nand_scan(ams_delta_mtd, 1)) {
 		err = -ENXIO;
 		goto out_mtd;
diff --git a/drivers/mtd/nand/au1550nd.c b/drivers/mtd/nand/au1550nd.c
index 29dde7dcafa1..275453ea7a71 100644
--- a/drivers/mtd/nand/au1550nd.c
+++ b/drivers/mtd/nand/au1550nd.c
@@ -269,6 +269,18 @@ static int au_verify_buf16(struct mtd_info *mtd, const u_char *buf, int len)
 	return 0;
 }
 
+/* Select the chip by setting nCE to low */
+#define NAND_CTL_SETNCE		1
+/* Deselect the chip by setting nCE to high */
+#define NAND_CTL_CLRNCE		2
+/* Select the command latch by setting CLE to high */
+#define NAND_CTL_SETCLE		3
+/* Deselect the command latch by setting CLE to low */
+#define NAND_CTL_CLRCLE		4
+/* Select the address latch by setting ALE to high */
+#define NAND_CTL_SETALE		5
+/* Deselect the address latch by setting ALE to low */
+#define NAND_CTL_CLRALE		6
 
 static void au1550_hwcontrol(struct mtd_info *mtd, int cmd)
 {
@@ -349,7 +361,7 @@ static void au1550_command(struct mtd_info *mtd, unsigned command, int column, i
 	ulong flags;
 
 	/* Begin command latch cycle */
-	this->hwcontrol(mtd, NAND_CTL_SETCLE);
+	au1550_hwcontrol(mtd, NAND_CTL_SETCLE);
 	/*
 	 * Write out the command to the device.
 	 */
@@ -372,10 +384,10 @@ static void au1550_command(struct mtd_info *mtd, unsigned command, int column, i
 	this->write_byte(mtd, command);
 
 	/* Set ALE and clear CLE to start address cycle */
-	this->hwcontrol(mtd, NAND_CTL_CLRCLE);
+	au1550_hwcontrol(mtd, NAND_CTL_CLRCLE);
 
 	if (column != -1 || page_addr != -1) {
-		this->hwcontrol(mtd, NAND_CTL_SETALE);
+		au1550_hwcontrol(mtd, NAND_CTL_SETALE);
 
 		/* Serially input address */
 		if (column != -1) {
@@ -400,7 +412,7 @@ static void au1550_command(struct mtd_info *mtd, unsigned command, int column, i
 				 */
 				ce_override = 1;
 				local_irq_save(flags);
-				this->hwcontrol(mtd, NAND_CTL_SETNCE);
+				au1550_hwcontrol(mtd, NAND_CTL_SETNCE);
 			}
 
 			this->write_byte(mtd, (u8)(page_addr >> 8));
@@ -410,7 +422,7 @@ static void au1550_command(struct mtd_info *mtd, unsigned command, int column, i
 				this->write_byte(mtd, (u8)((page_addr >> 16) & 0x0f));
 		}
 		/* Latch in address */
-		this->hwcontrol(mtd, NAND_CTL_CLRALE);
+		au1550_hwcontrol(mtd, NAND_CTL_CLRALE);
 	}
 
 	/*
@@ -443,7 +455,7 @@ static void au1550_command(struct mtd_info *mtd, unsigned command, int column, i
 			udelay(1);
 
 		/* Release -CE and re-enable interrupts. */
-		this->hwcontrol(mtd, NAND_CTL_CLRNCE);
+		au1550_hwcontrol(mtd, NAND_CTL_CLRNCE);
 		local_irq_restore(flags);
 		return;
 	}
@@ -571,7 +583,6 @@ static int __init au1xxx_nand_init(void)
 		nand_width = au_readl(MEM_STCFG3) & (1 << 22);
 
 	/* Set address of hardware control function */
-	this->hwcontrol = au1550_hwcontrol;
 	this->dev_ready = au1550_device_ready;
 	this->select_chip = au1550_select_chip;
 	this->cmdfunc = au1550_command;
diff --git a/drivers/mtd/nand/autcpu12.c b/drivers/mtd/nand/autcpu12.c
index dbb1b6267ade..fe94ae9ae1f2 100644
--- a/drivers/mtd/nand/autcpu12.c
+++ b/drivers/mtd/nand/autcpu12.c
@@ -4,7 +4,7 @@
  *  Copyright (c) 2002 Thomas Gleixner <tgxl@linutronix.de>
  *
  *  Derived from drivers/mtd/spia.c
- * 	 Copyright (C) 2000 Steven J. Hill (sjhill@realitydiluted.com)
+ *	 Copyright (C) 2000 Steven J. Hill (sjhill@realitydiluted.com)
  *
  * $Id: autcpu12.c,v 1.23 2005/11/07 11:14:30 gleixner Exp $
  *
@@ -42,11 +42,6 @@
  * MTD structure for AUTCPU12 board
  */
 static struct mtd_info *autcpu12_mtd = NULL;
-
-static int autcpu12_io_base = CS89712_VIRT_BASE;
-static int autcpu12_fio_pbase = AUTCPU12_PHYS_SMC;
-static int autcpu12_fio_ctrl = AUTCPU12_SMC_SELECT_OFFSET;
-static int autcpu12_pedr = AUTCPU12_SMC_PORT_OFFSET;
 static void __iomem *autcpu12_fio_base;
 
 /*
@@ -94,31 +89,42 @@ static struct mtd_partition partition_info128k[] = {
 #define NUM_PARTITIONS128K 2
 /*
  *	hardware specific access to control-lines
-*/
-
-static void autcpu12_hwcontrol(struct mtd_info *mtd, int cmd)
+ *
+ *	ALE bit 4 autcpu12_pedr
+ *	CLE bit 5 autcpu12_pedr
+ *	NCE bit 0 fio_ctrl
+ *
+ */
+static void autcpu12_hwcontrol(struct mtd_info *mtd, int cmd,
+			       unsigned int ctrl)
 {
-	switch (cmd) {
+	struct nand_chip *chip = mtd->priv;
 
-		case NAND_CTL_SETCLE: (*(volatile unsigned char *) (autcpu12_io_base + autcpu12_pedr)) |=  AUTCPU12_SMC_CLE; break;
-		case NAND_CTL_CLRCLE: (*(volatile unsigned char *) (autcpu12_io_base + autcpu12_pedr)) &= ~AUTCPU12_SMC_CLE; break;
+	if (ctrl & NAND_CTRL_CHANGE) {
+		void __iomem *addr
+		unsigned char bits;
 
-		case NAND_CTL_SETALE: (*(volatile unsigned char *) (autcpu12_io_base + autcpu12_pedr)) |=  AUTCPU12_SMC_ALE; break;
-		case NAND_CTL_CLRALE: (*(volatile unsigned char *) (autcpu12_io_base + autcpu12_pedr)) &= ~AUTCPU12_SMC_ALE; break;
+		addr = CS89712_VIRT_BASE + AUTCPU12_SMC_PORT_OFFSET;
+		bits = (ctrl & NAND_CLE) << 4;
+		bits |= (ctrl & NAND_ALE) << 2;
+		writeb((readb(addr) & ~0x30) | bits, addr);
 
-		case NAND_CTL_SETNCE: (*(volatile unsigned char *) (autcpu12_fio_base + autcpu12_fio_ctrl)) = 0x01; break;
-		case NAND_CTL_CLRNCE: (*(volatile unsigned char *) (autcpu12_fio_base + autcpu12_fio_ctrl)) = 0x00; break;
+		addr = autcpu12_fio_base + AUTCPU12_SMC_SELECT_OFFSET;
+		writeb((readb(addr) & ~0x1) | (ctrl & NAND_NCE), addr);
 	}
+
+	if (cmd != NAND_CMD_NONE)
+		writeb(cmd, chip->IO_ADDR_W);
 }
 
 /*
-*	read device ready pin
-*/
+ *	read device ready pin
+ */
 int autcpu12_device_ready(struct mtd_info *mtd)
 {
+	void __iomem *addr = CS89712_VIRT_BASE + AUTCPU12_SMC_PORT_OFFSET;
 
-	return ((*(volatile unsigned char *)(autcpu12_io_base + autcpu12_pedr)) & AUTCPU12_SMC_RDY) ? 1 : 0;
-
+	return readb(addr) & AUTCPU12_SMC_RDY;
 }
 
 /*
@@ -130,7 +136,8 @@ static int __init autcpu12_init(void)
 	int err = 0;
 
 	/* Allocate memory for MTD device structure and private data */
-	autcpu12_mtd = kmalloc(sizeof(struct mtd_info) + sizeof(struct nand_chip), GFP_KERNEL);
+	autcpu12_mtd = kmalloc(sizeof(struct mtd_info) + sizeof(struct nand_chip),
+			       GFP_KERNEL);
 	if (!autcpu12_mtd) {
 		printk("Unable to allocate AUTCPU12 NAND MTD device structure.\n");
 		err = -ENOMEM;
@@ -138,7 +145,7 @@ static int __init autcpu12_init(void)
 	}
 
 	/* map physical adress */
-	autcpu12_fio_base = ioremap(autcpu12_fio_pbase, SZ_1K);
+	autcpu12_fio_base = ioremap(AUTCPU12_PHYS_SMC, SZ_1K);
 	if (!autcpu12_fio_base) {
 		printk("Ioremap autcpu12 SmartMedia Card failed\n");
 		err = -EIO;
@@ -159,7 +166,7 @@ static int __init autcpu12_init(void)
 	/* Set address of NAND IO lines */
 	this->IO_ADDR_R = autcpu12_fio_base;
 	this->IO_ADDR_W = autcpu12_fio_base;
-	this->hwcontrol = autcpu12_hwcontrol;
+	this->cmd_ctrl = autcpu12_hwcontrol;
 	this->dev_ready = autcpu12_device_ready;
 	/* 20 us command delay time */
 	this->chip_delay = 20;
@@ -179,10 +186,22 @@ static int __init autcpu12_init(void)
 
 	/* Register the partitions */
 	switch (autcpu12_mtd->size) {
-		case SZ_16M: add_mtd_partitions(autcpu12_mtd, partition_info16k, NUM_PARTITIONS16K); break;
-		case SZ_32M: add_mtd_partitions(autcpu12_mtd, partition_info32k, NUM_PARTITIONS32K); break;
-		case SZ_64M: add_mtd_partitions(autcpu12_mtd, partition_info64k, NUM_PARTITIONS64K); break;
-		case SZ_128M: add_mtd_partitions(autcpu12_mtd, partition_info128k, NUM_PARTITIONS128K); break;
+		case SZ_16M:
+			add_mtd_partitions(autcpu12_mtd, partition_info16k,
+					   NUM_PARTITIONS16K);
+			break;
+		case SZ_32M:
+			add_mtd_partitions(autcpu12_mtd, partition_info32k,
+					   NUM_PARTITIONS32K);
+			break;
+		case SZ_64M:
+			add_mtd_partitions(autcpu12_mtd, partition_info64k,
+					   NUM_PARTITIONS64K);
+			break;
+		case SZ_128M:
+			add_mtd_partitions(autcpu12_mtd, partition_info128k,
+					   NUM_PARTITIONS128K);
+			break;
 		default:
 			printk("Unsupported SmartMedia device\n");
 			err = -ENXIO;
@@ -191,7 +210,7 @@ static int __init autcpu12_init(void)
 	goto out;
 
  out_ior:
-	iounmap((void *)autcpu12_fio_base);
+	iounmap(autcpu12_fio_base);
  out_mtd:
 	kfree(autcpu12_mtd);
  out:
@@ -209,7 +228,7 @@ static void __exit autcpu12_cleanup(void)
 	nand_release(autcpu12_mtd);
 
 	/* unmap physical adress */
-	iounmap((void *)autcpu12_fio_base);
+	iounmap(autcpu12_fio_base);
 
 	/* Free the MTD device structure */
 	kfree(autcpu12_mtd);
diff --git a/drivers/mtd/nand/cs553x_nand.c b/drivers/mtd/nand/cs553x_nand.c
index 064f3feadf53..cd3d7eb132f9 100644
--- a/drivers/mtd/nand/cs553x_nand.c
+++ b/drivers/mtd/nand/cs553x_nand.c
@@ -131,33 +131,17 @@ static void cs553x_write_byte(struct mtd_info *mtd, u_char byte)
 	writeb(byte, this->IO_ADDR_W + 0x801);
 }
 
-static void cs553x_hwcontrol(struct mtd_info *mtd, int cmd)
+static void cs553x_hwcontrol(struct mtd_info *mtd, int cmd,
+			     unsigned int ctrl)
 {
 	struct nand_chip *this = mtd->priv;
 	void __iomem *mmio_base = this->IO_ADDR_R;
-	unsigned char ctl;
-
-	switch (cmd) {
-	case NAND_CTL_SETCLE:
-		ctl = CS_NAND_CTL_CLE;
-		break;
-
-	case NAND_CTL_CLRCLE:
-	case NAND_CTL_CLRALE:
-	case NAND_CTL_SETNCE:
-		ctl = 0;
-		break;
-
-	case NAND_CTL_SETALE:
-		ctl = CS_NAND_CTL_ALE;
-		break;
-
-	default:
-	case NAND_CTL_CLRNCE:
-		ctl = CS_NAND_CTL_CE;
-		break;
+	if (ctrl & NAND_CTRL_CHANGE) {
+		unsigned char ctl = (ctrl & ~NAND_CTRL_CHANGE ) ^ 0x01;
+		writeb(ctl, mmio_base + MM_NAND_CTL);
 	}
-	writeb(ctl, mmio_base + MM_NAND_CTL);
+	if (cmd != NAND_CMD_NONE)
+		cs553x_write_byte(mtd, cmd);
 }
 
 static int cs553x_device_ready(struct mtd_info *mtd)
@@ -233,7 +217,7 @@ static int __init cs553x_init_one(int cs, int mmio, unsigned long adr)
 		goto out_mtd;
 	}
 
-	this->hwcontrol = cs553x_hwcontrol;
+	this->cmd_ctrl = cs553x_hwcontrol;
 	this->dev_ready = cs553x_device_ready;
 	this->read_byte = cs553x_read_byte;
 	this->write_byte = cs553x_write_byte;
diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index f77298f3af60..e4bb6b429f87 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -95,7 +95,8 @@ static u_char empty_write_ecc[6] = { 0x4b, 0x00, 0xe2, 0x0e, 0x93, 0xf7 };
 #define DoC_is_Millennium(doc) ((doc)->ChipID == DOC_ChipID_DocMil)
 #define DoC_is_2000(doc) ((doc)->ChipID == DOC_ChipID_Doc2k)
 
-static void doc200x_hwcontrol(struct mtd_info *mtd, int cmd);
+static void doc200x_hwcontrol(struct mtd_info *mtd, int cmd,
+			      unsigned int bitmask);
 static void doc200x_select_chip(struct mtd_info *mtd, int chip);
 
 static int debug = 0;
@@ -402,12 +403,10 @@ static uint16_t __init doc200x_ident_chip(struct mtd_info *mtd, int nr)
 	uint16_t ret;
 
 	doc200x_select_chip(mtd, nr);
-	doc200x_hwcontrol(mtd, NAND_CTL_SETCLE);
-	this->write_byte(mtd, NAND_CMD_READID);
-	doc200x_hwcontrol(mtd, NAND_CTL_CLRCLE);
-	doc200x_hwcontrol(mtd, NAND_CTL_SETALE);
-	this->write_byte(mtd, 0);
-	doc200x_hwcontrol(mtd, NAND_CTL_CLRALE);
+	doc200x_hwcontrol(mtd, NAND_CMD_READID,
+			  NAND_CTRL_CLE | NAND_CTRL_CHANGE);
+	doc200x_hwcontrol(mtd, 0, NAND_CTRL_ALE | NAND_CTRL_CHANGE);
+	doc200x_hwcontrol(mtd, NAND_CMD_NONE, NAND_NCE | NAND_CTRL_CHANGE);
 
 	/* We cant' use dev_ready here, but at least we wait for the
 	 * command to complete
@@ -425,12 +424,11 @@ static uint16_t __init doc200x_ident_chip(struct mtd_info *mtd, int nr)
 		} ident;
 		void __iomem *docptr = doc->virtadr;
 
-		doc200x_hwcontrol(mtd, NAND_CTL_SETCLE);
-		doc2000_write_byte(mtd, NAND_CMD_READID);
-		doc200x_hwcontrol(mtd, NAND_CTL_CLRCLE);
-		doc200x_hwcontrol(mtd, NAND_CTL_SETALE);
-		doc2000_write_byte(mtd, 0);
-		doc200x_hwcontrol(mtd, NAND_CTL_CLRALE);
+		doc200x_hwcontrol(mtd, NAND_CMD_READID,
+				  NAND_CTRL_CLE | NAND_CTRL_CHANGE);
+		doc200x_hwcontrol(mtd, 0, NAND_CTRL_ALE | NAND_CTRL_CHANGE);
+		doc200x_hwcontrol(mtd, NAND_CMD_NONE,
+				  NAND_NCE | NAND_CTRL_CHANGE);
 
 		udelay(50);
 
@@ -690,54 +688,37 @@ static void doc200x_select_chip(struct mtd_info *mtd, int chip)
 	chip -= (floor * doc->chips_per_floor);
 
 	/* 11.4.4 -- deassert CE before changing chip */
-	doc200x_hwcontrol(mtd, NAND_CTL_CLRNCE);
+	doc200x_hwcontrol(mtd, NAND_CMD_NONE, 0 | NAND_CTRL_CHANGE);
 
 	WriteDOC(floor, docptr, FloorSelect);
 	WriteDOC(chip, docptr, CDSNDeviceSelect);
 
-	doc200x_hwcontrol(mtd, NAND_CTL_SETNCE);
+	doc200x_hwcontrol(mtd, NAND_CMD_NONE, NAND_NCE | NAND_CTRL_CHANGE);
 
 	doc->curchip = chip;
 	doc->curfloor = floor;
 }
 
-static void doc200x_hwcontrol(struct mtd_info *mtd, int cmd)
+#define CDSN_CTRL_MSK (CDSN_CTRL_CE | CDSN_CTRL_CLE | CDSN_CTRL_ALE)
+
+static void doc200x_hwcontrol(struct mtd_info *mtd, int cmd,
+			      unsigned int ctrl)
 {
 	struct nand_chip *this = mtd->priv;
 	struct doc_priv *doc = this->priv;
 	void __iomem *docptr = doc->virtadr;
 
-	switch (cmd) {
-	case NAND_CTL_SETNCE:
-		doc->CDSNControl |= CDSN_CTRL_CE;
-		break;
-	case NAND_CTL_CLRNCE:
-		doc->CDSNControl &= ~CDSN_CTRL_CE;
-		break;
-	case NAND_CTL_SETCLE:
-		doc->CDSNControl |= CDSN_CTRL_CLE;
-		break;
-	case NAND_CTL_CLRCLE:
-		doc->CDSNControl &= ~CDSN_CTRL_CLE;
-		break;
-	case NAND_CTL_SETALE:
-		doc->CDSNControl |= CDSN_CTRL_ALE;
-		break;
-	case NAND_CTL_CLRALE:
-		doc->CDSNControl &= ~CDSN_CTRL_ALE;
-		break;
-	case NAND_CTL_SETWP:
-		doc->CDSNControl |= CDSN_CTRL_WP;
-		break;
-	case NAND_CTL_CLRWP:
-		doc->CDSNControl &= ~CDSN_CTRL_WP;
-		break;
+	if (ctrl & NAND_CTRL_CHANGE) {
+		doc->CDSNControl &= ~CDSN_CTRL_MSK;
+		doc->CDSNControl |= ctrl & CDSN_CTRL_MSK;
+		if (debug)
+			printk("hwcontrol(%d): %02x\n", cmd, doc->CDSNControl);
+		WriteDOC(doc->CDSNControl, docptr, CDSNControl);
+		/* 11.4.3 -- 4 NOPs after CSDNControl write */
+		DoC_Delay(doc, 4);
 	}
-	if (debug)
-		printk("hwcontrol(%d): %02x\n", cmd, doc->CDSNControl);
-	WriteDOC(doc->CDSNControl, docptr, CDSNControl);
-	/* 11.4.3 -- 4 NOPs after CSDNControl write */
-	DoC_Delay(doc, 4);
+	if (cmd != NAND_CMD_NONE)
+		this->write_byte(mtd, cmd);
 }
 
 static void doc2001plus_command(struct mtd_info *mtd, unsigned command, int column, int page_addr)
@@ -1510,7 +1491,7 @@ static inline int __init doc2001plus_init(struct mtd_info *mtd)
 	this->read_buf = doc2001plus_readbuf;
 	this->verify_buf = doc2001plus_verifybuf;
 	this->scan_bbt = inftl_scan_bbt;
-	this->hwcontrol = NULL;
+	this->cmd_ctrl = NULL;
 	this->select_chip = doc2001plus_select_chip;
 	this->cmdfunc = doc2001plus_command;
 	this->ecc.hwctl = doc2001plus_enable_hwecc;
@@ -1670,7 +1651,7 @@ static int __init doc_probe(unsigned long physadr)
 
 	nand->priv		= doc;
 	nand->select_chip	= doc200x_select_chip;
-	nand->hwcontrol		= doc200x_hwcontrol;
+	nand->cmd_ctrl		= doc200x_hwcontrol;
 	nand->dev_ready		= doc200x_dev_ready;
 	nand->waitfunc		= doc200x_wait;
 	nand->block_bad		= doc200x_block_bad;
diff --git a/drivers/mtd/nand/edb7312.c b/drivers/mtd/nand/edb7312.c
index 8e56570af91f..ba5a2174a408 100644
--- a/drivers/mtd/nand/edb7312.c
+++ b/drivers/mtd/nand/edb7312.c
@@ -73,32 +73,26 @@ static struct mtd_partition partition_info[] = {
 
 /*
  *	hardware specific access to control-lines
+ *
+ *	NAND_NCE: bit 0 -> bit 7
+ *	NAND_CLE: bit 1 -> bit 4
+ *	NAND_ALE: bit 2 -> bit 5
  */
-static void ep7312_hwcontrol(struct mtd_info *mtd, int cmd)
+static void ep7312_hwcontrol(struct mtd_info *mtd, int cmd, unsigned int ctrl)
 {
-	switch (cmd) {
-
-	case NAND_CTL_SETCLE:
-		clps_writeb(clps_readb(ep7312_pxdr) | 0x10, ep7312_pxdr);
-		break;
-	case NAND_CTL_CLRCLE:
-		clps_writeb(clps_readb(ep7312_pxdr) & ~0x10, ep7312_pxdr);
-		break;
-
-	case NAND_CTL_SETALE:
-		clps_writeb(clps_readb(ep7312_pxdr) | 0x20, ep7312_pxdr);
-		break;
-	case NAND_CTL_CLRALE:
-		clps_writeb(clps_readb(ep7312_pxdr) & ~0x20, ep7312_pxdr);
-		break;
-
-	case NAND_CTL_SETNCE:
-		clps_writeb((clps_readb(ep7312_pxdr) | 0x80) & ~0x40, ep7312_pxdr);
-		break;
-	case NAND_CTL_CLRNCE:
-		clps_writeb((clps_readb(ep7312_pxdr) | 0x80) | 0x40, ep7312_pxdr);
-		break;
+	struct nand_chip *chip = mtd->priv;
+
+	if (ctrl & NAND_CTRL_CHANGE) {
+		unsigned char bits;
+
+		bits = (ctrl & (NAND_CLE | NAND_ALE)) << 3;
+		bits = (ctrl & NAND_NCE) << 7;
+
+		clps_writeb((clps_readb(ep7312_pxdr)  & 0xB0) | 0x10,
+			    ep7312_pxdr);
 	}
+	if (cmd != NAND_CMD_NONE)
+		writeb(cmd, chip->IO_ADDR_W);
 }
 
 /*
@@ -159,7 +153,7 @@ static int __init ep7312_init(void)
 	/* insert callbacks */
 	this->IO_ADDR_R = ep7312_fio_base;
 	this->IO_ADDR_W = ep7312_fio_base;
-	this->hwcontrol = ep7312_hwcontrol;
+	this->cmd_ctrl = ep7312_hwcontrol;
 	this->dev_ready = ep7312_device_ready;
 	/* 15 us command delay time */
 	this->chip_delay = 15;
diff --git a/drivers/mtd/nand/h1910.c b/drivers/mtd/nand/h1910.c
index 06e91fa11b34..2d585d2d090c 100644
--- a/drivers/mtd/nand/h1910.c
+++ b/drivers/mtd/nand/h1910.c
@@ -56,36 +56,18 @@ static struct mtd_partition partition_info[] = {
 
 /*
  *	hardware specific access to control-lines
+ *
+ *	NAND_NCE: bit 0 - don't care
+ *	NAND_CLE: bit 1 - address bit 2
+ *	NAND_ALE: bit 2 - address bit 3
  */
-static void h1910_hwcontrol(struct mtd_info *mtd, int cmd)
+static void h1910_hwcontrol(struct mtd_info *mtd, int cmd,
+			    unsigned int ctrl)
 {
-	struct nand_chip *this = (struct nand_chip *)(mtd->priv);
-
-	switch (cmd) {
-
-	case NAND_CTL_SETCLE:
-		this->IO_ADDR_R |= (1 << 2);
-		this->IO_ADDR_W |= (1 << 2);
-		break;
-	case NAND_CTL_CLRCLE:
-		this->IO_ADDR_R &= ~(1 << 2);
-		this->IO_ADDR_W &= ~(1 << 2);
-		break;
-
-	case NAND_CTL_SETALE:
-		this->IO_ADDR_R |= (1 << 3);
-		this->IO_ADDR_W |= (1 << 3);
-		break;
-	case NAND_CTL_CLRALE:
-		this->IO_ADDR_R &= ~(1 << 3);
-		this->IO_ADDR_W &= ~(1 << 3);
-		break;
-
-	case NAND_CTL_SETNCE:
-		break;
-	case NAND_CTL_CLRNCE:
-		break;
-	}
+	struct nand_chip *chip = mtd->priv;
+
+	if (cmd != NAND_CMD_NONE)
+		writeb(cmd, chip->IO_ADDR_W | ((ctrl & 0x6) << 1));
 }
 
 /*
@@ -145,7 +127,7 @@ static int __init h1910_init(void)
 	/* insert callbacks */
 	this->IO_ADDR_R = nandaddr;
 	this->IO_ADDR_W = nandaddr;
-	this->hwcontrol = h1910_hwcontrol;
+	this->cmd_ctrl = h1910_hwcontrol;
 	this->dev_ready = NULL;	/* unknown whether that was correct or not so we will just do it like this */
 	/* 15 us command delay time */
 	this->chip_delay = 50;
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index aa2e14538bf4..f6997fb77b91 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -276,10 +276,10 @@ static void nand_select_chip(struct mtd_info *mtd, int chip)
 	struct nand_chip *this = mtd->priv;
 	switch (chip) {
 	case -1:
-		this->hwcontrol(mtd, NAND_CTL_CLRNCE);
+		this->cmd_ctrl(mtd, NAND_CMD_NONE, 0 | NAND_CTRL_CHANGE);
 		break;
 	case 0:
-		this->hwcontrol(mtd, NAND_CTL_SETNCE);
+		this->cmd_ctrl(mtd, NAND_CMD_NONE, NAND_NCE | NAND_CTRL_CHANGE);
 		break;
 
 	default:
@@ -548,13 +548,12 @@ static void nand_wait_ready(struct mtd_info *mtd)
  * Send command to NAND device. This function is used for small page
  * devices (256/512 Bytes per page)
  */
-static void nand_command(struct mtd_info *mtd, unsigned command, int column,
-			 int page_addr)
+static void nand_command(struct mtd_info *mtd, unsigned int command,
+			 int column, int page_addr)
 {
 	register struct nand_chip *this = mtd->priv;
+	int ctrl = NAND_CTRL_CLE | NAND_CTRL_CHANGE;
 
-	/* Begin command latch cycle */
-	this->hwcontrol(mtd, NAND_CTL_SETCLE);
 	/*
 	 * Write out the command to the device.
 	 */
@@ -572,33 +571,32 @@ static void nand_command(struct mtd_info *mtd, unsigned command, int column,
 			column -= 256;
 			readcmd = NAND_CMD_READ1;
 		}
-		this->write_byte(mtd, readcmd);
+		this->cmd_ctrl(mtd, readcmd, ctrl);
+		ctrl &= ~NAND_CTRL_CHANGE;
 	}
-	this->write_byte(mtd, command);
+	this->cmd_ctrl(mtd, command, ctrl);
 
-	/* Set ALE and clear CLE to start address cycle */
-	this->hwcontrol(mtd, NAND_CTL_CLRCLE);
-
-	if (column != -1 || page_addr != -1) {
-		this->hwcontrol(mtd, NAND_CTL_SETALE);
-
-		/* Serially input address */
-		if (column != -1) {
-			/* Adjust columns for 16 bit buswidth */
-			if (this->options & NAND_BUSWIDTH_16)
-				column >>= 1;
-			this->write_byte(mtd, column);
-		}
-		if (page_addr != -1) {
-			this->write_byte(mtd, (uint8_t)(page_addr & 0xff));
-			this->write_byte(mtd, (uint8_t)((page_addr >> 8) & 0xff));
-			/* One more address cycle for devices > 32MiB */
-			if (this->chipsize > (32 << 20))
-				this->write_byte(mtd, (uint8_t)((page_addr >> 16) & 0x0f));
-		}
-		/* Latch in address */
-		this->hwcontrol(mtd, NAND_CTL_CLRALE);
+	/*
+	 * Address cycle, when necessary
+	 */
+	ctrl = NAND_CTRL_ALE | NAND_CTRL_CHANGE;
+	/* Serially input address */
+	if (column != -1) {
+		/* Adjust columns for 16 bit buswidth */
+		if (this->options & NAND_BUSWIDTH_16)
+			column >>= 1;
+		this->cmd_ctrl(mtd, column, ctrl);
+		ctrl &= ~NAND_CTRL_CHANGE;
+	}
+	if (page_addr != -1) {
+		this->cmd_ctrl(mtd, page_addr, ctrl);
+		ctrl &= ~NAND_CTRL_CHANGE;
+		this->cmd_ctrl(mtd, page_addr >> 8, ctrl);
+		/* One more address cycle for devices > 32MiB */
+		if (this->chipsize > (32 << 20))
+			this->cmd_ctrl(mtd, page_addr >> 16, ctrl);
 	}
+	this->cmd_ctrl(mtd, NAND_CMD_NONE, NAND_NCE | NAND_CTRL_CHANGE);
 
 	/*
 	 * program and erase have their own busy handlers
@@ -611,15 +609,16 @@ static void nand_command(struct mtd_info *mtd, unsigned command, int column,
 	case NAND_CMD_ERASE2:
 	case NAND_CMD_SEQIN:
 	case NAND_CMD_STATUS:
+		this->cmd_ctrl(mtd, NAND_CMD_NONE, NAND_NCE);
 		return;
 
 	case NAND_CMD_RESET:
 		if (this->dev_ready)
 			break;
 		udelay(this->chip_delay);
-		this->hwcontrol(mtd, NAND_CTL_SETCLE);
-		this->write_byte(mtd, NAND_CMD_STATUS);
-		this->hwcontrol(mtd, NAND_CTL_CLRCLE);
+		this->cmd_ctrl(mtd, NAND_CMD_STATUS,
+			       NAND_CTRL_CLE | NAND_CTRL_CHANGE);
+		this->cmd_ctrl(mtd, NAND_CMD_NONE, NAND_NCE);
 		while (!(this->read_byte(mtd) & NAND_STATUS_READY)) ;
 		return;
 
@@ -648,12 +647,13 @@ static void nand_command(struct mtd_info *mtd, unsigned command, int column,
  * @column:	the column address for this command, -1 if none
  * @page_addr:	the page address for this command, -1 if none
  *
- * Send command to NAND device. This is the version for the new large page devices
- * We dont have the separate regions as we have in the small page devices.
- * We must emulate NAND_CMD_READOOB to keep the code compatible.
+ * Send command to NAND device. This is the version for the new large page
+ * devices We dont have the separate regions as we have in the small page
+ * devices.  We must emulate NAND_CMD_READOOB to keep the code compatible.
  *
  */
-static void nand_command_lp(struct mtd_info *mtd, unsigned command, int column, int page_addr)
+static void nand_command_lp(struct mtd_info *mtd, unsigned int command,
+			    int column, int page_addr)
 {
 	register struct nand_chip *this = mtd->priv;
 
@@ -663,34 +663,33 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned command, int column,
 		command = NAND_CMD_READ0;
 	}
 
-	/* Begin command latch cycle */
-	this->hwcontrol(mtd, NAND_CTL_SETCLE);
-	/* Write out the command to the device. */
-	this->write_byte(mtd, (command & 0xff));
-	/* End command latch cycle */
-	this->hwcontrol(mtd, NAND_CTL_CLRCLE);
+	/* Command latch cycle */
+	this->cmd_ctrl(mtd, command & 0xff,
+		       NAND_NCE | NAND_CLE | NAND_CTRL_CHANGE);
 
 	if (column != -1 || page_addr != -1) {
-		this->hwcontrol(mtd, NAND_CTL_SETALE);
+		int ctrl = NAND_CTRL_CHANGE | NAND_NCE | NAND_ALE;
 
 		/* Serially input address */
 		if (column != -1) {
 			/* Adjust columns for 16 bit buswidth */
 			if (this->options & NAND_BUSWIDTH_16)
 				column >>= 1;
-			this->write_byte(mtd, column & 0xff);
-			this->write_byte(mtd, column >> 8);
+			this->cmd_ctrl(mtd, column, ctrl);
+			ctrl &= ~NAND_CTRL_CHANGE;
+			this->cmd_ctrl(mtd, column >> 8, ctrl);
 		}
 		if (page_addr != -1) {
-			this->write_byte(mtd, (uint8_t)(page_addr & 0xff));
-			this->write_byte(mtd, (uint8_t)((page_addr >> 8) & 0xff));
+			this->cmd_ctrl(mtd, page_addr, ctrl);
+			this->cmd_ctrl(mtd, page_addr >> 8,
+				       NAND_NCE | NAND_ALE);
 			/* One more address cycle for devices > 128MiB */
 			if (this->chipsize > (128 << 20))
-				this->write_byte(mtd, (uint8_t)((page_addr >> 16) & 0xff));
+				this->cmd_ctrl(mtd, page_addr >> 16,
+					       NAND_NCE | NAND_ALE);
 		}
-		/* Latch in address */
-		this->hwcontrol(mtd, NAND_CTL_CLRALE);
 	}
+	this->cmd_ctrl(mtd, NAND_CMD_NONE, NAND_NCE | NAND_CTRL_CHANGE);
 
 	/*
 	 * program and erase have their own busy handlers
@@ -722,20 +721,14 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned command, int column,
 		if (this->dev_ready)
 			break;
 		udelay(this->chip_delay);
-		this->hwcontrol(mtd, NAND_CTL_SETCLE);
-		this->write_byte(mtd, NAND_CMD_STATUS);
-		this->hwcontrol(mtd, NAND_CTL_CLRCLE);
+		this->cmd_ctrl(mtd, NAND_CMD_STATUS, NAND_NCE | NAND_CLE);
+		this->cmd_ctrl(mtd, NAND_CMD_NONE, NAND_NCE);
 		while (!(this->read_byte(mtd) & NAND_STATUS_READY)) ;
 		return;
 
 	case NAND_CMD_READ0:
-		/* Begin command latch cycle */
-		this->hwcontrol(mtd, NAND_CTL_SETCLE);
-		/* Write out the start read command */
-		this->write_byte(mtd, NAND_CMD_READSTART);
-		/* End command latch cycle */
-		this->hwcontrol(mtd, NAND_CTL_CLRCLE);
-		/* Fall through into ready check */
+		this->cmd_ctrl(mtd, NAND_CMD_READSTART, NAND_NCE | NAND_CLE);
+		this->cmd_ctrl(mtd, NAND_CMD_NONE, NAND_NCE);
 
 		/* This applies to read commands */
 	default:
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 22af9b29d2bf..ecf727b32dec 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -1071,68 +1071,6 @@ switch_state(struct nandsim *ns)
 	}
 }
 
-static void
-ns_hwcontrol(struct mtd_info *mtd, int cmd)
-{
-	struct nandsim *ns = (struct nandsim *)((struct nand_chip *)mtd->priv)->priv;
-
-	switch (cmd) {
-
-	/* set CLE line high */
-	case NAND_CTL_SETCLE:
-		NS_DBG("ns_hwcontrol: start command latch cycles\n");
-		ns->lines.cle  = 1;
-		break;
-
-	/* set CLE line low */
-	case NAND_CTL_CLRCLE:
-		NS_DBG("ns_hwcontrol: stop command latch cycles\n");
-		ns->lines.cle  = 0;
-		break;
-
-	/* set ALE line high */
-	case NAND_CTL_SETALE:
-		NS_DBG("ns_hwcontrol: start address latch cycles\n");
-		ns->lines.ale   = 1;
-		break;
-
-	/* set ALE line low */
-	case NAND_CTL_CLRALE:
-		NS_DBG("ns_hwcontrol: stop address latch cycles\n");
-		ns->lines.ale  = 0;
-		break;
-
-	/* set WP line high */
-	case NAND_CTL_SETWP:
-		NS_DBG("ns_hwcontrol: enable write protection\n");
-		ns->lines.wp = 1;
-		break;
-
-	/* set WP line low */
-	case NAND_CTL_CLRWP:
-		NS_DBG("ns_hwcontrol: disable write protection\n");
-		ns->lines.wp = 0;
-		break;
-
-	/* set CE line low */
-	case NAND_CTL_SETNCE:
-		NS_DBG("ns_hwcontrol: enable chip\n");
-		ns->lines.ce = 1;
-		break;
-
-	/* set CE line high */
-	case NAND_CTL_CLRNCE:
-		NS_DBG("ns_hwcontrol: disable chip\n");
-		ns->lines.ce = 0;
-		break;
-
-	default:
-		NS_ERR("hwcontrol: unknown command\n");
-        }
-
-	return;
-}
-
 static u_char
 ns_nand_read_byte(struct mtd_info *mtd)
 {
@@ -1359,6 +1297,18 @@ ns_nand_write_byte(struct mtd_info *mtd, u_char byte)
 	return;
 }
 
+static void ns_hwcontrol(struct mtd_info *mtd, int cmd, unsigned int bitmask)
+{
+	struct nandsim *ns = ((struct nand_chip *)mtd->priv)->priv;
+
+	ns->lines.cle = bitmask & NAND_CLE ? 1 : 0;
+	ns->lines.ale = bitmask & NAND_ALE ? 1 : 0;
+	ns->lines.ce = bitmask & NAND_NCE ? 1 : 0;
+
+	if (cmd != NAND_CMD_NONE)
+		ns_nand_write_byte(mtd, cmd);
+}
+
 static int
 ns_device_ready(struct mtd_info *mtd)
 {
@@ -1514,7 +1464,7 @@ static int __init ns_init_module(void)
 	/*
 	 * Register simulator's callbacks.
 	 */
-	chip->hwcontrol  = ns_hwcontrol;
+	chip->cmd_ctrl	 = ns_hwcontrol;
 	chip->read_byte  = ns_nand_read_byte;
 	chip->dev_ready  = ns_device_ready;
 	chip->write_byte = ns_nand_write_byte;
diff --git a/drivers/mtd/nand/ndfc.c b/drivers/mtd/nand/ndfc.c
index e2dc81de106a..481541a683ca 100644
--- a/drivers/mtd/nand/ndfc.c
+++ b/drivers/mtd/nand/ndfc.c
@@ -60,22 +60,17 @@ static void ndfc_select_chip(struct mtd_info *mtd, int chip)
 	writel(ccr, ndfc->ndfcbase + NDFC_CCR);
 }
 
-static void ndfc_hwcontrol(struct mtd_info *mtd, int cmd)
+static void ndfc_hwcontrol(struct mtd_info *mtd, int cmd, unsigned int ctrl)
 {
-	struct ndfc_controller *ndfc = &ndfc_ctrl;
 	struct nand_chip *chip = mtd->priv;
 
-	switch (cmd) {
-	case NAND_CTL_SETCLE:
-		chip->IO_ADDR_W = ndfc->ndfcbase + NDFC_CMD;
-		break;
-	case NAND_CTL_SETALE:
-		chip->IO_ADDR_W = ndfc->ndfcbase + NDFC_ALE;
-		break;
-	default:
-		chip->IO_ADDR_W = ndfc->ndfcbase + NDFC_DATA;
-		break;
-	}
+	if (cmd == NAND_CMD_NONE)
+		return;
+
+	if (ctrl & NAND_CLE)
+		writel(cmd & 0xFF, chip->IO_ADDR_W + NDFC_CMD);
+	else
+		writel(cmd & 0xFF, chip->IO_ADDR_W + NDFC_ALE);
 }
 
 static int ndfc_ready(struct mtd_info *mtd)
@@ -158,7 +153,7 @@ static void ndfc_chip_init(struct ndfc_nand_mtd *mtd)
 
 	chip->IO_ADDR_R = ndfc->ndfcbase + NDFC_DATA;
 	chip->IO_ADDR_W = ndfc->ndfcbase + NDFC_DATA;
-	chip->hwcontrol = ndfc_hwcontrol;
+	chip->cmd_ctrl = ndfc_hwcontrol;
 	chip->dev_ready = ndfc_ready;
 	chip->select_chip = ndfc_select_chip;
 	chip->chip_delay = 50;
diff --git a/drivers/mtd/nand/ppchameleonevb.c b/drivers/mtd/nand/ppchameleonevb.c
index 9fab0998524d..22fa65c12ab9 100644
--- a/drivers/mtd/nand/ppchameleonevb.c
+++ b/drivers/mtd/nand/ppchameleonevb.c
@@ -108,54 +108,68 @@ extern int parse_cmdline_partitions(struct mtd_info *master, struct mtd_partitio
 /*
  *	hardware specific access to control-lines
  */
-static void ppchameleon_hwcontrol(struct mtd_info *mtdinfo, int cmd)
+static void ppchameleon_hwcontrol(struct mtd_info *mtdinfo, int cmd,
+				  unsigned int ctrl)
 {
-	switch (cmd) {
-
-	case NAND_CTL_SETCLE:
-		MACRO_NAND_CTL_SETCLE((unsigned long)CFG_NAND0_PADDR);
-		break;
-	case NAND_CTL_CLRCLE:
-		MACRO_NAND_CTL_CLRCLE((unsigned long)CFG_NAND0_PADDR);
-		break;
-	case NAND_CTL_SETALE:
-		MACRO_NAND_CTL_SETALE((unsigned long)CFG_NAND0_PADDR);
-		break;
-	case NAND_CTL_CLRALE:
-		MACRO_NAND_CTL_CLRALE((unsigned long)CFG_NAND0_PADDR);
-		break;
-	case NAND_CTL_SETNCE:
-		MACRO_NAND_ENABLE_CE((unsigned long)CFG_NAND0_PADDR);
-		break;
-	case NAND_CTL_CLRNCE:
-		MACRO_NAND_DISABLE_CE((unsigned long)CFG_NAND0_PADDR);
-		break;
+	struct nand_chip *chip = mtd->priv;
+
+	if (ctrl & NAND_CTRL_CHANGE) {
+#error Missing headerfiles. No way to fix this. -tglx
+		switch (cmd) {
+		case NAND_CTL_SETCLE:
+			MACRO_NAND_CTL_SETCLE((unsigned long)CFG_NAND0_PADDR);
+			break;
+		case NAND_CTL_CLRCLE:
+			MACRO_NAND_CTL_CLRCLE((unsigned long)CFG_NAND0_PADDR);
+			break;
+		case NAND_CTL_SETALE:
+			MACRO_NAND_CTL_SETALE((unsigned long)CFG_NAND0_PADDR);
+			break;
+		case NAND_CTL_CLRALE:
+			MACRO_NAND_CTL_CLRALE((unsigned long)CFG_NAND0_PADDR);
+			break;
+		case NAND_CTL_SETNCE:
+			MACRO_NAND_ENABLE_CE((unsigned long)CFG_NAND0_PADDR);
+			break;
+		case NAND_CTL_CLRNCE:
+			MACRO_NAND_DISABLE_CE((unsigned long)CFG_NAND0_PADDR);
+			break;
+		}
 	}
+	if (cmd != NAND_CMD_NONE)
+		writeb(cmd, chip->IO_ADDR_W);
 }
 
-static void ppchameleonevb_hwcontrol(struct mtd_info *mtdinfo, int cmd)
+static void ppchameleonevb_hwcontrol(struct mtd_info *mtdinfo, int cmd,
+				     unsigned int ctrl)
 {
-	switch (cmd) {
-
-	case NAND_CTL_SETCLE:
-		MACRO_NAND_CTL_SETCLE((unsigned long)CFG_NAND1_PADDR);
-		break;
-	case NAND_CTL_CLRCLE:
-		MACRO_NAND_CTL_CLRCLE((unsigned long)CFG_NAND1_PADDR);
-		break;
-	case NAND_CTL_SETALE:
-		MACRO_NAND_CTL_SETALE((unsigned long)CFG_NAND1_PADDR);
-		break;
-	case NAND_CTL_CLRALE:
-		MACRO_NAND_CTL_CLRALE((unsigned long)CFG_NAND1_PADDR);
-		break;
-	case NAND_CTL_SETNCE:
-		MACRO_NAND_ENABLE_CE((unsigned long)CFG_NAND1_PADDR);
-		break;
-	case NAND_CTL_CLRNCE:
-		MACRO_NAND_DISABLE_CE((unsigned long)CFG_NAND1_PADDR);
-		break;
+	struct nand_chip *chip = mtd->priv;
+
+	if (ctrl & NAND_CTRL_CHANGE) {
+#error Missing headerfiles. No way to fix this. -tglx
+		switch (cmd) {
+		case NAND_CTL_SETCLE:
+			MACRO_NAND_CTL_SETCLE((unsigned long)CFG_NAND1_PADDR);
+			break;
+		case NAND_CTL_CLRCLE:
+			MACRO_NAND_CTL_CLRCLE((unsigned long)CFG_NAND1_PADDR);
+			break;
+		case NAND_CTL_SETALE:
+			MACRO_NAND_CTL_SETALE((unsigned long)CFG_NAND1_PADDR);
+			break;
+		case NAND_CTL_CLRALE:
+			MACRO_NAND_CTL_CLRALE((unsigned long)CFG_NAND1_PADDR);
+			break;
+		case NAND_CTL_SETNCE:
+			MACRO_NAND_ENABLE_CE((unsigned long)CFG_NAND1_PADDR);
+			break;
+		case NAND_CTL_CLRNCE:
+			MACRO_NAND_DISABLE_CE((unsigned long)CFG_NAND1_PADDR);
+			break;
+		}
 	}
+	if (cmd != NAND_CMD_NONE)
+		writeb(cmd, chip->IO_ADDR_W);
 }
 
 #ifdef USE_READY_BUSY_PIN
@@ -251,7 +265,7 @@ static int __init ppchameleonevb_init(void)
 	/* insert callbacks */
 	this->IO_ADDR_R = ppchameleon_fio_base;
 	this->IO_ADDR_W = ppchameleon_fio_base;
-	this->hwcontrol = ppchameleon_hwcontrol;
+	this->cmd_ctrl = ppchameleon_hwcontrol;
 #ifdef USE_READY_BUSY_PIN
 	this->dev_ready = ppchameleon_device_ready;
 #endif
@@ -351,7 +365,7 @@ static int __init ppchameleonevb_init(void)
 	/* insert callbacks */
 	this->IO_ADDR_R = ppchameleonevb_fio_base;
 	this->IO_ADDR_W = ppchameleonevb_fio_base;
-	this->hwcontrol = ppchameleonevb_hwcontrol;
+	this->cmd_ctrl = ppchameleonevb_hwcontrol;
 #ifdef USE_READY_BUSY_PIN
 	this->dev_ready = ppchameleonevb_device_ready;
 #endif
diff --git a/drivers/mtd/nand/rtc_from4.c b/drivers/mtd/nand/rtc_from4.c
index f8e631c89a60..6c97bfaea19a 100644
--- a/drivers/mtd/nand/rtc_from4.c
+++ b/drivers/mtd/nand/rtc_from4.c
@@ -208,32 +208,18 @@ static uint8_t revbits[256] = {
  * Address lines (A24-A22), so no action is required here.
  *
  */
-static void rtc_from4_hwcontrol(struct mtd_info *mtd, int cmd)
+static void rtc_from4_hwcontrol(struct mtd_info *mtd, int cmd,
+				unsigned int ctrl)
 {
-	struct nand_chip *this = (struct nand_chip *)(mtd->priv);
+	struct nand_chip *chip = (mtd->priv);
 
-	switch (cmd) {
+	if (cmd == NAND_CMD_NONE)
+		return;
 
-	case NAND_CTL_SETCLE:
-		this->IO_ADDR_W = (void __iomem *)((unsigned long)this->IO_ADDR_W | RTC_FROM4_CLE);
-		break;
-	case NAND_CTL_CLRCLE:
-		this->IO_ADDR_W = (void __iomem *)((unsigned long)this->IO_ADDR_W & ~RTC_FROM4_CLE);
-		break;
-
-	case NAND_CTL_SETALE:
-		this->IO_ADDR_W = (void __iomem *)((unsigned long)this->IO_ADDR_W | RTC_FROM4_ALE);
-		break;
-	case NAND_CTL_CLRALE:
-		this->IO_ADDR_W = (void __iomem *)((unsigned long)this->IO_ADDR_W & ~RTC_FROM4_ALE);
-		break;
-
-	case NAND_CTL_SETNCE:
-		break;
-	case NAND_CTL_CLRNCE:
-		break;
-
-	}
+	if (ctrl & NAND_CLE)
+		writeb(cmd, chip->IO_ADDR_W | RTC_FROM4_CLE);
+	else
+		writeb(cmd, chip->IO_ADDR_W | RTC_FROM4_ALE);
 }
 
 /*
@@ -559,7 +545,7 @@ static int __init rtc_from4_init(void)
 	this->IO_ADDR_R = rtc_from4_fio_base;
 	this->IO_ADDR_W = rtc_from4_fio_base;
 	/* Set address of hardware control function */
-	this->hwcontrol = rtc_from4_hwcontrol;
+	this->cmd_ctrl = rtc_from4_hwcontrol;
 	/* Set address of chip select function */
 	this->select_chip = rtc_from4_nand_select_chip;
 	/* command delay time (in us) */
diff --git a/drivers/mtd/nand/s3c2410.c b/drivers/mtd/nand/s3c2410.c
index 608340a25278..215227d1a65c 100644
--- a/drivers/mtd/nand/s3c2410.c
+++ b/drivers/mtd/nand/s3c2410.c
@@ -256,60 +256,36 @@ static void s3c2410_nand_select_chip(struct mtd_info *mtd, int chip)
  *
 */
 
-static void s3c2410_nand_hwcontrol(struct mtd_info *mtd, int cmd)
+static void s3c2410_nand_hwcontrol(struct mtd_info *mtd, int cmd,
+				   unsigend int ctrl)
 {
 	struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd);
 	struct nand_chip *chip = mtd->priv;
 
-	switch (cmd) {
-	case NAND_CTL_SETNCE:
-	case NAND_CTL_CLRNCE:
-		printk(KERN_ERR "%s: called for NCE\n", __FUNCTION__);
-		break;
-
-	case NAND_CTL_SETCLE:
-		chip->IO_ADDR_W = info->regs + S3C2410_NFCMD;
-		break;
-
-	case NAND_CTL_SETALE:
-		chip->IO_ADDR_W = info->regs + S3C2410_NFADDR;
-		break;
-
-		/* NAND_CTL_CLRCLE: */
-		/* NAND_CTL_CLRALE: */
-	default:
-		chip->IO_ADDR_W = info->regs + S3C2410_NFDATA;
-		break;
-	}
+	if (cmd == NAND_CMD_NONE)
+		return;
+
+	if (cmd & NAND_CLE)
+		writeb(cmd, info->regs + S3C2410_NFCMD);
+	else
+		writeb(cmd, info->regs + S3C2410_NFADDR);
 }
 
 /* command and control functions */
 
-static void s3c2440_nand_hwcontrol(struct mtd_info *mtd, int cmd)
+static void s3c2410_nand_hwcontrol(struct mtd_info *mtd, int cmd,
+				   unsigend int ctrl)
 {
 	struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd);
 	struct nand_chip *chip = mtd->priv;
 
-	switch (cmd) {
-	case NAND_CTL_SETNCE:
-	case NAND_CTL_CLRNCE:
-		printk(KERN_ERR "%s: called for NCE\n", __FUNCTION__);
-		break;
-
-	case NAND_CTL_SETCLE:
-		chip->IO_ADDR_W = info->regs + S3C2440_NFCMD;
-		break;
-
-	case NAND_CTL_SETALE:
-		chip->IO_ADDR_W = info->regs + S3C2440_NFADDR;
-		break;
-
-		/* NAND_CTL_CLRCLE: */
-		/* NAND_CTL_CLRALE: */
-	default:
-		chip->IO_ADDR_W = info->regs + S3C2440_NFDATA;
-		break;
-	}
+	if (cmd == NAND_CMD_NONE)
+		return;
+
+	if (cmd & NAND_CLE)
+		writeb(cmd, info->regs + S3C2440_NFCMD);
+	else
+		writeb(cmd, info->regs + S3C2440_NFADDR);
 }
 
 /* s3c2410_nand_devready()
@@ -498,7 +474,7 @@ static void s3c2410_nand_init_chip(struct s3c2410_nand_info *info,
 
 	chip->IO_ADDR_R	   = info->regs + S3C2410_NFDATA;
 	chip->IO_ADDR_W    = info->regs + S3C2410_NFDATA;
-	chip->hwcontrol    = s3c2410_nand_hwcontrol;
+	chip->cmd_ctrl     = s3c2410_nand_hwcontrol;
 	chip->dev_ready    = s3c2410_nand_devready;
 	chip->write_buf    = s3c2410_nand_write_buf;
 	chip->read_buf     = s3c2410_nand_read_buf;
@@ -511,7 +487,7 @@ static void s3c2410_nand_init_chip(struct s3c2410_nand_info *info,
 	if (info->is_s3c2440) {
 		chip->IO_ADDR_R	 = info->regs + S3C2440_NFDATA;
 		chip->IO_ADDR_W  = info->regs + S3C2440_NFDATA;
-		chip->hwcontrol  = s3c2440_nand_hwcontrol;
+		chip->cmd_ctrl   = s3c2440_nand_hwcontrol;
 	}
 
 	nmtd->info	   = info;
diff --git a/drivers/mtd/nand/sharpsl.c b/drivers/mtd/nand/sharpsl.c
index 5554d0b97c8c..45a1da724bff 100644
--- a/drivers/mtd/nand/sharpsl.c
+++ b/drivers/mtd/nand/sharpsl.c
@@ -77,31 +77,26 @@ static struct mtd_partition sharpsl_nand_default_partition_info[] = {
 
 /*
  *	hardware specific access to control-lines
+ *	ctrl:
+ *	NAND_CNE: bit 0 -> bit 0 & 4
+ *	NAND_CLE: bit 1 -> bit 1
+ *	NAND_ALE: bit 2 -> bit 2
+ *
  */
-static void sharpsl_nand_hwcontrol(struct mtd_info *mtd, int cmd)
+static void sharpsl_nand_hwcontrol(struct mtd_info *mtd, int cmd,
+				   unsigned int ctrl)
 {
-	switch (cmd) {
-	case NAND_CTL_SETCLE:
-		writeb(readb(FLASHCTL) | FLCLE, FLASHCTL);
-		break;
-	case NAND_CTL_CLRCLE:
-		writeb(readb(FLASHCTL) & ~FLCLE, FLASHCTL);
-		break;
-
-	case NAND_CTL_SETALE:
-		writeb(readb(FLASHCTL) | FLALE, FLASHCTL);
-		break;
-	case NAND_CTL_CLRALE:
-		writeb(readb(FLASHCTL) & ~FLALE, FLASHCTL);
-		break;
-
-	case NAND_CTL_SETNCE:
-		writeb(readb(FLASHCTL) & ~(FLCE0 | FLCE1), FLASHCTL);
-		break;
-	case NAND_CTL_CLRNCE:
-		writeb(readb(FLASHCTL) | (FLCE0 | FLCE1), FLASHCTL);
-		break;
+	struct nand_chip *chip = mtd->priv;
+
+	if (ctrl & NAND_CTRL_CHANGE) {
+		unsigned char bits = ctrl & 0x07;
+
+		bits |= (ctrl & 0x01) << 4;
+		writeb((readb(FLASHCTL) & 0x17) | bits, FLASHCTL);
 	}
+
+	if (cmd != NAND_CMD_NONE)
+		writeb(cmd, chip->IO_ADDR_W);
 }
 
 static uint8_t scan_ff_pattern[] = { 0xff, 0xff };
@@ -196,7 +191,7 @@ static int __init sharpsl_nand_init(void)
 	this->IO_ADDR_R = FLASHIO;
 	this->IO_ADDR_W = FLASHIO;
 	/* Set address of hardware control function */
-	this->hwcontrol = sharpsl_nand_hwcontrol;
+	this->cmd_ctrl = sharpsl_nand_hwcontrol;
 	this->dev_ready = sharpsl_nand_dev_ready;
 	/* 15 us command delay time */
 	this->chip_delay = 15;
diff --git a/drivers/mtd/nand/spia.c b/drivers/mtd/nand/spia.c
index 9737f1d67c3c..1f6d429b1583 100644
--- a/drivers/mtd/nand/spia.c
+++ b/drivers/mtd/nand/spia.c
@@ -82,20 +82,27 @@ static const struct mtd_partition partition_info[] = {
 
 /*
  *	hardware specific access to control-lines
-*/
+ *
+ *	ctrl:
+ *	NAND_CNE: bit 0 -> bit 2
+ *	NAND_CLE: bit 1 -> bit 0
+ *	NAND_ALE: bit 2 -> bit 1
+ */
 static void spia_hwcontrol(struct mtd_info *mtd, int cmd)
 {
-	switch (cmd) {
+	struct nand_chip *chip = mtd->priv;
 
-	case NAND_CTL_SETCLE: (*(volatile unsigned char *) (spia_io_base + spia_pedr)) |=  0x01; break;
-	case NAND_CTL_CLRCLE: (*(volatile unsigned char *) (spia_io_base + spia_pedr)) &= ~0x01; break;
+	if (ctrl & NAND_CTRL_CHANGE) {
+		void __iomem *addr = spia_io_base + spia_pedr;
+		unsigned char bits;
 
-	case NAND_CTL_SETALE: (*(volatile unsigned char *) (spia_io_base + spia_pedr)) |=  0x02; break;
-	case NAND_CTL_CLRALE: (*(volatile unsigned char *) (spia_io_base + spia_pedr)) &= ~0x02; break;
-
-	case NAND_CTL_SETNCE: (*(volatile unsigned char *) (spia_io_base + spia_pedr)) &= ~0x04; break;
-	case NAND_CTL_CLRNCE: (*(volatile unsigned char *) (spia_io_base + spia_pedr)) |=  0x04; break;
+		bits = (ctrl & NAND_CNE) << 2;
+		bits |= (ctrl & NAND_CLE | NAND_ALE) >> 1;
+		writeb((readb(addr) & ~0x7) | bits, addr);
 	}
+
+	if (cmd != NAND_CMD_NONE)
+		writeb(cmd, chip->IO_ADDR_W);
 }
 
 /*
@@ -133,7 +140,7 @@ static int __init spia_init(void)
 	this->IO_ADDR_R = (void __iomem *)spia_fio_base;
 	this->IO_ADDR_W = (void __iomem *)spia_fio_base;
 	/* Set address of hardware control function */
-	this->hwcontrol = spia_hwcontrol;
+	this->cmd_ctrl = spia_hwcontrol;
 	/* 15 us command delay time */
 	this->chip_delay = 15;
 
diff --git a/drivers/mtd/nand/toto.c b/drivers/mtd/nand/toto.c
index 50aa6a46911f..a9cf0190c27a 100644
--- a/drivers/mtd/nand/toto.c
+++ b/drivers/mtd/nand/toto.c
@@ -32,6 +32,8 @@
 #include <asm/arch-omap1510/hardware.h>
 #include <asm/arch/gpio.h>
 
+#define CONFIG_NAND_WORKAROUND 1
+
 /*
  * MTD structure for TOTO board
  */
@@ -39,25 +41,6 @@ static struct mtd_info *toto_mtd = NULL;
 
 static unsigned long toto_io_base = OMAP_FLASH_1_BASE;
 
-#define CONFIG_NAND_WORKAROUND 1
-
-#define NAND_NCE 0x4000
-#define NAND_CLE 0x1000
-#define NAND_ALE 0x0002
-#define NAND_MASK (NAND_CLE | NAND_ALE | NAND_NCE)
-
-#define T_NAND_CTL_CLRALE(iob)  gpiosetout(NAND_ALE, 0)
-#define T_NAND_CTL_SETALE(iob)  gpiosetout(NAND_ALE, NAND_ALE)
-#ifdef CONFIG_NAND_WORKAROUND	/* "some" dev boards busted, blue wired to rts2 :( */
-#define T_NAND_CTL_CLRCLE(iob)  gpiosetout(NAND_CLE, 0); rts2setout(2, 2)
-#define T_NAND_CTL_SETCLE(iob)  gpiosetout(NAND_CLE, NAND_CLE); rts2setout(2, 0)
-#else
-#define T_NAND_CTL_CLRCLE(iob)  gpiosetout(NAND_CLE, 0)
-#define T_NAND_CTL_SETCLE(iob)  gpiosetout(NAND_CLE, NAND_CLE)
-#endif
-#define T_NAND_CTL_SETNCE(iob)  gpiosetout(NAND_NCE, 0)
-#define T_NAND_CTL_CLRNCE(iob)  gpiosetout(NAND_NCE, NAND_NCE)
-
 /*
  * Define partitions for flash devices
  */
@@ -91,25 +74,43 @@ static struct mtd_partition partition_info32M[] = {
 
 #define NUM_PARTITIONS32M 3
 #define NUM_PARTITIONS64M 4
+
 /*
  *	hardware specific access to control-lines
-*/
-
-static void toto_hwcontrol(struct mtd_info *mtd, int cmd)
+ *
+ *	ctrl:
+ *	NAND_NCE: bit 0 -> bit 14 (0x4000)
+ *	NAND_CLE: bit 1 -> bit 12 (0x1000)
+ *	NAND_ALE: bit 2 -> bit 1  (0x0002)
+ */
+static void toto_hwcontrol(struct mtd_info *mtd, int cmd,
+			   unsigned int ctrl)
 {
+	struct nand_chip *chip = mtd->priv;
+
+	if (ctrl & NAND_CTRL_CHANGE) {
+		unsigned long bits;
 
-	udelay(1);		/* hopefully enough time for tc make proceding write to clear */
-	switch (cmd) {
-		case NAND_CTL_SETCLE: T_NAND_CTL_SETCLE(cmd); break;
-		case NAND_CTL_CLRCLE: T_NAND_CTL_CLRCLE(cmd); break;
+		/* hopefully enough time for tc make proceding write to clear */
+		udelay(1);
 
-		case NAND_CTL_SETALE: T_NAND_CTL_SETALE(cmd); break;
-		case NAND_CTL_CLRALE: T_NAND_CTL_CLRALE(cmd); break;
+		bits = (~ctrl & NAND_NCE) << 14;
+		bits |= (ctrl & NAND_CLE) << 12;
+		bits |= (ctrl & NAND_ALE) >> 1;
 
-		case NAND_CTL_SETNCE: T_NAND_CTL_SETNCE(cmd); break;
-		case NAND_CTL_CLRNCE: T_NAND_CTL_CLRNCE(cmd); break;
+#warning Wild guess as gpiosetout() is nowhere defined in the kernel source - tglx
+		gpiosetout(0x5002, bits);
+
+#ifdef CONFIG_NAND_WORKAROUND
+		/* "some" dev boards busted, blue wired to rts2 :( */
+		rts2setout(2, (ctrl & NAND_CLE) << 1);
+#endif
+		/* allow time to ensure gpio state to over take memory write */
+		udelay(1);
 	}
-	udelay(1);		/* allow time to ensure gpio state to over take memory write */
+
+	if (cmd != NAND_CMD_NONE)
+		writeb(cmd, chip->IO_ADDR_W);
 }
 
 /*
@@ -142,7 +143,7 @@ static int __init toto_init(void)
 	/* Set address of NAND IO lines */
 	this->IO_ADDR_R = toto_io_base;
 	this->IO_ADDR_W = toto_io_base;
-	this->hwcontrol = toto_hwcontrol;
+	this->cmd_ctrl = toto_hwcontrol;
 	this->dev_ready = NULL;
 	/* 25 us command delay time */
 	this->chip_delay = 30;
diff --git a/drivers/mtd/nand/ts7250.c b/drivers/mtd/nand/ts7250.c
index 70bce1b0326c..a0b4b1edcb0d 100644
--- a/drivers/mtd/nand/ts7250.c
+++ b/drivers/mtd/nand/ts7250.c
@@ -83,31 +83,29 @@ static struct mtd_partition partition_info128[] = {
 
 /*
  *	hardware specific access to control-lines
+ *
+ *	ctrl:
+ *	NAND_NCE: bit 0 -> bit 2
+ *	NAND_CLE: bit 1 -> bit 1
+ *	NAND_ALE: bit 2 -> bit 0
  */
-static void ts7250_hwcontrol(struct mtd_info *mtd, int cmd)
+static void ts7250_hwcontrol(struct mtd_info *mtd, int cmd, unsigned int ctrl)
 {
-	unsigned long ctrl = TS72XX_NAND_CONTROL_VIRT_BASE;
-
-	switch (cmd) {
-	case NAND_CTL_SETCLE:
-		__raw_writeb(__raw_readb(ctrl) | 0x2, ctrl);
-		break;
-	case NAND_CTL_CLRCLE:
-		__raw_writeb(__raw_readb(ctrl) & ~0x2, ctrl);
-		break;
-	case NAND_CTL_SETALE:
-		__raw_writeb(__raw_readb(ctrl) | 0x1, ctrl);
-		break;
-	case NAND_CTL_CLRALE:
-		__raw_writeb(__raw_readb(ctrl) & ~0x1, ctrl);
-		break;
-	case NAND_CTL_SETNCE:
-		__raw_writeb(__raw_readb(ctrl) | 0x4, ctrl);
-		break;
-	case NAND_CTL_CLRNCE:
-		__raw_writeb(__raw_readb(ctrl) & ~0x4, ctrl);
-		break;
+	struct nand_chip *chip = mtd->priv;
+
+	if (ctrl & NAND_CTRL_CHANGE) {
+		unsigned long addr = TS72XX_NAND_CONTROL_VIRT_BASE;
+		unsigned char bits;
+
+		bits = (ctrl & NAND_CNE) << 2;
+		bits |= ctrl & NAND_CLE;
+		bits |= (ctrl & NAND_ALE) >> 2;
+
+		__raw_writeb((__raw_readb(addr) & ~0x7) | bits, addr);
 	}
+
+	if (cmd != NAND_CMD_NONE)
+		writeb(cmd, chip->IO_ADDR_W);
 }
 
 /*
@@ -152,7 +150,7 @@ static int __init ts7250_init(void)
 	/* insert callbacks */
 	this->IO_ADDR_R = (void *)TS72XX_NAND_DATA_VIRT_BASE;
 	this->IO_ADDR_W = (void *)TS72XX_NAND_DATA_VIRT_BASE;
-	this->hwcontrol = ts7250_hwcontrol;
+	this->cmd_ctrl = ts7250_hwcontrol;
 	this->dev_ready = ts7250_device_ready;
 	this->chip_delay = 15;
 	this->ecc.mode = NAND_ECC_SOFT;
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 8362b466df3a..e9a935263151 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -50,23 +50,20 @@ extern int nand_write_raw(struct mtd_info *mtd, loff_t to, size_t len,
 
 /*
  * Constants for hardware specific CLE/ALE/NCE function
-*/
+ *
+ * These are bits which can be or'ed to set/clear multiple
+ * bits in one go.
+ */
 /* Select the chip by setting nCE to low */
-#define NAND_CTL_SETNCE		1
-/* Deselect the chip by setting nCE to high */
-#define NAND_CTL_CLRNCE		2
+#define NAND_NCE		0x01
 /* Select the command latch by setting CLE to high */
-#define NAND_CTL_SETCLE		3
-/* Deselect the command latch by setting CLE to low */
-#define NAND_CTL_CLRCLE		4
+#define NAND_CLE		0x02
 /* Select the address latch by setting ALE to high */
-#define NAND_CTL_SETALE		5
-/* Deselect the address latch by setting ALE to low */
-#define NAND_CTL_CLRALE		6
-/* Set write protection by setting WP to high. Not used! */
-#define NAND_CTL_SETWP		7
-/* Clear write protection by setting WP to low. Not used! */
-#define NAND_CTL_CLRWP		8
+#define NAND_ALE		0x04
+
+#define NAND_CTRL_CLE		(NAND_NCE | NAND_CLE)
+#define NAND_CTRL_ALE		(NAND_NCE | NAND_ALE)
+#define NAND_CTRL_CHANGE	0x80
 
 /*
  * Standard NAND flash commands
@@ -106,6 +103,8 @@ extern int nand_write_raw(struct mtd_info *mtd, loff_t to, size_t len,
 #define NAND_CMD_STATUS_RESET	0x7f
 #define NAND_CMD_STATUS_CLEAR	0xff
 
+#define NAND_CMD_NONE		-1
+
 /* Status bits */
 #define NAND_STATUS_FAIL	0x01
 #define NAND_STATUS_FAIL_N1	0x02
@@ -263,7 +262,8 @@ struct nand_ecc_ctrl {
  * @select_chip:	[REPLACEABLE] select chip nr
  * @block_bad:		[REPLACEABLE] check, if the block is bad
  * @block_markbad:	[REPLACEABLE] mark the block bad
- * @hwcontrol:		[BOARDSPECIFIC] hardwarespecific function for accesing control-lines
+ * @cmd_ctrl:		[BOARDSPECIFIC] hardwarespecific funtion for controlling
+ *			ALE/CLE/nCE. Also used to write command and address
  * @dev_ready:		[BOARDSPECIFIC] hardwarespecific function for accesing device ready/busy line
  *			If set to NULL no access to ready/busy is available and the ready/busy information
  *			is read from the chip status register
@@ -317,7 +317,8 @@ struct nand_chip {
 	void		(*select_chip)(struct mtd_info *mtd, int chip);
 	int		(*block_bad)(struct mtd_info *mtd, loff_t ofs, int getchip);
 	int		(*block_markbad)(struct mtd_info *mtd, loff_t ofs);
-	void		(*hwcontrol)(struct mtd_info *mtd, int cmd);
+	void		(*cmd_ctrl)(struct mtd_info *mtd, int dat,
+				    unsigned int ctrl);
 	int		(*dev_ready)(struct mtd_info *mtd);
 	void		(*cmdfunc)(struct mtd_info *mtd, unsigned command, int column, int page_addr);
 	int		(*waitfunc)(struct mtd_info *mtd, struct nand_chip *this, int state);
-- 
cgit v1.2.3


From cad74f2c380411ae7bee997f3ba18834cfe313a2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 23 May 2006 23:28:48 +0200
Subject: [MTD] NAND remove write_byte/word function from nand_chip

The previous change of the command / hardware control allows to
remove the write_byte/word functions completely, as their only
user were nand_command and nand_command_lp.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/ams-delta.c   |  1 -
 drivers/mtd/nand/au1550nd.c    | 31 ++++++++---------------------
 drivers/mtd/nand/cs553x_nand.c |  1 -
 drivers/mtd/nand/diskonchip.c  | 11 ++++++-----
 drivers/mtd/nand/nand_base.c   | 45 ------------------------------------------
 drivers/mtd/nand/nandsim.c     | 13 ------------
 include/linux/mtd/nand.h       |  5 -----
 7 files changed, 14 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ams-delta.c b/drivers/mtd/nand/ams-delta.c
index c0e96860686e..d7897dc6b3c8 100644
--- a/drivers/mtd/nand/ams-delta.c
+++ b/drivers/mtd/nand/ams-delta.c
@@ -179,7 +179,6 @@ static int __init ams_delta_init(void)
 	this->IO_ADDR_R = (OMAP_MPUIO_BASE + OMAP_MPUIO_INPUT_LATCH);
 	this->IO_ADDR_W = (OMAP_MPUIO_BASE + OMAP_MPUIO_OUTPUT);
 	this->read_byte = ams_delta_read_byte;
-	this->write_byte = ams_delta_write_byte;
 	this->write_buf = ams_delta_write_buf;
 	this->read_buf = ams_delta_read_buf;
 	this->verify_buf = ams_delta_verify_buf;
diff --git a/drivers/mtd/nand/au1550nd.c b/drivers/mtd/nand/au1550nd.c
index 275453ea7a71..31228334da12 100644
--- a/drivers/mtd/nand/au1550nd.c
+++ b/drivers/mtd/nand/au1550nd.c
@@ -40,6 +40,7 @@
 static struct mtd_info *au1550_mtd = NULL;
 static void __iomem *p_nand;
 static int nand_width = 1;	/* default x8 */
+static void (*au1550_write_byte)(struct mtd_info *, u_char);
 
 /*
  * Define partitions for flash device
@@ -128,21 +129,6 @@ static u16 au_read_word(struct mtd_info *mtd)
 	return ret;
 }
 
-/**
- * au_write_word -  write one word to the chip
- * @mtd:	MTD device structure
- * @word:	data word to write
- *
- *  write function for 16bit buswith without
- * endianess conversion
- */
-static void au_write_word(struct mtd_info *mtd, u16 word)
-{
-	struct nand_chip *this = mtd->priv;
-	writew(word, this->IO_ADDR_W);
-	au_sync();
-}
-
 /**
  * au_write_buf -  write buffer to chip
  * @mtd:	MTD device structure
@@ -379,9 +365,9 @@ static void au1550_command(struct mtd_info *mtd, unsigned command, int column, i
 			column -= 256;
 			readcmd = NAND_CMD_READ1;
 		}
-		this->write_byte(mtd, readcmd);
+		au1550_write_byte(mtd, readcmd);
 	}
-	this->write_byte(mtd, command);
+	au1550_write_byte(mtd, command);
 
 	/* Set ALE and clear CLE to start address cycle */
 	au1550_hwcontrol(mtd, NAND_CTL_CLRCLE);
@@ -394,10 +380,10 @@ static void au1550_command(struct mtd_info *mtd, unsigned command, int column, i
 			/* Adjust columns for 16 bit buswidth */
 			if (this->options & NAND_BUSWIDTH_16)
 				column >>= 1;
-			this->write_byte(mtd, column);
+			au1550_write_byte(mtd, column);
 		}
 		if (page_addr != -1) {
-			this->write_byte(mtd, (u8)(page_addr & 0xff));
+			au1550_write_byte(mtd, (u8)(page_addr & 0xff));
 
 			if (command == NAND_CMD_READ0 ||
 			    command == NAND_CMD_READ1 ||
@@ -415,11 +401,11 @@ static void au1550_command(struct mtd_info *mtd, unsigned command, int column, i
 				au1550_hwcontrol(mtd, NAND_CTL_SETNCE);
 			}
 
-			this->write_byte(mtd, (u8)(page_addr >> 8));
+			au1550_write_byte(mtd, (u8)(page_addr >> 8));
 
 			/* One more address cycle for devices > 32MiB */
 			if (this->chipsize > (32 << 20))
-				this->write_byte(mtd, (u8)((page_addr >> 16) & 0x0f));
+				au1550_write_byte(mtd, (u8)((page_addr >> 16) & 0x0f));
 		}
 		/* Latch in address */
 		au1550_hwcontrol(mtd, NAND_CTL_CLRALE);
@@ -597,8 +583,7 @@ static int __init au1xxx_nand_init(void)
 		this->options |= NAND_BUSWIDTH_16;
 
 	this->read_byte = (!nand_width) ? au_read_byte16 : au_read_byte;
-	this->write_byte = (!nand_width) ? au_write_byte16 : au_write_byte;
-	this->write_word = au_write_word;
+	au1550_write_byte = (!nand_width) ? au_write_byte16 : au_write_byte;
 	this->read_word = au_read_word;
 	this->write_buf = (!nand_width) ? au_write_buf16 : au_write_buf;
 	this->read_buf = (!nand_width) ? au_read_buf16 : au_read_buf;
diff --git a/drivers/mtd/nand/cs553x_nand.c b/drivers/mtd/nand/cs553x_nand.c
index cd3d7eb132f9..1e0348ae325f 100644
--- a/drivers/mtd/nand/cs553x_nand.c
+++ b/drivers/mtd/nand/cs553x_nand.c
@@ -220,7 +220,6 @@ static int __init cs553x_init_one(int cs, int mmio, unsigned long adr)
 	this->cmd_ctrl = cs553x_hwcontrol;
 	this->dev_ready = cs553x_device_ready;
 	this->read_byte = cs553x_read_byte;
-	this->write_byte = cs553x_write_byte;
 	this->read_buf = cs553x_read_buf;
 	this->write_buf = cs553x_write_buf;
 
diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index e4bb6b429f87..2ec9080e2b14 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -717,8 +717,12 @@ static void doc200x_hwcontrol(struct mtd_info *mtd, int cmd,
 		/* 11.4.3 -- 4 NOPs after CSDNControl write */
 		DoC_Delay(doc, 4);
 	}
-	if (cmd != NAND_CMD_NONE)
-		this->write_byte(mtd, cmd);
+	if (cmd != NAND_CMD_NONE) {
+		if (DoC_is_2000(doc))
+			doc2000_write_byte(mtd, cmd);
+		else
+			doc2001_write_byte(mtd, cmd);
+	}
 }
 
 static void doc2001plus_command(struct mtd_info *mtd, unsigned command, int column, int page_addr)
@@ -1435,7 +1439,6 @@ static inline int __init doc2000_init(struct mtd_info *mtd)
 	struct nand_chip *this = mtd->priv;
 	struct doc_priv *doc = this->priv;
 
-	this->write_byte = doc2000_write_byte;
 	this->read_byte = doc2000_read_byte;
 	this->write_buf = doc2000_writebuf;
 	this->read_buf = doc2000_readbuf;
@@ -1453,7 +1456,6 @@ static inline int __init doc2001_init(struct mtd_info *mtd)
 	struct nand_chip *this = mtd->priv;
 	struct doc_priv *doc = this->priv;
 
-	this->write_byte = doc2001_write_byte;
 	this->read_byte = doc2001_read_byte;
 	this->write_buf = doc2001_writebuf;
 	this->read_buf = doc2001_readbuf;
@@ -1485,7 +1487,6 @@ static inline int __init doc2001plus_init(struct mtd_info *mtd)
 	struct nand_chip *this = mtd->priv;
 	struct doc_priv *doc = this->priv;
 
-	this->write_byte = NULL;
 	this->read_byte = doc2001plus_read_byte;
 	this->write_buf = doc2001plus_writebuf;
 	this->read_buf = doc2001plus_readbuf;
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index f6997fb77b91..4f387c8388d7 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -197,19 +197,6 @@ static uint8_t nand_read_byte(struct mtd_info *mtd)
 	return readb(this->IO_ADDR_R);
 }
 
-/**
- * nand_write_byte - [DEFAULT] write one byte to the chip
- * @mtd:	MTD device structure
- * @byte:	pointer to data byte to write
- *
- * Default write function for 8it buswith
- */
-static void nand_write_byte(struct mtd_info *mtd, uint8_t byte)
-{
-	struct nand_chip *this = mtd->priv;
-	writeb(byte, this->IO_ADDR_W);
-}
-
 /**
  * nand_read_byte16 - [DEFAULT] read one byte endianess aware from the chip
  * @mtd:	MTD device structure
@@ -223,20 +210,6 @@ static uint8_t nand_read_byte16(struct mtd_info *mtd)
 	return (uint8_t) cpu_to_le16(readw(this->IO_ADDR_R));
 }
 
-/**
- * nand_write_byte16 - [DEFAULT] write one byte endianess aware to the chip
- * @mtd:	MTD device structure
- * @byte:	pointer to data byte to write
- *
- * Default write function for 16bit buswith with
- * endianess conversion
- */
-static void nand_write_byte16(struct mtd_info *mtd, uint8_t byte)
-{
-	struct nand_chip *this = mtd->priv;
-	writew(le16_to_cpu((u16) byte), this->IO_ADDR_W);
-}
-
 /**
  * nand_read_word - [DEFAULT] read one word from the chip
  * @mtd:	MTD device structure
@@ -250,20 +223,6 @@ static u16 nand_read_word(struct mtd_info *mtd)
 	return readw(this->IO_ADDR_R);
 }
 
-/**
- * nand_write_word - [DEFAULT] write one word to the chip
- * @mtd:	MTD device structure
- * @word:	data word to write
- *
- * Default write function for 16bit buswith without
- * endianess conversion
- */
-static void nand_write_word(struct mtd_info *mtd, u16 word)
-{
-	struct nand_chip *this = mtd->priv;
-	writew(word, this->IO_ADDR_W);
-}
-
 /**
  * nand_select_chip - [DEFAULT] control CE line
  * @mtd:	MTD device structure
@@ -2200,12 +2159,8 @@ static void nand_set_defaults(struct nand_chip *this, int busw)
 
 	if (!this->select_chip)
 		this->select_chip = nand_select_chip;
-	if (!this->write_byte)
-		this->write_byte = busw ? nand_write_byte16 : nand_write_byte;
 	if (!this->read_byte)
 		this->read_byte = busw ? nand_read_byte16 : nand_read_byte;
-	if (!this->write_word)
-		this->write_word = nand_write_word;
 	if (!this->read_word)
 		this->read_word = nand_read_word;
 	if (!this->block_bad)
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index ecf727b32dec..ebd64abc8be8 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -1326,17 +1326,6 @@ ns_nand_read_word(struct mtd_info *mtd)
 	return chip->read_byte(mtd) | (chip->read_byte(mtd) << 8);
 }
 
-static void
-ns_nand_write_word(struct mtd_info *mtd, uint16_t word)
-{
-	struct nand_chip *chip = (struct nand_chip *)mtd->priv;
-
-	NS_DBG("write_word\n");
-
-	chip->write_byte(mtd, word & 0xFF);
-	chip->write_byte(mtd, word >> 8);
-}
-
 static void
 ns_nand_write_buf(struct mtd_info *mtd, const u_char *buf, int len)
 {
@@ -1467,11 +1456,9 @@ static int __init ns_init_module(void)
 	chip->cmd_ctrl	 = ns_hwcontrol;
 	chip->read_byte  = ns_nand_read_byte;
 	chip->dev_ready  = ns_device_ready;
-	chip->write_byte = ns_nand_write_byte;
 	chip->write_buf  = ns_nand_write_buf;
 	chip->read_buf   = ns_nand_read_buf;
 	chip->verify_buf = ns_nand_verify_buf;
-	chip->write_word = ns_nand_write_word;
 	chip->read_word  = ns_nand_read_word;
 	chip->ecc.mode   = NAND_ECC_SOFT;
 	chip->options   |= NAND_SKIP_BBTSCAN;
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index e9a935263151..2c0fb6380461 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -253,9 +253,7 @@ struct nand_ecc_ctrl {
  * @IO_ADDR_R:		[BOARDSPECIFIC] address to read the 8 I/O lines of the flash device
  * @IO_ADDR_W:		[BOARDSPECIFIC] address to write the 8 I/O lines of the flash device
  * @read_byte:		[REPLACEABLE] read one byte from the chip
- * @write_byte:		[REPLACEABLE] write one byte to the chip
  * @read_word:		[REPLACEABLE] read one word from the chip
- * @write_word:		[REPLACEABLE] write one word to the chip
  * @write_buf:		[REPLACEABLE] write data from the buffer to the chip
  * @read_buf:		[REPLACEABLE] read data from the chip into the buffer
  * @verify_buf:		[REPLACEABLE] verify buffer contents against the chip data
@@ -307,10 +305,7 @@ struct nand_chip {
 	void  __iomem	*IO_ADDR_W;
 
 	uint8_t		(*read_byte)(struct mtd_info *mtd);
-	void		(*write_byte)(struct mtd_info *mtd, uint8_t byte);
 	u16		(*read_word)(struct mtd_info *mtd);
-	void		(*write_word)(struct mtd_info *mtd, u16 word);
-
 	void		(*write_buf)(struct mtd_info *mtd, const uint8_t *buf, int len);
 	void		(*read_buf)(struct mtd_info *mtd, uint8_t *buf, int len);
 	int		(*verify_buf)(struct mtd_info *mtd, const uint8_t *buf, int len);
-- 
cgit v1.2.3


From 4c5c81613b0eb0dba97a8f312a2f1162f39fd47b Mon Sep 17 00:00:00 2001
From: Andrew Chew <achew@nvidia.com>
Date: Thu, 20 Apr 2006 15:54:26 -0700
Subject: [PATCH] sata_nv: Add MCP61 support

Added MCP61 SATA support to sata_nv.

Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ide/pci/amd74xx.c | 7 +++++--
 drivers/scsi/sata_nv.c    | 6 ++++++
 include/linux/pci_ids.h   | 4 ++++
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/pci/amd74xx.c b/drivers/ide/pci/amd74xx.c
index b22ee5462318..6e9dbf4d8077 100644
--- a/drivers/ide/pci/amd74xx.c
+++ b/drivers/ide/pci/amd74xx.c
@@ -74,6 +74,7 @@ static struct amd_ide_chip {
 	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE,	0x50, AMD_UDMA_133 },
 	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE,	0x50, AMD_UDMA_133 },
 	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE,	0x50, AMD_UDMA_133 },
 	{ PCI_DEVICE_ID_AMD_CS5536_IDE,			0x40, AMD_UDMA_100 },
 	{ 0 }
 };
@@ -488,7 +489,8 @@ static ide_pci_device_t amd74xx_chipsets[] __devinitdata = {
 	/* 14 */ DECLARE_NV_DEV("NFORCE-MCP04"),
 	/* 15 */ DECLARE_NV_DEV("NFORCE-MCP51"),
 	/* 16 */ DECLARE_NV_DEV("NFORCE-MCP55"),
-	/* 17 */ DECLARE_AMD_DEV("AMD5536"),
+	/* 17 */ DECLARE_NV_DEV("NFORCE-MCP61"),
+	/* 18 */ DECLARE_AMD_DEV("AMD5536"),
 };
 
 static int __devinit amd74xx_probe(struct pci_dev *dev, const struct pci_device_id *id)
@@ -525,7 +527,8 @@ static struct pci_device_id amd74xx_pci_tbl[] = {
 	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 14 },
 	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 15 },
 	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 16 },
-	{ PCI_VENDOR_ID_AMD,	PCI_DEVICE_ID_AMD_CS5536_IDE,		PCI_ANY_ID, PCI_ANY_ID, 0, 0, 17 },
+	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 17 },
+	{ PCI_VENDOR_ID_AMD,	PCI_DEVICE_ID_AMD_CS5536_IDE,		PCI_ANY_ID, PCI_ANY_ID, 0, 0, 18 },
 	{ 0, },
 };
 MODULE_DEVICE_TABLE(pci, amd74xx_pci_tbl);
diff --git a/drivers/scsi/sata_nv.c b/drivers/scsi/sata_nv.c
index 70c51088d371..043ff4fd9ba7 100644
--- a/drivers/scsi/sata_nv.c
+++ b/drivers/scsi/sata_nv.c
@@ -140,6 +140,12 @@ static const struct pci_device_id nv_pci_tbl[] = {
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0, GENERIC },
 	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SATA2,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0, GENERIC },
+	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0, GENERIC },
+	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA2,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0, GENERIC },
+	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA3,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0, GENERIC },
 	{ PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
 		PCI_ANY_ID, PCI_ANY_ID,
 		PCI_CLASS_STORAGE_IDE<<8, 0xffff00, GENERIC },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d6fe048376ab..233f60741c82 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1182,6 +1182,10 @@
 #define PCI_DEVICE_ID_NVIDIA_QUADRO_FX_1100         0x034E
 #define PCI_DEVICE_ID_NVIDIA_NVENET_14              0x0372
 #define PCI_DEVICE_ID_NVIDIA_NVENET_15              0x0373
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA      0x03E7
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE       0x03EC
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA2     0x03F6
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA3     0x03F7
 
 #define PCI_VENDOR_ID_IMS		0x10e0
 #define PCI_DEVICE_ID_IMS_TT128		0x9128
-- 
cgit v1.2.3


From a6b2c5d4754dc539a560fdf0d3fb78a14174394a Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 22 May 2006 16:59:59 +0100
Subject: [PATCH] PATCH: libata. Add ->data_xfer method

We need to pass the device in order to do per device checks such as
32bit I/O enables. With the changes to include dev->ap we now don't have
to add parameters however just clean them up. Also add data_xfer methods
to the existing drivers except ata_piix (which is in the other block of
patches). If you reject the piix one just add a data_xfer to it...

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c  | 63 +++++++++++++--------------------------------
 drivers/scsi/sata_mv.c      |  2 ++
 drivers/scsi/sata_nv.c      |  1 +
 drivers/scsi/sata_promise.c |  2 ++
 drivers/scsi/sata_qstor.c   |  1 +
 drivers/scsi/sata_sil.c     |  1 +
 drivers/scsi/sata_sis.c     |  1 +
 drivers/scsi/sata_svw.c     |  1 +
 drivers/scsi/sata_sx4.c     |  1 +
 drivers/scsi/sata_uli.c     |  1 +
 drivers/scsi/sata_via.c     |  1 +
 drivers/scsi/sata_vsc.c     |  1 +
 include/linux/libata.h      |  6 +++++
 13 files changed, 37 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 45b6b29bc10f..074a46e5bbdd 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -3527,7 +3527,7 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
 
 /**
  *	ata_mmio_data_xfer - Transfer data by MMIO
- *	@ap: port to read/write
+ *	@dev: device for this I/O
  *	@buf: data buffer
  *	@buflen: buffer length
  *	@write_data: read/write
@@ -3538,9 +3538,10 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
  *	Inherited from caller.
  */
 
-static void ata_mmio_data_xfer(struct ata_port *ap, unsigned char *buf,
-			       unsigned int buflen, int write_data)
+void ata_mmio_data_xfer(struct ata_device *adev, unsigned char *buf, 
+			unsigned int buflen, int write_data)
 {
+	struct ata_port *ap = adev->ap;
 	unsigned int i;
 	unsigned int words = buflen >> 1;
 	u16 *buf16 = (u16 *) buf;
@@ -3572,7 +3573,7 @@ static void ata_mmio_data_xfer(struct ata_port *ap, unsigned char *buf,
 
 /**
  *	ata_pio_data_xfer - Transfer data by PIO
- *	@ap: port to read/write
+ *	@adev: device to target
  *	@buf: data buffer
  *	@buflen: buffer length
  *	@write_data: read/write
@@ -3583,9 +3584,10 @@ static void ata_mmio_data_xfer(struct ata_port *ap, unsigned char *buf,
  *	Inherited from caller.
  */
 
-static void ata_pio_data_xfer(struct ata_port *ap, unsigned char *buf,
-			      unsigned int buflen, int write_data)
+void ata_pio_data_xfer(struct ata_device *adev, unsigned char *buf, 
+		       unsigned int buflen, int write_data)
 {
+	struct ata_port *ap = adev->ap;
 	unsigned int words = buflen >> 1;
 
 	/* Transfer multiple of 2 bytes */
@@ -3609,39 +3611,6 @@ static void ata_pio_data_xfer(struct ata_port *ap, unsigned char *buf,
 	}
 }
 
-/**
- *	ata_data_xfer - Transfer data from/to the data register.
- *	@ap: port to read/write
- *	@buf: data buffer
- *	@buflen: buffer length
- *	@do_write: read/write
- *
- *	Transfer data from/to the device data register.
- *
- *	LOCKING:
- *	Inherited from caller.
- */
-
-static void ata_data_xfer(struct ata_port *ap, unsigned char *buf,
-			  unsigned int buflen, int do_write)
-{
-	/* Make the crap hardware pay the costs not the good stuff */
-	if (unlikely(ap->flags & ATA_FLAG_IRQ_MASK)) {
-		unsigned long flags;
-		local_irq_save(flags);
-		if (ap->flags & ATA_FLAG_MMIO)
-			ata_mmio_data_xfer(ap, buf, buflen, do_write);
-		else
-			ata_pio_data_xfer(ap, buf, buflen, do_write);
-		local_irq_restore(flags);
-	} else {
-		if (ap->flags & ATA_FLAG_MMIO)
-			ata_mmio_data_xfer(ap, buf, buflen, do_write);
-		else
-			ata_pio_data_xfer(ap, buf, buflen, do_write);
-	}
-}
-
 /**
  *	ata_pio_sector - Transfer ATA_SECT_SIZE (512 bytes) of data.
  *	@qc: Command on going
@@ -3676,17 +3645,18 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
 	if (PageHighMem(page)) {
 		unsigned long flags;
 
+		/* FIXME: use a bounce buffer */
 		local_irq_save(flags);
 		buf = kmap_atomic(page, KM_IRQ0);
 
 		/* do the actual data transfer */
-		ata_data_xfer(ap, buf + offset, ATA_SECT_SIZE, do_write);
+		ap->ops->data_xfer(qc->dev, buf + offset, ATA_SECT_SIZE, do_write);
 
 		kunmap_atomic(buf, KM_IRQ0);
 		local_irq_restore(flags);
 	} else {
 		buf = page_address(page);
-		ata_data_xfer(ap, buf + offset, ATA_SECT_SIZE, do_write);
+		ap->ops->data_xfer(qc->dev, buf + offset, ATA_SECT_SIZE, do_write);
 	}
 
 	qc->cursect++;
@@ -3742,7 +3712,7 @@ static void atapi_send_cdb(struct ata_port *ap, struct ata_queued_cmd *qc)
 	DPRINTK("send cdb\n");
 	WARN_ON(qc->dev->cdb_len < 12);
 
-	ata_data_xfer(ap, qc->cdb, qc->dev->cdb_len, 1);
+	ap->ops->data_xfer(qc->dev, qc->cdb, qc->dev->cdb_len, 1);
 	ata_altstatus(ap); /* flush */
 
 	switch (qc->tf.protocol) {
@@ -3802,7 +3772,7 @@ next_sg:
 				       "%u bytes trailing data\n", bytes);
 
 		for (i = 0; i < words; i++)
-			ata_data_xfer(ap, (unsigned char*)pad_buf, 2, do_write);
+			ap->ops->data_xfer(qc->dev, (unsigned char*)pad_buf, 2, do_write);
 
 		ap->hsm_task_state = HSM_ST_LAST;
 		return;
@@ -3828,17 +3798,18 @@ next_sg:
 	if (PageHighMem(page)) {
 		unsigned long flags;
 
+		/* FIXME: use bounce buffer */
 		local_irq_save(flags);
 		buf = kmap_atomic(page, KM_IRQ0);
 
 		/* do the actual data transfer */
-		ata_data_xfer(ap, buf + offset, count, do_write);
+		ap->ops->data_xfer(qc->dev,  buf + offset, count, do_write);
 
 		kunmap_atomic(buf, KM_IRQ0);
 		local_irq_restore(flags);
 	} else {
 		buf = page_address(page);
-		ata_data_xfer(ap, buf + offset, count, do_write);
+		ap->ops->data_xfer(qc->dev,  buf + offset, count, do_write);
 	}
 
 	bytes -= count;
@@ -5702,6 +5673,8 @@ EXPORT_SYMBOL_GPL(ata_port_start);
 EXPORT_SYMBOL_GPL(ata_port_stop);
 EXPORT_SYMBOL_GPL(ata_host_stop);
 EXPORT_SYMBOL_GPL(ata_interrupt);
+EXPORT_SYMBOL_GPL(ata_mmio_data_xfer);
+EXPORT_SYMBOL_GPL(ata_pio_data_xfer);
 EXPORT_SYMBOL_GPL(ata_qc_prep);
 EXPORT_SYMBOL_GPL(ata_noop_qc_prep);
 EXPORT_SYMBOL_GPL(ata_bmdma_setup);
diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c
index 691c115ac8e1..bf3529fdea9c 100644
--- a/drivers/scsi/sata_mv.c
+++ b/drivers/scsi/sata_mv.c
@@ -406,6 +406,7 @@ static const struct ata_port_operations mv5_ops = {
 
 	.qc_prep		= mv_qc_prep,
 	.qc_issue		= mv_qc_issue,
+	.data_xfer		= ata_mmio_data_xfer,
 
 	.eng_timeout		= mv_eng_timeout,
 
@@ -433,6 +434,7 @@ static const struct ata_port_operations mv6_ops = {
 
 	.qc_prep		= mv_qc_prep,
 	.qc_issue		= mv_qc_issue,
+	.data_xfer		= ata_mmio_data_xfer,
 
 	.eng_timeout		= mv_eng_timeout,
 
diff --git a/drivers/scsi/sata_nv.c b/drivers/scsi/sata_nv.c
index 043ff4fd9ba7..d93513ef7412 100644
--- a/drivers/scsi/sata_nv.c
+++ b/drivers/scsi/sata_nv.c
@@ -234,6 +234,7 @@ static const struct ata_port_operations nv_ops = {
 	.qc_prep		= ata_qc_prep,
 	.qc_issue		= ata_qc_issue_prot,
 	.eng_timeout		= ata_eng_timeout,
+	.data_xfer		= ata_pio_data_xfer,
 	.irq_handler		= nv_interrupt,
 	.irq_clear		= ata_bmdma_irq_clear,
 	.scr_read		= nv_scr_read,
diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c
index 285ab0263d91..01111594d09c 100644
--- a/drivers/scsi/sata_promise.c
+++ b/drivers/scsi/sata_promise.c
@@ -137,6 +137,7 @@ static const struct ata_port_operations pdc_sata_ops = {
 	.qc_prep		= pdc_qc_prep,
 	.qc_issue		= pdc_qc_issue_prot,
 	.eng_timeout		= pdc_eng_timeout,
+	.data_xfer		= ata_mmio_data_xfer,
 	.irq_handler		= pdc_interrupt,
 	.irq_clear		= pdc_irq_clear,
 
@@ -159,6 +160,7 @@ static const struct ata_port_operations pdc_pata_ops = {
 
 	.qc_prep		= pdc_qc_prep,
 	.qc_issue		= pdc_qc_issue_prot,
+	.data_xfer		= ata_mmio_data_xfer,
 	.eng_timeout		= pdc_eng_timeout,
 	.irq_handler		= pdc_interrupt,
 	.irq_clear		= pdc_irq_clear,
diff --git a/drivers/scsi/sata_qstor.c b/drivers/scsi/sata_qstor.c
index 54283e06070e..68737cadd2d4 100644
--- a/drivers/scsi/sata_qstor.c
+++ b/drivers/scsi/sata_qstor.c
@@ -156,6 +156,7 @@ static const struct ata_port_operations qs_ata_ops = {
 	.phy_reset		= qs_phy_reset,
 	.qc_prep		= qs_qc_prep,
 	.qc_issue		= qs_qc_issue,
+	.data_xfer		= ata_mmio_data_xfer,
 	.eng_timeout		= qs_eng_timeout,
 	.irq_handler		= qs_intr,
 	.irq_clear		= qs_irq_clear,
diff --git a/drivers/scsi/sata_sil.c b/drivers/scsi/sata_sil.c
index aa63044eed2e..3bd807738698 100644
--- a/drivers/scsi/sata_sil.c
+++ b/drivers/scsi/sata_sil.c
@@ -176,6 +176,7 @@ static const struct ata_port_operations sil_ops = {
 	.bmdma_status		= ata_bmdma_status,
 	.qc_prep		= ata_qc_prep,
 	.qc_issue		= ata_qc_issue_prot,
+	.data_xfer		= ata_mmio_data_xfer,
 	.freeze			= sil_freeze,
 	.thaw			= sil_thaw,
 	.error_handler		= ata_bmdma_error_handler,
diff --git a/drivers/scsi/sata_sis.c b/drivers/scsi/sata_sis.c
index 3097821688dc..82a07bff7e91 100644
--- a/drivers/scsi/sata_sis.c
+++ b/drivers/scsi/sata_sis.c
@@ -113,6 +113,7 @@ static const struct ata_port_operations sis_ops = {
 	.bmdma_status		= ata_bmdma_status,
 	.qc_prep		= ata_qc_prep,
 	.qc_issue		= ata_qc_issue_prot,
+	.data_xfer		= ata_pio_data_xfer,
 	.eng_timeout		= ata_eng_timeout,
 	.irq_handler		= ata_interrupt,
 	.irq_clear		= ata_bmdma_irq_clear,
diff --git a/drivers/scsi/sata_svw.c b/drivers/scsi/sata_svw.c
index d5eb5375e265..7a4703bfa12a 100644
--- a/drivers/scsi/sata_svw.c
+++ b/drivers/scsi/sata_svw.c
@@ -320,6 +320,7 @@ static const struct ata_port_operations k2_sata_ops = {
 	.bmdma_status		= ata_bmdma_status,
 	.qc_prep		= ata_qc_prep,
 	.qc_issue		= ata_qc_issue_prot,
+	.data_xfer		= ata_mmio_data_xfer,
 	.eng_timeout		= ata_eng_timeout,
 	.irq_handler		= ata_interrupt,
 	.irq_clear		= ata_bmdma_irq_clear,
diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c
index e799ef35e9db..c4db6bf14a25 100644
--- a/drivers/scsi/sata_sx4.c
+++ b/drivers/scsi/sata_sx4.c
@@ -204,6 +204,7 @@ static const struct ata_port_operations pdc_20621_ops = {
 	.phy_reset		= pdc_20621_phy_reset,
 	.qc_prep		= pdc20621_qc_prep,
 	.qc_issue		= pdc20621_qc_issue_prot,
+	.data_xfer		= ata_mmio_data_xfer,
 	.eng_timeout		= pdc_eng_timeout,
 	.irq_handler		= pdc20621_interrupt,
 	.irq_clear		= pdc20621_irq_clear,
diff --git a/drivers/scsi/sata_uli.c b/drivers/scsi/sata_uli.c
index 15f81bfc30f0..7fae3e06e461 100644
--- a/drivers/scsi/sata_uli.c
+++ b/drivers/scsi/sata_uli.c
@@ -110,6 +110,7 @@ static const struct ata_port_operations uli_ops = {
 	.bmdma_status		= ata_bmdma_status,
 	.qc_prep		= ata_qc_prep,
 	.qc_issue		= ata_qc_issue_prot,
+	.data_xfer		= ata_pio_data_xfer,
 
 	.eng_timeout		= ata_eng_timeout,
 
diff --git a/drivers/scsi/sata_via.c b/drivers/scsi/sata_via.c
index 17aefab5f42f..1c9e2f36805a 100644
--- a/drivers/scsi/sata_via.c
+++ b/drivers/scsi/sata_via.c
@@ -124,6 +124,7 @@ static const struct ata_port_operations svia_sata_ops = {
 
 	.qc_prep		= ata_qc_prep,
 	.qc_issue		= ata_qc_issue_prot,
+	.data_xfer		= ata_pio_data_xfer,
 
 	.eng_timeout		= ata_eng_timeout,
 
diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c
index 0372be7ff1c9..438e7c6a0f8f 100644
--- a/drivers/scsi/sata_vsc.c
+++ b/drivers/scsi/sata_vsc.c
@@ -297,6 +297,7 @@ static const struct ata_port_operations vsc_sata_ops = {
 	.bmdma_status		= ata_bmdma_status,
 	.qc_prep		= ata_qc_prep,
 	.qc_issue		= ata_qc_issue_prot,
+	.data_xfer		= ata_pio_data_xfer,
 	.eng_timeout		= ata_eng_timeout,
 	.irq_handler		= vsc_sata_interrupt,
 	.irq_clear		= ata_bmdma_irq_clear,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index c51502c047a4..25a6bf181599 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -525,6 +525,8 @@ struct ata_port_operations {
 	void (*bmdma_setup) (struct ata_queued_cmd *qc);
 	void (*bmdma_start) (struct ata_queued_cmd *qc);
 
+	void (*data_xfer) (struct ata_device *, unsigned char *, unsigned int, int);
+
 	void (*qc_prep) (struct ata_queued_cmd *qc);
 	unsigned int (*qc_issue) (struct ata_queued_cmd *qc);
 
@@ -646,6 +648,10 @@ extern int ata_port_start (struct ata_port *ap);
 extern void ata_port_stop (struct ata_port *ap);
 extern void ata_host_stop (struct ata_host_set *host_set);
 extern irqreturn_t ata_interrupt (int irq, void *dev_instance, struct pt_regs *regs);
+extern void ata_mmio_data_xfer(struct ata_device *adev, unsigned char *buf,
+			       unsigned int buflen, int write_data);
+extern void ata_pio_data_xfer(struct ata_device *adev, unsigned char *buf,
+			      unsigned int buflen, int write_data);
 extern void ata_qc_prep(struct ata_queued_cmd *qc);
 extern void ata_noop_qc_prep(struct ata_queued_cmd *qc);
 extern unsigned int ata_qc_issue_prot(struct ata_queued_cmd *qc);
-- 
cgit v1.2.3


From 957d2df1801865eb1e63864bc63b970aa9c460ba Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Tue, 23 May 2006 13:18:57 +0100
Subject: [PATCH] libata: Remove obsolete flag

ATA_FLAG_IRQ_MASK was added when I did the original data transfer with
IRQ masked bits for PIO. It has since been replaced by ->pio_data_xfer
methods so should be removed so nobody uses it by mistake thinking it
still works.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/libata.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 25a6bf181599..9c60b4a4e2fd 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -149,7 +149,6 @@ enum {
 	ATA_FLAG_NO_ATAPI	= (1 << 6), /* No ATAPI support */
 	ATA_FLAG_PIO_DMA	= (1 << 7), /* PIO cmds via DMA */
 	ATA_FLAG_PIO_LBA48	= (1 << 8), /* Host DMA engine is LBA28 only */
-	ATA_FLAG_IRQ_MASK	= (1 << 9), /* Mask IRQ in PIO xfers */
 	ATA_FLAG_PIO_POLLING	= (1 << 10), /* use polling PIO if LLD
 					      * doesn't handle PIO interrupts */
 	ATA_FLAG_NCQ		= (1 << 11), /* host supports NCQ */
-- 
cgit v1.2.3


From 7a30601b3ac7b02440ffa629fd3d2cca71c1bcd8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Thu, 25 May 2006 09:50:16 +0200
Subject: [MTD] NAND Introduce NAND_NO_READRDY option

The nand driver has a superflous read ready / command
delay in the read functions. This was added to handle
chips which have an automatic read forward. Newer
chips do not have this functionality anymore. Add this
option to avoid the delay / I/O operation. Mark all
large page chips with the new option flag.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/nand_ids.c | 165 +++++++++++++++++++++++---------------------
 include/linux/mtd/nand.h    |   4 ++
 2 files changed, 92 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index a9d52fc6e5d7..2e2cdf2fc91d 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -18,99 +18,110 @@
 *	Name. ID code, pagesize, chipsize in MegaByte, eraseblock size,
 *	options
 *
-* 	Pagesize; 0, 256, 512
-*	0 	get this information from the extended chip ID
+*	Pagesize; 0, 256, 512
+*	0	get this information from the extended chip ID
 +	256	256 Byte page size
 *	512	512 Byte page size
 */
 struct nand_flash_dev nand_flash_ids[] = {
-	{"NAND 1MiB 5V 8-bit", 		0x6e, 256, 1, 0x1000, 0},
-	{"NAND 2MiB 5V 8-bit", 		0x64, 256, 2, 0x1000, 0},
-	{"NAND 4MiB 5V 8-bit", 		0x6b, 512, 4, 0x2000, 0},
-	{"NAND 1MiB 3,3V 8-bit", 	0xe8, 256, 1, 0x1000, 0},
-	{"NAND 1MiB 3,3V 8-bit", 	0xec, 256, 1, 0x1000, 0},
-	{"NAND 2MiB 3,3V 8-bit", 	0xea, 256, 2, 0x1000, 0},
-	{"NAND 4MiB 3,3V 8-bit", 	0xd5, 512, 4, 0x2000, 0},
-	{"NAND 4MiB 3,3V 8-bit", 	0xe3, 512, 4, 0x2000, 0},
-	{"NAND 4MiB 3,3V 8-bit", 	0xe5, 512, 4, 0x2000, 0},
-	{"NAND 8MiB 3,3V 8-bit", 	0xd6, 512, 8, 0x2000, 0},
-
-	{"NAND 8MiB 1,8V 8-bit", 	0x39, 512, 8, 0x2000, 0},
-	{"NAND 8MiB 3,3V 8-bit", 	0xe6, 512, 8, 0x2000, 0},
-	{"NAND 8MiB 1,8V 16-bit", 	0x49, 512, 8, 0x2000, NAND_BUSWIDTH_16},
-	{"NAND 8MiB 3,3V 16-bit", 	0x59, 512, 8, 0x2000, NAND_BUSWIDTH_16},
-
-	{"NAND 16MiB 1,8V 8-bit", 	0x33, 512, 16, 0x4000, 0},
-	{"NAND 16MiB 3,3V 8-bit", 	0x73, 512, 16, 0x4000, 0},
-	{"NAND 16MiB 1,8V 16-bit", 	0x43, 512, 16, 0x4000, NAND_BUSWIDTH_16},
-	{"NAND 16MiB 3,3V 16-bit", 	0x53, 512, 16, 0x4000, NAND_BUSWIDTH_16},
-
-	{"NAND 32MiB 1,8V 8-bit", 	0x35, 512, 32, 0x4000, 0},
-	{"NAND 32MiB 3,3V 8-bit", 	0x75, 512, 32, 0x4000, 0},
-	{"NAND 32MiB 1,8V 16-bit", 	0x45, 512, 32, 0x4000, NAND_BUSWIDTH_16},
-	{"NAND 32MiB 3,3V 16-bit", 	0x55, 512, 32, 0x4000, NAND_BUSWIDTH_16},
-
-	{"NAND 64MiB 1,8V 8-bit", 	0x36, 512, 64, 0x4000, 0},
-	{"NAND 64MiB 3,3V 8-bit", 	0x76, 512, 64, 0x4000, 0},
-	{"NAND 64MiB 1,8V 16-bit", 	0x46, 512, 64, 0x4000, NAND_BUSWIDTH_16},
-	{"NAND 64MiB 3,3V 16-bit", 	0x56, 512, 64, 0x4000, NAND_BUSWIDTH_16},
-
-	{"NAND 128MiB 1,8V 8-bit", 	0x78, 512, 128, 0x4000, 0},
-	{"NAND 128MiB 1,8V 8-bit", 	0x39, 512, 128, 0x4000, 0},
-	{"NAND 128MiB 3,3V 8-bit", 	0x79, 512, 128, 0x4000, 0},
-	{"NAND 128MiB 1,8V 16-bit", 	0x72, 512, 128, 0x4000, NAND_BUSWIDTH_16},
-	{"NAND 128MiB 1,8V 16-bit", 	0x49, 512, 128, 0x4000, NAND_BUSWIDTH_16},
-	{"NAND 128MiB 3,3V 16-bit", 	0x74, 512, 128, 0x4000, NAND_BUSWIDTH_16},
-	{"NAND 128MiB 3,3V 16-bit", 	0x59, 512, 128, 0x4000, NAND_BUSWIDTH_16},
-
-	{"NAND 256MiB 3,3V 8-bit", 	0x71, 512, 256, 0x4000, 0},
-
-	/* These are the new chips with large page size. The pagesize
-	* and the erasesize is determined from the extended id bytes
-	*/
+	{"NAND 1MiB 5V 8-bit",		0x6e, 256, 1, 0x1000, 0},
+	{"NAND 2MiB 5V 8-bit",		0x64, 256, 2, 0x1000, 0},
+	{"NAND 4MiB 5V 8-bit",		0x6b, 512, 4, 0x2000, 0},
+	{"NAND 1MiB 3,3V 8-bit",	0xe8, 256, 1, 0x1000, 0},
+	{"NAND 1MiB 3,3V 8-bit",	0xec, 256, 1, 0x1000, 0},
+	{"NAND 2MiB 3,3V 8-bit",	0xea, 256, 2, 0x1000, 0},
+	{"NAND 4MiB 3,3V 8-bit",	0xd5, 512, 4, 0x2000, 0},
+	{"NAND 4MiB 3,3V 8-bit",	0xe3, 512, 4, 0x2000, 0},
+	{"NAND 4MiB 3,3V 8-bit",	0xe5, 512, 4, 0x2000, 0},
+	{"NAND 8MiB 3,3V 8-bit",	0xd6, 512, 8, 0x2000, 0},
+
+	{"NAND 8MiB 1,8V 8-bit",	0x39, 512, 8, 0x2000, 0},
+	{"NAND 8MiB 3,3V 8-bit",	0xe6, 512, 8, 0x2000, 0},
+	{"NAND 8MiB 1,8V 16-bit",	0x49, 512, 8, 0x2000, NAND_BUSWIDTH_16},
+	{"NAND 8MiB 3,3V 16-bit",	0x59, 512, 8, 0x2000, NAND_BUSWIDTH_16},
+
+	{"NAND 16MiB 1,8V 8-bit",	0x33, 512, 16, 0x4000, 0},
+	{"NAND 16MiB 3,3V 8-bit",	0x73, 512, 16, 0x4000, 0},
+	{"NAND 16MiB 1,8V 16-bit",	0x43, 512, 16, 0x4000, NAND_BUSWIDTH_16},
+	{"NAND 16MiB 3,3V 16-bit",	0x53, 512, 16, 0x4000, NAND_BUSWIDTH_16},
+
+	{"NAND 32MiB 1,8V 8-bit",	0x35, 512, 32, 0x4000, 0},
+	{"NAND 32MiB 3,3V 8-bit",	0x75, 512, 32, 0x4000, 0},
+	{"NAND 32MiB 1,8V 16-bit",	0x45, 512, 32, 0x4000, NAND_BUSWIDTH_16},
+	{"NAND 32MiB 3,3V 16-bit",	0x55, 512, 32, 0x4000, NAND_BUSWIDTH_16},
+
+	{"NAND 64MiB 1,8V 8-bit",	0x36, 512, 64, 0x4000, 0},
+	{"NAND 64MiB 3,3V 8-bit",	0x76, 512, 64, 0x4000, 0},
+	{"NAND 64MiB 1,8V 16-bit",	0x46, 512, 64, 0x4000, NAND_BUSWIDTH_16},
+	{"NAND 64MiB 3,3V 16-bit",	0x56, 512, 64, 0x4000, NAND_BUSWIDTH_16},
+
+	{"NAND 128MiB 1,8V 8-bit",	0x78, 512, 128, 0x4000, 0},
+	{"NAND 128MiB 1,8V 8-bit",	0x39, 512, 128, 0x4000, 0},
+	{"NAND 128MiB 3,3V 8-bit",	0x79, 512, 128, 0x4000, 0},
+	{"NAND 128MiB 1,8V 16-bit",	0x72, 512, 128, 0x4000, NAND_BUSWIDTH_16},
+	{"NAND 128MiB 1,8V 16-bit",	0x49, 512, 128, 0x4000, NAND_BUSWIDTH_16},
+	{"NAND 128MiB 3,3V 16-bit",	0x74, 512, 128, 0x4000, NAND_BUSWIDTH_16},
+	{"NAND 128MiB 3,3V 16-bit",	0x59, 512, 128, 0x4000, NAND_BUSWIDTH_16},
+
+	{"NAND 256MiB 3,3V 8-bit",	0x71, 512, 256, 0x4000, 0},
+
+	/*
+	 * These are the new chips with large page size. The pagesize and the
+	 * erasesize is determined from the extended id bytes
+	 */
+#define LP_OPTIONS (NAND_SAMSUNG_LP_OPTIONS | NAND_NO_READRDY | NAND_NO_AUTOINCR)
+#define LP_OPTIONS16 (LP_OPTIONS | NAND_BUSWIDTH_16)
+
 	/*512 Megabit */
-	{"NAND 64MiB 1,8V 8-bit", 	0xA2, 0,  64, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_NO_AUTOINCR},
-	{"NAND 64MiB 3,3V 8-bit", 	0xF2, 0,  64, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_NO_AUTOINCR},
-	{"NAND 64MiB 1,8V 16-bit", 	0xB2, 0,  64, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_BUSWIDTH_16 | NAND_NO_AUTOINCR},
-	{"NAND 64MiB 3,3V 16-bit", 	0xC2, 0,  64, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_BUSWIDTH_16 | NAND_NO_AUTOINCR},
+	{"NAND 64MiB 1,8V 8-bit",	0xA2, 0,  64, 0, LP_OPTIONS},
+	{"NAND 64MiB 3,3V 8-bit",	0xF2, 0,  64, 0, LP_OPTIONS},
+	{"NAND 64MiB 1,8V 16-bit",	0xB2, 0,  64, 0, LP_OPTIONS16},
+	{"NAND 64MiB 3,3V 16-bit",	0xC2, 0,  64, 0, LP_OPTIONS16},
 
 	/* 1 Gigabit */
-	{"NAND 128MiB 1,8V 8-bit", 	0xA1, 0, 128, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_NO_AUTOINCR},
-	{"NAND 128MiB 3,3V 8-bit", 	0xF1, 0, 128, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_NO_AUTOINCR},
-	{"NAND 128MiB 1,8V 16-bit", 	0xB1, 0, 128, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_BUSWIDTH_16 | NAND_NO_AUTOINCR},
-	{"NAND 128MiB 3,3V 16-bit", 	0xC1, 0, 128, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_BUSWIDTH_16 | NAND_NO_AUTOINCR},
+	{"NAND 128MiB 1,8V 8-bit",	0xA1, 0, 128, 0, LP_OPTIONS},
+	{"NAND 128MiB 3,3V 8-bit",	0xF1, 0, 128, 0, LP_OPTIONS},
+	{"NAND 128MiB 1,8V 16-bit",	0xB1, 0, 128, 0, LP_OPTIONS16},
+	{"NAND 128MiB 3,3V 16-bit",	0xC1, 0, 128, 0, LP_OPTIONS16},
 
 	/* 2 Gigabit */
-	{"NAND 256MiB 1,8V 8-bit", 	0xAA, 0, 256, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_NO_AUTOINCR},
-	{"NAND 256MiB 3,3V 8-bit", 	0xDA, 0, 256, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_NO_AUTOINCR},
-	{"NAND 256MiB 1,8V 16-bit", 	0xBA, 0, 256, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_BUSWIDTH_16 | NAND_NO_AUTOINCR},
-	{"NAND 256MiB 3,3V 16-bit", 	0xCA, 0, 256, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_BUSWIDTH_16 | NAND_NO_AUTOINCR},
+	{"NAND 256MiB 1,8V 8-bit",	0xAA, 0, 256, 0, LP_OPTIONS},
+	{"NAND 256MiB 3,3V 8-bit",	0xDA, 0, 256, 0, LP_OPTIONS},
+	{"NAND 256MiB 1,8V 16-bit",	0xBA, 0, 256, 0, LP_OPTIONS16},
+	{"NAND 256MiB 3,3V 16-bit",	0xCA, 0, 256, 0, LP_OPTIONS16},
 
 	/* 4 Gigabit */
-	{"NAND 512MiB 1,8V 8-bit", 	0xAC, 0, 512, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_NO_AUTOINCR},
-	{"NAND 512MiB 3,3V 8-bit", 	0xDC, 0, 512, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_NO_AUTOINCR},
-	{"NAND 512MiB 1,8V 16-bit", 	0xBC, 0, 512, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_BUSWIDTH_16 | NAND_NO_AUTOINCR},
-	{"NAND 512MiB 3,3V 16-bit", 	0xCC, 0, 512, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_BUSWIDTH_16 | NAND_NO_AUTOINCR},
+	{"NAND 512MiB 1,8V 8-bit",	0xAC, 0, 512, 0, LP_OPTIONS},
+	{"NAND 512MiB 3,3V 8-bit",	0xDC, 0, 512, 0, LP_OPTIONS},
+	{"NAND 512MiB 1,8V 16-bit",	0xBC, 0, 512, 0, LP_OPTIONS16},
+	{"NAND 512MiB 3,3V 16-bit",	0xCC, 0, 512, 0, LP_OPTIONS16},
 
 	/* 8 Gigabit */
-	{"NAND 1GiB 1,8V 8-bit", 	0xA3, 0, 1024, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_NO_AUTOINCR},
-	{"NAND 1GiB 3,3V 8-bit", 	0xD3, 0, 1024, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_NO_AUTOINCR},
-	{"NAND 1GiB 1,8V 16-bit", 	0xB3, 0, 1024, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_BUSWIDTH_16 | NAND_NO_AUTOINCR},
-	{"NAND 1GiB 3,3V 16-bit", 	0xC3, 0, 1024, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_BUSWIDTH_16 | NAND_NO_AUTOINCR},
+	{"NAND 1GiB 1,8V 8-bit",	0xA3, 0, 1024, 0, LP_OPTIONS},
+	{"NAND 1GiB 3,3V 8-bit",	0xD3, 0, 1024, 0, LP_OPTIONS},
+	{"NAND 1GiB 1,8V 16-bit",	0xB3, 0, 1024, 0, LP_OPTIONS16},
+	{"NAND 1GiB 3,3V 16-bit",	0xC3, 0, 1024, 0, LP_OPTIONS16},
 
 	/* 16 Gigabit */
-	{"NAND 2GiB 1,8V 8-bit", 	0xA5, 0, 2048, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_NO_AUTOINCR},
-	{"NAND 2GiB 3,3V 8-bit", 	0xD5, 0, 2048, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_NO_AUTOINCR},
-	{"NAND 2GiB 1,8V 16-bit", 	0xB5, 0, 2048, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_BUSWIDTH_16 | NAND_NO_AUTOINCR},
-	{"NAND 2GiB 3,3V 16-bit", 	0xC5, 0, 2048, 0, NAND_SAMSUNG_LP_OPTIONS | NAND_BUSWIDTH_16 | NAND_NO_AUTOINCR},
-
-	/* Renesas AND 1 Gigabit. Those chips do not support extended id and have a strange page/block layout !
-	 * The chosen minimum erasesize is 4 * 2 * 2048 = 16384 Byte, as those chips have an array of 4 page planes
-	 * 1 block = 2 pages, but due to plane arrangement the blocks 0-3 consists of page 0 + 4,1 + 5, 2 + 6, 3 + 7
-	 * Anyway JFFS2 would increase the eraseblock size so we chose a combined one which can be erased in one go
-	 * There are more speed improvements for reads and writes possible, but not implemented now
+	{"NAND 2GiB 1,8V 8-bit",	0xA5, 0, 2048, 0, LP_OPTIONS},
+	{"NAND 2GiB 3,3V 8-bit",	0xD5, 0, 2048, 0, LP_OPTIONS},
+	{"NAND 2GiB 1,8V 16-bit",	0xB5, 0, 2048, 0, LP_OPTIONS16},
+	{"NAND 2GiB 3,3V 16-bit",	0xC5, 0, 2048, 0, LP_OPTIONS16},
+
+	/*
+	 * Renesas AND 1 Gigabit. Those chips do not support extended id and
+	 * have a strange page/block layout !  The chosen minimum erasesize is
+	 * 4 * 2 * 2048 = 16384 Byte, as those chips have an array of 4 page
+	 * planes 1 block = 2 pages, but due to plane arrangement the blocks
+	 * 0-3 consists of page 0 + 4,1 + 5, 2 + 6, 3 + 7 Anyway JFFS2 would
+	 * increase the eraseblock size so we chose a combined one which can be
+	 * erased in one go There are more speed improvements for reads and
+	 * writes possible, but not implemented now
 	 */
-	{"AND 128MiB 3,3V 8-bit",	0x01, 2048, 128, 0x4000, NAND_IS_AND | NAND_NO_AUTOINCR | NAND_4PAGE_ARRAY | BBT_AUTO_REFRESH},
+	{"AND 128MiB 3,3V 8-bit",	0x01, 2048, 128, 0x4000,
+	 NAND_IS_AND | NAND_NO_AUTOINCR |NAND_NO_READRDY | NAND_4PAGE_ARRAY |
+	 BBT_AUTO_REFRESH
+	},
 
 	{NULL,}
 };
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 2c0fb6380461..2fd85d55803d 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -159,6 +159,10 @@ typedef enum {
  * bits from adjacent blocks from 'leaking' in altering data.
  * This happens with the Renesas AG-AND chips, possibly others.  */
 #define BBT_AUTO_REFRESH	0x00000080
+/* Chip does not require ready check on read. True
+ * for all large page devices, as they do not support
+ * autoincrement.*/
+#define NAND_NO_READRDY		0x00000100
 
 /* Options valid for Samsung large page devices */
 #define NAND_SAMSUNG_LP_OPTIONS \
-- 
cgit v1.2.3


From 7fac464868ec5d80019fa549b8b4516dd1dc9d5c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Thu, 25 May 2006 09:57:31 +0200
Subject: [MTD] Add ECC statistics to struct mtd_info

FLASH - especially NAND FLASH - will become less reliable
and bit flips more likely. Add an ECC statistics struct
to struct mtd_info to keep track of this.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mtd/mtd.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index b8ad634391db..41a984dcb139 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -56,6 +56,17 @@ struct mtd_erase_region_info {
 	u_int32_t numblocks;		/* Number of blocks of erasesize in this region */
 };
 
+/**
+ * struct mtd_ecc_stats - error correction status
+ *
+ * @corrected:	number of corrected bits
+ * @failed:	number of uncorrectable errors
+ */
+struct mtd_ecc_stats {
+	unsigned long corrected;
+	unsigned long failed;
+};
+
 struct mtd_info {
 	u_char type;
 	u_int32_t flags;
@@ -153,6 +164,9 @@ struct mtd_info {
 
 	struct notifier_block reboot_notifier;  /* default mode before reboot */
 
+	/* ECC status information */
+	struct mtd_ecc_stats ecc_stats;
+
 	void *priv;
 
 	struct module *owner;
-- 
cgit v1.2.3


From 9577f44a899cf4acb9e381c8946307b72153cd15 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Thu, 25 May 2006 10:04:31 +0200
Subject: [MTD] NAND Add read/write function pointers to struct nand_ecc_ctrl

Add read/write function pointers to struct nand_ecc_ctrl to
prepare the modulaization of nand_read/write functions. The
current implementation handles every type of ecc mode
software/hardware and all kinds of strange ecc placement
schemes in one switch/if construct. Thats too complex to
maintain and too inflexible to expand. Modularization will
also shorten the code pathes of the read/write functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mtd/nand.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 2fd85d55803d..daacde5132fe 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -233,16 +233,23 @@ struct nand_hw_control {
  * @steps:	number of ecc steps per page
  * @size:	data bytes per ecc step
  * @bytes:	ecc bytes per step
+ * @total:	total number of ecc bytes per page
+ * @prepad:	padding information for syndrome based ecc generators
+ * @postpad:	padding information for syndrome based ecc generators
  * @hwctl:	function to control hardware ecc generator. Must only
  *		be provided if an hardware ECC is available
  * @calculate:	function for ecc calculation or readback from ecc hardware
  * @correct:	function for ecc correction, matching to ecc generator (sw/hw)
+ * @write_page:	function to write a page according to the ecc generator requirements
  */
 struct nand_ecc_ctrl {
 	nand_ecc_modes_t	mode;
 	int			steps;
 	int			size;
 	int			bytes;
+	int			total;
+	int			prepad;
+	int			postpad;
 	void			(*hwctl)(struct mtd_info *mtd, int mode);
 	int			(*calculate)(struct mtd_info *mtd,
 					     const uint8_t *dat,
@@ -250,6 +257,12 @@ struct nand_ecc_ctrl {
 	int			(*correct)(struct mtd_info *mtd, uint8_t *dat,
 					   uint8_t *read_ecc,
 					   uint8_t *calc_ecc);
+	int			(*read_page)(struct mtd_info *mtd,
+					     struct nand_chip *chip,
+					     uint8_t *buf);
+	int			(*write_page)(struct mtd_info *mtd,
+					      struct nand_chip *chip,
+					      uint8_t *buf, int cached);
 };
 
 /**
-- 
cgit v1.2.3


From f5bbdacc41939f89d8ccb18dd79cd9b21c0cb75d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Thu, 25 May 2006 10:07:16 +0200
Subject: [MTD] NAND Modularize read function

Split the core of the read function out and implement
seperate handling functions for software and hardware
ECC.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/diskonchip.c |   4 +-
 drivers/mtd/nand/nand_base.c  | 448 +++++++++++++++++++++---------------------
 drivers/mtd/nand/rtc_from4.c  |  62 +++---
 include/linux/mtd/nand.h      |  16 +-
 4 files changed, 275 insertions(+), 255 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index 2ec9080e2b14..83af6f05cd00 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -968,12 +968,14 @@ static int doc200x_calculate_ecc(struct mtd_info *mtd, const u_char *dat, unsign
 	return 0;
 }
 
-static int doc200x_correct_data(struct mtd_info *mtd, u_char *dat, u_char *read_ecc, u_char *calc_ecc)
+static int doc200x_correct_data(struct mtd_info *mtd, u_char *dat,
+				u_char *read_ecc, u_char *isnull)
 {
 	int i, ret = 0;
 	struct nand_chip *this = mtd->priv;
 	struct doc_priv *doc = this->priv;
 	void __iomem *docptr = doc->virtadr;
+	uint8_t calc_ecc[6];
 	volatile u_char dummy;
 	int emptymatch = 1;
 
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 49bca242610b..21fce2bce4b2 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -976,256 +976,224 @@ static int nand_verify_pages(struct mtd_info *mtd, struct nand_chip *chip, int p
 #endif
 
 /**
- * nand_read - [MTD Interface] MTD compability function for nand_do_read_ecc
- * @mtd:	MTD device structure
- * @from:	offset to read from
- * @len:	number of bytes to read
- * @retlen:	pointer to variable to store the number of read bytes
- * @buf:	the databuffer to put data
- *
- * This function simply calls nand_do_read_ecc with oob buffer and oobsel = NULL
- * and flags = 0xff
+ * nand_read_page_swecc - {REPLACABLE] software ecc based page read function
+ * @mtd:	mtd info structure
+ * @chip:	nand chip info structure
+ * @buf:	buffer to store read data
  */
-static int nand_read(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, uint8_t *buf)
+static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
+				uint8_t *buf)
 {
-	return nand_do_read_ecc(mtd, from, len, retlen, buf, NULL, &mtd->oobinfo, 0xff);
+	int i, eccsize = chip->ecc.size;
+	int eccbytes = chip->ecc.bytes;
+	int eccsteps = chip->ecc.steps;
+	uint8_t *p = buf;
+	uint8_t *ecc_calc = chip->oob_buf + mtd->oobsize;
+	uint8_t *ecc_code = ecc_calc + mtd->oobsize;
+	int *eccpos = chip->autooob->eccpos;
+
+	chip->read_buf(mtd, buf, mtd->writesize);
+	chip->read_buf(mtd, chip->oob_buf, mtd->oobsize);
+
+	if (chip->ecc.mode == NAND_ECC_NONE)
+		return 0;
+
+	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize)
+		chip->ecc.calculate(mtd, p, &ecc_calc[i]);
+
+	for (i = 0; i < chip->ecc.total; i++)
+		ecc_code[i] = chip->oob_buf[eccpos[i]];
+
+	eccsteps = chip->ecc.steps;
+	p = buf;
+
+	for (i = 0 ; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
+		int stat;
+
+		stat = chip->ecc.correct(mtd, p, &ecc_code[i], &ecc_calc[i]);
+		if (stat == -1)
+			mtd->ecc_stats.failed++;
+		else
+			mtd->ecc_stats.corrected += stat;
+	}
+	return 0;
 }
 
 /**
- * nand_do_read_ecc - [MTD Interface] Read data with ECC
- * @mtd:	MTD device structure
- * @from:	offset to read from
- * @len:	number of bytes to read
- * @retlen:	pointer to variable to store the number of read bytes
- * @buf:	the databuffer to put data
- * @oob_buf:	filesystem supplied oob data buffer (can be NULL)
- * @oobsel:	oob selection structure
- * @flags:	flag to indicate if nand_get_device/nand_release_device should be preformed
- *		and how many corrected error bits are acceptable:
- *		  bits 0..7 - number of tolerable errors
- *		  bit  8    - 0 == do not get/release chip, 1 == get/release chip
+ * nand_read_page_hwecc - {REPLACABLE] hardware ecc based page read function
+ * @mtd:	mtd info structure
+ * @chip:	nand chip info structure
+ * @buf:	buffer to store read data
  *
- * NAND read with ECC
+ * Not for syndrome calculating ecc controllers which need a special oob layout
  */
-int nand_do_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
-		     size_t *retlen, uint8_t *buf, uint8_t *oob_buf, struct nand_oobinfo *oobsel, int flags)
+static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
+				uint8_t *buf)
 {
-
-	int i, j, col, realpage, page, end, ecc, chipnr, sndcmd = 1;
-	int read = 0, oob = 0, ecc_status = 0, ecc_failed = 0;
-	struct nand_chip *chip = mtd->priv;
-	uint8_t *data_poi, *oob_data = oob_buf;
-	uint8_t ecc_calc[32];
-	uint8_t ecc_code[32];
-	int eccmode, eccsteps;
-	int *oob_config, datidx;
-	int blockcheck = (1 << (chip->phys_erase_shift - chip->page_shift)) - 1;
-	int eccbytes;
-	int compareecc = 1;
-	int oobreadlen;
-
-	DEBUG(MTD_DEBUG_LEVEL3, "nand_read_ecc: from = 0x%08x, len = %i\n", (unsigned int)from, (int)len);
-
-	/* Do not allow reads past end of device */
-	if ((from + len) > mtd->size) {
-		DEBUG(MTD_DEBUG_LEVEL0, "nand_read_ecc: Attempt read beyond end of device\n");
-		*retlen = 0;
-		return -EINVAL;
+	int i, eccsize = chip->ecc.size;
+	int eccbytes = chip->ecc.bytes;
+	int eccsteps = chip->ecc.steps;
+	uint8_t *p = buf;
+	uint8_t *ecc_calc = chip->oob_buf + mtd->oobsize;
+	uint8_t *ecc_code = ecc_calc + mtd->oobsize;
+	int *eccpos = chip->autooob->eccpos;
+
+	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
+		chip->ecc.hwctl(mtd, NAND_ECC_READ);
+		chip->read_buf(mtd, p, eccsize);
+		chip->ecc.calculate(mtd, p, &ecc_calc[i]);
 	}
+	chip->read_buf(mtd, chip->oob_buf, mtd->oobsize);
 
-	/* Grab the lock and see if the device is available */
-	if (flags & NAND_GET_DEVICE)
-		nand_get_device(chip, mtd, FL_READING);
+	for (i = 0; i < chip->ecc.total; i++)
+		ecc_code[i] = chip->oob_buf[eccpos[i]];
 
-	/* Autoplace of oob data ? Use the default placement scheme */
-	if (oobsel->useecc == MTD_NANDECC_AUTOPLACE)
-		oobsel = chip->autooob;
+	eccsteps = chip->ecc.steps;
+	p = buf;
 
-	eccmode = oobsel->useecc ? chip->ecc.mode : NAND_ECC_NONE;
-	oob_config = oobsel->eccpos;
+	for (i = 0 ; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
+		int stat;
 
-	/* Select the NAND device */
-	chipnr = (int)(from >> chip->chip_shift);
-	chip->select_chip(mtd, chipnr);
-
-	/* First we calculate the starting page */
-	realpage = (int)(from >> chip->page_shift);
-	page = realpage & chip->pagemask;
+		stat = chip->ecc.correct(mtd, p, &ecc_code[i], &ecc_calc[i]);
+		if (stat == -1)
+			mtd->ecc_stats.failed++;
+		else
+			mtd->ecc_stats.corrected += stat;
+	}
+	return 0;
+}
 
-	/* Get raw starting column */
-	col = from & (mtd->writesize - 1);
+/**
+ * nand_read_page_syndrome - {REPLACABLE] hardware ecc syndrom based page read
+ * @mtd:	mtd info structure
+ * @chip:	nand chip info structure
+ * @buf:	buffer to store read data
+ *
+ * The hw generator calculates the error syndrome automatically. Therefor
+ * we need a special oob layout and .
+ */
+static int nand_read_page_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+				   uint8_t *buf)
+{
+	int i, eccsize = chip->ecc.size;
+	int eccbytes = chip->ecc.bytes;
+	int eccsteps = chip->ecc.steps;
+	uint8_t *p = buf;
+	uint8_t *oob = chip->oob_buf;
 
-	end = mtd->writesize;
-	ecc = chip->ecc.size;
-	eccbytes = chip->ecc.bytes;
+	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
+		int stat;
 
-	if ((eccmode == NAND_ECC_NONE) || (chip->options & NAND_HWECC_SYNDROME))
-		compareecc = 0;
+		chip->ecc.hwctl(mtd, NAND_ECC_READ);
+		chip->read_buf(mtd, p, eccsize);
 
-	oobreadlen = mtd->oobsize;
-	if (chip->options & NAND_HWECC_SYNDROME)
-		oobreadlen -= oobsel->eccbytes;
+		if (chip->ecc.prepad) {
+			chip->read_buf(mtd, oob, chip->ecc.prepad);
+			oob += chip->ecc.prepad;
+		}
 
-	/* Loop until all data read */
-	while (read < len) {
+		chip->ecc.hwctl(mtd, NAND_ECC_READSYN);
+		chip->read_buf(mtd, oob, eccbytes);
+		stat = chip->ecc.correct(mtd, p, oob, NULL);
 
-		int aligned = (!col && (len - read) >= end);
-		/*
-		 * If the read is not page aligned, we have to read into data buffer
-		 * due to ecc, else we read into return buffer direct
-		 */
-		if (aligned)
-			data_poi = &buf[read];
+		if (stat == -1)
+			mtd->ecc_stats.failed++;
 		else
-			data_poi = chip->data_buf;
+			mtd->ecc_stats.corrected += stat;
 
-		/* Check, if we have this page in the buffer
-		 *
-		 * FIXME: Make it work when we must provide oob data too,
-		 * check the usage of data_buf oob field
-		 */
-		if (realpage == chip->pagebuf && !oob_buf) {
-			/* aligned read ? */
-			if (aligned)
-				memcpy(data_poi, chip->data_buf, end);
-			goto readdata;
-		}
+		oob += eccbytes;
 
-		/* Check, if we must send the read command */
-		if (sndcmd) {
-			chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
-			sndcmd = 0;
+		if (chip->ecc.postpad) {
+			chip->read_buf(mtd, oob, chip->ecc.postpad);
+			oob += chip->ecc.postpad;
 		}
+	}
 
-		/* get oob area, if we have no oob buffer from fs-driver */
-		if (!oob_buf || oobsel->useecc == MTD_NANDECC_AUTOPLACE ||
-			oobsel->useecc == MTD_NANDECC_AUTOPL_USR)
-			oob_data = &chip->data_buf[end];
-
-		eccsteps = chip->ecc.steps;
-
-		switch (eccmode) {
-		case NAND_ECC_NONE:{
-				/* No ECC, Read in a page */
-				static unsigned long lastwhinge = 0;
-				if ((lastwhinge / HZ) != (jiffies / HZ)) {
-					printk(KERN_WARNING
-					       "Reading data from NAND FLASH without ECC is not recommended\n");
-					lastwhinge = jiffies;
-				}
-				chip->read_buf(mtd, data_poi, end);
-				break;
-			}
+	/* Calculate remaining oob bytes */
+	i = oob - chip->oob_buf;
+	if (i)
+		chip->read_buf(mtd, oob, i);
 
-		case NAND_ECC_SOFT:	/* Software ECC 3/256: Read in a page + oob data */
-			chip->read_buf(mtd, data_poi, end);
-			for (i = 0, datidx = 0; eccsteps; eccsteps--, i += 3, datidx += ecc)
-				chip->ecc.calculate(mtd, &data_poi[datidx], &ecc_calc[i]);
-			break;
+	return 0;
+}
 
-		default:
-			for (i = 0, datidx = 0; eccsteps; eccsteps--, i += eccbytes, datidx += ecc) {
-				chip->ecc.hwctl(mtd, NAND_ECC_READ);
-				chip->read_buf(mtd, &data_poi[datidx], ecc);
-
-				/* HW ecc with syndrome calculation must read the
-				 * syndrome from flash immidiately after the data */
-				if (!compareecc) {
-					/* Some hw ecc generators need to know when the
-					 * syndrome is read from flash */
-					chip->ecc.hwctl(mtd, NAND_ECC_READSYN);
-					chip->read_buf(mtd, &oob_data[i], eccbytes);
-					/* We calc error correction directly, it checks the hw
-					 * generator for an error, reads back the syndrome and
-					 * does the error correction on the fly */
-					ecc_status = chip->ecc.correct(mtd, &data_poi[datidx], &oob_data[i], &ecc_code[i]);
-					if ((ecc_status == -1) || (ecc_status > (flags && 0xff))) {
-						DEBUG(MTD_DEBUG_LEVEL0, "nand_read_ecc: "
-						      "Failed ECC read, page 0x%08x on chip %d\n", page, chipnr);
-						ecc_failed++;
-					}
-				} else {
-					chip->ecc.calculate(mtd, &data_poi[datidx], &ecc_calc[i]);
-				}
-			}
-			break;
-		}
+/**
+ * nand_do_read - [Internal] Read data with ECC
+ *
+ * @mtd:	MTD device structure
+ * @from:	offset to read from
+ * @len:	number of bytes to read
+ * @retlen:	pointer to variable to store the number of read bytes
+ * @buf:	the databuffer to put data
+ *
+ * Internal function. Called with chip held.
+ */
+int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
+		 size_t *retlen, uint8_t *buf)
+{
+	int chipnr, page, realpage, col, bytes, aligned;
+	struct nand_chip *chip = mtd->priv;
+	struct mtd_ecc_stats stats;
+	int blkcheck = (1 << (chip->phys_erase_shift - chip->page_shift)) - 1;
+	int sndcmd = 1;
+	int ret = 0;
+	uint32_t readlen = len;
+	uint8_t *bufpoi;
 
-		/* read oobdata */
-		chip->read_buf(mtd, &oob_data[mtd->oobsize - oobreadlen], oobreadlen);
+	stats = mtd->ecc_stats;
 
-		/* Skip ECC check, if not requested (ECC_NONE or HW_ECC with syndromes) */
-		if (!compareecc)
-			goto readoob;
+	chipnr = (int)(from >> chip->chip_shift);
+	chip->select_chip(mtd, chipnr);
 
-		/* Pick the ECC bytes out of the oob data */
-		for (j = 0; j < oobsel->eccbytes; j++)
-			ecc_code[j] = oob_data[oob_config[j]];
+	realpage = (int)(from >> chip->page_shift);
+	page = realpage & chip->pagemask;
 
-		/* correct data, if necessary */
-		for (i = 0, j = 0, datidx = 0; i < chip->ecc.steps; i++, datidx += ecc) {
-			ecc_status = chip->ecc.correct(mtd, &data_poi[datidx], &ecc_code[j], &ecc_calc[j]);
+	col = (int)(from & (mtd->writesize - 1));
 
-			/* Get next chunk of ecc bytes */
-			j += eccbytes;
+	while(1) {
+		bytes = min(mtd->writesize - col, readlen);
+		aligned = (bytes == mtd->writesize);
 
-			/* Check, if we have a fs supplied oob-buffer,
-			 * This is the legacy mode. Used by YAFFS1
-			 * Should go away some day
-			 */
-			if (oob_buf && oobsel->useecc == MTD_NANDECC_PLACE) {
-				int *p = (int *)(&oob_data[mtd->oobsize]);
-				p[i] = ecc_status;
-			}
+		/* Is the current page in the buffer ? */
+		if (realpage != chip->pagebuf) {
+			bufpoi = aligned ? buf : chip->data_buf;
 
-			if ((ecc_status == -1) || (ecc_status > (flags && 0xff))) {
-				DEBUG(MTD_DEBUG_LEVEL0, "nand_read_ecc: " "Failed ECC read, page 0x%08x\n", page);
-				ecc_failed++;
+			if (likely(sndcmd)) {
+				chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
+				sndcmd = 0;
 			}
-		}
 
-	      readoob:
-		/* check, if we have a fs supplied oob-buffer */
-		if (oob_buf) {
-			/* without autoplace. Legacy mode used by YAFFS1 */
-			switch (oobsel->useecc) {
-			case MTD_NANDECC_AUTOPLACE:
-			case MTD_NANDECC_AUTOPL_USR:
-				/* Walk through the autoplace chunks */
-				for (i = 0; oobsel->oobfree[i][1]; i++) {
-					int from = oobsel->oobfree[i][0];
-					int num = oobsel->oobfree[i][1];
-					memcpy(&oob_buf[oob], &oob_data[from], num);
-					oob += num;
-				}
+			/* Now read the page into the buffer */
+			ret = chip->ecc.read_page(mtd, chip, bufpoi);
+			if (ret < 0)
 				break;
-			case MTD_NANDECC_PLACE:
-				/* YAFFS1 legacy mode */
-				oob_data += chip->ecc.steps * sizeof(int);
-			default:
-				oob_data += mtd->oobsize;
+
+			/* Transfer not aligned data */
+			if (!aligned) {
+				chip->pagebuf = realpage;
+				memcpy(buf, chip->data_buf + col, bytes);
+			}
+
+			if (!(chip->options & NAND_NO_READRDY)) {
+				/*
+				 * Apply delay or wait for ready/busy pin. Do
+				 * this before the AUTOINCR check, so no
+				 * problems arise if a chip which does auto
+				 * increment is marked as NOAUTOINCR by the
+				 * board driver.
+				 */
+				if (!chip->dev_ready)
+					udelay(chip->chip_delay);
+				else
+					nand_wait_ready(mtd);
 			}
-		}
-	readdata:
-		/* Partial page read, transfer data into fs buffer */
-		if (!aligned) {
-			for (j = col; j < end && read < len; j++)
-				buf[read++] = data_poi[j];
-			chip->pagebuf = realpage;
 		} else
-			read += mtd->writesize;
+			memcpy(buf, chip->data_buf + col, bytes);
 
-		/* Apply delay or wait for ready/busy pin
-		 * Do this before the AUTOINCR check, so no problems
-		 * arise if a chip which does auto increment
-		 * is marked as NOAUTOINCR by the board driver.
-		 */
-		if (!chip->dev_ready)
-			udelay(chip->chip_delay);
-		else
-			nand_wait_ready(mtd);
+		buf += bytes;
+		readlen -= bytes;
 
-		if (read == len)
+		if (!readlen)
 			break;
 
 		/* For subsequent reads align to page boundary. */
@@ -1240,24 +1208,51 @@ int nand_do_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 			chip->select_chip(mtd, -1);
 			chip->select_chip(mtd, chipnr);
 		}
+
 		/* Check, if the chip supports auto page increment
 		 * or if we have hit a block boundary.
 		 */
-		if (!NAND_CANAUTOINCR(chip) || !(page & blockcheck))
+		if (!NAND_CANAUTOINCR(chip) || !(page & blkcheck))
 			sndcmd = 1;
 	}
 
-	/* Deselect and wake up anyone waiting on the device */
-	if (flags & NAND_GET_DEVICE)
-		nand_release_device(mtd);
+	*retlen = len - (size_t) readlen;
 
-	/*
-	 * Return success, if no ECC failures, else -EBADMSG
-	 * fs driver will take care of that, because
-	 * retlen == desired len and result == -EBADMSG
-	 */
-	*retlen = read;
-	return ecc_failed ? -EBADMSG : 0;
+	if (ret)
+		return ret;
+
+	return mtd->ecc_stats.failed - stats.failed ? -EBADMSG : 0;
+}
+
+/**
+ * nand_read - [MTD Interface] MTD compability function for nand_do_read_ecc
+ * @mtd:	MTD device structure
+ * @from:	offset to read from
+ * @len:	number of bytes to read
+ * @retlen:	pointer to variable to store the number of read bytes
+ * @buf:	the databuffer to put data
+ *
+ * Get hold of the chip and call nand_do_read
+ */
+static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
+		     size_t *retlen, uint8_t *buf)
+{
+	int ret;
+
+	*retlen = 0;
+	/* Do not allow reads past end of device */
+	if ((from + len) > mtd->size)
+		return -EINVAL;
+	if (!len)
+		return 0;
+
+	nand_get_device(mtd->priv, mtd, FL_READING);
+
+	ret = nand_do_read(mtd, from, len, retlen, buf);
+
+	nand_release_device(mtd);
+
+	return ret;
 }
 
 /**
@@ -2417,6 +2412,10 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 	 */
 	switch (chip->ecc.mode) {
 	case NAND_ECC_HW:
+		/* Use standard hwecc read page function ? */
+		if (!chip->ecc.read_page)
+			chip->ecc.read_page = nand_read_page_hwecc;
+
 	case NAND_ECC_HW_SYNDROME:
 		if (!chip->ecc.calculate || !chip->ecc.correct ||
 		    !chip->ecc.hwctl) {
@@ -2424,6 +2423,10 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 			       "Hardware ECC not possible\n");
 			BUG();
 		}
+		/* Use standard syndrome read page function ? */
+		if (!chip->ecc.read_page)
+			chip->ecc.read_page = nand_read_page_syndrome;
+
 		if (mtd->writesize >= chip->ecc.size)
 			break;
 		printk(KERN_WARNING "%d byte HW ECC not possible on "
@@ -2434,6 +2437,7 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 	case NAND_ECC_SOFT:
 		chip->ecc.calculate = nand_calculate_ecc;
 		chip->ecc.correct = nand_correct_data;
+		chip->ecc.read_page = nand_read_page_swecc;
 		chip->ecc.size = 256;
 		chip->ecc.bytes = 3;
 		break;
@@ -2441,6 +2445,7 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 	case NAND_ECC_NONE:
 		printk(KERN_WARNING "NAND_ECC_NONE selected by board driver. "
 		       "This is not recommended !!\n");
+		chip->ecc.read_page = nand_read_page_swecc;
 		chip->ecc.size = mtd->writesize;
 		chip->ecc.bytes = 0;
 		break;
@@ -2459,6 +2464,7 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 		printk(KERN_WARNING "Invalid ecc parameters\n");
 		BUG();
 	}
+	chip->ecc.total = chip->ecc.steps * chip->ecc.bytes;
 
 	/* Initialize state */
 	chip->state = FL_READY;
diff --git a/drivers/mtd/nand/rtc_from4.c b/drivers/mtd/nand/rtc_from4.c
index 6c97bfaea19a..b7083104a05b 100644
--- a/drivers/mtd/nand/rtc_from4.c
+++ b/drivers/mtd/nand/rtc_from4.c
@@ -444,7 +444,8 @@ static int rtc_from4_correct_data(struct mtd_info *mtd, const u_char *buf, u_cha
  * note: see pages 34..37 of data sheet for details.
  *
  */
-static int rtc_from4_errstat(struct mtd_info *mtd, struct nand_chip *this, int state, int status, int page)
+static int rtc_from4_errstat(struct mtd_info *mtd, struct nand_chip *this,
+			     int state, int status, int page)
 {
 	int er_stat = 0;
 	int rtn, retlen;
@@ -455,39 +456,50 @@ static int rtc_from4_errstat(struct mtd_info *mtd, struct nand_chip *this, int s
 	this->cmdfunc(mtd, NAND_CMD_STATUS_CLEAR, -1, -1);
 
 	if (state == FL_ERASING) {
+
 		for (i = 0; i < 4; i++) {
-			if (status & 1 << (i + 1)) {
-				this->cmdfunc(mtd, (NAND_CMD_STATUS_ERROR + i + 1), -1, -1);
-				rtn = this->read_byte(mtd);
-				this->cmdfunc(mtd, NAND_CMD_STATUS_RESET, -1, -1);
-				if (!(rtn & ERR_STAT_ECC_AVAILABLE)) {
-					er_stat |= 1 << (i + 1);	/* err_ecc_not_avail */
-				}
-			}
+			if (!(status & 1 << (i + 1)))
+				continue;
+			this->cmdfunc(mtd, (NAND_CMD_STATUS_ERROR + i + 1),
+				      -1, -1);
+			rtn = this->read_byte(mtd);
+			this->cmdfunc(mtd, NAND_CMD_STATUS_RESET, -1, -1);
+
+			/* err_ecc_not_avail */
+			if (!(rtn & ERR_STAT_ECC_AVAILABLE))
+				er_stat |= 1 << (i + 1);
 		}
+
 	} else if (state == FL_WRITING) {
+
+		unsigned long corrected = mtd->ecc_stats.corrected;
+
 		/* single bank write logic */
 		this->cmdfunc(mtd, NAND_CMD_STATUS_ERROR, -1, -1);
 		rtn = this->read_byte(mtd);
 		this->cmdfunc(mtd, NAND_CMD_STATUS_RESET, -1, -1);
+
 		if (!(rtn & ERR_STAT_ECC_AVAILABLE)) {
-			er_stat |= 1 << 1;	/* err_ecc_not_avail */
-		} else {
-			len = mtd->writesize;
-			buf = kmalloc(len, GFP_KERNEL);
-			if (!buf) {
-				printk(KERN_ERR "rtc_from4_errstat: Out of memory!\n");
-				er_stat = 1;	/* if we can't check, assume failed */
-			} else {
-				/* recovery read */
-				/* page read */
-				rtn = nand_do_read_ecc(mtd, page, len, &retlen, buf, NULL, this->autooob, 1);
-				if (rtn) {	/* if read failed or > 1-bit error corrected */
-					er_stat |= 1 << 1;	/* ECC read failed */
-				}
-				kfree(buf);
-			}
+			/* err_ecc_not_avail */
+			er_stat |= 1 << 1;
+			goto out;
 		}
+
+		len = mtd->writesize;
+		buf = kmalloc(len, GFP_KERNEL);
+		if (!buf) {
+			printk(KERN_ERR "rtc_from4_errstat: Out of memory!\n");
+			er_stat = 1;
+			goto out;
+		}
+
+		/* recovery read */
+		rtn = nand_do_read(mtd, page, len, &retlen, buf);
+
+		/* if read failed or > 1-bit error corrected */
+		if (rtn || (mtd->ecc_stats.corrected - corrected) > 1) {
+			er_stat |= 1 << 1;
+		kfree(buf);
 	}
 
 	rtn = status;
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index daacde5132fe..00916498ea55 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -479,14 +479,14 @@ struct nand_bbt_descr {
 /* The maximum number of blocks to scan for a bbt */
 #define NAND_BBT_SCAN_MAXBLOCKS	4
 
-extern int nand_scan_bbt (struct mtd_info *mtd, struct nand_bbt_descr *bd);
-extern int nand_update_bbt (struct mtd_info *mtd, loff_t offs);
-extern int nand_default_bbt (struct mtd_info *mtd);
-extern int nand_isbad_bbt (struct mtd_info *mtd, loff_t offs, int allowbbt);
-extern int nand_erase_nand (struct mtd_info *mtd, struct erase_info *instr, int allowbbt);
-extern int nand_do_read_ecc (struct mtd_info *mtd, loff_t from, size_t len,
-			     size_t * retlen, uint8_t * buf, uint8_t * oob_buf,
-			     struct nand_oobinfo *oobsel, int flags);
+extern int nand_scan_bbt(struct mtd_info *mtd, struct nand_bbt_descr *bd);
+extern int nand_update_bbt(struct mtd_info *mtd, loff_t offs);
+extern int nand_default_bbt(struct mtd_info *mtd);
+extern int nand_isbad_bbt(struct mtd_info *mtd, loff_t offs, int allowbbt);
+extern int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
+			   int allowbbt);
+extern int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
+			size_t * retlen, uint8_t * buf);
 
 /*
 * Constants for oob configuration
-- 
cgit v1.2.3


From f75e5097ef298c5a0aa106faa211d1afdc92dc3d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Fri, 26 May 2006 18:52:08 +0200
Subject: [MTD] NAND modularize write function

Modularize the write function and reorganaize the internal buffer
management. Remove obsolete chip options and fixup all affected
users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/diskonchip.c |   2 +-
 drivers/mtd/nand/edb7312.c    |   3 -
 drivers/mtd/nand/nand_base.c  | 775 ++++++++++++++----------------------------
 drivers/mtd/nand/nand_bbt.c   |   2 +-
 drivers/mtd/nand/rtc_from4.c  |   1 -
 drivers/mtd/nand/toto.c       |   2 -
 include/linux/mtd/nand.h      |  71 ++--
 7 files changed, 308 insertions(+), 548 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index 83af6f05cd00..82262a4a4208 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -1666,7 +1666,7 @@ static int __init doc_probe(unsigned long physadr)
 	nand->ecc.mode		= NAND_ECC_HW_SYNDROME;
 	nand->ecc.size		= 512;
 	nand->ecc.bytes		= 6;
-	nand->options		= NAND_USE_FLASH_BBT | NAND_HWECC_SYNDROME;
+	nand->options		= NAND_USE_FLASH_BBT;
 
 	doc->physadr		= physadr;
 	doc->virtadr		= virtadr;
diff --git a/drivers/mtd/nand/edb7312.c b/drivers/mtd/nand/edb7312.c
index ba5a2174a408..516c0e5e564c 100644
--- a/drivers/mtd/nand/edb7312.c
+++ b/drivers/mtd/nand/edb7312.c
@@ -198,9 +198,6 @@ static void __exit ep7312_cleanup(void)
 	/* Release resources, unregister device */
 	nand_release(ap7312_mtd);
 
-	/* Free internal data buffer */
-	kfree(this->data_buf);
-
 	/* Free the MTD device structure */
 	kfree(ep7312_mtd);
 }
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 2b29b47e2af4..cead9fc4f99f 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -88,37 +88,8 @@ static uint8_t ffchars[] = {
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 };
 
-/*
- * NAND low-level MTD interface functions
- */
-static void nand_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len);
-static void nand_read_buf(struct mtd_info *mtd, uint8_t *buf, int len);
-static int nand_verify_buf(struct mtd_info *mtd, const uint8_t *buf, int len);
-
-static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
-		     size_t *retlen, uint8_t *buf);
-static int nand_read_oob(struct mtd_info *mtd, loff_t from, size_t len,
-			 size_t *retlen, uint8_t *buf);
-static int nand_write(struct mtd_info *mtd, loff_t to, size_t len,
-		      size_t *retlen, const uint8_t *buf);
 static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len,
 			  size_t *retlen, const uint8_t *buf);
-static int nand_erase(struct mtd_info *mtd, struct erase_info *instr);
-static void nand_sync(struct mtd_info *mtd);
-
-/* Some internal functions */
-static int nand_write_page(struct mtd_info *mtd, struct nand_chip *chip,
-			   int page, uint8_t * oob_buf,
-			   struct nand_oobinfo *oobsel, int mode);
-#ifdef CONFIG_MTD_NAND_VERIFY_WRITE
-static int nand_verify_pages(struct mtd_info *mtd, struct nand_chip *chip,
-			     int page, int numpages, uint8_t *oob_buf,
-			     struct nand_oobinfo *oobsel, int chipnr,
-			     int oobmode);
-#else
-#define nand_verify_pages(...) (0)
-#endif
-
 static int nand_get_device(struct nand_chip *chip, struct mtd_info *mtd,
 			   int new_state);
 
@@ -262,7 +233,6 @@ static int nand_verify_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
 	for (i = 0; i < len; i++)
 		if (buf[i] != readb(chip->IO_ADDR_R))
 			return -EFAULT;
-
 	return 0;
 }
 
@@ -766,215 +736,6 @@ static int nand_wait(struct mtd_info *mtd, struct nand_chip *chip, int state)
 	return status;
 }
 
-/**
- * nand_write_page - [GENERIC] write one page
- * @mtd:	MTD device structure
- * @this:	NAND chip structure
- * @page: 	startpage inside the chip, must be called with (page & chip->pagemask)
- * @oob_buf:	out of band data buffer
- * @oobsel:	out of band selecttion structre
- * @cached:	1 = enable cached programming if supported by chip
- *
- * Nand_page_program function is used for write and writev !
- * This function will always program a full page of data
- * If you call it with a non page aligned buffer, you're lost :)
- *
- * Cached programming is not supported yet.
- */
-static int nand_write_page(struct mtd_info *mtd, struct nand_chip *chip, int page,
-			   uint8_t *oob_buf, struct nand_oobinfo *oobsel, int cached)
-{
-	int i, status;
-	uint8_t ecc_code[32];
-	int eccmode = oobsel->useecc ? chip->ecc.mode : NAND_ECC_NONE;
-	int *oob_config = oobsel->eccpos;
-	int datidx = 0, eccidx = 0, eccsteps = chip->ecc.steps;
-	int eccbytes = 0;
-
-	/* FIXME: Enable cached programming */
-	cached = 0;
-
-	/* Send command to begin auto page programming */
-	chip->cmdfunc(mtd, NAND_CMD_SEQIN, 0x00, page);
-
-	/* Write out complete page of data, take care of eccmode */
-	switch (eccmode) {
-		/* No ecc, write all */
-	case NAND_ECC_NONE:
-		printk(KERN_WARNING "Writing data without ECC to NAND-FLASH is not recommended\n");
-		chip->write_buf(mtd, chip->data_poi, mtd->writesize);
-		break;
-
-		/* Software ecc 3/256, write all */
-	case NAND_ECC_SOFT:
-		for (; eccsteps; eccsteps--) {
-			chip->ecc.calculate(mtd, &chip->data_poi[datidx], ecc_code);
-			for (i = 0; i < 3; i++, eccidx++)
-				oob_buf[oob_config[eccidx]] = ecc_code[i];
-			datidx += chip->ecc.size;
-		}
-		chip->write_buf(mtd, chip->data_poi, mtd->writesize);
-		break;
-	default:
-		eccbytes = chip->ecc.bytes;
-		for (; eccsteps; eccsteps--) {
-			/* enable hardware ecc logic for write */
-			chip->ecc.hwctl(mtd, NAND_ECC_WRITE);
-			chip->write_buf(mtd, &chip->data_poi[datidx], chip->ecc.size);
-			chip->ecc.calculate(mtd, &chip->data_poi[datidx], ecc_code);
-			for (i = 0; i < eccbytes; i++, eccidx++)
-				oob_buf[oob_config[eccidx]] = ecc_code[i];
-			/* If the hardware ecc provides syndromes then
-			 * the ecc code must be written immidiately after
-			 * the data bytes (words) */
-			if (chip->options & NAND_HWECC_SYNDROME)
-				chip->write_buf(mtd, ecc_code, eccbytes);
-			datidx += chip->ecc.size;
-		}
-		break;
-	}
-
-	/* Write out OOB data */
-	if (chip->options & NAND_HWECC_SYNDROME)
-		chip->write_buf(mtd, &oob_buf[oobsel->eccbytes], mtd->oobsize - oobsel->eccbytes);
-	else
-		chip->write_buf(mtd, oob_buf, mtd->oobsize);
-
-	/* Send command to actually program the data */
-	chip->cmdfunc(mtd, cached ? NAND_CMD_CACHEDPROG : NAND_CMD_PAGEPROG, -1, -1);
-
-	if (!cached) {
-		/* call wait ready function */
-		status = chip->waitfunc(mtd, chip, FL_WRITING);
-
-		/* See if operation failed and additional status checks are available */
-		if ((status & NAND_STATUS_FAIL) && (chip->errstat)) {
-			status = chip->errstat(mtd, chip, FL_WRITING, status, page);
-		}
-
-		/* See if device thinks it succeeded */
-		if (status & NAND_STATUS_FAIL) {
-			DEBUG(MTD_DEBUG_LEVEL0, "%s: " "Failed write, page 0x%08x, ", __FUNCTION__, page);
-			return -EIO;
-		}
-	} else {
-		/* FIXME: Implement cached programming ! */
-		/* wait until cache is ready */
-		// status = chip->waitfunc (mtd, this, FL_CACHEDRPG);
-	}
-	return 0;
-}
-
-#ifdef CONFIG_MTD_NAND_VERIFY_WRITE
-/**
- * nand_verify_pages - [GENERIC] verify the chip contents after a write
- * @mtd:	MTD device structure
- * @this:	NAND chip structure
- * @page:	startpage inside the chip, must be called with (page & chip->pagemask)
- * @numpages:	number of pages to verify
- * @oob_buf:	out of band data buffer
- * @oobsel:	out of band selecttion structre
- * @chipnr:	number of the current chip
- * @oobmode:	1 = full buffer verify, 0 = ecc only
- *
- * The NAND device assumes that it is always writing to a cleanly erased page.
- * Hence, it performs its internal write verification only on bits that
- * transitioned from 1 to 0. The device does NOT verify the whole page on a
- * byte by byte basis. It is possible that the page was not completely erased
- * or the page is becoming unusable due to wear. The read with ECC would catch
- * the error later when the ECC page check fails, but we would rather catch
- * it early in the page write stage. Better to write no data than invalid data.
- */
-static int nand_verify_pages(struct mtd_info *mtd, struct nand_chip *chip, int page, int numpages,
-			     uint8_t *oob_buf, struct nand_oobinfo *oobsel, int chipnr, int oobmode)
-{
-	int i, j, datidx = 0, oobofs = 0, res = -EIO;
-	int eccsteps = chip->ecc.steps;
-	int hweccbytes;
-	uint8_t oobdata[64];
-
-	hweccbytes = (chip->options & NAND_HWECC_SYNDROME) ? (oobsel->eccbytes / eccsteps) : 0;
-
-	/* Send command to read back the first page */
-	chip->cmdfunc(mtd, NAND_CMD_READ0, 0, page);
-
-	for (;;) {
-		for (j = 0; j < eccsteps; j++) {
-			/* Loop through and verify the data */
-			if (chip->verify_buf(mtd, &chip->data_poi[datidx], mtd->eccsize)) {
-				DEBUG(MTD_DEBUG_LEVEL0, "%s: " "Failed write verify, page 0x%08x ", __FUNCTION__, page);
-				goto out;
-			}
-			datidx += mtd->eccsize;
-			/* Have we a hw generator layout ? */
-			if (!hweccbytes)
-				continue;
-			if (chip->verify_buf(mtd, &chip->oob_buf[oobofs], hweccbytes)) {
-				DEBUG(MTD_DEBUG_LEVEL0, "%s: " "Failed write verify, page 0x%08x ", __FUNCTION__, page);
-				goto out;
-			}
-			oobofs += hweccbytes;
-		}
-
-		/* check, if we must compare all data or if we just have to
-		 * compare the ecc bytes
-		 */
-		if (oobmode) {
-			if (chip->verify_buf(mtd, &oob_buf[oobofs], mtd->oobsize - hweccbytes * eccsteps)) {
-				DEBUG(MTD_DEBUG_LEVEL0, "%s: " "Failed write verify, page 0x%08x ", __FUNCTION__, page);
-				goto out;
-			}
-		} else {
-			/* Read always, else autoincrement fails */
-			chip->read_buf(mtd, oobdata, mtd->oobsize - hweccbytes * eccsteps);
-
-			if (oobsel->useecc != MTD_NANDECC_OFF && !hweccbytes) {
-				int ecccnt = oobsel->eccbytes;
-
-				for (i = 0; i < ecccnt; i++) {
-					int idx = oobsel->eccpos[i];
-					if (oobdata[idx] != oob_buf[oobofs + idx]) {
-						DEBUG(MTD_DEBUG_LEVEL0, "%s: Failed ECC write verify, page 0x%08x, %6i bytes were succesful\n",
-						      __FUNCTION__, page, i);
-						goto out;
-					}
-				}
-			}
-		}
-		oobofs += mtd->oobsize - hweccbytes * eccsteps;
-		page++;
-		numpages--;
-
-		/* Apply delay or wait for ready/busy pin
-		 * Do this before the AUTOINCR check, so no problems
-		 * arise if a chip which does auto increment
-		 * is marked as NOAUTOINCR by the board driver.
-		 * Do this also before returning, so the chip is
-		 * ready for the next command.
-		 */
-		if (!chip->dev_ready)
-			udelay(chip->chip_delay);
-		else
-			nand_wait_ready(mtd);
-
-		/* All done, return happy */
-		if (!numpages)
-			return 0;
-
-		/* Check, if the chip supports auto page increment */
-		if (!NAND_CANAUTOINCR(chip))
-			chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
-	}
-	/*
-	 * Terminate the read command. We come here in case of an error
-	 * So we must issue a reset command.
-	 */
- out:
-	chip->cmdfunc(mtd, NAND_CMD_RESET, -1, -1);
-	return res;
-}
-#endif
-
 /**
  * nand_read_page_swecc - {REPLACABLE] software ecc based page read function
  * @mtd:	mtd info structure
@@ -988,12 +749,12 @@ static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
 	int eccbytes = chip->ecc.bytes;
 	int eccsteps = chip->ecc.steps;
 	uint8_t *p = buf;
-	uint8_t *ecc_calc = chip->oob_buf + mtd->oobsize;
-	uint8_t *ecc_code = ecc_calc + mtd->oobsize;
+	uint8_t *ecc_calc = chip->buffers.ecccalc;
+	uint8_t *ecc_code = chip->buffers.ecccode;
 	int *eccpos = chip->autooob->eccpos;
 
 	chip->read_buf(mtd, buf, mtd->writesize);
-	chip->read_buf(mtd, chip->oob_buf, mtd->oobsize);
+	chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
 
 	if (chip->ecc.mode == NAND_ECC_NONE)
 		return 0;
@@ -1002,7 +763,7 @@ static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
 		chip->ecc.calculate(mtd, p, &ecc_calc[i]);
 
 	for (i = 0; i < chip->ecc.total; i++)
-		ecc_code[i] = chip->oob_buf[eccpos[i]];
+		ecc_code[i] = chip->oob_poi[eccpos[i]];
 
 	eccsteps = chip->ecc.steps;
 	p = buf;
@@ -1034,8 +795,8 @@ static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
 	int eccbytes = chip->ecc.bytes;
 	int eccsteps = chip->ecc.steps;
 	uint8_t *p = buf;
-	uint8_t *ecc_calc = chip->oob_buf + mtd->oobsize;
-	uint8_t *ecc_code = ecc_calc + mtd->oobsize;
+	uint8_t *ecc_calc = chip->buffers.ecccalc;
+	uint8_t *ecc_code = chip->buffers.ecccode;
 	int *eccpos = chip->autooob->eccpos;
 
 	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
@@ -1043,10 +804,10 @@ static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
 		chip->read_buf(mtd, p, eccsize);
 		chip->ecc.calculate(mtd, p, &ecc_calc[i]);
 	}
-	chip->read_buf(mtd, chip->oob_buf, mtd->oobsize);
+	chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
 
 	for (i = 0; i < chip->ecc.total; i++)
-		ecc_code[i] = chip->oob_buf[eccpos[i]];
+		ecc_code[i] = chip->oob_poi[eccpos[i]];
 
 	eccsteps = chip->ecc.steps;
 	p = buf;
@@ -1070,7 +831,7 @@ static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
  * @buf:	buffer to store read data
  *
  * The hw generator calculates the error syndrome automatically. Therefor
- * we need a special oob layout and .
+ * we need a special oob layout and handling.
  */
 static int nand_read_page_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
 				   uint8_t *buf)
@@ -1079,7 +840,7 @@ static int nand_read_page_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
 	int eccbytes = chip->ecc.bytes;
 	int eccsteps = chip->ecc.steps;
 	uint8_t *p = buf;
-	uint8_t *oob = chip->oob_buf;
+	uint8_t *oob = chip->oob_poi;
 
 	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
 		int stat;
@@ -1110,7 +871,7 @@ static int nand_read_page_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
 	}
 
 	/* Calculate remaining oob bytes */
-	i = oob - chip->oob_buf;
+	i = oob - chip->oob_poi;
 	if (i)
 		chip->read_buf(mtd, oob, i);
 
@@ -1149,6 +910,7 @@ int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
 	page = realpage & chip->pagemask;
 
 	col = (int)(from & (mtd->writesize - 1));
+	chip->oob_poi = chip->buffers.oobrbuf;
 
 	while(1) {
 		bytes = min(mtd->writesize - col, readlen);
@@ -1156,7 +918,7 @@ int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
 
 		/* Is the current page in the buffer ? */
 		if (realpage != chip->pagebuf) {
-			bufpoi = aligned ? buf : chip->data_buf;
+			bufpoi = aligned ? buf : chip->buffers.databuf;
 
 			if (likely(sndcmd)) {
 				chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
@@ -1171,7 +933,7 @@ int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
 			/* Transfer not aligned data */
 			if (!aligned) {
 				chip->pagebuf = realpage;
-				memcpy(buf, chip->data_buf + col, bytes);
+				memcpy(buf, chip->buffers.databuf + col, bytes);
 			}
 
 			if (!(chip->options & NAND_NO_READRDY)) {
@@ -1188,7 +950,7 @@ int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
 					nand_wait_ready(mtd);
 			}
 		} else
-			memcpy(buf, chip->data_buf + col, bytes);
+			memcpy(buf, chip->buffers.databuf + col, bytes);
 
 		buf += bytes;
 		readlen -= bytes;
@@ -1392,10 +1154,11 @@ int nand_read_raw(struct mtd_info *mtd, uint8_t *buf, loff_t from, size_t len,
 	blockcheck = (1 << (chip->phys_erase_shift - chip->page_shift)) - 1;
 
 	while (len) {
-		if (sndcmd)
+		if (likely(sndcmd)) {
 			chip->cmdfunc(mtd, NAND_CMD_READ0, 0,
 				      page & chip->pagemask);
-		sndcmd = 0;
+			sndcmd = 0;
+		}
 
 		chip->read_buf(mtd, &buf[cnt], pagesize);
 
@@ -1403,10 +1166,12 @@ int nand_read_raw(struct mtd_info *mtd, uint8_t *buf, loff_t from, size_t len,
 		cnt += pagesize;
 		page++;
 
-		if (!chip->dev_ready)
-			udelay(chip->chip_delay);
-		else
-			nand_wait_ready(mtd);
+		if (!(chip->options & NAND_NO_READRDY)) {
+			if (!chip->dev_ready)
+				udelay(chip->chip_delay);
+			else
+				nand_wait_ready(mtd);
+		}
 
 		/*
 		 * Check, if the chip supports auto page increment or if we
@@ -1422,112 +1187,156 @@ int nand_read_raw(struct mtd_info *mtd, uint8_t *buf, loff_t from, size_t len,
 }
 
 /**
- * nand_write_raw - [GENERIC] Write raw data including oob
- * @mtd:	MTD device structure
- * @buf:	source buffer
- * @to:		offset to write to
- * @len:	number of bytes to write
- * @buf:	source buffer
- * @oob:	oob buffer
- *
- * Write raw data including oob
+ * nand_write_page_swecc - {REPLACABLE] software ecc based page write function
+ * @mtd:	mtd info structure
+ * @chip:	nand chip info structure
+ * @buf:	data buffer
  */
-int nand_write_raw(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen,
-		   uint8_t *buf, uint8_t *oob)
+static void nand_write_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
+				  const uint8_t *buf)
 {
-	struct nand_chip *chip = mtd->priv;
-	int page = (int)(to >> chip->page_shift);
-	int chipnr = (int)(to >> chip->chip_shift);
-	int ret;
+	int i, eccsize = chip->ecc.size;
+	int eccbytes = chip->ecc.bytes;
+	int eccsteps = chip->ecc.steps;
+	uint8_t *ecc_calc = chip->buffers.ecccalc;
+	const uint8_t *p = buf;
+	int *eccpos = chip->autooob->eccpos;
 
-	*retlen = 0;
+	if (chip->ecc.mode != NAND_ECC_NONE) {
+		/* Software ecc calculation */
+		for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize)
+			chip->ecc.calculate(mtd, p, &ecc_calc[i]);
 
-	/* Do not allow writes past end of device */
-	if ((to + len) > mtd->size) {
-		DEBUG(MTD_DEBUG_LEVEL0, "nand_read_raw: Attempt write "
-		      "beyond end of device\n");
-		return -EINVAL;
+		for (i = 0; i < chip->ecc.total; i++)
+			chip->oob_poi[eccpos[i]] = ecc_calc[i];
 	}
 
-	/* Grab the lock and see if the device is available */
-	nand_get_device(chip, mtd, FL_WRITING);
+	chip->write_buf(mtd, buf, mtd->writesize);
+	chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
+}
 
-	chip->select_chip(mtd, chipnr);
-	chip->data_poi = buf;
+/**
+ * nand_write_page_hwecc - {REPLACABLE] hardware ecc based page write function
+ * @mtd:	mtd info structure
+ * @chip:	nand chip info structure
+ * @buf:	data buffer
+ */
+static void nand_write_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
+				  const uint8_t *buf)
+{
+	int i, eccsize = chip->ecc.size;
+	int eccbytes = chip->ecc.bytes;
+	int eccsteps = chip->ecc.steps;
+	uint8_t *ecc_calc = chip->buffers.ecccalc;
+	const uint8_t *p = buf;
+	int *eccpos = chip->autooob->eccpos;
 
-	while (len != *retlen) {
-		ret = nand_write_page(mtd, chip, page, oob, &mtd->oobinfo, 0);
-		if (ret)
-			return ret;
-		page++;
-		*retlen += mtd->writesize;
-		chip->data_poi += mtd->writesize;
-		oob += mtd->oobsize;
+	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
+		chip->ecc.hwctl(mtd, NAND_ECC_WRITE);
+		chip->write_buf(mtd, p, mtd->writesize);
+		chip->ecc.calculate(mtd, p, &ecc_calc[i]);
 	}
 
-	/* Deselect and wake up anyone waiting on the device */
-	nand_release_device(mtd);
-	return 0;
+	for (i = 0; i < chip->ecc.total; i++)
+		chip->oob_poi[eccpos[i]] = ecc_calc[i];
+
+	chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
 }
-EXPORT_SYMBOL_GPL(nand_write_raw);
 
 /**
- * nand_prepare_oobbuf - [GENERIC] Prepare the out of band buffer
- * @mtd:	MTD device structure
- * @fsbuf:	buffer given by fs driver
- * @oobsel:	out of band selection structre
- * @autoplace:	1 = place given buffer into the oob bytes
- * @numpages:	number of pages to prepare
- *
- * Return:
- * 1. Filesystem buffer available and autoplacement is off,
- *    return filesystem buffer
- * 2. No filesystem buffer or autoplace is off, return internal
- *    buffer
- * 3. Filesystem buffer is given and autoplace selected
- *    put data from fs buffer into internal buffer and
- *    retrun internal buffer
- *
- * Note: The internal buffer is filled with 0xff. This must
- * be done only once, when no autoplacement happens
- * Autoplacement sets the buffer dirty flag, which
- * forces the 0xff fill before using the buffer again.
+ * nand_write_page_syndrome - {REPLACABLE] hardware ecc syndrom based page write
+ * @mtd:	mtd info structure
+ * @chip:	nand chip info structure
+ * @buf:	data buffer
  *
-*/
-static uint8_t *nand_prepare_oobbuf(struct mtd_info *mtd, uint8_t *fsbuf, struct nand_oobinfo *oobsel,
-				   int autoplace, int numpages)
+ * The hw generator calculates the error syndrome automatically. Therefor
+ * we need a special oob layout and handling.
+ */
+static void nand_write_page_syndrome(struct mtd_info *mtd,
+				    struct nand_chip *chip, const uint8_t *buf)
 {
-	struct nand_chip *chip = mtd->priv;
-	int i, len, ofs;
+	int i, eccsize = chip->ecc.size;
+	int eccbytes = chip->ecc.bytes;
+	int eccsteps = chip->ecc.steps;
+	const uint8_t *p = buf;
+	uint8_t *oob = chip->oob_poi;
 
-	/* Zero copy fs supplied buffer */
-	if (fsbuf && !autoplace)
-		return fsbuf;
+	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
 
-	/* Check, if the buffer must be filled with ff again */
-	if (chip->oobdirty) {
-		memset(chip->oob_buf, 0xff, mtd->oobsize << (chip->phys_erase_shift - chip->page_shift));
-		chip->oobdirty = 0;
-	}
+		chip->ecc.hwctl(mtd, NAND_ECC_WRITE);
+		chip->write_buf(mtd, p, eccsize);
 
-	/* If we have no autoplacement or no fs buffer use the internal one */
-	if (!autoplace || !fsbuf)
-		return chip->oob_buf;
-
-	/* Walk through the pages and place the data */
-	chip->oobdirty = 1;
-	ofs = 0;
-	while (numpages--) {
-		for (i = 0, len = 0; len < mtd->oobavail; i++) {
-			int to = ofs + oobsel->oobfree[i][0];
-			int num = oobsel->oobfree[i][1];
-			memcpy(&chip->oob_buf[to], fsbuf, num);
-			len += num;
-			fsbuf += num;
+		if (chip->ecc.prepad) {
+			chip->write_buf(mtd, oob, chip->ecc.prepad);
+			oob += chip->ecc.prepad;
+		}
+
+		chip->ecc.calculate(mtd, p, oob);
+		chip->write_buf(mtd, oob, eccbytes);
+		oob += eccbytes;
+
+		if (chip->ecc.postpad) {
+			chip->write_buf(mtd, oob, chip->ecc.postpad);
+			oob += chip->ecc.postpad;
 		}
-		ofs += mtd->oobavail;
 	}
-	return chip->oob_buf;
+
+	/* Calculate remaining oob bytes */
+	i = oob - chip->oob_poi;
+	if (i)
+		chip->write_buf(mtd, oob, i);
+}
+
+/**
+ * nand_write_page - [INTERNAL] write one page
+ * @mtd:	MTD device structure
+ * @chip:	NAND chip descriptor
+ * @buf:	the data to write
+ * @page:	page number to write
+ * @cached:	cached programming
+ */
+static int nand_write_page(struct mtd_info *mtd, struct nand_chip *chip,
+			   const uint8_t *buf, int page, int cached)
+{
+	int status;
+
+	chip->cmdfunc(mtd, NAND_CMD_SEQIN, 0x00, page);
+
+	chip->ecc.write_page(mtd, chip, buf);
+
+	/*
+	 * Cached progamming disabled for now, Not sure if its worth the
+	 * trouble. The speed gain is not very impressive. (2.3->2.6Mib/s)
+	 */
+	cached = 0;
+
+	if (!cached || !(chip->options & NAND_CACHEPRG)) {
+
+		chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
+		status = chip->waitfunc(mtd, chip, FL_WRITING);
+		/*
+		 * See if operation failed and additional status checks are
+		 * available
+		 */
+		if ((status & NAND_STATUS_FAIL) && (chip->errstat))
+			status = chip->errstat(mtd, chip, FL_WRITING, status,
+					       page);
+
+		if (status & NAND_STATUS_FAIL)
+			return -EIO;
+	} else {
+		chip->cmdfunc(mtd, NAND_CMD_CACHEDPROG, -1, -1);
+		status = chip->waitfunc(mtd, chip, FL_WRITING);
+	}
+
+#ifdef CONFIG_MTD_NAND_VERIFY_WRITE
+	/* Send command to read back the data */
+	chip->cmdfunc(mtd, NAND_CMD_READ0, 0, page);
+
+	if (chip->verify_buf(mtd, buf, mtd->writesize))
+		return -EIO;
+#endif
+	return 0;
 }
 
 #define NOTALIGNED(x) (x & (mtd->writesize-1)) != 0
@@ -1545,137 +1354,128 @@ static uint8_t *nand_prepare_oobbuf(struct mtd_info *mtd, uint8_t *fsbuf, struct
 static int nand_write(struct mtd_info *mtd, loff_t to, size_t len,
 			  size_t *retlen, const uint8_t *buf)
 {
-	int startpage, page, ret = -EIO, oob = 0, written = 0, chipnr;
-	int autoplace = 0, numpages, totalpages;
+	int chipnr, realpage, page, blockmask;
 	struct nand_chip *chip = mtd->priv;
-	uint8_t *oobbuf, *bufstart, *eccbuf = NULL;
-	int ppblock = (1 << (chip->phys_erase_shift - chip->page_shift));
-	struct nand_oobinfo *oobsel = &mtd->oobinfo;
-
-	DEBUG(MTD_DEBUG_LEVEL3, "nand_write: to = 0x%08x, len = %i\n", (unsigned int)to, (int)len);
+	uint32_t writelen = len;
+	int bytes = mtd->writesize;
+	int ret = -EIO;
 
-	/* Initialize retlen, in case of early exit */
 	*retlen = 0;
 
 	/* Do not allow write past end of device */
 	if ((to + len) > mtd->size) {
-		DEBUG(MTD_DEBUG_LEVEL0, "nand_write: Attempt to write past end of page\n");
+		DEBUG(MTD_DEBUG_LEVEL0, "nand_write: "
+		      "Attempt to write past end of page\n");
 		return -EINVAL;
 	}
 
 	/* reject writes, which are not page aligned */
 	if (NOTALIGNED(to) || NOTALIGNED(len)) {
-		printk(KERN_NOTICE "nand_write: Attempt to write not page aligned data\n");
+		printk(KERN_NOTICE "nand_write: "
+		       "Attempt to write not page aligned data\n");
 		return -EINVAL;
 	}
 
-	/* Grab the lock and see if the device is available */
-	nand_get_device(chip, mtd, FL_WRITING);
+	if (!len)
+		return 0;
 
-	/* Calculate chipnr */
-	chipnr = (int)(to >> chip->chip_shift);
-	/* Select the NAND device */
-	chip->select_chip(mtd, chipnr);
+	nand_get_device(chip, mtd, FL_WRITING);
 
 	/* Check, if it is write protected */
 	if (nand_check_wp(mtd))
 		goto out;
 
-	/* Autoplace of oob data ? Use the default placement scheme */
-	if (oobsel->useecc == MTD_NANDECC_AUTOPLACE) {
-		oobsel = chip->autooob;
-		autoplace = 1;
-	}
-	if (oobsel->useecc == MTD_NANDECC_AUTOPL_USR)
-		autoplace = 1;
+	chipnr = (int)(to >> chip->chip_shift);
+	chip->select_chip(mtd, chipnr);
 
-	/* Setup variables and oob buffer */
-	totalpages = len >> chip->page_shift;
-	page = (int)(to >> chip->page_shift);
-	/* Invalidate the page cache, if we write to the cached page */
-	if (page <= chip->pagebuf && chip->pagebuf < (page + totalpages))
+	realpage = (int)(to >> chip->page_shift);
+	page = realpage & chip->pagemask;
+	blockmask = (1 << (chip->phys_erase_shift - chip->page_shift)) - 1;
+
+	/* Invalidate the page cache, when we write to the cached page */
+	if (to <= (chip->pagebuf << chip->page_shift) &&
+	    (chip->pagebuf << chip->page_shift) < (to + len))
 		chip->pagebuf = -1;
 
-	/* Set it relative to chip */
-	page &= chip->pagemask;
-	startpage = page;
-	/* Calc number of pages we can write in one go */
-	numpages = min(ppblock - (startpage & (ppblock - 1)), totalpages);
-	oobbuf = nand_prepare_oobbuf(mtd, eccbuf, oobsel, autoplace, numpages);
-	bufstart = (uint8_t *) buf;
-
-	/* Loop until all data is written */
-	while (written < len) {
-
-		chip->data_poi = (uint8_t *) &buf[written];
-		/* Write one page. If this is the last page to write
-		 * or the last page in this block, then use the
-		 * real pageprogram command, else select cached programming
-		 * if supported by the chip.
-		 */
-		ret = nand_write_page(mtd, chip, page, &oobbuf[oob], oobsel, (--numpages > 0));
-		if (ret) {
-			DEBUG(MTD_DEBUG_LEVEL0, "nand_write: write_page failed %d\n", ret);
-			goto out;
-		}
-		/* Next oob page */
-		oob += mtd->oobsize;
-		/* Update written bytes count */
-		written += mtd->writesize;
-		if (written == len)
-			goto cmp;
+	chip->oob_poi = chip->buffers.oobwbuf;
 
-		/* Increment page address */
-		page++;
+	while(1) {
+		int cached = writelen > bytes && page != blockmask;
 
-		/* Have we hit a block boundary ? Then we have to verify and
-		 * if verify is ok, we have to setup the oob buffer for
-		 * the next pages.
-		 */
-		if (!(page & (ppblock - 1))) {
-			int ofs;
-			chip->data_poi = bufstart;
-			ret = nand_verify_pages(mtd, chip, startpage, page - startpage,
-						oobbuf, oobsel, chipnr, (eccbuf != NULL));
-			if (ret) {
-				DEBUG(MTD_DEBUG_LEVEL0, "nand_write: verify_pages failed %d\n", ret);
-				goto out;
-			}
-			*retlen = written;
-
-			ofs = autoplace ? mtd->oobavail : mtd->oobsize;
-			if (eccbuf)
-				eccbuf += (page - startpage) * ofs;
-			totalpages -= page - startpage;
-			numpages = min(totalpages, ppblock);
-			page &= chip->pagemask;
-			startpage = page;
-			oobbuf = nand_prepare_oobbuf(mtd, eccbuf, oobsel, autoplace, numpages);
-			oob = 0;
-			/* Check, if we cross a chip boundary */
-			if (!page) {
-				chipnr++;
-				chip->select_chip(mtd, -1);
-				chip->select_chip(mtd, chipnr);
-			}
+		ret = nand_write_page(mtd, chip, buf, page, cached);
+		if (ret)
+			break;
+
+		writelen -= bytes;
+		if (!writelen)
+			break;
+
+		buf += bytes;
+		realpage++;
+
+		page = realpage & chip->pagemask;
+		/* Check, if we cross a chip boundary */
+		if (!page) {
+			chipnr++;
+			chip->select_chip(mtd, -1);
+			chip->select_chip(mtd, chipnr);
 		}
 	}
-	/* Verify the remaining pages */
- cmp:
-	chip->data_poi = bufstart;
-	ret = nand_verify_pages(mtd, chip, startpage, totalpages, oobbuf, oobsel, chipnr, (eccbuf != NULL));
-	if (!ret)
-		*retlen = written;
-	else
-		DEBUG(MTD_DEBUG_LEVEL0, "nand_write: verify_pages failed %d\n", ret);
-
  out:
-	/* Deselect and wake up anyone waiting on the device */
+	*retlen = len - writelen;
 	nand_release_device(mtd);
-
 	return ret;
 }
 
+/**
+ * nand_write_raw - [GENERIC] Write raw data including oob
+ * @mtd:	MTD device structure
+ * @buf:	source buffer
+ * @to:		offset to write to
+ * @len:	number of bytes to write
+ * @buf:	source buffer
+ * @oob:	oob buffer
+ *
+ * Write raw data including oob
+ */
+int nand_write_raw(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen,
+		   const uint8_t *buf, uint8_t *oob)
+{
+	struct nand_chip *chip = mtd->priv;
+	int page = (int)(to >> chip->page_shift);
+	int chipnr = (int)(to >> chip->chip_shift);
+	int ret;
+
+	*retlen = 0;
+
+	/* Do not allow writes past end of device */
+	if ((to + len) > mtd->size) {
+		DEBUG(MTD_DEBUG_LEVEL0, "nand_read_raw: Attempt write "
+		      "beyond end of device\n");
+		return -EINVAL;
+	}
+
+	/* Grab the lock and see if the device is available */
+	nand_get_device(chip, mtd, FL_WRITING);
+
+	chip->select_chip(mtd, chipnr);
+	chip->oob_poi = oob;
+
+	while (len != *retlen) {
+		ret = nand_write_page(mtd, chip, buf, page, 0);
+		if (ret)
+			return ret;
+		page++;
+		*retlen += mtd->writesize;
+		buf += mtd->writesize;
+		chip->oob_poi += mtd->oobsize;
+	}
+
+	/* Deselect and wake up anyone waiting on the device */
+	nand_release_device(mtd);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nand_write_raw);
 
 /**
  * nand_write_oob - [MTD Interface] NAND write out-of-band
@@ -2081,64 +1881,6 @@ static void nand_resume(struct mtd_info *mtd)
 		       "in suspended state\n");
 }
 
-/*
- * Free allocated data structures
- */
-static void nand_free_kmem(struct nand_chip *chip)
-{
-	/* Buffer allocated by nand_scan ? */
-	if (chip->options & NAND_OOBBUF_ALLOC)
-		kfree(chip->oob_buf);
-	/* Buffer allocated by nand_scan ? */
-	if (chip->options & NAND_DATABUF_ALLOC)
-		kfree(chip->data_buf);
-	/* Controller allocated by nand_scan ? */
-	if (chip->options & NAND_CONTROLLER_ALLOC)
-		kfree(chip->controller);
-}
-
-/*
- * Allocate buffers and data structures
- */
-static int nand_allocate_kmem(struct mtd_info *mtd, struct nand_chip *chip)
-{
-	size_t len;
-
-	if (!chip->oob_buf) {
-		len = mtd->oobsize <<
-			(chip->phys_erase_shift - chip->page_shift);
-		chip->oob_buf = kmalloc(len, GFP_KERNEL);
-		if (!chip->oob_buf)
-			goto outerr;
-		chip->options |= NAND_OOBBUF_ALLOC;
-	}
-
-	if (!chip->data_buf) {
-		len = mtd->writesize + mtd->oobsize;
-		chip->data_buf = kmalloc(len, GFP_KERNEL);
-		if (!chip->data_buf)
-			goto outerr;
-		chip->options |= NAND_DATABUF_ALLOC;
-	}
-
-	if (!chip->controller) {
-		chip->controller = kzalloc(sizeof(struct nand_hw_control),
-					   GFP_KERNEL);
-		if (!chip->controller)
-			goto outerr;
-
-		spin_lock_init(&chip->controller->lock);
-		init_waitqueue_head(&chip->controller->wq);
-		chip->options |= NAND_CONTROLLER_ALLOC;
-	}
-	return 0;
-
- outerr:
-	printk(KERN_ERR "nand_scan(): Cannot allocate buffers\n");
-	nand_free_kmem(chip);
-	return -ENOMEM;
-}
-
 /*
  * Set default functions
  */
@@ -2174,6 +1916,13 @@ static void nand_set_defaults(struct nand_chip *chip, int busw)
 		chip->verify_buf = busw ? nand_verify_buf16 : nand_verify_buf;
 	if (!chip->scan_bbt)
 		chip->scan_bbt = nand_default_bbt;
+
+	if (!chip->controller) {
+		chip->controller = &chip->hwcontrol;
+		spin_lock_init(&chip->controller->lock);
+		init_waitqueue_head(&chip->controller->wq);
+	}
+
 }
 
 /*
@@ -2321,8 +2070,7 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd,
  * This fills out all the uninitialized function pointers
  * with the defaults.
  * The flash ID is read and the mtd/chip structures are
- * filled with the appropriate values. Buffers are allocated if
- * they are not provided by the board driver
+ * filled with the appropriate values.
  * The mtd->owner field must be set to the module of the caller
  *
  */
@@ -2369,13 +2117,8 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 	chip->numchips = i;
 	mtd->size = i * chip->chipsize;
 
-	/* Allocate buffers and data structures */
-	if (nand_allocate_kmem(mtd, chip))
-		return -ENOMEM;
-
-	/* Preset the internal oob buffer */
-	memset(chip->oob_buf, 0xff,
-	       mtd->oobsize << (chip->phys_erase_shift - chip->page_shift));
+	/* Preset the internal oob write buffer */
+	memset(chip->buffers.oobwbuf, 0xff, mtd->oobsize);
 
 	/*
 	 * If no default placement scheme is given, select an appropriate one
@@ -2415,6 +2158,8 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 		/* Use standard hwecc read page function ? */
 		if (!chip->ecc.read_page)
 			chip->ecc.read_page = nand_read_page_hwecc;
+		if (!chip->ecc.write_page)
+			chip->ecc.write_page = nand_write_page_hwecc;
 
 	case NAND_ECC_HW_SYNDROME:
 		if (!chip->ecc.calculate || !chip->ecc.correct ||
@@ -2423,9 +2168,11 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 			       "Hardware ECC not possible\n");
 			BUG();
 		}
-		/* Use standard syndrome read page function ? */
+		/* Use standard syndrome read/write page function ? */
 		if (!chip->ecc.read_page)
 			chip->ecc.read_page = nand_read_page_syndrome;
+		if (!chip->ecc.write_page)
+			chip->ecc.write_page = nand_write_page_syndrome;
 
 		if (mtd->writesize >= chip->ecc.size)
 			break;
@@ -2438,6 +2185,7 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 		chip->ecc.calculate = nand_calculate_ecc;
 		chip->ecc.correct = nand_correct_data;
 		chip->ecc.read_page = nand_read_page_swecc;
+		chip->ecc.write_page = nand_write_page_swecc;
 		chip->ecc.size = 256;
 		chip->ecc.bytes = 3;
 		break;
@@ -2446,6 +2194,7 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 		printk(KERN_WARNING "NAND_ECC_NONE selected by board driver. "
 		       "This is not recommended !!\n");
 		chip->ecc.read_page = nand_read_page_swecc;
+		chip->ecc.write_page = nand_write_page_swecc;
 		chip->ecc.size = mtd->writesize;
 		chip->ecc.bytes = 0;
 		break;
@@ -2522,8 +2271,6 @@ void nand_release(struct mtd_info *mtd)
 
 	/* Free bad block table memory */
 	kfree(chip->bbt);
-	/* Free buffers */
-	nand_free_kmem(chip);
 }
 
 EXPORT_SYMBOL_GPL(nand_scan);
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index ecaaca18d1e0..40f99304df76 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -666,7 +666,7 @@ static inline int nand_memory_bbt(struct mtd_info *mtd, struct nand_bbt_descr *b
 	struct nand_chip *this = mtd->priv;
 
 	bd->options &= ~NAND_BBT_SCANEMPTY;
-	return create_bbt(mtd, this->data_buf, bd, -1);
+	return create_bbt(mtd, this->buffers.databuf, bd, -1);
 }
 
 /**
diff --git a/drivers/mtd/nand/rtc_from4.c b/drivers/mtd/nand/rtc_from4.c
index b7083104a05b..de6de91fbad9 100644
--- a/drivers/mtd/nand/rtc_from4.c
+++ b/drivers/mtd/nand/rtc_from4.c
@@ -571,7 +571,6 @@ static int __init rtc_from4_init(void)
 	this->ecc.mode = NAND_ECC_HW_SYNDROME;
 	this->ecc.size = 512;
 	this->ecc.bytes = 8;
-	this->options |= NAND_HWECC_SYNDROME;
 	/* return the status of extra status and ECC checks */
 	this->errstat = rtc_from4_errstat;
 	/* set the nand_oobinfo to support FPGA H/W error detection */
diff --git a/drivers/mtd/nand/toto.c b/drivers/mtd/nand/toto.c
index a9cf0190c27a..f9e2d4a0ab8c 100644
--- a/drivers/mtd/nand/toto.c
+++ b/drivers/mtd/nand/toto.c
@@ -175,8 +175,6 @@ static int __init toto_init(void)
 
 	goto out;
 
- out_buf:
-	kfree(this->data_buf);
  out_mtd:
 	kfree(toto_mtd);
  out:
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 00916498ea55..1a749ba6130f 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -37,7 +37,7 @@ extern int nand_read_raw (struct mtd_info *mtd, uint8_t *buf, loff_t from,
 
 
 extern int nand_write_raw(struct mtd_info *mtd, loff_t to, size_t len,
-			  size_t *retlen, uint8_t *buf, uint8_t *oob);
+			  size_t *retlen, const uint8_t *buf, uint8_t *oob);
 
 /* The maximum number of NAND chips in an array */
 #define NAND_MAX_CHIPS		8
@@ -47,6 +47,7 @@ extern int nand_write_raw(struct mtd_info *mtd, loff_t to, size_t len,
  * adjust this accordingly.
  */
 #define NAND_MAX_OOBSIZE	64
+#define NAND_MAX_PAGESIZE	2048
 
 /*
  * Constants for hardware specific CLE/ALE/NCE function
@@ -181,20 +182,12 @@ typedef enum {
 /* Use a flash based bad block table. This option is passed to the
  * default bad block table function. */
 #define NAND_USE_FLASH_BBT	0x00010000
-/* The hw ecc generator provides a syndrome instead a ecc value on read
- * This can only work if we have the ecc bytes directly behind the
- * data bytes. Applies for DOC and AG-AND Renesas HW Reed Solomon generators */
-#define NAND_HWECC_SYNDROME	0x00020000
 /* This option skips the bbt scan during initialization. */
-#define NAND_SKIP_BBTSCAN	0x00040000
+#define NAND_SKIP_BBTSCAN	0x00020000
 
 /* Options set by nand scan */
 /* Nand scan has allocated controller struct */
-#define NAND_CONTROLLER_ALLOC	0x20000000
-/* Nand scan has allocated oob_buf */
-#define NAND_OOBBUF_ALLOC	0x40000000
-/* Nand scan has allocated data_buf */
-#define NAND_DATABUF_ALLOC	0x80000000
+#define NAND_CONTROLLER_ALLOC	0x80000000
 
 
 /*
@@ -240,6 +233,7 @@ struct nand_hw_control {
  *		be provided if an hardware ECC is available
  * @calculate:	function for ecc calculation or readback from ecc hardware
  * @correct:	function for ecc correction, matching to ecc generator (sw/hw)
+ * @read_page:	function to read a page according to the ecc generator requirements
  * @write_page:	function to write a page according to the ecc generator requirements
  */
 struct nand_ecc_ctrl {
@@ -260,9 +254,28 @@ struct nand_ecc_ctrl {
 	int			(*read_page)(struct mtd_info *mtd,
 					     struct nand_chip *chip,
 					     uint8_t *buf);
-	int			(*write_page)(struct mtd_info *mtd,
+	void			(*write_page)(struct mtd_info *mtd,
 					      struct nand_chip *chip,
-					      uint8_t *buf, int cached);
+					      const uint8_t *buf);
+};
+
+/**
+ * struct nand_buffers - buffer structure for read/write
+ * @ecccalc:	buffer for calculated ecc
+ * @ecccode:	buffer for ecc read from flash
+ * @oobwbuf:	buffer for write oob data
+ * @databuf:	buffer for data - dynamically sized
+ * @oobrbuf:	buffer to read oob data
+ *
+ * Do not change the order of buffers. databuf and oobrbuf must be in
+ * consecutive order.
+ */
+struct nand_buffers {
+	uint8_t	ecccalc[NAND_MAX_OOBSIZE];
+	uint8_t	ecccode[NAND_MAX_OOBSIZE];
+	uint8_t	oobwbuf[NAND_MAX_OOBSIZE];
+	uint8_t databuf[NAND_MAX_PAGESIZE];
+	uint8_t	oobrbuf[NAND_MAX_OOBSIZE];
 };
 
 /**
@@ -294,8 +307,8 @@ struct nand_ecc_ctrl {
  * @phys_erase_shift:	[INTERN] number of address bits in a physical eraseblock
  * @bbt_erase_shift:	[INTERN] number of address bits in a bbt entry
  * @chip_shift:		[INTERN] number of address bits in one chip
- * @data_buf:		[INTERN] internal buffer for one page + oob
- * @oob_buf:		[INTERN] oob buffer for one eraseblock
+ * @datbuf:		[INTERN] internal buffer for one page + oob
+ * @oobbuf:		[INTERN] oob buffer for one eraseblock
  * @oobdirty:		[INTERN] indicates that oob_buf must be reinitialized
  * @data_poi:		[INTERN] pointer to a data buffer
  * @options:		[BOARDSPECIFIC] various chip options. They can partly be set to inform nand_scan about
@@ -336,32 +349,38 @@ struct nand_chip {
 	int		(*waitfunc)(struct mtd_info *mtd, struct nand_chip *this, int state);
 	void		(*erase_cmd)(struct mtd_info *mtd, int page);
 	int		(*scan_bbt)(struct mtd_info *mtd);
-	struct nand_ecc_ctrl ecc;
+	int		(*errstat)(struct mtd_info *mtd, struct nand_chip *this, int state, int status, int page);
+
 	int		chip_delay;
-	wait_queue_head_t wq;
-	nand_state_t	state;
+	unsigned int	options;
+
 	int		page_shift;
 	int		phys_erase_shift;
 	int		bbt_erase_shift;
 	int		chip_shift;
-	uint8_t		*data_buf;
-	uint8_t		*oob_buf;
-	int		oobdirty;
-	uint8_t		*data_poi;
-	unsigned int	options;
-	int		badblockpos;
 	int		numchips;
 	unsigned long	chipsize;
 	int		pagemask;
 	int		pagebuf;
+	int		badblockpos;
+
+	nand_state_t	state;
+
+	uint8_t		*oob_poi;
+	struct nand_hw_control  *controller;
 	struct nand_oobinfo	*autooob;
+
+	struct nand_ecc_ctrl ecc;
+	struct nand_buffers buffers;
+	struct nand_hw_control hwcontrol;
+
 	uint8_t		*bbt;
 	struct nand_bbt_descr	*bbt_td;
 	struct nand_bbt_descr	*bbt_md;
+
 	struct nand_bbt_descr	*badblock_pattern;
-	struct nand_hw_control  *controller;
+
 	void		*priv;
-	int		(*errstat)(struct mtd_info *mtd, struct nand_chip *this, int state, int status, int page);
 };
 
 /*
-- 
cgit v1.2.3


From 622b20fcb8b42aa4c3c87c0a036f2ad0927b64bc Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Wed, 24 May 2006 14:06:11 +0100
Subject: [PATCH] PCI identifiers for the pata_via update

These IDs are also used by the drivers/ide/pci changes submitted by
VIA.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/pci_ids.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 233f60741c82..e75727954a12 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1238,6 +1238,7 @@
 #define PCI_DEVICE_ID_VIA_PX8X0_0	0x0259
 #define PCI_DEVICE_ID_VIA_3269_0	0x0269
 #define PCI_DEVICE_ID_VIA_K8T800PRO_0	0x0282
+#define PCI_DEVICE_ID_VIA_3296_0	0x0296
 #define PCI_DEVICE_ID_VIA_8363_0	0x0305
 #define PCI_DEVICE_ID_VIA_P4M800CE	0x0314
 #define PCI_DEVICE_ID_VIA_8371_0	0x0391
@@ -1245,6 +1246,7 @@
 #define PCI_DEVICE_ID_VIA_82C561	0x0561
 #define PCI_DEVICE_ID_VIA_82C586_1	0x0571
 #define PCI_DEVICE_ID_VIA_82C576	0x0576
+#define PCI_DEVICE_ID_VIA_SATA_EIDE	0x0581
 #define PCI_DEVICE_ID_VIA_82C586_0	0x0586
 #define PCI_DEVICE_ID_VIA_82C596	0x0596
 #define PCI_DEVICE_ID_VIA_82C597_0	0x0597
@@ -1285,10 +1287,11 @@
 #define PCI_DEVICE_ID_VIA_8783_0	0x3208
 #define PCI_DEVICE_ID_VIA_8237		0x3227
 #define PCI_DEVICE_ID_VIA_8251		0x3287
-#define PCI_DEVICE_ID_VIA_3296_0	0x0296
+#define PCI_DEVICE_ID_VIA_8237A		0x3337
 #define PCI_DEVICE_ID_VIA_8231		0x8231
 #define PCI_DEVICE_ID_VIA_8231_4	0x8235
 #define PCI_DEVICE_ID_VIA_8365_1	0x8305
+#define PCI_DEVICE_ID_VIA_CX700		0x8324
 #define PCI_DEVICE_ID_VIA_8371_1	0x8391
 #define PCI_DEVICE_ID_VIA_82C598_1	0x8598
 #define PCI_DEVICE_ID_VIA_838X_1	0xB188
-- 
cgit v1.2.3


From 75e995855f45a83afdae34d50c0b3ee14fb23b7a Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Wed, 24 May 2006 14:14:41 +0100
Subject: [PATCH] libata: add pio_data_xfer_noirq

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 25 +++++++++++++++++++++++++
 include/linux/libata.h     |  2 ++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 074a46e5bbdd..f19fe662f0ca 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -3611,6 +3611,30 @@ void ata_pio_data_xfer(struct ata_device *adev, unsigned char *buf,
 	}
 }
 
+/**
+ *	ata_pio_data_xfer_noirq - Transfer data by PIO
+ *	@adev: device to target
+ *	@buf: data buffer
+ *	@buflen: buffer length
+ *	@write_data: read/write
+ *
+ *	Transfer data from/to the device data register by PIO. Do the 
+ *	transfer with interrupts disabled.
+ *
+ *	LOCKING:
+ *	Inherited from caller.
+ */
+
+void ata_pio_data_xfer_noirq(struct ata_device *adev, unsigned char *buf,
+				    unsigned int buflen, int write_data)
+{
+	unsigned long flags;
+	local_irq_save(flags);
+	ata_pio_data_xfer(adev, buf, buflen, write_data);
+	local_irq_restore(flags);
+}
+
+
 /**
  *	ata_pio_sector - Transfer ATA_SECT_SIZE (512 bytes) of data.
  *	@qc: Command on going
@@ -5675,6 +5699,7 @@ EXPORT_SYMBOL_GPL(ata_host_stop);
 EXPORT_SYMBOL_GPL(ata_interrupt);
 EXPORT_SYMBOL_GPL(ata_mmio_data_xfer);
 EXPORT_SYMBOL_GPL(ata_pio_data_xfer);
+EXPORT_SYMBOL_GPL(ata_pio_data_xfer_noirq);
 EXPORT_SYMBOL_GPL(ata_qc_prep);
 EXPORT_SYMBOL_GPL(ata_noop_qc_prep);
 EXPORT_SYMBOL_GPL(ata_bmdma_setup);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 9c60b4a4e2fd..b0ee1c1437d6 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -651,6 +651,8 @@ extern void ata_mmio_data_xfer(struct ata_device *adev, unsigned char *buf,
 			       unsigned int buflen, int write_data);
 extern void ata_pio_data_xfer(struct ata_device *adev, unsigned char *buf,
 			      unsigned int buflen, int write_data);
+extern void ata_pio_data_xfer_noirq(struct ata_device *adev, unsigned char *buf,
+			      unsigned int buflen, int write_data);
 extern void ata_qc_prep(struct ata_queued_cmd *qc);
 extern void ata_noop_qc_prep(struct ata_queued_cmd *qc);
 extern unsigned int ata_qc_issue_prot(struct ata_queued_cmd *qc);
-- 
cgit v1.2.3


From 8be834f76291fdcc0614cb84926c6910b9f2ecbc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Sat, 27 May 2006 20:05:26 +0200
Subject: [MTD] NAND Fix platform structure and NDFC driver

The platform structure was lacking an oobinfo field.
The NDFC driver had some remains from another tree.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/ndfc.c  | 6 ++----
 include/linux/mtd/nand.h | 4 +++-
 include/linux/mtd/ndfc.h | 3 ++-
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ndfc.c b/drivers/mtd/nand/ndfc.c
index 4d70dd16cf5d..5790d630faed 100644
--- a/drivers/mtd/nand/ndfc.c
+++ b/drivers/mtd/nand/ndfc.c
@@ -20,7 +20,6 @@
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/ndfc.h>
-#include <linux/mtd/ubi.h>
 #include <linux/mtd/mtd.h>
 #include <linux/platform_device.h>
 
@@ -169,14 +168,13 @@ static void ndfc_chip_init(struct ndfc_nand_mtd *mtd)
 	chip->ecc.mode = NAND_ECC_HW;
 	chip->ecc.size = 256;
 	chip->ecc.bytes = 3;
-	chip->autooob = mtd->pl_chip->autooob;
+	chip->autooob = mtd->pl_chip->oobinfo;
 	mtd->mtd.priv = chip;
 	mtd->mtd.owner = THIS_MODULE;
 }
 
 static int ndfc_chip_probe(struct platform_device *pdev)
 {
-	int rc;
 	struct platform_nand_chip *nc = pdev->dev.platform_data;
 	struct ndfc_chip_settings *settings = nc->priv;
 	struct ndfc_controller *ndfc = &ndfc_ctrl;
@@ -235,7 +233,7 @@ static int ndfc_nand_probe(struct platform_device *pdev)
 	struct ndfc_controller_settings *settings = nc->priv;
 	struct resource *res = pdev->resource;
 	struct ndfc_controller *ndfc = &ndfc_ctrl;
-	unsigned long long phys = NDFC_PHYSADDR_OFFS | res->start;
+	unsigned long long phys = setting->erpn | res->start;
 
 	ndfc->ndfcbase = ioremap64(phys, res->end - res->start + 1);
 	if (!ndfc->ndfcbase) {
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 1a749ba6130f..fd46bcf52281 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -518,10 +518,11 @@ extern int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
  *
  * @nr_chips:		max. number of chips to scan for
  * @chip_offs:		chip number offset
- * @nr_partitions:	number of partitions pointed to be partitoons (or zero)
+ * @nr_partitions:	number of partitions pointed to by partitions (or zero)
  * @partitions:		mtd partition list
  * @chip_delay:		R/B delay value in us
  * @options:		Option flags, e.g. 16bit buswidth
+ * @oobinfo:		oob info structure (ecc placement)
  * @priv:		hardware controller specific settings
  */
 struct platform_nand_chip {
@@ -529,6 +530,7 @@ struct platform_nand_chip {
 	int			chip_offset;
 	int			nr_partitions;
 	struct mtd_partition	*partitions;
+	struct nand_oobinfo	*oobinfo;
 	int			chip_delay;
 	unsigned int		options;
 	void			*priv;
diff --git a/include/linux/mtd/ndfc.h b/include/linux/mtd/ndfc.h
index 31d61f07d768..d0558a982628 100644
--- a/include/linux/mtd/ndfc.h
+++ b/include/linux/mtd/ndfc.h
@@ -56,7 +56,8 @@
 #define NDFC_MAX_BANKS		4
 
 struct ndfc_controller_settings {
-	uint32_t		ccr_settings;
+	uint32_t	ccr_settings;
+	uint64_t	ndfc_erpn;
 };
 
 struct ndfc_chip_settings {
-- 
cgit v1.2.3


From ff268fb8791cf18df536113355d7184007c269d9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Sat, 27 May 2006 20:36:12 +0200
Subject: [MTD] NAND Consolidate oobinfo handling

The info structure for out of band data was copied into
the mtd structure. Make it a pointer and remove the ability
to set it from userspace. The position of ecc bytes is
defined by the hardware and should not be changed by software.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/mtdchar.c              | 10 ++--------
 drivers/mtd/mtdconcat.c            |  4 +---
 drivers/mtd/mtdpart.c              |  4 +---
 drivers/mtd/nand/nand_base.c       | 10 +---------
 drivers/mtd/onenand/onenand_base.c |  2 +-
 fs/jffs2/wbuf.c                    |  2 +-
 include/linux/mtd/mtd.h            |  5 ++---
 7 files changed, 9 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 7a7df851c993..608f7af679cb 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -512,16 +512,10 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 		break;
 	}
 
-	case MEMSETOOBSEL:
-	{
-		if (copy_from_user(&mtd->oobinfo, argp, sizeof(struct nand_oobinfo)))
-			return -EFAULT;
-		break;
-	}
-
 	case MEMGETOOBSEL:
 	{
-		if (copy_to_user(argp, &(mtd->oobinfo), sizeof(struct nand_oobinfo)))
+		if (copy_to_user(argp, mtd->oobinfo,
+				 sizeof(struct nand_oobinfo)))
 			return -EFAULT;
 		break;
 	}
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 6d52137988fa..699fce7770de 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -766,9 +766,7 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 
 	}
 
-	if(concat->mtd.type == MTD_NANDFLASH)
-		memcpy(&concat->mtd.oobinfo, &subdev[0]->oobinfo,
-			sizeof(struct nand_oobinfo));
+	concat->mtd.oobinfo = subdev[0]->oobinfo;
 
 	concat->num_subdev = num_devs;
 	concat->mtd.name = name;
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index a93550ce7978..b6b218952d49 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -316,7 +316,6 @@ int add_mtd_partitions(struct mtd_info *master,
 		slave->mtd.size = parts[i].size;
 		slave->mtd.writesize = master->writesize;
 		slave->mtd.oobsize = master->oobsize;
-		slave->mtd.oobavail = master->oobavail;
 		slave->mtd.ecctype = master->ecctype;
 		slave->mtd.eccsize = master->eccsize;
 
@@ -435,8 +434,7 @@ int add_mtd_partitions(struct mtd_info *master,
 				parts[i].name);
 		}
 
-		/* copy oobinfo from master */
-		memcpy(&slave->mtd.oobinfo, &master->oobinfo, sizeof(slave->mtd.oobinfo));
+		slave->mtd.oobinfo = master->oobinfo;
 
 		if(parts[i].mtdp)
 		{	/* store the object pointer (caller may or may not register it */
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 023224dd12eb..20f79fec73b5 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2142,14 +2142,6 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 		}
 	}
 
-	/*
-	 * The number of bytes available for the filesystem to place fs
-	 * dependend oob data
-	 */
-	mtd->oobavail = 0;
-	for (i = 0; chip->autooob->oobfree[i][1]; i++)
-		mtd->oobavail += chip->autooob->oobfree[i][1];
-
 	/*
 	 * check ECC mode, default to software if 3byte/512byte hardware ECC is
 	 * selected and we have 256 byte pagesize fallback to software ECC
@@ -2245,7 +2237,7 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 	mtd->block_markbad = nand_block_markbad;
 
 	/* and make the autooob the default one */
-	memcpy(&mtd->oobinfo, chip->autooob, sizeof(mtd->oobinfo));
+	mtd->oobinfo = chip->autooob;
 
 	/* Check, if we should skip the bad block table scan */
 	if (chip->options & NAND_SKIP_BBTSCAN)
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 7a2419186ff4..b24bfa6e202c 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -1762,7 +1762,7 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 		break;
 	}
 
-	memcpy(&mtd->oobinfo, this->autooob, sizeof(mtd->oobinfo));
+	mtd->oobinfo = this->autooob;
 
 	/* Fill in remaining MTD driver data */
 	mtd->type = MTD_NANDFLASH;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 717fa2f52ac2..dc275cedfe4a 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1151,7 +1151,7 @@ static struct nand_oobinfo jffs2_oobinfo_docecc = {
 
 static int jffs2_nand_set_oobinfo(struct jffs2_sb_info *c)
 {
-	struct nand_oobinfo *oinfo = &c->mtd->oobinfo;
+	struct nand_oobinfo *oinfo = c->mtd->oobinfo;
 
 	/* Do this only, if we have an oob buffer */
 	if (!c->mtd->oobsize)
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 41a984dcb139..8429da51bb09 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -101,9 +101,8 @@ struct mtd_info {
 	char *name;
 	int index;
 
-	// oobinfo is a nand_oobinfo structure, which can be set by iotcl (MEMSETOOBINFO)
-	struct nand_oobinfo oobinfo;
-	u_int32_t oobavail;  // Number of bytes in OOB area available for fs
+	/* oobinfo structure pointer - read only ! */
+	struct nand_oobinfo *oobinfo;
 
 	/* Data for variable erase regions. If numeraseregions is zero,
 	 * it means that the whole device has erasesize as given above.
-- 
cgit v1.2.3


From 5bd34c091a044d130601370c370f84b1c59f1627 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Sat, 27 May 2006 22:16:10 +0200
Subject: [MTD] NAND Replace oobinfo by ecclayout

The nand_oobinfo structure is not fitting the newer error correction
demands anymore. Replace it by struct nand_ecclayout and fixup the users
all over the place. Keep the nand_oobinfo based ioctl for user space
compability reasons.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/mtdchar.c              | 26 +++++++++++++++++--
 drivers/mtd/mtdconcat.c            |  2 +-
 drivers/mtd/mtdpart.c              |  2 +-
 drivers/mtd/nand/diskonchip.c      |  5 ++--
 drivers/mtd/nand/nand_base.c       | 52 ++++++++++++++++++++++++--------------
 drivers/mtd/nand/ndfc.c            |  2 +-
 drivers/mtd/nand/rtc_from4.c       |  5 ++--
 drivers/mtd/nand/s3c2410.c         |  5 ++--
 drivers/mtd/nand/sharpsl.c         |  5 ++--
 drivers/mtd/onenand/onenand_base.c | 14 +++++-----
 fs/jffs2/jffs2_fs_sb.h             |  2 +-
 fs/jffs2/wbuf.c                    | 51 ++++++++++++-------------------------
 include/linux/mtd/inftl.h          |  2 +-
 include/linux/mtd/mtd.h            |  4 +--
 include/linux/mtd/nand.h           |  9 ++++---
 include/linux/mtd/nftl.h           |  2 +-
 include/linux/mtd/onenand.h        |  6 ++---
 include/linux/mtd/partitions.h     |  2 +-
 include/mtd/mtd-abi.h              | 36 +++++++++++++++++++++-----
 include/mtd/mtd-user.h             |  1 +
 20 files changed, 134 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 608f7af679cb..e75ec5fe7760 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -512,14 +512,36 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 		break;
 	}
 
+	/* Legacy interface */
 	case MEMGETOOBSEL:
 	{
-		if (copy_to_user(argp, mtd->oobinfo,
-				 sizeof(struct nand_oobinfo)))
+		struct nand_oobinfo oi;
+
+		if (!mtd->ecclayout)
+			return -EOPNOTSUPP;
+		if (mtd->ecclayout->eccbytes > ARRAY_SIZE(oi.eccpos))
+			return -EINVAL;
+
+		oi.useecc = MTD_NANDECC_AUTOPLACE;
+		memcpy(&oi.eccpos, mtd->ecclayout->eccpos, sizeof(oi.eccpos));
+		memcpy(&oi.oobfree, mtd->ecclayout->oobfree,
+		       sizeof(oi.oobfree));
+
+		if (copy_to_user(argp, &oi, sizeof(struct nand_oobinfo)))
 			return -EFAULT;
 		break;
 	}
 
+	case ECCGETLAYOUT:
+
+		if (!mtd->ecclayout)
+			return -EOPNOTSUPP;
+
+		if (copy_to_user(argp, &mtd->ecclayout,
+				 sizeof(struct nand_ecclayout)))
+			return -EFAULT;
+		break;
+
 	case MEMGETBADBLOCK:
 	{
 		loff_t offs;
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 699fce7770de..ec15abcdbdfa 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -766,7 +766,7 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 
 	}
 
-	concat->mtd.oobinfo = subdev[0]->oobinfo;
+	concat->mtd.ecclayout = subdev[0]->ecclayout;
 
 	concat->num_subdev = num_devs;
 	concat->mtd.name = name;
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index b6b218952d49..6d7639b98eab 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -434,7 +434,7 @@ int add_mtd_partitions(struct mtd_info *master,
 				parts[i].name);
 		}
 
-		slave->mtd.oobinfo = master->oobinfo;
+		slave->mtd.ecclayout = master->ecclayout;
 
 		if(parts[i].mtdp)
 		{	/* store the object pointer (caller may or may not register it */
diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index 82262a4a4208..463e12ced1b3 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -1058,8 +1058,7 @@ static int doc200x_correct_data(struct mtd_info *mtd, u_char *dat,
  * safer.  The only problem with it is that any code that parses oobfree must
  * be able to handle out-of-order segments.
  */
-static struct nand_oobinfo doc200x_oobinfo = {
-	.useecc = MTD_NANDECC_AUTOPLACE,
+static struct nand_ecclayout doc200x_oobinfo = {
 	.eccbytes = 6,
 	.eccpos = {0, 1, 2, 3, 4, 5},
 	.oobfree = {{8, 8}, {6, 2}}
@@ -1662,7 +1661,7 @@ static int __init doc_probe(unsigned long physadr)
 	nand->ecc.calculate	= doc200x_calculate_ecc;
 	nand->ecc.correct	= doc200x_correct_data;
 
-	nand->autooob		= &doc200x_oobinfo;
+	nand->ecc.layout	= &doc200x_oobinfo;
 	nand->ecc.mode		= NAND_ECC_HW_SYNDROME;
 	nand->ecc.size		= 512;
 	nand->ecc.bytes		= 6;
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 20f79fec73b5..e922b829c4be 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -52,28 +52,33 @@
 #endif
 
 /* Define default oob placement schemes for large and small page devices */
-static struct nand_oobinfo nand_oob_8 = {
-	.useecc = MTD_NANDECC_AUTOPLACE,
+static struct nand_ecclayout nand_oob_8 = {
 	.eccbytes = 3,
 	.eccpos = {0, 1, 2},
-	.oobfree = {{3, 2}, {6, 2}}
+	.oobfree = {
+		{.offset = 3,
+		 .length = 2},
+		{.offset = 6,
+		 .length = 2}}
 };
 
-static struct nand_oobinfo nand_oob_16 = {
-	.useecc = MTD_NANDECC_AUTOPLACE,
+static struct nand_ecclayout nand_oob_16 = {
 	.eccbytes = 6,
 	.eccpos = {0, 1, 2, 3, 6, 7},
-	.oobfree = {{8, 8}}
+	.oobfree = {
+		{.offset = 8,
+		 . length = 8}}
 };
 
-static struct nand_oobinfo nand_oob_64 = {
-	.useecc = MTD_NANDECC_AUTOPLACE,
+static struct nand_ecclayout nand_oob_64 = {
 	.eccbytes = 24,
 	.eccpos = {
 		   40, 41, 42, 43, 44, 45, 46, 47,
 		   48, 49, 50, 51, 52, 53, 54, 55,
 		   56, 57, 58, 59, 60, 61, 62, 63},
-	.oobfree = {{2, 38}}
+	.oobfree = {
+		{.offset = 2,
+		 .length = 38}}
 };
 
 /* This is used for padding purposes in nand_write_oob */
@@ -749,7 +754,7 @@ static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
 	uint8_t *p = buf;
 	uint8_t *ecc_calc = chip->buffers.ecccalc;
 	uint8_t *ecc_code = chip->buffers.ecccode;
-	int *eccpos = chip->autooob->eccpos;
+	int *eccpos = chip->ecc.layout->eccpos;
 
 	chip->read_buf(mtd, buf, mtd->writesize);
 	chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
@@ -795,7 +800,7 @@ static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
 	uint8_t *p = buf;
 	uint8_t *ecc_calc = chip->buffers.ecccalc;
 	uint8_t *ecc_code = chip->buffers.ecccode;
-	int *eccpos = chip->autooob->eccpos;
+	int *eccpos = chip->ecc.layout->eccpos;
 
 	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
 		chip->ecc.hwctl(mtd, NAND_ECC_READ);
@@ -1198,7 +1203,7 @@ static void nand_write_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
 	int eccsteps = chip->ecc.steps;
 	uint8_t *ecc_calc = chip->buffers.ecccalc;
 	const uint8_t *p = buf;
-	int *eccpos = chip->autooob->eccpos;
+	int *eccpos = chip->ecc.layout->eccpos;
 
 	if (chip->ecc.mode != NAND_ECC_NONE) {
 		/* Software ecc calculation */
@@ -1227,7 +1232,7 @@ static void nand_write_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
 	int eccsteps = chip->ecc.steps;
 	uint8_t *ecc_calc = chip->buffers.ecccalc;
 	const uint8_t *p = buf;
-	int *eccpos = chip->autooob->eccpos;
+	int *eccpos = chip->ecc.layout->eccpos;
 
 	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
 		chip->ecc.hwctl(mtd, NAND_ECC_WRITE);
@@ -2124,16 +2129,16 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 	/*
 	 * If no default placement scheme is given, select an appropriate one
 	 */
-	if (!chip->autooob) {
+	if (!chip->ecc.layout) {
 		switch (mtd->oobsize) {
 		case 8:
-			chip->autooob = &nand_oob_8;
+			chip->ecc.layout = &nand_oob_8;
 			break;
 		case 16:
-			chip->autooob = &nand_oob_16;
+			chip->ecc.layout = &nand_oob_16;
 			break;
 		case 64:
-			chip->autooob = &nand_oob_64;
+			chip->ecc.layout = &nand_oob_64;
 			break;
 		default:
 			printk(KERN_WARNING "No oob scheme defined for "
@@ -2197,6 +2202,15 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 		BUG();
 	}
 
+	/*
+	 * The number of bytes available for a client to place data into
+	 * the out of band area
+	 */
+	chip->ecc.layout->oobavail = 0;
+	for (i = 0; chip->ecc.layout->oobfree[i].length; i++)
+		chip->ecc.layout->oobavail +=
+			chip->ecc.layout->oobfree[i].length;
+
 	/*
 	 * Set the number of read / write steps for one page depending on ECC
 	 * mode
@@ -2236,8 +2250,8 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 	mtd->block_isbad = nand_block_isbad;
 	mtd->block_markbad = nand_block_markbad;
 
-	/* and make the autooob the default one */
-	mtd->oobinfo = chip->autooob;
+	/* propagate ecc.layout to mtd_info */
+	mtd->ecclayout = chip->ecc.layout;
 
 	/* Check, if we should skip the bad block table scan */
 	if (chip->options & NAND_SKIP_BBTSCAN)
diff --git a/drivers/mtd/nand/ndfc.c b/drivers/mtd/nand/ndfc.c
index 5790d630faed..551702ddcacb 100644
--- a/drivers/mtd/nand/ndfc.c
+++ b/drivers/mtd/nand/ndfc.c
@@ -168,7 +168,7 @@ static void ndfc_chip_init(struct ndfc_nand_mtd *mtd)
 	chip->ecc.mode = NAND_ECC_HW;
 	chip->ecc.size = 256;
 	chip->ecc.bytes = 3;
-	chip->autooob = mtd->pl_chip->oobinfo;
+	chip->ecclayout = mtd->pl_chip->ecclayout;
 	mtd->mtd.priv = chip;
 	mtd->mtd.owner = THIS_MODULE;
 }
diff --git a/drivers/mtd/nand/rtc_from4.c b/drivers/mtd/nand/rtc_from4.c
index de6de91fbad9..f8c49645324d 100644
--- a/drivers/mtd/nand/rtc_from4.c
+++ b/drivers/mtd/nand/rtc_from4.c
@@ -142,8 +142,7 @@ static struct rs_control *rs_decoder;
 /*
  *      hardware specific Out Of Band information
  */
-static struct nand_oobinfo rtc_from4_nand_oobinfo = {
-	.useecc = MTD_NANDECC_AUTOPLACE,
+static struct nand_ecclayout rtc_from4_nand_oobinfo = {
 	.eccbytes = 32,
 	.eccpos = {
 		   0, 1, 2, 3, 4, 5, 6, 7,
@@ -574,7 +573,7 @@ static int __init rtc_from4_init(void)
 	/* return the status of extra status and ECC checks */
 	this->errstat = rtc_from4_errstat;
 	/* set the nand_oobinfo to support FPGA H/W error detection */
-	this->autooob = &rtc_from4_nand_oobinfo;
+	this->ecc.layout = &rtc_from4_nand_oobinfo;
 	this->ecc.hwctl = rtc_from4_enable_hwecc;
 	this->ecc.calculate = rtc_from4_calculate_ecc;
 	this->ecc.correct = rtc_from4_correct_data;
diff --git a/drivers/mtd/nand/s3c2410.c b/drivers/mtd/nand/s3c2410.c
index 215227d1a65c..8429793a6288 100644
--- a/drivers/mtd/nand/s3c2410.c
+++ b/drivers/mtd/nand/s3c2410.c
@@ -76,8 +76,7 @@ static int hardware_ecc = 0;
 /* new oob placement block for use with hardware ecc generation
  */
 
-static struct nand_oobinfo nand_hw_eccoob = {
-	.useecc = MTD_NANDECC_AUTOPLACE,
+static struct nand_ecclayout nand_hw_eccoob = {
 	.eccbytes = 3,
 	.eccpos = {0, 1, 2},
 	.oobfree = {{8, 8}}
@@ -502,7 +501,7 @@ static void s3c2410_nand_init_chip(struct s3c2410_nand_info *info,
 		chip->ecc.mode	    = NAND_ECC_HW;
 		chip->ecc.size	    = 512;
 		chip->ecc.bytes	    = 3;
-		chip->autooob       = &nand_hw_eccoob;
+		chip->ecc.layout    = &nand_hw_eccoob;
 
 		if (info->is_s3c2440) {
 			chip->ecc.hwctl     = s3c2440_nand_enable_hwecc;
diff --git a/drivers/mtd/nand/sharpsl.c b/drivers/mtd/nand/sharpsl.c
index 45a1da724bff..21743658d150 100644
--- a/drivers/mtd/nand/sharpsl.c
+++ b/drivers/mtd/nand/sharpsl.c
@@ -115,8 +115,7 @@ static struct nand_bbt_descr sharpsl_akita_bbt = {
 	.pattern = scan_ff_pattern
 };
 
-static struct nand_oobinfo akita_oobinfo = {
-	.useecc = MTD_NANDECC_AUTOPLACE,
+static struct nand_ecclayout akita_oobinfo = {
 	.eccbytes = 24,
 	.eccpos = {
 		   0x5, 0x1, 0x2, 0x3, 0x6, 0x7, 0x15, 0x11,
@@ -202,7 +201,7 @@ static int __init sharpsl_nand_init(void)
 	this->badblock_pattern = &sharpsl_bbt;
 	if (machine_is_akita() || machine_is_borzoi()) {
 		this->badblock_pattern = &sharpsl_akita_bbt;
-		this->autooob = &akita_oobinfo;
+		this->ecc.layout = &akita_oobinfo;
 	}
 	this->ecc.hwctl = sharpsl_nand_enable_hwecc;
 	this->ecc.calculate = sharpsl_nand_calculate_ecc;
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index b24bfa6e202c..a0d3f011c0f2 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -23,8 +23,7 @@
 /**
  * onenand_oob_64 - oob info for large (2KB) page
  */
-static struct nand_oobinfo onenand_oob_64 = {
-	.useecc		= MTD_NANDECC_AUTOPLACE,
+static struct nand_ecclayout onenand_oob_64 = {
 	.eccbytes	= 20,
 	.eccpos		= {
 		8, 9, 10, 11, 12,
@@ -41,8 +40,7 @@ static struct nand_oobinfo onenand_oob_64 = {
 /**
  * onenand_oob_32 - oob info for middle (1KB) page
  */
-static struct nand_oobinfo onenand_oob_32 = {
-	.useecc		= MTD_NANDECC_AUTOPLACE,
+static struct nand_ecclayout onenand_oob_32 = {
 	.eccbytes	= 10,
 	.eccpos		= {
 		8, 9, 10, 11, 12,
@@ -1747,22 +1745,22 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 
 	switch (mtd->oobsize) {
 	case 64:
-		this->autooob = &onenand_oob_64;
+		this->ecclayout = &onenand_oob_64;
 		break;
 
 	case 32:
-		this->autooob = &onenand_oob_32;
+		this->ecclayout = &onenand_oob_32;
 		break;
 
 	default:
 		printk(KERN_WARNING "No OOB scheme defined for oobsize %d\n",
 			mtd->oobsize);
 		/* To prevent kernel oops */
-		this->autooob = &onenand_oob_32;
+		this->ecclayout = &onenand_oob_32;
 		break;
 	}
 
-	mtd->oobinfo = this->autooob;
+	mtd->ecclayout = this->ecclayout;
 
 	/* Fill in remaining MTD driver data */
 	mtd->type = MTD_NANDFLASH;
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 272fbea55192..506690cc9a78 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -107,7 +107,7 @@ struct jffs2_sb_info {
 	struct rw_semaphore wbuf_sem;	/* Protects the write buffer */
 
 	/* Information about out-of-band area usage... */
-	struct nand_oobinfo *oobinfo;
+	struct nand_ecclayout *ecclayout;
 	uint32_t badblock_pos;
 	uint32_t fsdata_pos;
 	uint32_t fsdata_len;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index dc275cedfe4a..c6a62e162963 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1140,18 +1140,9 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
 	return 1;
 }
 
-#define NAND_JFFS2_OOB16_FSDALEN	8
-
-static struct nand_oobinfo jffs2_oobinfo_docecc = {
-	.useecc = MTD_NANDECC_PLACE,
-	.eccbytes = 6,
-	.eccpos = {0,1,2,3,4,5}
-};
-
-
 static int jffs2_nand_set_oobinfo(struct jffs2_sb_info *c)
 {
-	struct nand_oobinfo *oinfo = c->mtd->oobinfo;
+	struct nand_ecclayout *oinfo = c->mtd->ecclayout;
 
 	/* Do this only, if we have an oob buffer */
 	if (!c->mtd->oobsize)
@@ -1161,33 +1152,23 @@ static int jffs2_nand_set_oobinfo(struct jffs2_sb_info *c)
 	c->cleanmarker_size = 0;
 
 	/* Should we use autoplacement ? */
-	if (oinfo && oinfo->useecc == MTD_NANDECC_AUTOPLACE) {
-		D1(printk(KERN_DEBUG "JFFS2 using autoplace on NAND\n"));
-		/* Get the position of the free bytes */
-		if (!oinfo->oobfree[0][1]) {
-			printk (KERN_WARNING "jffs2_nand_set_oobinfo(): Eeep. Autoplacement selected and no empty space in oob\n");
-			return -ENOSPC;
-		}
-		c->fsdata_pos = oinfo->oobfree[0][0];
-		c->fsdata_len = oinfo->oobfree[0][1];
-		if (c->fsdata_len > 8)
-			c->fsdata_len = 8;
-	} else {
-		/* This is just a legacy fallback and should go away soon */
-		switch(c->mtd->ecctype) {
-		case MTD_ECC_RS_DiskOnChip:
-			printk(KERN_WARNING "JFFS2 using DiskOnChip hardware ECC without autoplacement. Fix it!\n");
-			c->oobinfo = &jffs2_oobinfo_docecc;
-			c->fsdata_pos = 6;
-			c->fsdata_len = NAND_JFFS2_OOB16_FSDALEN;
-			c->badblock_pos = 15;
-			break;
+	if (!oinfo) {
+		D1(printk(KERN_DEBUG "JFFS2 on NAND. No autoplacment info found\n"));
+		return -EINVAL;
+	}
 
-		default:
-			D1(printk(KERN_DEBUG "JFFS2 on NAND. No autoplacment info found\n"));
-			return -EINVAL;
-		}
+	D1(printk(KERN_DEBUG "JFFS2 using autoplace on NAND\n"));
+	/* Get the position of the free bytes */
+	if (!oinfo->oobfree[0].length) {
+		printk (KERN_WARNING "jffs2_nand_set_oobinfo(): Eeep."
+			" Autoplacement selected and no empty space in oob\n");
+		return -ENOSPC;
 	}
+	c->fsdata_pos = oinfo->oobfree[0].offset;
+	c->fsdata_len = oinfo->oobfree[0].length;
+	if (c->fsdata_len > 8)
+		c->fsdata_len = 8;
+
 	return 0;
 }
 
diff --git a/include/linux/mtd/inftl.h b/include/linux/mtd/inftl.h
index d7eaa40e5ab0..6977780e548f 100644
--- a/include/linux/mtd/inftl.h
+++ b/include/linux/mtd/inftl.h
@@ -46,7 +46,7 @@ struct INFTLrecord {
         unsigned int nb_blocks;		/* number of physical blocks */
         unsigned int nb_boot_blocks;	/* number of blocks used by the bios */
         struct erase_info instr;
-        struct nand_oobinfo oobinfo;
+        struct nand_ecclayout oobinfo;
 };
 
 int INFTL_mount(struct INFTLrecord *s);
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 8429da51bb09..48a9df21ab11 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -101,8 +101,8 @@ struct mtd_info {
 	char *name;
 	int index;
 
-	/* oobinfo structure pointer - read only ! */
-	struct nand_oobinfo *oobinfo;
+	/* ecc layout structure pointer - read only ! */
+	struct nand_ecclayout *ecclayout;
 
 	/* Data for variable erase regions. If numeraseregions is zero,
 	 * it means that the whole device has erasesize as given above.
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index fd46bcf52281..dc2bf1bcf42b 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -244,6 +244,7 @@ struct nand_ecc_ctrl {
 	int			total;
 	int			prepad;
 	int			postpad;
+	struct nand_ecclayout	*layout;
 	void			(*hwctl)(struct mtd_info *mtd, int mode);
 	int			(*calculate)(struct mtd_info *mtd,
 					     const uint8_t *dat,
@@ -318,7 +319,7 @@ struct nand_buffers {
  * @chipsize:		[INTERN] the size of one chip for multichip arrays
  * @pagemask:		[INTERN] page number mask = number of (pages / chip) - 1
  * @pagebuf:		[INTERN] holds the pagenumber which is currently in data_buf
- * @autooob:		[REPLACEABLE] the default (auto)placement scheme
+ * @ecclayout:		[REPLACEABLE] the default ecc placement scheme
  * @bbt:		[INTERN] bad block table pointer
  * @bbt_td:		[REPLACEABLE] bad block table descriptor for flash lookup
  * @bbt_md:		[REPLACEABLE] bad block table mirror descriptor
@@ -368,7 +369,7 @@ struct nand_chip {
 
 	uint8_t		*oob_poi;
 	struct nand_hw_control  *controller;
-	struct nand_oobinfo	*autooob;
+	struct nand_ecclayout	*ecclayout;
 
 	struct nand_ecc_ctrl ecc;
 	struct nand_buffers buffers;
@@ -522,7 +523,7 @@ extern int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
  * @partitions:		mtd partition list
  * @chip_delay:		R/B delay value in us
  * @options:		Option flags, e.g. 16bit buswidth
- * @oobinfo:		oob info structure (ecc placement)
+ * @ecclayout:		ecc layout info structure
  * @priv:		hardware controller specific settings
  */
 struct platform_nand_chip {
@@ -530,7 +531,7 @@ struct platform_nand_chip {
 	int			chip_offset;
 	int			nr_partitions;
 	struct mtd_partition	*partitions;
-	struct nand_oobinfo	*oobinfo;
+	struct nand_ecclayout	*ecclayout;
 	int			chip_delay;
 	unsigned int		options;
 	void			*priv;
diff --git a/include/linux/mtd/nftl.h b/include/linux/mtd/nftl.h
index d35d2c21ff3e..bcf2fb3fa4a7 100644
--- a/include/linux/mtd/nftl.h
+++ b/include/linux/mtd/nftl.h
@@ -37,7 +37,7 @@ struct NFTLrecord {
         unsigned int nb_blocks;		/* number of physical blocks */
         unsigned int nb_boot_blocks;	/* number of blocks used by the bios */
         struct erase_info instr;
-	struct nand_oobinfo oobinfo;
+	struct nand_ecclayout oobinfo;
 };
 
 int NFTL_mount(struct NFTLrecord *s);
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 3f5919f2e9da..9ce9a48db444 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -77,7 +77,7 @@ struct onenand_bufferram {
  * @param chip_lock	[INTERN] spinlock used to protect access to this structure and the chip
  * @param wq		[INTERN] wait queue to sleep on if a OneNAND operation is in progress
  * @param state		[INTERN] the current state of the OneNAND device
- * @param autooob	[REPLACEABLE] the default (auto)placement scheme
+ * @param ecclayout	[REPLACEABLE] the default ecc placement scheme
  * @param bbm		[REPLACEABLE] pointer to Bad Block Management
  * @param priv		[OPTIONAL] pointer to private chip date
  */
@@ -113,9 +113,9 @@ struct onenand_chip {
 	onenand_state_t		state;
 	unsigned char		*page_buf;
 
-	struct nand_oobinfo	*autooob;
+	struct nand_ecclayout	*ecclayout;
 
-	void 			*bbm;
+	void			*bbm;
 
 	void			*priv;
 };
diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
index b03f512d51b9..da6b3d6f12a7 100644
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -41,7 +41,7 @@ struct mtd_partition {
 	u_int32_t size;			/* partition size */
 	u_int32_t offset;		/* offset within the master MTD space */
 	u_int32_t mask_flags;		/* master MTD flags to mask out for this partition */
-	struct nand_oobinfo *oobsel;	/* out of band layout for this partition (NAND only)*/
+	struct nand_ecclayout *ecclayout;	/* out of band layout for this partition (NAND only)*/
 	struct mtd_info **mtdp;		/* pointer to store the MTD object */
 };
 
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
index 1e09e4c8f485..54c673f9648d 100644
--- a/include/mtd/mtd-abi.h
+++ b/include/mtd/mtd-abi.h
@@ -82,12 +82,12 @@ struct otp_info {
 	uint32_t locked;
 };
 
-#define MEMGETINFO              _IOR('M', 1, struct mtd_info_user)
-#define MEMERASE                _IOW('M', 2, struct erase_info_user)
-#define MEMWRITEOOB             _IOWR('M', 3, struct mtd_oob_buf)
-#define MEMREADOOB              _IOWR('M', 4, struct mtd_oob_buf)
-#define MEMLOCK                 _IOW('M', 5, struct erase_info_user)
-#define MEMUNLOCK               _IOW('M', 6, struct erase_info_user)
+#define MEMGETINFO		_IOR('M', 1, struct mtd_info_user)
+#define MEMERASE		_IOW('M', 2, struct erase_info_user)
+#define MEMWRITEOOB		_IOWR('M', 3, struct mtd_oob_buf)
+#define MEMREADOOB		_IOWR('M', 4, struct mtd_oob_buf)
+#define MEMLOCK			_IOW('M', 5, struct erase_info_user)
+#define MEMUNLOCK		_IOW('M', 6, struct erase_info_user)
 #define MEMGETREGIONCOUNT	_IOR('M', 7, int)
 #define MEMGETREGIONINFO	_IOWR('M', 8, struct region_info_user)
 #define MEMSETOOBSEL		_IOW('M', 9, struct nand_oobinfo)
@@ -97,8 +97,13 @@ struct otp_info {
 #define OTPSELECT		_IOR('M', 13, int)
 #define OTPGETREGIONCOUNT	_IOW('M', 14, int)
 #define OTPGETREGIONINFO	_IOW('M', 15, struct otp_info)
-#define OTPLOCK		_IOR('M', 16, struct otp_info)
+#define OTPLOCK			_IOR('M', 16, struct otp_info)
+#define ECCGETLAYOUT		_IOR('M', 17, struct nand_ecclayout)
 
+/*
+ * Obsolete legacy interface. Keep it in order not to break userspace
+ * interfaces
+ */
 struct nand_oobinfo {
 	uint32_t useecc;
 	uint32_t eccbytes;
@@ -106,4 +111,21 @@ struct nand_oobinfo {
 	uint32_t eccpos[32];
 };
 
+struct nand_oobfree {
+	uint32_t offset;
+	uint32_t length;
+};
+
+#define MTD_MAX_OOBFREE_ENTRIES	8
+/*
+ * ECC layout control structure. Exported to userspace for
+ * diagnosis and to allow creation of raw images
+ */
+struct nand_ecclayout {
+	uint32_t eccbytes;
+	uint32_t eccpos[64];
+	uint32_t oobavail;
+	struct nand_oobfree oobfree[MTD_MAX_OOBFREE_ENTRIES];
+};
+
 #endif /* __MTD_ABI_H__ */
diff --git a/include/mtd/mtd-user.h b/include/mtd/mtd-user.h
index 1c13fc7161fe..713f34d3e62e 100644
--- a/include/mtd/mtd-user.h
+++ b/include/mtd/mtd-user.h
@@ -16,5 +16,6 @@ typedef struct mtd_info_user mtd_info_t;
 typedef struct erase_info_user erase_info_t;
 typedef struct region_info_user region_info_t;
 typedef struct nand_oobinfo nand_oobinfo_t;
+typedef struct nand_ecclayout nand_ecclayout_t;
 
 #endif /* __MTD_USER_H__ */
-- 
cgit v1.2.3


From f4a43cfcecfcaeeaa40a9dbc1d1378298c22446e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Sun, 28 May 2006 11:01:53 +0200
Subject: [MTD] Remove silly MTD_WRITE/READ macros

Most of those macros are unused and the used ones just obfuscate
the code. Remove them and fixup all users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/inftlcore.c   |  63 ++++++++++----------
 drivers/mtd/inftlmount.c  |  43 ++++++++------
 drivers/mtd/maps/nettel.c |   2 +-
 drivers/mtd/mtdblock.c    |  13 +++--
 drivers/mtd/mtdchar.c     |   4 +-
 drivers/mtd/nftlcore.c    | 144 ++++++++++++++++++++++++----------------------
 drivers/mtd/nftlmount.c   |  74 ++++++++++++++----------
 fs/jffs/intrep.c          |  15 ++---
 include/linux/mtd/mtd.h   |  16 +-----
 9 files changed, 195 insertions(+), 179 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c
index ddd12993780d..3396f0e1ac5f 100644
--- a/drivers/mtd/inftlcore.c
+++ b/drivers/mtd/inftlcore.c
@@ -197,10 +197,11 @@ static u16 INFTL_foldchain(struct INFTLrecord *inftl, unsigned thisVUC, unsigned
 	u16 BlockMap[MAX_SECTORS_PER_UNIT];
 	unsigned char BlockDeleted[MAX_SECTORS_PER_UNIT];
 	unsigned int thisEUN, prevEUN, status;
+	struct mtd_info *mtd = inftl->mbd.mtd;
 	int block, silly;
 	unsigned int targetEUN;
 	struct inftl_oob oob;
-        size_t retlen;
+	size_t retlen;
 
 	DEBUG(MTD_DEBUG_LEVEL3, "INFTL: INFTL_foldchain(inftl=%p,thisVUC=%d,"
 		"pending=%d)\n", inftl, thisVUC, pendingblock);
@@ -226,9 +227,9 @@ static u16 INFTL_foldchain(struct INFTLrecord *inftl, unsigned thisVUC, unsigned
 			if ((BlockMap[block] != 0xffff) || BlockDeleted[block])
 				continue;
 
-			if (MTD_READOOB(inftl->mbd.mtd, (thisEUN * inftl->EraseSize)
-			     + (block * SECTORSIZE), 16 , &retlen,
-			     (char *)&oob) < 0)
+			if (mtd->read_oob(mtd, (thisEUN * inftl->EraseSize)
+					  + (block * SECTORSIZE), 16 , &retlen,
+					  (char *)&oob) < 0)
 				status = SECTOR_IGNORE;
 			else
 				status = oob.b.Status | oob.b.Status1;
@@ -288,13 +289,14 @@ static u16 INFTL_foldchain(struct INFTLrecord *inftl, unsigned thisVUC, unsigned
 		if (BlockMap[block] == BLOCK_NIL)
 			continue;
 
-		ret = MTD_READ(inftl->mbd.mtd, (inftl->EraseSize *
-			BlockMap[block]) + (block * SECTORSIZE), SECTORSIZE,
-			&retlen, movebuf);
+		ret = mtd->read(mtd, (inftl->EraseSize * BlockMap[block]) +
+				(block * SECTORSIZE), SECTORSIZE, &retlen,
+				movebuf);
 		if (ret < 0) {
-			ret = MTD_READ(inftl->mbd.mtd, (inftl->EraseSize *
-				BlockMap[block]) + (block * SECTORSIZE),
-				SECTORSIZE, &retlen, movebuf);
+			ret = mtd->read(mtd,
+					(inftl->EraseSize * BlockMap[block]) +
+					(block * SECTORSIZE), SECTORSIZE,
+					&retlen, movebuf);
 			if (ret != -EIO)
 				DEBUG(MTD_DEBUG_LEVEL1, "INFTL: error went "
 				      "away on retry?\n");
@@ -415,6 +417,7 @@ static inline u16 INFTL_findwriteunit(struct INFTLrecord *inftl, unsigned block)
 	unsigned int thisVUC = block / (inftl->EraseSize / SECTORSIZE);
 	unsigned int thisEUN, writeEUN, prev_block, status;
 	unsigned long blockofs = (block * SECTORSIZE) & (inftl->EraseSize -1);
+	struct mtd_info *mtd = inftl->mbd.mtd;
 	struct inftl_oob oob;
 	struct inftl_bci bci;
 	unsigned char anac, nacs, parity;
@@ -434,8 +437,8 @@ static inline u16 INFTL_findwriteunit(struct INFTLrecord *inftl, unsigned block)
 		silly = MAX_LOOPS;
 
 		while (thisEUN <= inftl->lastEUN) {
-			MTD_READOOB(inftl->mbd.mtd, (thisEUN * inftl->EraseSize) +
-				blockofs, 8, &retlen, (char *)&bci);
+			mtd->read_oob(mtd, (thisEUN * inftl->EraseSize) +
+				      blockofs, 8, &retlen, (char *)&bci);
 
 			status = bci.Status | bci.Status1;
 			DEBUG(MTD_DEBUG_LEVEL3, "INFTL: status of block %d in "
@@ -522,8 +525,8 @@ hitused:
 		nacs = 0;
 		thisEUN = inftl->VUtable[thisVUC];
 		if (thisEUN != BLOCK_NIL) {
-			MTD_READOOB(inftl->mbd.mtd, thisEUN * inftl->EraseSize
-				+ 8, 8, &retlen, (char *)&oob.u);
+			mtd->read_oob(mtd, thisEUN * inftl->EraseSize
+				      + 8, 8, &retlen, (char *)&oob.u);
 			anac = oob.u.a.ANAC + 1;
 			nacs = oob.u.a.NACs + 1;
 		}
@@ -544,8 +547,8 @@ hitused:
 		oob.u.a.parityPerField = parity;
 		oob.u.a.discarded = 0xaa;
 
-		MTD_WRITEOOB(inftl->mbd.mtd, writeEUN * inftl->EraseSize + 8, 8,
-			&retlen, (char *)&oob.u);
+		mtd->write_oob(mtd, writeEUN * inftl->EraseSize + 8, 8,
+			       &retlen, (char *)&oob.u);
 
 		/* Also back up header... */
 		oob.u.b.virtualUnitNo = cpu_to_le16(thisVUC);
@@ -555,8 +558,8 @@ hitused:
 		oob.u.b.parityPerField = parity;
 		oob.u.b.discarded = 0xaa;
 
-		MTD_WRITEOOB(inftl->mbd.mtd, writeEUN * inftl->EraseSize +
-			SECTORSIZE * 4 + 8, 8, &retlen, (char *)&oob.u);
+		mtd->write_oob(mtd, writeEUN * inftl->EraseSize +
+			       SECTORSIZE * 4 + 8, 8, &retlen, (char *)&oob.u);
 
 		inftl->PUtable[writeEUN] = inftl->VUtable[thisVUC];
 		inftl->VUtable[thisVUC] = writeEUN;
@@ -576,6 +579,7 @@ hitused:
  */
 static void INFTL_trydeletechain(struct INFTLrecord *inftl, unsigned thisVUC)
 {
+	struct mtd_info *mtd = inftl->mbd.mtd;
 	unsigned char BlockUsed[MAX_SECTORS_PER_UNIT];
 	unsigned char BlockDeleted[MAX_SECTORS_PER_UNIT];
 	unsigned int thisEUN, status;
@@ -606,9 +610,9 @@ static void INFTL_trydeletechain(struct INFTLrecord *inftl, unsigned thisVUC)
 			if (BlockUsed[block] || BlockDeleted[block])
 				continue;
 
-			if (MTD_READOOB(inftl->mbd.mtd, (thisEUN * inftl->EraseSize)
-			    + (block * SECTORSIZE), 8 , &retlen,
-			    (char *)&bci) < 0)
+			if (mtd->read_oob(mtd, (thisEUN * inftl->EraseSize)
+					  + (block * SECTORSIZE), 8 , &retlen,
+					  (char *)&bci) < 0)
 				status = SECTOR_IGNORE;
 			else
 				status = bci.Status | bci.Status1;
@@ -697,6 +701,7 @@ static int INFTL_deleteblock(struct INFTLrecord *inftl, unsigned block)
 {
 	unsigned int thisEUN = inftl->VUtable[block / (inftl->EraseSize / SECTORSIZE)];
 	unsigned long blockofs = (block * SECTORSIZE) & (inftl->EraseSize - 1);
+	struct mtd_info *mtd = inftl->mbd.mtd;
 	unsigned int status;
 	int silly = MAX_LOOPS;
 	size_t retlen;
@@ -706,8 +711,8 @@ static int INFTL_deleteblock(struct INFTLrecord *inftl, unsigned block)
 		"block=%d)\n", inftl, block);
 
 	while (thisEUN < inftl->nb_blocks) {
-		if (MTD_READOOB(inftl->mbd.mtd, (thisEUN * inftl->EraseSize) +
-		    blockofs, 8, &retlen, (char *)&bci) < 0)
+		if (mtd->read_oob(mtd, (thisEUN * inftl->EraseSize) +
+				  blockofs, 8, &retlen, (char *)&bci) < 0)
 			status = SECTOR_IGNORE;
 		else
 			status = bci.Status | bci.Status1;
@@ -741,10 +746,10 @@ foundit:
 	if (thisEUN != BLOCK_NIL) {
 		loff_t ptr = (thisEUN * inftl->EraseSize) + blockofs;
 
-		if (MTD_READOOB(inftl->mbd.mtd, ptr, 8, &retlen, (char *)&bci) < 0)
+		if (mtd->read_oob(mtd, ptr, 8, &retlen, (char *)&bci) < 0)
 			return -EIO;
 		bci.Status = bci.Status1 = SECTOR_DELETED;
-		if (MTD_WRITEOOB(inftl->mbd.mtd, ptr, 8, &retlen, (char *)&bci) < 0)
+		if (mtd->write_oob(mtd, ptr, 8, &retlen, (char *)&bci) < 0)
 			return -EIO;
 		INFTL_trydeletechain(inftl, block / (inftl->EraseSize / SECTORSIZE));
 	}
@@ -805,6 +810,7 @@ static int inftl_readblock(struct mtd_blktrans_dev *mbd, unsigned long block,
 	struct INFTLrecord *inftl = (void *)mbd;
 	unsigned int thisEUN = inftl->VUtable[block / (inftl->EraseSize / SECTORSIZE)];
 	unsigned long blockofs = (block * SECTORSIZE) & (inftl->EraseSize - 1);
+	struct mtd_info *mtd = inftl->mbd.mtd;
 	unsigned int status;
 	int silly = MAX_LOOPS;
 	struct inftl_bci bci;
@@ -814,8 +820,8 @@ static int inftl_readblock(struct mtd_blktrans_dev *mbd, unsigned long block,
 		"buffer=%p)\n", inftl, block, buffer);
 
 	while (thisEUN < inftl->nb_blocks) {
-		if (MTD_READOOB(inftl->mbd.mtd, (thisEUN * inftl->EraseSize) +
-		     blockofs, 8, &retlen, (char *)&bci) < 0)
+		if (mtd->read_oob(mtd, (thisEUN * inftl->EraseSize) +
+				  blockofs, 8, &retlen, (char *)&bci) < 0)
 			status = SECTOR_IGNORE;
 		else
 			status = bci.Status | bci.Status1;
@@ -853,8 +859,7 @@ foundit:
 	} else {
 		size_t retlen;
 		loff_t ptr = (thisEUN * inftl->EraseSize) + blockofs;
-		if (MTD_READ(inftl->mbd.mtd, ptr, SECTORSIZE, &retlen,
-		    buffer))
+		if (mtd->read(mtd, ptr, SECTORSIZE, &retlen, buffer))
 			return -EIO;
 	}
 	return 0;
diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c
index f89a03795e76..b4cda7d0a52d 100644
--- a/drivers/mtd/inftlmount.c
+++ b/drivers/mtd/inftlmount.c
@@ -57,6 +57,7 @@ static int find_boot_record(struct INFTLrecord *inftl)
 	unsigned int i, block;
 	u8 buf[SECTORSIZE];
 	struct INFTLMediaHeader *mh = &inftl->MediaHdr;
+	struct mtd_info *mtd = inftl->mbd.mtd;
 	struct INFTLPartition *ip;
 	size_t retlen;
 
@@ -80,8 +81,8 @@ static int find_boot_record(struct INFTLrecord *inftl)
 		 * Check for BNAND header first. Then whinge if it's found
 		 * but later checks fail.
 		 */
-		ret = MTD_READ(inftl->mbd.mtd, block * inftl->EraseSize,
-		    SECTORSIZE, &retlen, buf);
+		ret = mtd->read(mtd, block * inftl->EraseSize,
+				SECTORSIZE, &retlen, buf);
 		/* We ignore ret in case the ECC of the MediaHeader is invalid
 		   (which is apparently acceptable) */
 		if (retlen != SECTORSIZE) {
@@ -106,8 +107,9 @@ static int find_boot_record(struct INFTLrecord *inftl)
 		}
 
 		/* To be safer with BIOS, also use erase mark as discriminant */
-		if ((ret = MTD_READOOB(inftl->mbd.mtd, block * inftl->EraseSize +
-		    SECTORSIZE + 8, 8, &retlen, (char *)&h1) < 0)) {
+		if ((ret = mtd->read_oob(mtd, block * inftl->EraseSize +
+					 SECTORSIZE + 8, 8, &retlen,
+					 (char *)&h1) < 0)) {
 			printk(KERN_WARNING "INFTL: ANAND header found at "
 				"0x%x in mtd%d, but OOB data read failed "
 				"(err %d)\n", block * inftl->EraseSize,
@@ -123,8 +125,8 @@ static int find_boot_record(struct INFTLrecord *inftl)
 		memcpy(mh, buf, sizeof(struct INFTLMediaHeader));
 
 		/* Read the spare media header at offset 4096 */
-		MTD_READ(inftl->mbd.mtd, block * inftl->EraseSize + 4096,
-		    SECTORSIZE, &retlen, buf);
+		mtd->read(mtd, block * inftl->EraseSize + 4096,
+			  SECTORSIZE, &retlen, buf);
 		if (retlen != SECTORSIZE) {
 			printk(KERN_WARNING "INFTL: Unable to read spare "
 			       "Media Header\n");
@@ -233,7 +235,7 @@ static int find_boot_record(struct INFTLrecord *inftl)
 				 */
 				instr->addr = ip->Reserved0 * inftl->EraseSize;
 				instr->len = inftl->EraseSize;
-				MTD_ERASE(inftl->mbd.mtd, instr);
+				mtd->erase(mtd, instr);
 			}
 			if ((ip->lastUnit - ip->firstUnit + 1) < ip->virtualUnits) {
 				printk(KERN_WARNING "INFTL: Media Header "
@@ -387,6 +389,7 @@ int INFTL_formatblock(struct INFTLrecord *inftl, int block)
 	size_t retlen;
 	struct inftl_unittail uci;
 	struct erase_info *instr = &inftl->instr;
+	struct mtd_info *mtd = inftl->mbd.mtd;
 	int physblock;
 
 	DEBUG(MTD_DEBUG_LEVEL3, "INFTL: INFTL_formatblock(inftl=%p,"
@@ -404,8 +407,9 @@ int INFTL_formatblock(struct INFTLrecord *inftl, int block)
 	/* Erase one physical eraseblock at a time, even though the NAND api
 	   allows us to group them.  This way we if we have a failure, we can
 	   mark only the failed block in the bbt. */
-	for (physblock = 0; physblock < inftl->EraseSize; physblock += instr->len, instr->addr += instr->len) {
-		MTD_ERASE(inftl->mbd.mtd, instr);
+	for (physblock = 0; physblock < inftl->EraseSize;
+	     physblock += instr->len, instr->addr += instr->len) {
+		mtd->erase(inftl->mbd.mtd, instr);
 
 		if (instr->state == MTD_ERASE_FAILED) {
 			printk(KERN_WARNING "INFTL: error while formatting block %d\n",
@@ -414,10 +418,10 @@ int INFTL_formatblock(struct INFTLrecord *inftl, int block)
 		}
 
 		/*
-	 	* Check the "freeness" of Erase Unit before updating metadata.
-	 	* FixMe: is this check really necessary? Since we have check the
-	 	*        return code after the erase operation.
-	 	*/
+		 * Check the "freeness" of Erase Unit before updating metadata.
+		 * FixMe: is this check really necessary? Since we have check
+		 * the return code after the erase operation.
+		 */
 		if (check_free_sectors(inftl, instr->addr, instr->len, 1) != 0)
 			goto fail;
 	}
@@ -429,8 +433,7 @@ int INFTL_formatblock(struct INFTLrecord *inftl, int block)
 	uci.Reserved[2] = 0;
 	uci.Reserved[3] = 0;
 	instr->addr = block * inftl->EraseSize + SECTORSIZE * 2;
-	if (MTD_WRITEOOB(inftl->mbd.mtd, instr->addr +
-	    8, 8, &retlen, (char *)&uci) < 0)
+	if (mtd->write_oob(mtd, instr->addr + 8, 8, &retlen, (char *)&uci) < 0)
 		goto fail;
 	return 0;
 fail:
@@ -549,6 +552,7 @@ void INFTL_dumpVUchains(struct INFTLrecord *s)
 
 int INFTL_mount(struct INFTLrecord *s)
 {
+	struct mtd_info *mtd = s->mbd.mtd;
 	unsigned int block, first_block, prev_block, last_block;
 	unsigned int first_logical_block, logical_block, erase_mark;
 	int chain_length, do_format_chain;
@@ -607,10 +611,11 @@ int INFTL_mount(struct INFTLrecord *s)
 				break;
 			}
 
-			if (MTD_READOOB(s->mbd.mtd, block * s->EraseSize + 8,
-			    8, &retlen, (char *)&h0) < 0 ||
-			    MTD_READOOB(s->mbd.mtd, block * s->EraseSize +
-			    2 * SECTORSIZE + 8, 8, &retlen, (char *)&h1) < 0) {
+			if (mtd->read_oob(mtd, block * s->EraseSize + 8,
+					  8, &retlen, (char *)&h0) < 0 ||
+			    mtd->read_oob(mtd, block * s->EraseSize +
+					  2 * SECTORSIZE + 8, 8, &retlen,
+					  (char *)&h1) < 0) {
 				/* Should never happen? */
 				do_format_chain++;
 				break;
diff --git a/drivers/mtd/maps/nettel.c b/drivers/mtd/maps/nettel.c
index 20771b2a05e1..0994b5b2e331 100644
--- a/drivers/mtd/maps/nettel.c
+++ b/drivers/mtd/maps/nettel.c
@@ -190,7 +190,7 @@ int nettel_eraseconfig(void)
 		set_current_state(TASK_INTERRUPTIBLE);
 		add_wait_queue(&wait_q, &wait);
 
-		ret = MTD_ERASE(mtd, &nettel_erase);
+		ret = mtd->erase(mtd, &nettel_erase);
 		if (ret) {
 			set_current_state(TASK_RUNNING);
 			remove_wait_queue(&wait_q, &wait);
diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c
index 8e50170137e0..9b0bc20e4d8d 100644
--- a/drivers/mtd/mtdblock.c
+++ b/drivers/mtd/mtdblock.c
@@ -71,7 +71,7 @@ static int erase_write (struct mtd_info *mtd, unsigned long pos,
 	set_current_state(TASK_INTERRUPTIBLE);
 	add_wait_queue(&wait_q, &wait);
 
-	ret = MTD_ERASE(mtd, &erase);
+	ret = mtd->erase(mtd, &erase);
 	if (ret) {
 		set_current_state(TASK_RUNNING);
 		remove_wait_queue(&wait_q, &wait);
@@ -88,7 +88,7 @@ static int erase_write (struct mtd_info *mtd, unsigned long pos,
 	 * Next, writhe data to flash.
 	 */
 
-	ret = MTD_WRITE (mtd, pos, len, &retlen, buf);
+	ret = mtd->write(mtd, pos, len, &retlen, buf);
 	if (ret)
 		return ret;
 	if (retlen != len)
@@ -138,7 +138,7 @@ static int do_cached_write (struct mtdblk_dev *mtdblk, unsigned long pos,
 		mtd->name, pos, len);
 
 	if (!sect_size)
-		return MTD_WRITE (mtd, pos, len, &retlen, buf);
+		return mtd->write(mtd, pos, len, &retlen, buf);
 
 	while (len > 0) {
 		unsigned long sect_start = (pos/sect_size)*sect_size;
@@ -170,7 +170,8 @@ static int do_cached_write (struct mtdblk_dev *mtdblk, unsigned long pos,
 			    mtdblk->cache_offset != sect_start) {
 				/* fill the cache with the current sector */
 				mtdblk->cache_state = STATE_EMPTY;
-				ret = MTD_READ(mtd, sect_start, sect_size, &retlen, mtdblk->cache_data);
+				ret = mtd->read(mtd, sect_start, sect_size,
+						&retlen, mtdblk->cache_data);
 				if (ret)
 					return ret;
 				if (retlen != sect_size)
@@ -207,7 +208,7 @@ static int do_cached_read (struct mtdblk_dev *mtdblk, unsigned long pos,
 			mtd->name, pos, len);
 
 	if (!sect_size)
-		return MTD_READ (mtd, pos, len, &retlen, buf);
+		return mtd->read(mtd, pos, len, &retlen, buf);
 
 	while (len > 0) {
 		unsigned long sect_start = (pos/sect_size)*sect_size;
@@ -226,7 +227,7 @@ static int do_cached_read (struct mtdblk_dev *mtdblk, unsigned long pos,
 		    mtdblk->cache_offset == sect_start) {
 			memcpy (buf, mtdblk->cache_data + offset, size);
 		} else {
-			ret = MTD_READ (mtd, pos, size, &retlen, buf);
+			ret = mtd->read(mtd, pos, size, &retlen, buf);
 			if (ret)
 				return ret;
 			if (retlen != size)
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index e75ec5fe7760..b45e7747daa3 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -194,7 +194,7 @@ static ssize_t mtd_read(struct file *file, char __user *buf, size_t count,loff_t
 			ret = mtd->read_user_prot_reg(mtd, *ppos, len, &retlen, kbuf);
 			break;
 		default:
-			ret = MTD_READ(mtd, *ppos, len, &retlen, kbuf);
+			ret = mtd->read(mtd, *ppos, len, &retlen, kbuf);
 		}
 		/* Nand returns -EBADMSG on ecc errors, but it returns
 		 * the data. For our userspace tools it is important
@@ -205,7 +205,7 @@ static ssize_t mtd_read(struct file *file, char __user *buf, size_t count,loff_t
 		if (!ret || (ret == -EBADMSG)) {
 			*ppos += retlen;
 			if (copy_to_user(buf, kbuf, retlen)) {
-			        kfree(kbuf);
+				kfree(kbuf);
 				return -EFAULT;
 			}
 			else
diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c
index dd03349946c2..359533b33d9b 100644
--- a/drivers/mtd/nftlcore.c
+++ b/drivers/mtd/nftlcore.c
@@ -183,6 +183,7 @@ static u16 NFTL_findfreeblock(struct NFTLrecord *nftl, int desperate )
 
 static u16 NFTL_foldchain (struct NFTLrecord *nftl, unsigned thisVUC, unsigned pendingblock )
 {
+	struct mtd_info *mtd = nftl->mbd.mtd;
 	u16 BlockMap[MAX_SECTORS_PER_UNIT];
 	unsigned char BlockLastState[MAX_SECTORS_PER_UNIT];
 	unsigned char BlockFreeFound[MAX_SECTORS_PER_UNIT];
@@ -192,7 +193,7 @@ static u16 NFTL_foldchain (struct NFTLrecord *nftl, unsigned thisVUC, unsigned p
 	unsigned int targetEUN;
 	struct nftl_oob oob;
 	int inplace = 1;
-        size_t retlen;
+	size_t retlen;
 
 	memset(BlockMap, 0xff, sizeof(BlockMap));
 	memset(BlockFreeFound, 0, sizeof(BlockFreeFound));
@@ -208,21 +209,21 @@ static u16 NFTL_foldchain (struct NFTLrecord *nftl, unsigned thisVUC, unsigned p
 	/* Scan to find the Erase Unit which holds the actual data for each
 	   512-byte block within the Chain.
 	*/
-        silly = MAX_LOOPS;
+	silly = MAX_LOOPS;
 	targetEUN = BLOCK_NIL;
 	while (thisEUN <= nftl->lastEUN ) {
-                unsigned int status, foldmark;
+		unsigned int status, foldmark;
 
 		targetEUN = thisEUN;
 		for (block = 0; block < nftl->EraseSize / 512; block ++) {
-			MTD_READOOB(nftl->mbd.mtd,
-				    (thisEUN * nftl->EraseSize) + (block * 512),
-				    16 , &retlen, (char *)&oob);
+			mtd->read_oob(mtd, (thisEUN * nftl->EraseSize) +
+				      (block * 512), 16 , &retlen,
+				      (char *)&oob);
 			if (block == 2) {
-                                foldmark = oob.u.c.FoldMark | oob.u.c.FoldMark1;
-                                if (foldmark == FOLD_MARK_IN_PROGRESS) {
-                                        DEBUG(MTD_DEBUG_LEVEL1,
-                                              "Write Inhibited on EUN %d\n", thisEUN);
+				foldmark = oob.u.c.FoldMark | oob.u.c.FoldMark1;
+				if (foldmark == FOLD_MARK_IN_PROGRESS) {
+					DEBUG(MTD_DEBUG_LEVEL1,
+					      "Write Inhibited on EUN %d\n", thisEUN);
 					inplace = 0;
 				} else {
 					/* There's no other reason not to do inplace,
@@ -231,7 +232,7 @@ static u16 NFTL_foldchain (struct NFTLrecord *nftl, unsigned thisVUC, unsigned p
 					inplace = 1;
 				}
 			}
-                        status = oob.b.Status | oob.b.Status1;
+			status = oob.b.Status | oob.b.Status1;
 			BlockLastState[block] = status;
 
 			switch(status) {
@@ -326,15 +327,15 @@ static u16 NFTL_foldchain (struct NFTLrecord *nftl, unsigned thisVUC, unsigned p
 			return BLOCK_NIL;
 		}
 	} else {
-            /* We put a fold mark in the chain we are folding only if
-               we fold in place to help the mount check code. If we do
-               not fold in place, it is possible to find the valid
-               chain by selecting the longer one */
-            oob.u.c.FoldMark = oob.u.c.FoldMark1 = cpu_to_le16(FOLD_MARK_IN_PROGRESS);
-            oob.u.c.unused = 0xffffffff;
-            MTD_WRITEOOB(nftl->mbd.mtd, (nftl->EraseSize * targetEUN) + 2 * 512 + 8,
-                         8, &retlen, (char *)&oob.u);
-        }
+		/* We put a fold mark in the chain we are folding only if we
+               fold in place to help the mount check code. If we do not fold in
+               place, it is possible to find the valid chain by selecting the
+               longer one */
+		oob.u.c.FoldMark = oob.u.c.FoldMark1 = cpu_to_le16(FOLD_MARK_IN_PROGRESS);
+		oob.u.c.unused = 0xffffffff;
+		mtd->write_oob(mtd, (nftl->EraseSize * targetEUN) + 2 * 512 + 8,
+			       8, &retlen, (char *)&oob.u);
+	}
 
 	/* OK. We now know the location of every block in the Virtual Unit Chain,
 	   and the Erase Unit into which we are supposed to be copying.
@@ -351,20 +352,20 @@ static u16 NFTL_foldchain (struct NFTLrecord *nftl, unsigned thisVUC, unsigned p
 			continue;
 		}
 
-                /* copy only in non free block (free blocks can only
+		/* copy only in non free block (free blocks can only
                    happen in case of media errors or deleted blocks) */
-                if (BlockMap[block] == BLOCK_NIL)
-                        continue;
-
-                ret = MTD_READ(nftl->mbd.mtd, (nftl->EraseSize * BlockMap[block]) + (block * 512),
-				  512, &retlen, movebuf);
-                if (ret < 0) {
-                    ret = MTD_READ(nftl->mbd.mtd, (nftl->EraseSize * BlockMap[block])
-                                      + (block * 512), 512, &retlen,
-                                      movebuf);
-                    if (ret != -EIO)
-                        printk("Error went away on retry.\n");
-                }
+		if (BlockMap[block] == BLOCK_NIL)
+			continue;
+
+		ret = mtd->read(mtd, (nftl->EraseSize * BlockMap[block]) + (block * 512),
+				512, &retlen, movebuf);
+		if (ret < 0) {
+			ret = mtd->read(mtd, (nftl->EraseSize * BlockMap[block])
+					+ (block * 512), 512, &retlen,
+					movebuf);
+			if (ret != -EIO)
+				printk("Error went away on retry.\n");
+		}
 		memset(&oob, 0xff, sizeof(struct nftl_oob));
 		oob.b.Status = oob.b.Status1 = SECTOR_USED;
 
@@ -374,13 +375,12 @@ static u16 NFTL_foldchain (struct NFTLrecord *nftl, unsigned thisVUC, unsigned p
 
 	}
 
-        /* add the header so that it is now a valid chain */
-        oob.u.a.VirtUnitNum = oob.u.a.SpareVirtUnitNum
-                = cpu_to_le16(thisVUC);
-        oob.u.a.ReplUnitNum = oob.u.a.SpareReplUnitNum = 0xffff;
+	/* add the header so that it is now a valid chain */
+	oob.u.a.VirtUnitNum = oob.u.a.SpareVirtUnitNum = cpu_to_le16(thisVUC);
+	oob.u.a.ReplUnitNum = oob.u.a.SpareReplUnitNum = 0xffff;
 
-        MTD_WRITEOOB(nftl->mbd.mtd, (nftl->EraseSize * targetEUN) + 8,
-                     8, &retlen, (char *)&oob.u);
+	mtd->write_oob(mtd, (nftl->EraseSize * targetEUN) + 8,
+		       8, &retlen, (char *)&oob.u);
 
 	/* OK. We've moved the whole lot into the new block. Now we have to free the original blocks. */
 
@@ -397,18 +397,18 @@ static u16 NFTL_foldchain (struct NFTLrecord *nftl, unsigned thisVUC, unsigned p
 	while (thisEUN <= nftl->lastEUN && thisEUN != targetEUN) {
 		unsigned int EUNtmp;
 
-                EUNtmp = nftl->ReplUnitTable[thisEUN];
+		EUNtmp = nftl->ReplUnitTable[thisEUN];
 
-                if (NFTL_formatblock(nftl, thisEUN) < 0) {
+		if (NFTL_formatblock(nftl, thisEUN) < 0) {
 			/* could not erase : mark block as reserved
 			 */
 			nftl->ReplUnitTable[thisEUN] = BLOCK_RESERVED;
-                } else {
+		} else {
 			/* correctly erased : mark it as free */
 			nftl->ReplUnitTable[thisEUN] = BLOCK_FREE;
 			nftl->numfreeEUNs++;
-                }
-                thisEUN = EUNtmp;
+		}
+		thisEUN = EUNtmp;
 	}
 
 	/* Make this the new start of chain for thisVUC */
@@ -474,6 +474,7 @@ static inline u16 NFTL_findwriteunit(struct NFTLrecord *nftl, unsigned block)
 {
 	u16 lastEUN;
 	u16 thisVUC = block / (nftl->EraseSize / 512);
+	struct mtd_info *mtd = nftl->mbd.mtd;
 	unsigned int writeEUN;
 	unsigned long blockofs = (block * 512) & (nftl->EraseSize -1);
 	size_t retlen;
@@ -490,21 +491,22 @@ static inline u16 NFTL_findwriteunit(struct NFTLrecord *nftl, unsigned block)
 		*/
 		lastEUN = BLOCK_NIL;
 		writeEUN = nftl->EUNtable[thisVUC];
-                silly = MAX_LOOPS;
+		silly = MAX_LOOPS;
 		while (writeEUN <= nftl->lastEUN) {
 			struct nftl_bci bci;
 			size_t retlen;
-                        unsigned int status;
+			unsigned int status;
 
 			lastEUN = writeEUN;
 
-			MTD_READOOB(nftl->mbd.mtd, (writeEUN * nftl->EraseSize) + blockofs,
-				    8, &retlen, (char *)&bci);
+			mtd->read_oob(mtd,
+				      (writeEUN * nftl->EraseSize) + blockofs,
+				      8, &retlen, (char *)&bci);
 
 			DEBUG(MTD_DEBUG_LEVEL2, "Status of block %d in EUN %d is %x\n",
 			      block , writeEUN, le16_to_cpu(bci.Status));
 
-                        status = bci.Status | bci.Status1;
+			status = bci.Status | bci.Status1;
 			switch(status) {
 			case SECTOR_FREE:
 				return writeEUN;
@@ -575,10 +577,10 @@ static inline u16 NFTL_findwriteunit(struct NFTLrecord *nftl, unsigned block)
 		/* We've found a free block. Insert it into the chain. */
 
 		if (lastEUN != BLOCK_NIL) {
-                    thisVUC |= 0x8000; /* It's a replacement block */
+			thisVUC |= 0x8000; /* It's a replacement block */
 		} else {
-                    /* The first block in a new chain */
-                    nftl->EUNtable[thisVUC] = writeEUN;
+			/* The first block in a new chain */
+			nftl->EUNtable[thisVUC] = writeEUN;
 		}
 
 		/* set up the actual EUN we're writing into */
@@ -586,29 +588,29 @@ static inline u16 NFTL_findwriteunit(struct NFTLrecord *nftl, unsigned block)
 		nftl->ReplUnitTable[writeEUN] = BLOCK_NIL;
 
 		/* ... and on the flash itself */
-		MTD_READOOB(nftl->mbd.mtd, writeEUN * nftl->EraseSize + 8, 8,
-			    &retlen, (char *)&oob.u);
+		mtd->read_oob(mtd, writeEUN * nftl->EraseSize + 8, 8,
+			      &retlen, (char *)&oob.u);
 
 		oob.u.a.VirtUnitNum = oob.u.a.SpareVirtUnitNum = cpu_to_le16(thisVUC);
 
-		MTD_WRITEOOB(nftl->mbd.mtd, writeEUN * nftl->EraseSize + 8, 8,
-                             &retlen, (char *)&oob.u);
+		mtd->write_oob(mtd, writeEUN * nftl->EraseSize + 8, 8,
+			       &retlen, (char *)&oob.u);
 
-                /* we link the new block to the chain only after the
+		/* we link the new block to the chain only after the
                    block is ready. It avoids the case where the chain
                    could point to a free block */
-                if (lastEUN != BLOCK_NIL) {
+		if (lastEUN != BLOCK_NIL) {
 			/* Both in our cache... */
 			nftl->ReplUnitTable[lastEUN] = writeEUN;
 			/* ... and on the flash itself */
-			MTD_READOOB(nftl->mbd.mtd, (lastEUN * nftl->EraseSize) + 8,
-				    8, &retlen, (char *)&oob.u);
+			mtd->read_oob(mtd, (lastEUN * nftl->EraseSize) + 8,
+				      8, &retlen, (char *)&oob.u);
 
 			oob.u.a.ReplUnitNum = oob.u.a.SpareReplUnitNum
 				= cpu_to_le16(writeEUN);
 
-			MTD_WRITEOOB(nftl->mbd.mtd, (lastEUN * nftl->EraseSize) + 8,
-				     8, &retlen, (char *)&oob.u);
+			mtd->write_oob(mtd, (lastEUN * nftl->EraseSize) + 8,
+				       8, &retlen, (char *)&oob.u);
 		}
 
 		return writeEUN;
@@ -652,20 +654,22 @@ static int nftl_readblock(struct mtd_blktrans_dev *mbd, unsigned long block,
 			  char *buffer)
 {
 	struct NFTLrecord *nftl = (void *)mbd;
+	struct mtd_info *mtd = nftl->mbd.mtd;
 	u16 lastgoodEUN;
 	u16 thisEUN = nftl->EUNtable[block / (nftl->EraseSize / 512)];
 	unsigned long blockofs = (block * 512) & (nftl->EraseSize - 1);
-        unsigned int status;
+	unsigned int status;
 	int silly = MAX_LOOPS;
-        size_t retlen;
-        struct nftl_bci bci;
+	size_t retlen;
+	struct nftl_bci bci;
 
 	lastgoodEUN = BLOCK_NIL;
 
-        if (thisEUN != BLOCK_NIL) {
+	if (thisEUN != BLOCK_NIL) {
 		while (thisEUN < nftl->nb_blocks) {
-			if (MTD_READOOB(nftl->mbd.mtd, (thisEUN * nftl->EraseSize) + blockofs,
-					8, &retlen, (char *)&bci) < 0)
+			if (mtd->read_oob(mtd, (thisEUN * nftl->EraseSize) +
+					  blockofs, 8, &retlen,
+					  (char *)&bci) < 0)
 				status = SECTOR_IGNORE;
 			else
 				status = bci.Status | bci.Status1;
@@ -695,7 +699,7 @@ static int nftl_readblock(struct mtd_blktrans_dev *mbd, unsigned long block,
 			}
 			thisEUN = nftl->ReplUnitTable[thisEUN];
 		}
-        }
+	}
 
  the_end:
 	if (lastgoodEUN == BLOCK_NIL) {
@@ -704,7 +708,7 @@ static int nftl_readblock(struct mtd_blktrans_dev *mbd, unsigned long block,
 	} else {
 		loff_t ptr = (lastgoodEUN * nftl->EraseSize) + blockofs;
 		size_t retlen;
-		if (MTD_READ(nftl->mbd.mtd, ptr, 512, &retlen, buffer))
+		if (mtd->read(mtd, ptr, 512, &retlen, buffer))
 			return -EIO;
 	}
 	return 0;
diff --git a/drivers/mtd/nftlmount.c b/drivers/mtd/nftlmount.c
index 90e5e7e97fdc..521b07cd2326 100644
--- a/drivers/mtd/nftlmount.c
+++ b/drivers/mtd/nftlmount.c
@@ -45,6 +45,7 @@ static int find_boot_record(struct NFTLrecord *nftl)
 	size_t retlen;
 	u8 buf[SECTORSIZE];
 	struct NFTLMediaHeader *mh = &nftl->MediaHdr;
+	struct mtd_info *mtd = nftl->mbd.mtd;
 	unsigned int i;
 
         /* Assume logical EraseSize == physical erasesize for starting the scan.
@@ -65,7 +66,8 @@ static int find_boot_record(struct NFTLrecord *nftl)
 
 		/* Check for ANAND header first. Then can whinge if it's found but later
 		   checks fail */
-		ret = MTD_READ(nftl->mbd.mtd, block * nftl->EraseSize, SECTORSIZE, &retlen, buf);
+		ret = mtd->read(mtd, block * nftl->EraseSize, SECTORSIZE,
+				&retlen, buf);
 		/* We ignore ret in case the ECC of the MediaHeader is invalid
 		   (which is apparently acceptable) */
 		if (retlen != SECTORSIZE) {
@@ -90,8 +92,9 @@ static int find_boot_record(struct NFTLrecord *nftl)
 		}
 
 		/* To be safer with BIOS, also use erase mark as discriminant */
-		if ((ret = MTD_READOOB(nftl->mbd.mtd, block * nftl->EraseSize + SECTORSIZE + 8,
-				8, &retlen, (char *)&h1) < 0)) {
+		if ((ret = mtd->read_oob(mtd, block * nftl->EraseSize +
+					 SECTORSIZE + 8, 8, &retlen,
+					 (char *)&h1) < 0)) {
 			printk(KERN_WARNING "ANAND header found at 0x%x in mtd%d, but OOB data read failed (err %d)\n",
 			       block * nftl->EraseSize, nftl->mbd.mtd->index, ret);
 			continue;
@@ -109,8 +112,8 @@ static int find_boot_record(struct NFTLrecord *nftl)
 		}
 
 		/* Finally reread to check ECC */
-		if ((ret = MTD_READECC(nftl->mbd.mtd, block * nftl->EraseSize, SECTORSIZE,
-				&retlen, buf, (char *)&oob, NULL) < 0)) {
+		if ((ret = mtd->read(mtd, block * nftl->EraseSize, SECTORSIZE,
+				     &retlen, buf) < 0)) {
 			printk(KERN_NOTICE "ANAND header found at 0x%x in mtd%d, but ECC read failed (err %d)\n",
 			       block * nftl->EraseSize, nftl->mbd.mtd->index, ret);
 			continue;
@@ -228,9 +231,9 @@ device is already correct.
 The new DiskOnChip driver already scanned the bad block table.  Just query it.
 			if ((i & (SECTORSIZE - 1)) == 0) {
 				/* read one sector for every SECTORSIZE of blocks */
-				if ((ret = MTD_READECC(nftl->mbd.mtd, block * nftl->EraseSize +
-						       i + SECTORSIZE, SECTORSIZE, &retlen, buf,
-						       (char *)&oob, NULL)) < 0) {
+				if ((ret = mtd->read(nftl->mbd.mtd, block * nftl->EraseSize +
+						     i + SECTORSIZE, SECTORSIZE, &retlen,
+						     buf)) < 0) {
 					printk(KERN_NOTICE "Read of bad sector table failed (err %d)\n",
 					       ret);
 					kfree(nftl->ReplUnitTable);
@@ -305,10 +308,11 @@ int NFTL_formatblock(struct NFTLrecord *nftl, int block)
 	unsigned int nb_erases, erase_mark;
 	struct nftl_uci1 uci;
 	struct erase_info *instr = &nftl->instr;
+	struct mtd_info *mtd = nftl->mbd.mtd;
 
 	/* Read the Unit Control Information #1 for Wear-Leveling */
-	if (MTD_READOOB(nftl->mbd.mtd, block * nftl->EraseSize + SECTORSIZE + 8,
-			8, &retlen, (char *)&uci) < 0)
+	if (mtd->read_oob(mtd, block * nftl->EraseSize + SECTORSIZE + 8,
+			  8, &retlen, (char *)&uci) < 0)
 		goto default_uci1;
 
 	erase_mark = le16_to_cpu ((uci.EraseMark | uci.EraseMark1));
@@ -325,7 +329,7 @@ int NFTL_formatblock(struct NFTLrecord *nftl, int block)
 	instr->mtd = nftl->mbd.mtd;
 	instr->addr = block * nftl->EraseSize;
 	instr->len = nftl->EraseSize;
-	MTD_ERASE(nftl->mbd.mtd, instr);
+	mtd->erase(mtd, instr);
 
 	if (instr->state == MTD_ERASE_FAILED) {
 		printk("Error while formatting block %d\n", block);
@@ -347,8 +351,8 @@ int NFTL_formatblock(struct NFTLrecord *nftl, int block)
 			goto fail;
 
 		uci.WearInfo = le32_to_cpu(nb_erases);
-		if (MTD_WRITEOOB(nftl->mbd.mtd, block * nftl->EraseSize + SECTORSIZE + 8, 8,
-				 &retlen, (char *)&uci) < 0)
+		if (mtd->write_oob(mtd, block * nftl->EraseSize + SECTORSIZE +
+				   8, 8, &retlen, (char *)&uci) < 0)
 			goto fail;
 		return 0;
 fail:
@@ -369,6 +373,7 @@ fail:
  *	case. */
 static void check_sectors_in_chain(struct NFTLrecord *nftl, unsigned int first_block)
 {
+	struct mtd_info *mtd = nftl->mbd.mtd;
 	unsigned int block, i, status;
 	struct nftl_bci bci;
 	int sectors_per_block;
@@ -378,8 +383,9 @@ static void check_sectors_in_chain(struct NFTLrecord *nftl, unsigned int first_b
 	block = first_block;
 	for (;;) {
 		for (i = 0; i < sectors_per_block; i++) {
-			if (MTD_READOOB(nftl->mbd.mtd, block * nftl->EraseSize + i * SECTORSIZE,
-					8, &retlen, (char *)&bci) < 0)
+			if (mtd->read_oob(mtd,
+					  block * nftl->EraseSize + i * SECTORSIZE,
+					  8, &retlen, (char *)&bci) < 0)
 				status = SECTOR_IGNORE;
 			else
 				status = bci.Status | bci.Status1;
@@ -398,9 +404,10 @@ static void check_sectors_in_chain(struct NFTLrecord *nftl, unsigned int first_b
 					/* sector not free actually : mark it as SECTOR_IGNORE  */
 					bci.Status = SECTOR_IGNORE;
 					bci.Status1 = SECTOR_IGNORE;
-					MTD_WRITEOOB(nftl->mbd.mtd,
-						     block * nftl->EraseSize + i * SECTORSIZE,
-						     8, &retlen, (char *)&bci);
+					mtd->write_oob(mtd, block *
+						       nftl->EraseSize +
+						       i * SECTORSIZE, 8,
+						       &retlen, (char *)&bci);
 				}
 				break;
 			default:
@@ -485,13 +492,14 @@ static void format_chain(struct NFTLrecord *nftl, unsigned int first_block)
  *	1. */
 static int check_and_mark_free_block(struct NFTLrecord *nftl, int block)
 {
+	struct mtd_info *mtd = nftl->mbd.mtd;
 	struct nftl_uci1 h1;
 	unsigned int erase_mark;
 	size_t retlen;
 
 	/* check erase mark. */
-	if (MTD_READOOB(nftl->mbd.mtd, block * nftl->EraseSize + SECTORSIZE + 8, 8,
-			&retlen, (char *)&h1) < 0)
+	if (mtd->read_oob(mtd, block * nftl->EraseSize + SECTORSIZE + 8, 8,
+			  &retlen, (char *)&h1) < 0)
 		return -1;
 
 	erase_mark = le16_to_cpu ((h1.EraseMark | h1.EraseMark1));
@@ -505,8 +513,9 @@ static int check_and_mark_free_block(struct NFTLrecord *nftl, int block)
 		h1.EraseMark = cpu_to_le16(ERASE_MARK);
 		h1.EraseMark1 = cpu_to_le16(ERASE_MARK);
 		h1.WearInfo = cpu_to_le32(0);
-		if (MTD_WRITEOOB(nftl->mbd.mtd, block * nftl->EraseSize + SECTORSIZE + 8, 8,
-				 &retlen, (char *)&h1) < 0)
+		if (mtd->write_oob(mtd,
+				   block * nftl->EraseSize + SECTORSIZE + 8, 8,
+				   &retlen, (char *)&h1) < 0)
 			return -1;
 	} else {
 #if 0
@@ -517,8 +526,8 @@ static int check_and_mark_free_block(struct NFTLrecord *nftl, int block)
 						SECTORSIZE, 0) != 0)
 				return -1;
 
-			if (MTD_READOOB(nftl->mbd.mtd, block * nftl->EraseSize + i,
-					16, &retlen, buf) < 0)
+			if (mtd->read_oob(mtd, block * nftl->EraseSize + i,
+					  16, &retlen, buf) < 0)
 				return -1;
 			if (i == SECTORSIZE) {
 				/* skip erase mark */
@@ -544,11 +553,12 @@ static int check_and_mark_free_block(struct NFTLrecord *nftl, int block)
  */
 static int get_fold_mark(struct NFTLrecord *nftl, unsigned int block)
 {
+	struct mtd_info *mtd = nftl->mbd.mtd;
 	struct nftl_uci2 uci;
 	size_t retlen;
 
-	if (MTD_READOOB(nftl->mbd.mtd, block * nftl->EraseSize + 2 * SECTORSIZE + 8,
-			8, &retlen, (char *)&uci) < 0)
+	if (mtd->read_oob(mtd, block * nftl->EraseSize + 2 * SECTORSIZE + 8,
+			  8, &retlen, (char *)&uci) < 0)
 		return 0;
 
 	return le16_to_cpu((uci.FoldMark | uci.FoldMark1));
@@ -562,6 +572,7 @@ int NFTL_mount(struct NFTLrecord *s)
 	int chain_length, do_format_chain;
 	struct nftl_uci0 h0;
 	struct nftl_uci1 h1;
+	struct mtd_info *mtd = s->mbd.mtd;
 	size_t retlen;
 
 	/* search for NFTL MediaHeader and Spare NFTL Media Header */
@@ -586,10 +597,13 @@ int NFTL_mount(struct NFTLrecord *s)
 
 			for (;;) {
 				/* read the block header. If error, we format the chain */
-				if (MTD_READOOB(s->mbd.mtd, block * s->EraseSize + 8, 8,
-						&retlen, (char *)&h0) < 0 ||
-				    MTD_READOOB(s->mbd.mtd, block * s->EraseSize + SECTORSIZE + 8, 8,
-						&retlen, (char *)&h1) < 0) {
+				if (mtd->read_oob(mtd,
+						  block * s->EraseSize + 8, 8,
+						  &retlen, (char *)&h0) < 0 ||
+				    mtd->read_oob(mtd,
+						  block * s->EraseSize +
+						  SECTORSIZE + 8, 8,
+						  &retlen, (char *)&h1) < 0) {
 					s->ReplUnitTable[block] = BLOCK_NIL;
 					do_format_chain = 1;
 					break;
diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c
index 0ef207dfaf6f..5371a403130a 100644
--- a/fs/jffs/intrep.c
+++ b/fs/jffs/intrep.c
@@ -247,7 +247,7 @@ flash_safe_read(struct mtd_info *mtd, loff_t from,
 	D3(printk(KERN_NOTICE "flash_safe_read(%p, %08x, %p, %08x)\n",
 		  mtd, (unsigned int) from, buf, count));
 
-	res = MTD_READ(mtd, from, count, &retlen, buf);
+	res = mtd->read(mtd, from, count, &retlen, buf);
 	if (retlen != count) {
 		panic("Didn't read all bytes in flash_safe_read(). Returned %d\n", res);
 	}
@@ -262,7 +262,7 @@ flash_read_u32(struct mtd_info *mtd, loff_t from)
 	__u32 ret;
 	int res;
 
-	res = MTD_READ(mtd, from, 4, &retlen, (unsigned char *)&ret);
+	res = mtd->read(mtd, from, 4, &retlen, (unsigned char *)&ret);
 	if (retlen != 4) {
 		printk("Didn't read all bytes in flash_read_u32(). Returned %d\n", res);
 		return 0;
@@ -282,7 +282,7 @@ flash_safe_write(struct mtd_info *mtd, loff_t to,
 	D3(printk(KERN_NOTICE "flash_safe_write(%p, %08x, %p, %08x)\n",
 		  mtd, (unsigned int) to, buf, count));
 
-	res = MTD_WRITE(mtd, to, count, &retlen, buf);
+	res = mtd->write(mtd, to, count, &retlen, buf);
 	if (retlen != count) {
 		printk("Didn't write all bytes in flash_safe_write(). Returned %d\n", res);
 	}
@@ -300,9 +300,9 @@ flash_safe_writev(struct mtd_info *mtd, const struct kvec *vecs,
 
 	D3(printk(KERN_NOTICE "flash_safe_writev(%p, %08x, %p)\n",
 		  mtd, (unsigned int) to, vecs));
-	
+
 	if (mtd->writev) {
-		res = MTD_WRITEV(mtd, vecs, iovec_cnt, to, &retlen);
+		res = mtd->writev(mtd, vecs, iovec_cnt, to, &retlen);
 		return res ? res : retlen;
 	}
 	/* Not implemented writev. Repeatedly use write - on the not so
@@ -312,7 +312,8 @@ flash_safe_writev(struct mtd_info *mtd, const struct kvec *vecs,
 	retlen=0;
 
 	for (i=0; !res && i<iovec_cnt; i++) {
-		res = MTD_WRITE(mtd, to, vecs[i].iov_len, &retlen_a, vecs[i].iov_base);
+		res = mtd->write(mtd, to, vecs[i].iov_len, &retlen_a,
+				 vecs[i].iov_base);
 		if (retlen_a != vecs[i].iov_len) {
 			printk("Didn't write all bytes in flash_safe_writev(). Returned %d\n", res);
 			if (i != iovec_cnt-1)
@@ -393,7 +394,7 @@ flash_erase_region(struct mtd_info *mtd, loff_t start,
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	add_wait_queue(&wait_q, &wait);
 
-	if (MTD_ERASE(mtd, erase) < 0) {
+	if (mtd->erase(mtd, erase) < 0) {
 		set_current_state(TASK_RUNNING);
 		remove_wait_queue(&wait_q, &wait);
 		kfree(erase);
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 48a9df21ab11..4970c2e96fbf 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -199,20 +199,6 @@ int default_mtd_writev(struct mtd_info *mtd, const struct kvec *vecs,
 int default_mtd_readv(struct mtd_info *mtd, struct kvec *vecs,
 		      unsigned long count, loff_t from, size_t *retlen);
 
-#define MTD_ERASE(mtd, args...) (*(mtd->erase))(mtd, args)
-#define MTD_POINT(mtd, a,b,c,d) (*(mtd->point))(mtd, a,b,c, (u_char **)(d))
-#define MTD_UNPOINT(mtd, arg) (*(mtd->unpoint))(mtd, (u_char *)arg)
-#define MTD_READ(mtd, args...) (*(mtd->read))(mtd, args)
-#define MTD_WRITE(mtd, args...) (*(mtd->write))(mtd, args)
-#define MTD_READV(mtd, args...) (*(mtd->readv))(mtd, args)
-#define MTD_WRITEV(mtd, args...) (*(mtd->writev))(mtd, args)
-#define MTD_READECC(mtd, args...) (*(mtd->read_ecc))(mtd, args)
-#define MTD_WRITEECC(mtd, args...) (*(mtd->write_ecc))(mtd, args)
-#define MTD_READOOB(mtd, args...) (*(mtd->read_oob))(mtd, args)
-#define MTD_WRITEOOB(mtd, args...) (*(mtd->write_oob))(mtd, args)
-#define MTD_SYNC(mtd) do { if (mtd->sync) (*(mtd->sync))(mtd);  } while (0)
-
-
 #ifdef CONFIG_MTD_PARTITIONS
 void mtd_erase_callback(struct erase_info *instr);
 #else
@@ -233,7 +219,7 @@ static inline void mtd_erase_callback(struct erase_info *instr)
 
 #ifdef CONFIG_MTD_DEBUG
 #define DEBUG(n, args...)				\
- 	do {						\
+	do {						\
 		if (n <= CONFIG_MTD_DEBUG_VERBOSE)	\
 			printk(KERN_INFO args);		\
 	} while(0)
-- 
cgit v1.2.3


From 8593fbc68b0df1168995de76d1af38eb62fd6b62 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Mon, 29 May 2006 03:26:58 +0200
Subject: [MTD] Rework the out of band handling completely

Hopefully the last iteration on this!

The handling of out of band data on NAND was accompanied by tons of fruitless
discussions and halfarsed patches to make it work for a particular
problem. Sufficiently annoyed by I all those "I know it better" mails and the
resonable amount of discarded "it solves my problem" patches, I finally decided
to go for the big rework. After removing the _ecc variants of mtd read/write
functions the solution to satisfy the various requirements was to refactor the
read/write _oob functions in mtd.

The major change is that read/write_oob now takes a pointer to an operation
descriptor structure "struct mtd_oob_ops".instead of having a function with at
least seven arguments.

read/write_oob which should probably renamed to a more descriptive name, can do
the following tasks:

- read/write out of band data
- read/write data content and out of band data
- read/write raw data content and out of band data (ecc disabled)

struct mtd_oob_ops has a mode field, which determines the oob handling mode.

Aside of the MTD_OOB_RAW mode, which is intended to be especially for
diagnostic purposes and some internal functions e.g. bad block table creation,
the other two modes are for mtd clients:

MTD_OOB_PLACE puts/gets the given oob data exactly to/from the place which is
described by the ooboffs and ooblen fields of the mtd_oob_ops strcuture. It's
up to the caller to make sure that the byte positions are not used by the ECC
placement algorithms.

MTD_OOB_AUTO puts/gets the given oob data automaticaly to/from the places in
the out of band area which are described by the oobfree tuples in the ecclayout
data structre which is associated to the devicee.

The decision whether data plus oob or oob only handling is done depends on the
setting of the datbuf member of the data structure. When datbuf == NULL then
the internal read/write_oob functions are selected, otherwise the read/write
data routines are invoked.

Tested on a few platforms with all variants. Please be aware of possible
regressions for your particular device / application scenario

Disclaimer: Any whining will be ignored from those who just contributed "hot
air blurb" and never sat down to tackle the underlying problem of the mess in
the NAND driver grown over time and the big chunk of work to fix up the
existing users. The problem was not the holiness of the existing MTD
interfaces. The problems was the lack of time to go for the big overhaul. It's
easy to add more mess to the existing one, but it takes alot of effort to go
for a real solution.

Improvements and bugfixes are welcome!

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/devices/doc2000.c      |  39 ++-
 drivers/mtd/devices/doc2001.c      |  34 ++-
 drivers/mtd/devices/doc2001plus.c  |  34 ++-
 drivers/mtd/inftlcore.c            | 111 ++++++--
 drivers/mtd/inftlmount.c           |  27 +-
 drivers/mtd/mtdchar.c              |  59 ++--
 drivers/mtd/mtdconcat.c            |  90 +++---
 drivers/mtd/mtdpart.c              |  29 +-
 drivers/mtd/nand/nand_base.c       | 542 ++++++++++++++++++++++---------------
 drivers/mtd/nand/nand_bbt.c        | 188 +++++++++----
 drivers/mtd/nftlcore.c             |  92 +++++--
 drivers/mtd/nftlmount.c            |  29 +-
 drivers/mtd/onenand/onenand_base.c |  46 +++-
 drivers/mtd/onenand/onenand_bbt.c  |   7 +-
 fs/jffs2/jffs2_fs_sb.h             |   1 +
 fs/jffs2/wbuf.c                    | 230 ++++++++--------
 include/linux/mtd/mtd.h            |  50 +++-
 include/linux/mtd/nand.h           |  10 +-
 18 files changed, 1028 insertions(+), 590 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/devices/doc2000.c b/drivers/mtd/devices/doc2000.c
index d9ba1ee658f6..c54e40464d82 100644
--- a/drivers/mtd/devices/doc2000.c
+++ b/drivers/mtd/devices/doc2000.c
@@ -59,10 +59,10 @@ static int doc_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 			size_t *retlen, u_char *buf, u_char *eccbuf, struct nand_oobinfo *oobsel);
 static int doc_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 			 size_t *retlen, const u_char *buf, u_char *eccbuf, struct nand_oobinfo *oobsel);
-static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
-			size_t *retlen, u_char *buf);
-static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
-			 size_t *retlen, const u_char *buf);
+static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
+			struct mtd_oob_ops *ops);
+static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
+			 struct mtd_oob_ops *ops);
 static int doc_write_oob_nolock(struct mtd_info *mtd, loff_t ofs, size_t len,
 			 size_t *retlen, const u_char *buf);
 static int doc_erase (struct mtd_info *mtd, struct erase_info *instr);
@@ -959,12 +959,18 @@ static int doc_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 	return 0;
 }
 
-static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
-			size_t * retlen, u_char * buf)
+static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
+			struct mtd_oob_ops *ops)
 {
 	struct DiskOnChip *this = mtd->priv;
 	int len256 = 0, ret;
 	struct Nand *mychip;
+	uint8_t *buf = ops->oobbuf;
+	size_t len = ops->len;
+
+	BUG_ON(ops->mode != MTD_OOB_PLACE);
+
+	ofs += ops->ooboffs;
 
 	mutex_lock(&this->lock);
 
@@ -1005,7 +1011,7 @@ static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
 
 	DoC_ReadBuf(this, &buf[len256], len - len256);
 
-	*retlen = len;
+	ops->retlen = len;
 	/* Reading the full OOB data drops us off of the end of the page,
          * causing the flash device to go into busy mode, so we need
          * to wait until ready 11.4.1 and Toshiba TC58256FT docs */
@@ -1120,17 +1126,20 @@ static int doc_write_oob_nolock(struct mtd_info *mtd, loff_t ofs, size_t len,
 
 }
 
-static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
- 			 size_t * retlen, const u_char * buf)
+static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
+			 struct mtd_oob_ops *ops)
 {
- 	struct DiskOnChip *this = mtd->priv;
- 	int ret;
+	struct DiskOnChip *this = mtd->priv;
+	int ret;
 
- 	mutex_lock(&this->lock);
- 	ret = doc_write_oob_nolock(mtd, ofs, len, retlen, buf);
+	BUG_ON(ops->mode != MTD_OOB_PLACE);
+
+	mutex_lock(&this->lock);
+	ret = doc_write_oob_nolock(mtd, ofs + ops->ooboffs, ops->len,
+				   &ops->retlen, ops->oobbuf);
 
- 	mutex_unlock(&this->lock);
- 	return ret;
+	mutex_unlock(&this->lock);
+	return ret;
 }
 
 static int doc_erase(struct mtd_info *mtd, struct erase_info *instr)
diff --git a/drivers/mtd/devices/doc2001.c b/drivers/mtd/devices/doc2001.c
index 579c0b570ae5..0cf022a69e65 100644
--- a/drivers/mtd/devices/doc2001.c
+++ b/drivers/mtd/devices/doc2001.c
@@ -43,10 +43,10 @@ static int doc_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 static int doc_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 			 size_t *retlen, const u_char *buf, u_char *eccbuf,
 			 struct nand_oobinfo *oobsel);
-static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
-			size_t *retlen, u_char *buf);
-static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
-			 size_t *retlen, const u_char *buf);
+static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
+			struct mtd_oob_ops *ops);
+static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
+			 struct mtd_oob_ops *ops);
 static int doc_erase (struct mtd_info *mtd, struct erase_info *instr);
 
 static struct mtd_info *docmillist = NULL;
@@ -662,8 +662,8 @@ static int doc_write_ecc (struct mtd_info *mtd, loff_t to, size_t len,
 	return ret;
 }
 
-static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
-			size_t *retlen, u_char *buf)
+static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
+			struct mtd_oob_ops *ops)
 {
 #ifndef USE_MEMCPY
 	int i;
@@ -672,6 +672,12 @@ static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
 	struct DiskOnChip *this = mtd->priv;
 	void __iomem *docptr = this->virtadr;
 	struct Nand *mychip = &this->chips[ofs >> this->chipshift];
+	uint8_t *buf = ops->oobbuf;
+	size_t len = ops->len;
+
+	BUG_ON(ops->mode != MTD_OOB_PLACE);
+
+	ofs += ops->ooboffs;
 
 	/* Find the chip which is to be used and select it */
 	if (this->curfloor != mychip->floor) {
@@ -708,13 +714,13 @@ static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
 #endif
 	buf[len - 1] = ReadDOC(docptr, LastDataRead);
 
-	*retlen = len;
+	ops->retlen = len;
 
 	return 0;
 }
 
-static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
-			 size_t *retlen, const u_char *buf)
+static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
+			 struct mtd_oob_ops *ops)
 {
 #ifndef USE_MEMCPY
 	int i;
@@ -724,6 +730,12 @@ static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
 	struct DiskOnChip *this = mtd->priv;
 	void __iomem *docptr = this->virtadr;
 	struct Nand *mychip = &this->chips[ofs >> this->chipshift];
+	uint8_t *buf = ops->oobbuf;
+	size_t len = ops->len;
+
+	BUG_ON(ops->mode != MTD_OOB_PLACE);
+
+	ofs += ops->ooboffs;
 
 	/* Find the chip which is to be used and select it */
 	if (this->curfloor != mychip->floor) {
@@ -775,12 +787,12 @@ static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
 	if (ReadDOC(docptr, Mil_CDSN_IO) & 1) {
 		printk("Error programming oob data\n");
 		/* FIXME: implement Bad Block Replacement (in nftl.c ??) */
-		*retlen = 0;
+		ops->retlen = 0;
 		ret = -EIO;
 	}
 	dummy = ReadDOC(docptr, LastDataRead);
 
-	*retlen = len;
+	ops->retlen = len;
 
 	return ret;
 }
diff --git a/drivers/mtd/devices/doc2001plus.c b/drivers/mtd/devices/doc2001plus.c
index 1ee0c0dcb53b..66cb1e50469a 100644
--- a/drivers/mtd/devices/doc2001plus.c
+++ b/drivers/mtd/devices/doc2001plus.c
@@ -47,10 +47,10 @@ static int doc_read_ecc(struct mtd_info *mtd, loff_t from, size_t len,
 static int doc_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 		size_t *retlen, const u_char *buf, u_char *eccbuf,
 		struct nand_oobinfo *oobsel);
-static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
-		size_t *retlen, u_char *buf);
-static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
-		size_t *retlen, const u_char *buf);
+static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
+			struct mtd_oob_ops *ops);
+static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
+			 struct mtd_oob_ops *ops);
 static int doc_erase (struct mtd_info *mtd, struct erase_info *instr);
 
 static struct mtd_info *docmilpluslist = NULL;
@@ -868,14 +868,20 @@ static int doc_write_ecc(struct mtd_info *mtd, loff_t to, size_t len,
 	return ret;
 }
 
-static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
-			size_t *retlen, u_char *buf)
+static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
+			struct mtd_oob_ops *ops)
 {
 	loff_t fofs, base;
 	struct DiskOnChip *this = mtd->priv;
 	void __iomem * docptr = this->virtadr;
 	struct Nand *mychip = &this->chips[ofs >> this->chipshift];
 	size_t i, size, got, want;
+	uint8_t *buf = ops->oobbuf;
+	size_t len = ops->len;
+
+	BUG_ON(ops->mode != MTD_OOB_PLACE);
+
+	ofs += ops->ooboffs;
 
 	DoC_CheckASIC(docptr);
 
@@ -941,12 +947,12 @@ static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
 	/* Disable flash internally */
 	WriteDOC(0, docptr, Mplus_FlashSelect);
 
-	*retlen = len;
+	ops->retlen = len;
 	return 0;
 }
 
-static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
-			 size_t *retlen, const u_char *buf)
+static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
+			 struct mtd_oob_ops *ops)
 {
 	volatile char dummy;
 	loff_t fofs, base;
@@ -955,6 +961,12 @@ static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
 	struct Nand *mychip = &this->chips[ofs >> this->chipshift];
 	size_t i, size, got, want;
 	int ret = 0;
+	uint8_t *buf = ops->oobbuf;
+	size_t len = ops->len;
+
+	BUG_ON(ops->mode != MTD_OOB_PLACE);
+
+	ofs += ops->ooboffs;
 
 	DoC_CheckASIC(docptr);
 
@@ -1030,7 +1042,7 @@ static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
 			printk("MTD: Error 0x%x programming oob at 0x%x\n",
 				dummy, (int)ofs);
 			/* FIXME: implement Bad Block Replacement */
-			*retlen = 0;
+			ops->retlen = 0;
 			ret = -EIO;
 		}
 		dummy = ReadDOC(docptr, Mplus_LastDataRead);
@@ -1043,7 +1055,7 @@ static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, size_t len,
 	/* Disable flash internally */
 	WriteDOC(0, docptr, Mplus_FlashSelect);
 
-	*retlen = len;
+	ops->retlen = len;
 	return ret;
 }
 
diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c
index 3396f0e1ac5f..efb1a95aa0a0 100644
--- a/drivers/mtd/inftlcore.c
+++ b/drivers/mtd/inftlcore.c
@@ -150,6 +150,69 @@ static void inftl_remove_dev(struct mtd_blktrans_dev *dev)
  * Actual INFTL access routines.
  */
 
+/*
+ * Read oob data from flash
+ */
+int inftl_read_oob(struct mtd_info *mtd, loff_t offs, size_t len,
+		   size_t *retlen, uint8_t *buf)
+{
+	struct mtd_oob_ops ops;
+	int res;
+
+	ops.mode = MTD_OOB_PLACE;
+	ops.ooboffs = offs & (mtd->writesize - 1);
+	ops.ooblen = len;
+	ops.oobbuf = buf;
+	ops.datbuf = NULL;
+	ops.len = len;
+
+	res = mtd->read_oob(mtd, offs & ~(mtd->writesize - 1), &ops);
+	*retlen = ops.retlen;
+	return res;
+}
+
+/*
+ * Write oob data to flash
+ */
+int inftl_write_oob(struct mtd_info *mtd, loff_t offs, size_t len,
+		    size_t *retlen, uint8_t *buf)
+{
+	struct mtd_oob_ops ops;
+	int res;
+
+	ops.mode = MTD_OOB_PLACE;
+	ops.ooboffs = offs & (mtd->writesize - 1);
+	ops.ooblen = len;
+	ops.oobbuf = buf;
+	ops.datbuf = NULL;
+	ops.len = len;
+
+	res = mtd->write_oob(mtd, offs & ~(mtd->writesize - 1), &ops);
+	*retlen = ops.retlen;
+	return res;
+}
+
+/*
+ * Write data and oob to flash
+ */
+static int inftl_write(struct mtd_info *mtd, loff_t offs, size_t len,
+		       size_t *retlen, uint8_t *buf, uint8_t *oob)
+{
+	struct mtd_oob_ops ops;
+	int res;
+
+	ops.mode = MTD_OOB_PLACE;
+	ops.ooboffs = offs;
+	ops.ooblen = mtd->oobsize;
+	ops.oobbuf = oob;
+	ops.datbuf = buf;
+	ops.len = len;
+
+	res = mtd->write_oob(mtd, offs & ~(mtd->writesize - 1), &ops);
+	*retlen = ops.retlen;
+	return res;
+}
+
 /*
  * INFTL_findfreeblock: Find a free Erase Unit on the INFTL partition.
  *	This function is used when the give Virtual Unit Chain.
@@ -227,9 +290,9 @@ static u16 INFTL_foldchain(struct INFTLrecord *inftl, unsigned thisVUC, unsigned
 			if ((BlockMap[block] != 0xffff) || BlockDeleted[block])
 				continue;
 
-			if (mtd->read_oob(mtd, (thisEUN * inftl->EraseSize)
-					  + (block * SECTORSIZE), 16 , &retlen,
-					  (char *)&oob) < 0)
+			if (inftl_read_oob(mtd, (thisEUN * inftl->EraseSize)
+					   + (block * SECTORSIZE), 16, &retlen,
+					   (char *)&oob) < 0)
 				status = SECTOR_IGNORE;
 			else
 				status = oob.b.Status | oob.b.Status1;
@@ -304,9 +367,9 @@ static u16 INFTL_foldchain(struct INFTLrecord *inftl, unsigned thisVUC, unsigned
 		memset(&oob, 0xff, sizeof(struct inftl_oob));
 		oob.b.Status = oob.b.Status1 = SECTOR_USED;
 
-		nand_write_raw(inftl->mbd.mtd, (inftl->EraseSize * targetEUN) +
-			       (block * SECTORSIZE), SECTORSIZE, &retlen,
-			       movebuf, (char *)&oob);
+		inftl_write(inftl->mbd.mtd, (inftl->EraseSize * targetEUN) +
+			    (block * SECTORSIZE), SECTORSIZE, &retlen,
+			    movebuf, (char *)&oob);
 	}
 
 	/*
@@ -437,8 +500,8 @@ static inline u16 INFTL_findwriteunit(struct INFTLrecord *inftl, unsigned block)
 		silly = MAX_LOOPS;
 
 		while (thisEUN <= inftl->lastEUN) {
-			mtd->read_oob(mtd, (thisEUN * inftl->EraseSize) +
-				      blockofs, 8, &retlen, (char *)&bci);
+			inftl_read_oob(mtd, (thisEUN * inftl->EraseSize) +
+				       blockofs, 8, &retlen, (char *)&bci);
 
 			status = bci.Status | bci.Status1;
 			DEBUG(MTD_DEBUG_LEVEL3, "INFTL: status of block %d in "
@@ -525,8 +588,8 @@ hitused:
 		nacs = 0;
 		thisEUN = inftl->VUtable[thisVUC];
 		if (thisEUN != BLOCK_NIL) {
-			mtd->read_oob(mtd, thisEUN * inftl->EraseSize
-				      + 8, 8, &retlen, (char *)&oob.u);
+			inftl_read_oob(mtd, thisEUN * inftl->EraseSize
+				       + 8, 8, &retlen, (char *)&oob.u);
 			anac = oob.u.a.ANAC + 1;
 			nacs = oob.u.a.NACs + 1;
 		}
@@ -547,8 +610,8 @@ hitused:
 		oob.u.a.parityPerField = parity;
 		oob.u.a.discarded = 0xaa;
 
-		mtd->write_oob(mtd, writeEUN * inftl->EraseSize + 8, 8,
-			       &retlen, (char *)&oob.u);
+		inftl_write_oob(mtd, writeEUN * inftl->EraseSize + 8, 8,
+				&retlen, (char *)&oob.u);
 
 		/* Also back up header... */
 		oob.u.b.virtualUnitNo = cpu_to_le16(thisVUC);
@@ -558,8 +621,8 @@ hitused:
 		oob.u.b.parityPerField = parity;
 		oob.u.b.discarded = 0xaa;
 
-		mtd->write_oob(mtd, writeEUN * inftl->EraseSize +
-			       SECTORSIZE * 4 + 8, 8, &retlen, (char *)&oob.u);
+		inftl_write_oob(mtd, writeEUN * inftl->EraseSize +
+				SECTORSIZE * 4 + 8, 8, &retlen, (char *)&oob.u);
 
 		inftl->PUtable[writeEUN] = inftl->VUtable[thisVUC];
 		inftl->VUtable[thisVUC] = writeEUN;
@@ -610,8 +673,8 @@ static void INFTL_trydeletechain(struct INFTLrecord *inftl, unsigned thisVUC)
 			if (BlockUsed[block] || BlockDeleted[block])
 				continue;
 
-			if (mtd->read_oob(mtd, (thisEUN * inftl->EraseSize)
-					  + (block * SECTORSIZE), 8 , &retlen,
+			if (inftl_read_oob(mtd, (thisEUN * inftl->EraseSize)
+					   + (block * SECTORSIZE), 8 , &retlen,
 					  (char *)&bci) < 0)
 				status = SECTOR_IGNORE;
 			else
@@ -711,8 +774,8 @@ static int INFTL_deleteblock(struct INFTLrecord *inftl, unsigned block)
 		"block=%d)\n", inftl, block);
 
 	while (thisEUN < inftl->nb_blocks) {
-		if (mtd->read_oob(mtd, (thisEUN * inftl->EraseSize) +
-				  blockofs, 8, &retlen, (char *)&bci) < 0)
+		if (inftl_read_oob(mtd, (thisEUN * inftl->EraseSize) +
+				   blockofs, 8, &retlen, (char *)&bci) < 0)
 			status = SECTOR_IGNORE;
 		else
 			status = bci.Status | bci.Status1;
@@ -746,10 +809,10 @@ foundit:
 	if (thisEUN != BLOCK_NIL) {
 		loff_t ptr = (thisEUN * inftl->EraseSize) + blockofs;
 
-		if (mtd->read_oob(mtd, ptr, 8, &retlen, (char *)&bci) < 0)
+		if (inftl_read_oob(mtd, ptr, 8, &retlen, (char *)&bci) < 0)
 			return -EIO;
 		bci.Status = bci.Status1 = SECTOR_DELETED;
-		if (mtd->write_oob(mtd, ptr, 8, &retlen, (char *)&bci) < 0)
+		if (inftl_write_oob(mtd, ptr, 8, &retlen, (char *)&bci) < 0)
 			return -EIO;
 		INFTL_trydeletechain(inftl, block / (inftl->EraseSize / SECTORSIZE));
 	}
@@ -790,9 +853,9 @@ static int inftl_writeblock(struct mtd_blktrans_dev *mbd, unsigned long block,
 		memset(&oob, 0xff, sizeof(struct inftl_oob));
 		oob.b.Status = oob.b.Status1 = SECTOR_USED;
 
-		nand_write_raw(inftl->mbd.mtd, (writeEUN * inftl->EraseSize) +
-			       blockofs, SECTORSIZE, &retlen, (char *)buffer,
-			       (char *)&oob);
+		inftl_write(inftl->mbd.mtd, (writeEUN * inftl->EraseSize) +
+			    blockofs, SECTORSIZE, &retlen, (char *)buffer,
+			    (char *)&oob);
 		/*
 		 * need to write SECTOR_USED flags since they are not written
 		 * in mtd_writeecc
@@ -820,7 +883,7 @@ static int inftl_readblock(struct mtd_blktrans_dev *mbd, unsigned long block,
 		"buffer=%p)\n", inftl, block, buffer);
 
 	while (thisEUN < inftl->nb_blocks) {
-		if (mtd->read_oob(mtd, (thisEUN * inftl->EraseSize) +
+		if (inftl_read_oob(mtd, (thisEUN * inftl->EraseSize) +
 				  blockofs, 8, &retlen, (char *)&bci) < 0)
 			status = SECTOR_IGNORE;
 		else
diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c
index b4cda7d0a52d..8f6006f1a519 100644
--- a/drivers/mtd/inftlmount.c
+++ b/drivers/mtd/inftlmount.c
@@ -43,6 +43,11 @@
 
 char inftlmountrev[]="$Revision: 1.18 $";
 
+extern int inftl_read_oob(struct mtd_info *mtd, loff_t offs, size_t len,
+			  size_t *retlen, uint8_t *buf);
+extern int inftl_write_oob(struct mtd_info *mtd, loff_t offs, size_t len,
+			   size_t *retlen, uint8_t *buf);
+
 /*
  * find_boot_record: Find the INFTL Media Header and its Spare copy which
  *	contains the various device information of the INFTL partition and
@@ -107,9 +112,9 @@ static int find_boot_record(struct INFTLrecord *inftl)
 		}
 
 		/* To be safer with BIOS, also use erase mark as discriminant */
-		if ((ret = mtd->read_oob(mtd, block * inftl->EraseSize +
-					 SECTORSIZE + 8, 8, &retlen,
-					 (char *)&h1) < 0)) {
+		if ((ret = inftl_read_oob(mtd, block * inftl->EraseSize +
+					  SECTORSIZE + 8, 8, &retlen,
+					  (char *)&h1) < 0)) {
 			printk(KERN_WARNING "INFTL: ANAND header found at "
 				"0x%x in mtd%d, but OOB data read failed "
 				"(err %d)\n", block * inftl->EraseSize,
@@ -363,8 +368,8 @@ static int check_free_sectors(struct INFTLrecord *inftl, unsigned int address,
 			return -1;
 
 		if (check_oob) {
-			if(mtd->read_oob(mtd, address, mtd->oobsize,
-					 &retlen, &buf[SECTORSIZE]) < 0)
+			if(inftl_read_oob(mtd, address, mtd->oobsize,
+					  &retlen, &buf[SECTORSIZE]) < 0)
 				return -1;
 			if (memcmpb(buf + SECTORSIZE, 0xff, mtd->oobsize) != 0)
 				return -1;
@@ -433,7 +438,7 @@ int INFTL_formatblock(struct INFTLrecord *inftl, int block)
 	uci.Reserved[2] = 0;
 	uci.Reserved[3] = 0;
 	instr->addr = block * inftl->EraseSize + SECTORSIZE * 2;
-	if (mtd->write_oob(mtd, instr->addr + 8, 8, &retlen, (char *)&uci) < 0)
+	if (inftl_write_oob(mtd, instr->addr + 8, 8, &retlen, (char *)&uci) < 0)
 		goto fail;
 	return 0;
 fail:
@@ -611,11 +616,11 @@ int INFTL_mount(struct INFTLrecord *s)
 				break;
 			}
 
-			if (mtd->read_oob(mtd, block * s->EraseSize + 8,
-					  8, &retlen, (char *)&h0) < 0 ||
-			    mtd->read_oob(mtd, block * s->EraseSize +
-					  2 * SECTORSIZE + 8, 8, &retlen,
-					  (char *)&h1) < 0) {
+			if (inftl_read_oob(mtd, block * s->EraseSize + 8,
+					   8, &retlen, (char *)&h0) < 0 ||
+			    inftl_read_oob(mtd, block * s->EraseSize +
+					   2 * SECTORSIZE + 8, 8, &retlen,
+					   (char *)&h1) < 0) {
 				/* Should never happen? */
 				do_format_chain++;
 				break;
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index b45e7747daa3..7522fc3a2827 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -408,8 +408,7 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 	case MEMWRITEOOB:
 	{
 		struct mtd_oob_buf buf;
-		void *databuf;
-		ssize_t retlen;
+		struct mtd_oob_ops ops;
 
 		if(!(file->f_mode & 2))
 			return -EPERM;
@@ -417,7 +416,7 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 		if (copy_from_user(&buf, argp, sizeof(struct mtd_oob_buf)))
 			return -EFAULT;
 
-		if (buf.length > 0x4096)
+		if (buf.length > 4096)
 			return -EINVAL;
 
 		if (!mtd->write_oob)
@@ -429,21 +428,32 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 		if (ret)
 			return ret;
 
-		databuf = kmalloc(buf.length, GFP_KERNEL);
-		if (!databuf)
+		ops.len = buf.length;
+		ops.ooblen = mtd->oobsize;
+		ops.ooboffs = buf.start & (mtd->oobsize - 1);
+		ops.datbuf = NULL;
+		ops.mode = MTD_OOB_PLACE;
+
+		if (ops.ooboffs && ops.len > (ops.ooblen - ops.ooboffs))
+			return -EINVAL;
+
+		ops.oobbuf = kmalloc(buf.length, GFP_KERNEL);
+		if (!ops.oobbuf)
 			return -ENOMEM;
 
-		if (copy_from_user(databuf, buf.ptr, buf.length)) {
-			kfree(databuf);
+		if (copy_from_user(ops.oobbuf, buf.ptr, buf.length)) {
+			kfree(ops.oobbuf);
 			return -EFAULT;
 		}
 
-		ret = (mtd->write_oob)(mtd, buf.start, buf.length, &retlen, databuf);
+		buf.start &= ~(mtd->oobsize - 1);
+		ret = mtd->write_oob(mtd, buf.start, &ops);
 
-		if (copy_to_user(argp + sizeof(uint32_t), &retlen, sizeof(uint32_t)))
+		if (copy_to_user(argp + sizeof(uint32_t), &ops.retlen,
+				 sizeof(uint32_t)))
 			ret = -EFAULT;
 
-		kfree(databuf);
+		kfree(ops.oobbuf);
 		break;
 
 	}
@@ -451,13 +461,12 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 	case MEMREADOOB:
 	{
 		struct mtd_oob_buf buf;
-		void *databuf;
-		ssize_t retlen;
+		struct mtd_oob_ops ops;
 
 		if (copy_from_user(&buf, argp, sizeof(struct mtd_oob_buf)))
 			return -EFAULT;
 
-		if (buf.length > 0x4096)
+		if (buf.length > 4096)
 			return -EINVAL;
 
 		if (!mtd->read_oob)
@@ -465,22 +474,32 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 		else
 			ret = access_ok(VERIFY_WRITE, buf.ptr,
 					buf.length) ? 0 : -EFAULT;
-
 		if (ret)
 			return ret;
 
-		databuf = kmalloc(buf.length, GFP_KERNEL);
-		if (!databuf)
+		ops.len = buf.length;
+		ops.ooblen = mtd->oobsize;
+		ops.ooboffs = buf.start & (mtd->oobsize - 1);
+		ops.datbuf = NULL;
+		ops.mode = MTD_OOB_PLACE;
+
+		if (ops.ooboffs && ops.len > (ops.ooblen - ops.ooboffs))
+			return -EINVAL;
+
+		ops.oobbuf = kmalloc(buf.length, GFP_KERNEL);
+		if (!ops.oobbuf)
 			return -ENOMEM;
 
-		ret = (mtd->read_oob)(mtd, buf.start, buf.length, &retlen, databuf);
+		buf.start &= ~(mtd->oobsize - 1);
+		ret = mtd->read_oob(mtd, buf.start, &ops);
 
-		if (put_user(retlen, (uint32_t __user *)argp))
+		if (put_user(ops.retlen, (uint32_t __user *)argp))
 			ret = -EFAULT;
-		else if (retlen && copy_to_user(buf.ptr, databuf, retlen))
+		else if (ops.retlen && copy_to_user(buf.ptr, ops.oobbuf,
+						    ops.retlen))
 			ret = -EFAULT;
 
-		kfree(databuf);
+		kfree(ops.oobbuf);
 		break;
 	}
 
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index ec15abcdbdfa..38151b8e6631 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -231,101 +231,85 @@ concat_writev(struct mtd_info *mtd, const struct kvec *vecs,
 }
 
 static int
-concat_read_oob(struct mtd_info *mtd, loff_t from, size_t len,
-		size_t * retlen, u_char * buf)
+concat_read_oob(struct mtd_info *mtd, loff_t from, struct mtd_oob_ops *ops)
 {
 	struct mtd_concat *concat = CONCAT(mtd);
-	int err = -EINVAL;
-	int i;
+	struct mtd_oob_ops devops = *ops;
+	int i, err;
 
-	*retlen = 0;
+	ops->retlen = 0;
 
 	for (i = 0; i < concat->num_subdev; i++) {
 		struct mtd_info *subdev = concat->subdev[i];
-		size_t size, retsize;
 
 		if (from >= subdev->size) {
-			/* Not destined for this subdev */
-			size = 0;
 			from -= subdev->size;
 			continue;
 		}
-		if (from + len > subdev->size)
-			/* First part goes into this subdev */
-			size = subdev->size - from;
-		else
-			/* Entire transaction goes into this subdev */
-			size = len;
 
-		if (subdev->read_oob)
-			err = subdev->read_oob(subdev, from, size,
-					       &retsize, buf);
-		else
-			err = -EINVAL;
+		/* partial read ? */
+		if (from + devops.len > subdev->size)
+			devops.len = subdev->size - from;
 
+		err = subdev->read_oob(subdev, from, &devops);
+		ops->retlen += devops.retlen;
 		if (err)
-			break;
+			return err;
 
-		*retlen += retsize;
-		len -= size;
-		if (len == 0)
-			break;
+		devops.len = ops->len - ops->retlen;
+		if (!devops.len)
+			return 0;
+
+		if (devops.datbuf)
+			devops.datbuf += devops.retlen;
+		if (devops.oobbuf)
+			devops.oobbuf += devops.ooblen;
 
-		err = -EINVAL;
-		buf += size;
 		from = 0;
 	}
-	return err;
+	return -EINVAL;
 }
 
 static int
-concat_write_oob(struct mtd_info *mtd, loff_t to, size_t len,
-		 size_t * retlen, const u_char * buf)
+concat_write_oob(struct mtd_info *mtd, loff_t to, struct mtd_oob_ops *ops)
 {
 	struct mtd_concat *concat = CONCAT(mtd);
-	int err = -EINVAL;
-	int i;
+	struct mtd_oob_ops devops = *ops;
+	int i, err;
 
 	if (!(mtd->flags & MTD_WRITEABLE))
 		return -EROFS;
 
-	*retlen = 0;
+	ops->retlen = 0;
 
 	for (i = 0; i < concat->num_subdev; i++) {
 		struct mtd_info *subdev = concat->subdev[i];
-		size_t size, retsize;
 
 		if (to >= subdev->size) {
-			size = 0;
 			to -= subdev->size;
 			continue;
 		}
-		if (to + len > subdev->size)
-			size = subdev->size - to;
-		else
-			size = len;
 
-		if (!(subdev->flags & MTD_WRITEABLE))
-			err = -EROFS;
-		else if (subdev->write_oob)
-			err = subdev->write_oob(subdev, to, size, &retsize,
-						buf);
-		else
-			err = -EINVAL;
+		/* partial write ? */
+		if (to + devops.len > subdev->size)
+			devops.len = subdev->size - to;
 
+		err = subdev->write_oob(subdev, to, &devops);
+		ops->retlen += devops.retlen;
 		if (err)
-			break;
+			return err;
 
-		*retlen += retsize;
-		len -= size;
-		if (len == 0)
-			break;
+		devops.len = ops->len - ops->retlen;
+		if (!devops.len)
+			return 0;
 
-		err = -EINVAL;
-		buf += size;
+		if (devops.datbuf)
+			devops.datbuf += devops.retlen;
+		if (devops.oobbuf)
+			devops.oobbuf += devops.ooblen;
 		to = 0;
 	}
-	return err;
+	return -EINVAL;
 }
 
 static void concat_erase_callback(struct erase_info *instr)
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 6d7639b98eab..f22aeccf01e7 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -78,16 +78,16 @@ static void part_unpoint (struct mtd_info *mtd, u_char *addr, loff_t from, size_
 	part->master->unpoint (part->master, addr, from + part->offset, len);
 }
 
-static int part_read_oob (struct mtd_info *mtd, loff_t from, size_t len,
-			size_t *retlen, u_char *buf)
+static int part_read_oob(struct mtd_info *mtd, loff_t from,
+			 struct mtd_oob_ops *ops)
 {
 	struct mtd_part *part = PART(mtd);
+
 	if (from >= mtd->size)
-		len = 0;
-	else if (from + len > mtd->size)
-		len = mtd->size - from;
-	return part->master->read_oob (part->master, from + part->offset,
-					len, retlen, buf);
+		return -EINVAL;
+	if (from + ops->len > mtd->size)
+		return -EINVAL;
+	return part->master->read_oob(part->master, from + part->offset, ops);
 }
 
 static int part_read_user_prot_reg (struct mtd_info *mtd, loff_t from, size_t len,
@@ -134,18 +134,19 @@ static int part_write (struct mtd_info *mtd, loff_t to, size_t len,
 				    len, retlen, buf);
 }
 
-static int part_write_oob (struct mtd_info *mtd, loff_t to, size_t len,
-			size_t *retlen, const u_char *buf)
+static int part_write_oob(struct mtd_info *mtd, loff_t to,
+			 struct mtd_oob_ops *ops)
 {
 	struct mtd_part *part = PART(mtd);
+
 	if (!(mtd->flags & MTD_WRITEABLE))
 		return -EROFS;
+
 	if (to >= mtd->size)
-		len = 0;
-	else if (to + len > mtd->size)
-		len = mtd->size - to;
-	return part->master->write_oob (part->master, to + part->offset,
-					len, retlen, buf);
+		return -EINVAL;
+	if (to + ops->len > mtd->size)
+		return -EINVAL;
+	return part->master->write_oob(part->master, to + part->offset, ops);
 }
 
 static int part_write_user_prot_reg (struct mtd_info *mtd, loff_t from, size_t len,
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index e922b829c4be..b8e6e1579cf1 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -81,23 +81,12 @@ static struct nand_ecclayout nand_oob_64 = {
 		 .length = 38}}
 };
 
-/* This is used for padding purposes in nand_write_oob */
-static uint8_t ffchars[] = {
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-};
-
-static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len,
-			  size_t *retlen, const uint8_t *buf);
 static int nand_get_device(struct nand_chip *chip, struct mtd_info *mtd,
 			   int new_state);
 
+static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
+			     struct mtd_oob_ops *ops);
+
 /*
  * For devices which display every fart in the system on a seperate LED. Is
  * compiled away when LED support is disabled.
@@ -358,7 +347,6 @@ static int nand_default_block_markbad(struct mtd_info *mtd, loff_t ofs)
 {
 	struct nand_chip *chip = mtd->priv;
 	uint8_t buf[2] = { 0, 0 };
-	size_t retlen;
 	int block;
 
 	/* Get block number */
@@ -371,8 +359,13 @@ static int nand_default_block_markbad(struct mtd_info *mtd, loff_t ofs)
 		return nand_update_bbt(mtd, ofs);
 
 	/* We write two bytes, so we dont have to mess with 16 bit access */
-	ofs += mtd->oobsize + (chip->badblockpos & ~0x01);
-	return nand_write_oob(mtd, ofs, 2, &retlen, buf);
+	ofs += mtd->oobsize;
+	chip->ops.len = 2;
+	chip->ops.datbuf = NULL;
+	chip->ops.oobbuf = buf;
+	chip->ops.ooboffs = chip->badblockpos & ~0x01;
+
+	return nand_do_write_oob(mtd, ofs, &chip->ops);
 }
 
 /**
@@ -739,6 +732,20 @@ static int nand_wait(struct mtd_info *mtd, struct nand_chip *chip, int state)
 	return status;
 }
 
+/**
+ * nand_read_page_raw - [Intern] read raw page data without ecc
+ * @mtd:	mtd info structure
+ * @chip:	nand chip info structure
+ * @buf:	buffer to store read data
+ */
+static int nand_read_page_raw(struct mtd_info *mtd, struct nand_chip *chip,
+			      uint8_t *buf)
+{
+	chip->read_buf(mtd, buf, mtd->writesize);
+	chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
+	return 0;
+}
+
 /**
  * nand_read_page_swecc - {REPLACABLE] software ecc based page read function
  * @mtd:	mtd info structure
@@ -756,11 +763,7 @@ static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
 	uint8_t *ecc_code = chip->buffers.ecccode;
 	int *eccpos = chip->ecc.layout->eccpos;
 
-	chip->read_buf(mtd, buf, mtd->writesize);
-	chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
-
-	if (chip->ecc.mode == NAND_ECC_NONE)
-		return 0;
+	nand_read_page_raw(mtd, chip, buf);
 
 	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize)
 		chip->ecc.calculate(mtd, p, &ecc_calc[i]);
@@ -882,18 +885,50 @@ static int nand_read_page_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
 }
 
 /**
- * nand_do_read - [Internal] Read data with ECC
+ * nand_transfer_oob - [Internal] Transfer oob to client buffer
+ * @chip:	nand chip structure
+ * @ops:	oob ops structure
+ */
+static uint8_t *nand_transfer_oob(struct nand_chip *chip, uint8_t *oob,
+				  struct mtd_oob_ops *ops)
+{
+	size_t len = ops->ooblen;
+
+	switch(ops->mode) {
+
+	case MTD_OOB_PLACE:
+	case MTD_OOB_RAW:
+		memcpy(oob, chip->oob_poi + ops->ooboffs, len);
+		return oob + len;
+
+	case MTD_OOB_AUTO: {
+		struct nand_oobfree *free = chip->ecc.layout->oobfree;
+		size_t bytes;
+
+		for(; free->length && len; free++, len -= bytes) {
+			bytes = min(len, free->length);
+
+			memcpy(oob, chip->oob_poi + free->offset, bytes);
+			oob += bytes;
+		}
+		return oob;
+	}
+	default:
+		BUG();
+	}
+	return NULL;
+}
+
+/**
+ * nand_do_read_ops - [Internal] Read data with ECC
  *
  * @mtd:	MTD device structure
  * @from:	offset to read from
- * @len:	number of bytes to read
- * @retlen:	pointer to variable to store the number of read bytes
- * @buf:	the databuffer to put data
  *
  * Internal function. Called with chip held.
  */
-int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
-		 size_t *retlen, uint8_t *buf)
+static int nand_do_read_ops(struct mtd_info *mtd, loff_t from,
+			    struct mtd_oob_ops *ops)
 {
 	int chipnr, page, realpage, col, bytes, aligned;
 	struct nand_chip *chip = mtd->priv;
@@ -901,8 +936,8 @@ int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
 	int blkcheck = (1 << (chip->phys_erase_shift - chip->page_shift)) - 1;
 	int sndcmd = 1;
 	int ret = 0;
-	uint32_t readlen = len;
-	uint8_t *bufpoi;
+	uint32_t readlen = ops->len;
+	uint8_t *bufpoi, *oob, *buf;
 
 	stats = mtd->ecc_stats;
 
@@ -915,12 +950,15 @@ int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
 	col = (int)(from & (mtd->writesize - 1));
 	chip->oob_poi = chip->buffers.oobrbuf;
 
+	buf = ops->datbuf;
+	oob = ops->oobbuf;
+
 	while(1) {
 		bytes = min(mtd->writesize - col, readlen);
 		aligned = (bytes == mtd->writesize);
 
 		/* Is the current page in the buffer ? */
-		if (realpage != chip->pagebuf) {
+		if (realpage != chip->pagebuf || oob) {
 			bufpoi = aligned ? buf : chip->buffers.databuf;
 
 			if (likely(sndcmd)) {
@@ -939,6 +977,16 @@ int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
 				memcpy(buf, chip->buffers.databuf + col, bytes);
 			}
 
+			buf += bytes;
+
+			if (unlikely(oob)) {
+				/* Raw mode does data:oob:data:oob */
+				if (ops->mode != MTD_OOB_RAW)
+					oob = nand_transfer_oob(chip, oob, ops);
+				else
+					buf = nand_transfer_oob(chip, buf, ops);
+			}
+
 			if (!(chip->options & NAND_NO_READRDY)) {
 				/*
 				 * Apply delay or wait for ready/busy pin. Do
@@ -952,10 +1000,11 @@ int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
 				else
 					nand_wait_ready(mtd);
 			}
-		} else
+		} else {
 			memcpy(buf, chip->buffers.databuf + col, bytes);
+			buf += bytes;
+		}
 
-		buf += bytes;
 		readlen -= bytes;
 
 		if (!readlen)
@@ -981,7 +1030,7 @@ int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
 			sndcmd = 1;
 	}
 
-	*retlen = len - (size_t) readlen;
+	ops->retlen = ops->len - (size_t) readlen;
 
 	if (ret)
 		return ret;
@@ -1002,57 +1051,49 @@ int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
 static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
 		     size_t *retlen, uint8_t *buf)
 {
+	struct nand_chip *chip = mtd->priv;
 	int ret;
 
-	*retlen = 0;
 	/* Do not allow reads past end of device */
 	if ((from + len) > mtd->size)
 		return -EINVAL;
 	if (!len)
 		return 0;
 
-	nand_get_device(mtd->priv, mtd, FL_READING);
+	nand_get_device(chip, mtd, FL_READING);
 
-	ret = nand_do_read(mtd, from, len, retlen, buf);
+	chip->ops.len = len;
+	chip->ops.datbuf = buf;
+	chip->ops.oobbuf = NULL;
+
+	ret = nand_do_read_ops(mtd, from, &chip->ops);
 
 	nand_release_device(mtd);
 
+	*retlen = chip->ops.retlen;
 	return ret;
 }
 
 /**
- * nand_read_oob - [MTD Interface] NAND read out-of-band
+ * nand_do_read_oob - [Intern] NAND read out-of-band
  * @mtd:	MTD device structure
  * @from:	offset to read from
- * @len:	number of bytes to read
- * @retlen:	pointer to variable to store the number of read bytes
- * @buf:	the databuffer to put data
+ * @ops:	oob operations description structure
  *
  * NAND read out-of-band data from the spare area
  */
-static int nand_read_oob(struct mtd_info *mtd, loff_t from, size_t len,
-			 size_t *retlen, uint8_t *buf)
+static int nand_do_read_oob(struct mtd_info *mtd, loff_t from,
+			    struct mtd_oob_ops *ops)
 {
 	int col, page, realpage, chipnr, sndcmd = 1;
 	struct nand_chip *chip = mtd->priv;
 	int blkcheck = (1 << (chip->phys_erase_shift - chip->page_shift)) - 1;
-	int readlen = len;
+	int direct, bytes, readlen = ops->len;
+	uint8_t *bufpoi, *buf = ops->oobbuf;
 
 	DEBUG(MTD_DEBUG_LEVEL3, "nand_read_oob: from = 0x%08x, len = %i\n",
 	      (unsigned int)from, (int)len);
 
-	/* Initialize return length value */
-	*retlen = 0;
-
-	/* Do not allow reads past end of device */
-	if ((from + len) > mtd->size) {
-		DEBUG(MTD_DEBUG_LEVEL0, "nand_read_oob: "
-		      "Attempt read beyond end of device\n");
-		return -EINVAL;
-	}
-
-	nand_get_device(chip, mtd, FL_READING);
-
 	chipnr = (int)(from >> chip->chip_shift);
 	chip->select_chip(mtd, chipnr);
 
@@ -1060,20 +1101,31 @@ static int nand_read_oob(struct mtd_info *mtd, loff_t from, size_t len,
 	realpage = (int)(from >> chip->page_shift);
 	page = realpage & chip->pagemask;
 
-	/* Mask to get column */
-	col = from & (mtd->oobsize - 1);
+	if (ops->mode != MTD_OOB_AUTO) {
+		col = ops->ooboffs;
+		direct = 1;
+	} else {
+		col = 0;
+		direct = 0;
+	}
 
 	while(1) {
-		int bytes = min((int)(mtd->oobsize - col), readlen);
+		bytes = direct ? ops->ooblen : mtd->oobsize;
+		bufpoi = direct ? buf : chip->buffers.oobrbuf;
 
 		if (likely(sndcmd)) {
 			chip->cmdfunc(mtd, NAND_CMD_READOOB, col, page);
 			sndcmd = 0;
 		}
 
-		chip->read_buf(mtd, buf, bytes);
+		chip->read_buf(mtd, bufpoi, bytes);
 
-		readlen -= bytes;
+		if (unlikely(!direct))
+			buf = nand_transfer_oob(chip, buf, ops);
+		else
+			buf += ops->ooblen;
+
+		readlen -= ops->ooblen;
 		if (!readlen)
 			break;
 
@@ -1090,10 +1142,6 @@ static int nand_read_oob(struct mtd_info *mtd, loff_t from, size_t len,
 				nand_wait_ready(mtd);
 		}
 
-		buf += bytes;
-		bytes = mtd->oobsize;
-		col = 0;
-
 		/* Increment page address */
 		realpage++;
 
@@ -1112,81 +1160,76 @@ static int nand_read_oob(struct mtd_info *mtd, loff_t from, size_t len,
 			sndcmd = 1;
 	}
 
-	/* Deselect and wake up anyone waiting on the device */
-	nand_release_device(mtd);
-
-	*retlen = len;
+	ops->retlen = ops->len;
 	return 0;
 }
 
 /**
- * nand_read_raw - [GENERIC] Read raw data including oob into buffer
+ * nand_read_oob - [MTD Interface] NAND read data and/or out-of-band
  * @mtd:	MTD device structure
- * @buf:	temporary buffer
  * @from:	offset to read from
- * @len:	number of bytes to read
- * @ooblen:	number of oob data bytes to read
+ * @ops:	oob operation description structure
  *
- * Read raw data including oob into buffer
+ * NAND read data and/or out-of-band data
  */
-int nand_read_raw(struct mtd_info *mtd, uint8_t *buf, loff_t from, size_t len,
-		  size_t ooblen)
+static int nand_read_oob(struct mtd_info *mtd, loff_t from,
+			 struct mtd_oob_ops *ops)
 {
+	int (*read_page)(struct mtd_info *mtd, struct nand_chip *chip,
+			 uint8_t *buf) = NULL;
 	struct nand_chip *chip = mtd->priv;
-	int page = (int)(from >> chip->page_shift);
-	int chipnr = (int)(from >> chip->chip_shift);
-	int sndcmd = 1;
-	int cnt = 0;
-	int pagesize = mtd->writesize + mtd->oobsize;
-	int blockcheck;
+	int ret = -ENOTSUPP;
+
+	ops->retlen = 0;
 
 	/* Do not allow reads past end of device */
-	if ((from + len) > mtd->size) {
-		DEBUG(MTD_DEBUG_LEVEL0, "nand_read_raw: "
+	if ((from + ops->len) > mtd->size) {
+		DEBUG(MTD_DEBUG_LEVEL0, "nand_read_oob: "
 		      "Attempt read beyond end of device\n");
 		return -EINVAL;
 	}
 
-	/* Grab the lock and see if the device is available */
 	nand_get_device(chip, mtd, FL_READING);
 
-	chip->select_chip(mtd, chipnr);
-
-	/* Add requested oob length */
-	len += ooblen;
-	blockcheck = (1 << (chip->phys_erase_shift - chip->page_shift)) - 1;
+	switch(ops->mode) {
+	case MTD_OOB_PLACE:
+	case MTD_OOB_AUTO:
+		break;
 
-	while (len) {
-		if (likely(sndcmd)) {
-			chip->cmdfunc(mtd, NAND_CMD_READ0, 0,
-				      page & chip->pagemask);
-			sndcmd = 0;
-		}
+	case MTD_OOB_RAW:
+		/* Replace the read_page algorithm temporary */
+		read_page = chip->ecc.read_page;
+		chip->ecc.read_page = nand_read_page_raw;
+		break;
 
-		chip->read_buf(mtd, &buf[cnt], pagesize);
+	default:
+		goto out;
+	}
 
-		len -= pagesize;
-		cnt += pagesize;
-		page++;
+	if (!ops->datbuf)
+		ret = nand_do_read_oob(mtd, from, ops);
+	else
+		ret = nand_do_read_ops(mtd, from, ops);
 
-		if (!(chip->options & NAND_NO_READRDY)) {
-			if (!chip->dev_ready)
-				udelay(chip->chip_delay);
-			else
-				nand_wait_ready(mtd);
-		}
+	if (unlikely(ops->mode == MTD_OOB_RAW))
+		chip->ecc.read_page = read_page;
+ out:
+	nand_release_device(mtd);
+	return ret;
+}
 
-		/*
-		 * Check, if the chip supports auto page increment or if we
-		 * cross a block boundary.
-		 */
-		if (!NAND_CANAUTOINCR(chip) || !(page & blockcheck))
-			sndcmd = 1;
-	}
 
-	/* Deselect and wake up anyone waiting on the device */
-	nand_release_device(mtd);
-	return 0;
+/**
+ * nand_write_page_raw - [Intern] raw page write function
+ * @mtd:	mtd info structure
+ * @chip:	nand chip info structure
+ * @buf:	data buffer
+ */
+static void nand_write_page_raw(struct mtd_info *mtd, struct nand_chip *chip,
+				const uint8_t *buf)
+{
+	chip->write_buf(mtd, buf, mtd->writesize);
+	chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
 }
 
 /**
@@ -1205,17 +1248,14 @@ static void nand_write_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
 	const uint8_t *p = buf;
 	int *eccpos = chip->ecc.layout->eccpos;
 
-	if (chip->ecc.mode != NAND_ECC_NONE) {
-		/* Software ecc calculation */
-		for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize)
-			chip->ecc.calculate(mtd, p, &ecc_calc[i]);
+	/* Software ecc calculation */
+	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize)
+		chip->ecc.calculate(mtd, p, &ecc_calc[i]);
 
-		for (i = 0; i < chip->ecc.total; i++)
-			chip->oob_poi[eccpos[i]] = ecc_calc[i];
-	}
+	for (i = 0; i < chip->ecc.total; i++)
+		chip->oob_poi[eccpos[i]] = ecc_calc[i];
 
-	chip->write_buf(mtd, buf, mtd->writesize);
-	chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
+	nand_write_page_raw(mtd, chip, buf);
 }
 
 /**
@@ -1342,51 +1382,77 @@ static int nand_write_page(struct mtd_info *mtd, struct nand_chip *chip,
 	return 0;
 }
 
+/**
+ * nand_fill_oob - [Internal] Transfer client buffer to oob
+ * @chip:	nand chip structure
+ * @oob:	oob data buffer
+ * @ops:	oob ops structure
+ */
+static uint8_t *nand_fill_oob(struct nand_chip *chip, uint8_t *oob,
+				  struct mtd_oob_ops *ops)
+{
+	size_t len = ops->ooblen;
+
+	switch(ops->mode) {
+
+	case MTD_OOB_PLACE:
+	case MTD_OOB_RAW:
+		memcpy(chip->oob_poi + ops->ooboffs, oob, len);
+		return oob + len;
+
+	case MTD_OOB_AUTO: {
+		struct nand_oobfree *free = chip->ecc.layout->oobfree;
+		size_t bytes;
+
+		for(; free->length && len; free++, len -= bytes) {
+			bytes = min(len, free->length);
+			memcpy(chip->oob_poi + free->offset, oob, bytes);
+			oob += bytes;
+		}
+		return oob;
+	}
+	default:
+		BUG();
+	}
+	return NULL;
+}
+
 #define NOTALIGNED(x) (x & (mtd->writesize-1)) != 0
 
 /**
- * nand_write - [MTD Interface] NAND write with ECC
+ * nand_do_write_ops - [Internal] NAND write with ECC
  * @mtd:	MTD device structure
  * @to:		offset to write to
- * @len:	number of bytes to write
- * @retlen:	pointer to variable to store the number of written bytes
- * @buf:	the data to write
+ * @ops:	oob operations description structure
  *
  * NAND write with ECC
  */
-static int nand_write(struct mtd_info *mtd, loff_t to, size_t len,
-			  size_t *retlen, const uint8_t *buf)
+static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
+			     struct mtd_oob_ops *ops)
 {
 	int chipnr, realpage, page, blockmask;
 	struct nand_chip *chip = mtd->priv;
-	uint32_t writelen = len;
+	uint32_t writelen = ops->len;
+	uint8_t *oob = ops->oobbuf;
+	uint8_t *buf = ops->datbuf;
 	int bytes = mtd->writesize;
-	int ret = -EIO;
+	int ret;
 
-	*retlen = 0;
-
-	/* Do not allow write past end of device */
-	if ((to + len) > mtd->size) {
-		DEBUG(MTD_DEBUG_LEVEL0, "nand_write: "
-		      "Attempt to write past end of page\n");
-		return -EINVAL;
-	}
+	ops->retlen = 0;
 
 	/* reject writes, which are not page aligned */
-	if (NOTALIGNED(to) || NOTALIGNED(len)) {
+	if (NOTALIGNED(to) || NOTALIGNED(ops->len)) {
 		printk(KERN_NOTICE "nand_write: "
 		       "Attempt to write not page aligned data\n");
 		return -EINVAL;
 	}
 
-	if (!len)
+	if (!writelen)
 		return 0;
 
-	nand_get_device(chip, mtd, FL_WRITING);
-
 	/* Check, if it is write protected */
 	if (nand_check_wp(mtd))
-		goto out;
+		return -EIO;
 
 	chipnr = (int)(to >> chip->chip_shift);
 	chip->select_chip(mtd, chipnr);
@@ -1397,7 +1463,7 @@ static int nand_write(struct mtd_info *mtd, loff_t to, size_t len,
 
 	/* Invalidate the page cache, when we write to the cached page */
 	if (to <= (chip->pagebuf << chip->page_shift) &&
-	    (chip->pagebuf << chip->page_shift) < (to + len))
+	    (chip->pagebuf << chip->page_shift) < (to + ops->len))
 		chip->pagebuf = -1;
 
 	chip->oob_poi = chip->buffers.oobwbuf;
@@ -1405,6 +1471,9 @@ static int nand_write(struct mtd_info *mtd, loff_t to, size_t len,
 	while(1) {
 		int cached = writelen > bytes && page != blockmask;
 
+		if (unlikely(oob))
+			oob = nand_fill_oob(chip, oob, ops);
+
 		ret = nand_write_page(mtd, chip, buf, page, cached);
 		if (ret)
 			break;
@@ -1424,94 +1493,74 @@ static int nand_write(struct mtd_info *mtd, loff_t to, size_t len,
 			chip->select_chip(mtd, chipnr);
 		}
 	}
- out:
-	*retlen = len - writelen;
-	nand_release_device(mtd);
+
+	if (unlikely(oob))
+		memset(chip->oob_poi, 0xff, mtd->oobsize);
+
+	ops->retlen = ops->len - writelen;
 	return ret;
 }
 
 /**
- * nand_write_raw - [GENERIC] Write raw data including oob
+ * nand_write - [MTD Interface] NAND write with ECC
  * @mtd:	MTD device structure
- * @buf:	source buffer
  * @to:		offset to write to
  * @len:	number of bytes to write
- * @buf:	source buffer
- * @oob:	oob buffer
+ * @retlen:	pointer to variable to store the number of written bytes
+ * @buf:	the data to write
  *
- * Write raw data including oob
+ * NAND write with ECC
  */
-int nand_write_raw(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen,
-		   const uint8_t *buf, uint8_t *oob)
+static int nand_write(struct mtd_info *mtd, loff_t to, size_t len,
+			  size_t *retlen, const uint8_t *buf)
 {
 	struct nand_chip *chip = mtd->priv;
-	int page = (int)(to >> chip->page_shift);
-	int chipnr = (int)(to >> chip->chip_shift);
 	int ret;
 
-	*retlen = 0;
-
-	/* Do not allow writes past end of device */
-	if ((to + len) > mtd->size) {
-		DEBUG(MTD_DEBUG_LEVEL0, "nand_read_raw: Attempt write "
-		      "beyond end of device\n");
+	/* Do not allow reads past end of device */
+	if ((to + len) > mtd->size)
 		return -EINVAL;
-	}
+	if (!len)
+		return 0;
 
-	/* Grab the lock and see if the device is available */
-	nand_get_device(chip, mtd, FL_WRITING);
+	nand_get_device(chip, mtd, FL_READING);
 
-	chip->select_chip(mtd, chipnr);
-	chip->oob_poi = oob;
+	chip->ops.len = len;
+	chip->ops.datbuf = (uint8_t *)buf;
+	chip->ops.oobbuf = NULL;
 
-	while (len != *retlen) {
-		ret = nand_write_page(mtd, chip, buf, page, 0);
-		if (ret)
-			return ret;
-		page++;
-		*retlen += mtd->writesize;
-		buf += mtd->writesize;
-		chip->oob_poi += mtd->oobsize;
-	}
+	ret = nand_do_write_ops(mtd, to, &chip->ops);
 
-	/* Deselect and wake up anyone waiting on the device */
 	nand_release_device(mtd);
-	return 0;
+
+	*retlen = chip->ops.retlen;
+	return ret;
 }
-EXPORT_SYMBOL_GPL(nand_write_raw);
 
 /**
- * nand_write_oob - [MTD Interface] NAND write out-of-band
+ * nand_do_write_oob - [MTD Interface] NAND write out-of-band
  * @mtd:	MTD device structure
  * @to:		offset to write to
- * @len:	number of bytes to write
- * @retlen:	pointer to variable to store the number of written bytes
- * @buf:	the data to write
+ * @ops:	oob operation description structure
  *
  * NAND write out-of-band
  */
-static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len,
-			  size_t *retlen, const uint8_t *buf)
+static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
+			     struct mtd_oob_ops *ops)
 {
-	int column, page, status, ret = -EIO, chipnr;
+	int chipnr, page, status;
 	struct nand_chip *chip = mtd->priv;
 
 	DEBUG(MTD_DEBUG_LEVEL3, "nand_write_oob: to = 0x%08x, len = %i\n",
-	      (unsigned int)to, (int)len);
-
-	/* Initialize return length value */
-	*retlen = 0;
+	      (unsigned int)to, (int)ops->len);
 
 	/* Do not allow write past end of page */
-	column = to & (mtd->oobsize - 1);
-	if ((column + len) > mtd->oobsize) {
+	if ((ops->ooboffs + ops->len) > mtd->oobsize) {
 		DEBUG(MTD_DEBUG_LEVEL0, "nand_write_oob: "
 		      "Attempt to write past end of page\n");
 		return -EINVAL;
 	}
 
-	nand_get_device(chip, mtd, FL_WRITING);
-
 	chipnr = (int)(to >> chip->chip_shift);
 	chip->select_chip(mtd, chipnr);
 
@@ -1528,26 +1577,27 @@ static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len,
 
 	/* Check, if it is write protected */
 	if (nand_check_wp(mtd))
-		goto out;
+		return -EROFS;
 
 	/* Invalidate the page cache, if we write to the cached page */
 	if (page == chip->pagebuf)
 		chip->pagebuf = -1;
 
-	if (NAND_MUST_PAD(chip)) {
+	if (ops->mode == MTD_OOB_AUTO || NAND_MUST_PAD(chip)) {
+		chip->oob_poi = chip->buffers.oobwbuf;
+		memset(chip->oob_poi, 0xff, mtd->oobsize);
+		nand_fill_oob(chip, ops->oobbuf, ops);
 		chip->cmdfunc(mtd, NAND_CMD_SEQIN, mtd->writesize,
 			      page & chip->pagemask);
-		/* prepad 0xff for partial programming */
-		chip->write_buf(mtd, ffchars, column);
-		/* write data */
-		chip->write_buf(mtd, buf, len);
-		/* postpad 0xff for partial programming */
-		chip->write_buf(mtd, ffchars, mtd->oobsize - (len + column));
+		chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
+		memset(chip->oob_poi, 0xff, mtd->oobsize);
 	} else {
-		chip->cmdfunc(mtd, NAND_CMD_SEQIN, mtd->writesize + column,
+		chip->cmdfunc(mtd, NAND_CMD_SEQIN,
+			      mtd->writesize + ops->ooboffs,
 			      page & chip->pagemask);
-		chip->write_buf(mtd, buf, len);
+		chip->write_buf(mtd, ops->oobbuf, ops->len);
 	}
+
 	/* Send command to program the OOB data */
 	chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
 
@@ -1557,27 +1607,75 @@ static int nand_write_oob(struct mtd_info *mtd, loff_t to, size_t len,
 	if (status & NAND_STATUS_FAIL) {
 		DEBUG(MTD_DEBUG_LEVEL0, "nand_write_oob: "
 		      "Failed write, page 0x%08x\n", page);
-		ret = -EIO;
-		goto out;
+		return -EIO;
 	}
-	*retlen = len;
+	ops->retlen = ops->len;
 
 #ifdef CONFIG_MTD_NAND_VERIFY_WRITE
-	/* Send command to read back the data */
-	chip->cmdfunc(mtd, NAND_CMD_READOOB, column, page & chip->pagemask);
+	if (ops->mode != MTD_OOB_AUTO) {
+		/* Send command to read back the data */
+		chip->cmdfunc(mtd, NAND_CMD_READOOB, ops->ooboffs,
+			      page & chip->pagemask);
 
-	if (chip->verify_buf(mtd, buf, len)) {
-		DEBUG(MTD_DEBUG_LEVEL0, "nand_write_oob: "
-		      "Failed write verify, page 0x%08x\n", page);
-		ret = -EIO;
-		goto out;
+		if (chip->verify_buf(mtd, ops->oobbuf, ops->len)) {
+			DEBUG(MTD_DEBUG_LEVEL0, "nand_write_oob: "
+			      "Failed write verify, page 0x%08x\n", page);
+			return -EIO;
+		}
 	}
 #endif
-	ret = 0;
+		return 0;
+}
+
+/**
+ * nand_write_oob - [MTD Interface] NAND write data and/or out-of-band
+ * @mtd:	MTD device structure
+ * @from:	offset to read from
+ * @ops:	oob operation description structure
+ */
+static int nand_write_oob(struct mtd_info *mtd, loff_t to,
+			  struct mtd_oob_ops *ops)
+{
+	void (*write_page)(struct mtd_info *mtd, struct nand_chip *chip,
+			  const uint8_t *buf) = NULL;
+	struct nand_chip *chip = mtd->priv;
+	int ret = -ENOTSUPP;
+
+	ops->retlen = 0;
+
+	/* Do not allow writes past end of device */
+	if ((to + ops->len) > mtd->size) {
+		DEBUG(MTD_DEBUG_LEVEL0, "nand_read_oob: "
+		      "Attempt read beyond end of device\n");
+		return -EINVAL;
+	}
+
+	nand_get_device(chip, mtd, FL_READING);
+
+	switch(ops->mode) {
+	case MTD_OOB_PLACE:
+	case MTD_OOB_AUTO:
+		break;
+
+	case MTD_OOB_RAW:
+		/* Replace the write_page algorithm temporary */
+		write_page = chip->ecc.write_page;
+		chip->ecc.write_page = nand_write_page_raw;
+		break;
+
+	default:
+		goto out;
+	}
+
+	if (!ops->datbuf)
+		ret = nand_do_write_oob(mtd, to, ops);
+	else
+		ret = nand_do_write_ops(mtd, to, ops);
+
+	if (unlikely(ops->mode == MTD_OOB_RAW))
+		chip->ecc.write_page = write_page;
  out:
-	/* Deselect and wake up anyone waiting on the device */
 	nand_release_device(mtd);
-
 	return ret;
 }
 
@@ -2191,8 +2289,8 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 	case NAND_ECC_NONE:
 		printk(KERN_WARNING "NAND_ECC_NONE selected by board driver. "
 		       "This is not recommended !!\n");
-		chip->ecc.read_page = nand_read_page_swecc;
-		chip->ecc.write_page = nand_write_page_swecc;
+		chip->ecc.read_page = nand_read_page_raw;
+		chip->ecc.write_page = nand_write_page_raw;
 		chip->ecc.size = mtd->writesize;
 		chip->ecc.bytes = 0;
 		break;
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index 40f99304df76..480c3cbf9bf9 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -230,6 +230,42 @@ static int read_abs_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_desc
 	return 0;
 }
 
+/*
+ * Scan read raw data from flash
+ */
+static int scan_read_raw(struct mtd_info *mtd, uint8_t *buf, loff_t offs,
+			 size_t len)
+{
+	struct mtd_oob_ops ops;
+
+	ops.mode = MTD_OOB_RAW;
+	ops.ooboffs = 0;
+	ops.ooblen = mtd->oobsize;
+	ops.oobbuf = buf;
+	ops.datbuf = buf;
+	ops.len = len;
+
+	return mtd->read_oob(mtd, offs, &ops);
+}
+
+/*
+ * Scan write data with oob to flash
+ */
+static int scan_write_bbt(struct mtd_info *mtd, loff_t offs, size_t len,
+			  uint8_t *buf, uint8_t *oob)
+{
+	struct mtd_oob_ops ops;
+
+	ops.mode = MTD_OOB_PLACE;
+	ops.ooboffs = 0;
+	ops.ooblen = mtd->oobsize;
+	ops.datbuf = buf;
+	ops.oobbuf = oob;
+	ops.len = len;
+
+	return mtd->write_oob(mtd, offs, &ops);
+}
+
 /**
  * read_abs_bbts - [GENERIC] Read the bad block table(s) for all chips starting at a given page
  * @mtd:	MTD device structure
@@ -241,27 +277,85 @@ static int read_abs_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_desc
  * We assume that the bbt bits are in consecutive order.
  *
 */
-static int read_abs_bbts(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr *td, struct nand_bbt_descr *md)
+static int read_abs_bbts(struct mtd_info *mtd, uint8_t *buf,
+			 struct nand_bbt_descr *td, struct nand_bbt_descr *md)
 {
 	struct nand_chip *this = mtd->priv;
 
 	/* Read the primary version, if available */
 	if (td->options & NAND_BBT_VERSION) {
-		nand_read_raw(mtd, buf, td->pages[0] << this->page_shift, mtd->writesize, mtd->oobsize);
+		scan_read_raw(mtd, buf, td->pages[0] << this->page_shift,
+			      mtd->writesize);
 		td->version[0] = buf[mtd->writesize + td->veroffs];
-		printk(KERN_DEBUG "Bad block table at page %d, version 0x%02X\n", td->pages[0], td->version[0]);
+		printk(KERN_DEBUG "Bad block table at page %d, version 0x%02X\n",
+		       td->pages[0], td->version[0]);
 	}
 
 	/* Read the mirror version, if available */
 	if (md && (md->options & NAND_BBT_VERSION)) {
-		nand_read_raw(mtd, buf, md->pages[0] << this->page_shift, mtd->writesize, mtd->oobsize);
+		scan_read_raw(mtd, buf, md->pages[0] << this->page_shift,
+			      mtd->writesize);
 		md->version[0] = buf[mtd->writesize + md->veroffs];
-		printk(KERN_DEBUG "Bad block table at page %d, version 0x%02X\n", md->pages[0], md->version[0]);
+		printk(KERN_DEBUG "Bad block table at page %d, version 0x%02X\n",
+		       md->pages[0], md->version[0]);
 	}
-
 	return 1;
 }
 
+/*
+ * Scan a given block full
+ */
+static int scan_block_full(struct mtd_info *mtd, struct nand_bbt_descr *bd,
+			   loff_t offs, uint8_t *buf, size_t readlen,
+			   int scanlen, int len)
+{
+	int ret, j;
+
+	ret = scan_read_raw(mtd, buf, offs, readlen);
+	if (ret)
+		return ret;
+
+	for (j = 0; j < len; j++, buf += scanlen) {
+		if (check_pattern(buf, scanlen, mtd->writesize, bd))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Scan a given block partially
+ */
+static int scan_block_fast(struct mtd_info *mtd, struct nand_bbt_descr *bd,
+			   loff_t offs, uint8_t *buf, int len)
+{
+	struct mtd_oob_ops ops;
+	int j, ret;
+
+	ops.len = mtd->oobsize;
+	ops.ooblen = mtd->oobsize;
+	ops.oobbuf = buf;
+	ops.ooboffs = 0;
+	ops.datbuf = NULL;
+	ops.mode = MTD_OOB_PLACE;
+
+	for (j = 0; j < len; j++) {
+		/*
+		 * Read the full oob until read_oob is fixed to
+		 * handle single byte reads for 16 bit
+		 * buswidth
+		 */
+		ret = mtd->read_oob(mtd, offs, &ops);
+		if (ret)
+			return ret;
+
+		if (check_short_pattern(buf, bd))
+			return 1;
+
+		offs += mtd->writesize;
+	}
+	return 0;
+}
+
 /**
  * create_bbt - [GENERIC] Create a bad block table by scanning the device
  * @mtd:	MTD device structure
@@ -273,13 +367,14 @@ static int read_abs_bbts(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_des
  * Create a bad block table by scanning the device
  * for the given good/bad block identify pattern
  */
-static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr *bd, int chip)
+static int create_bbt(struct mtd_info *mtd, uint8_t *buf,
+	struct nand_bbt_descr *bd, int chip)
 {
 	struct nand_chip *this = mtd->priv;
-	int i, j, numblocks, len, scanlen;
+	int i, numblocks, len, scanlen;
 	int startblock;
 	loff_t from;
-	size_t readlen, ooblen;
+	size_t readlen;
 
 	printk(KERN_INFO "Scanning device for bad blocks\n");
 
@@ -294,18 +389,17 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 
 	if (!(bd->options & NAND_BBT_SCANEMPTY)) {
 		/* We need only read few bytes from the OOB area */
-		scanlen = ooblen = 0;
+		scanlen = 0;
 		readlen = bd->len;
 	} else {
 		/* Full page content should be read */
 		scanlen = mtd->writesize + mtd->oobsize;
 		readlen = len * mtd->writesize;
-		ooblen = len * mtd->oobsize;
 	}
 
 	if (chip == -1) {
-		/* Note that numblocks is 2 * (real numblocks) here, see i+=2 below as it
-		 * makes shifting and masking less painful */
+		/* Note that numblocks is 2 * (real numblocks) here, see i+=2
+		 * below as it makes shifting and masking less painful */
 		numblocks = mtd->size >> (this->bbt_erase_shift - 1);
 		startblock = 0;
 		from = 0;
@@ -324,35 +418,21 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 	for (i = startblock; i < numblocks;) {
 		int ret;
 
-		if (bd->options & NAND_BBT_SCANEMPTY)
-			if ((ret = nand_read_raw(mtd, buf, from, readlen, ooblen)))
-				return ret;
-
-		for (j = 0; j < len; j++) {
-			if (!(bd->options & NAND_BBT_SCANEMPTY)) {
-				size_t retlen;
-
-				/* Read the full oob until read_oob is fixed to
-				 * handle single byte reads for 16 bit buswidth */
-				ret = mtd->read_oob(mtd, from + j * mtd->writesize, mtd->oobsize, &retlen, buf);
-				if (ret)
-					return ret;
-
-				if (check_short_pattern(buf, bd)) {
-					this->bbt[i >> 3] |= 0x03 << (i & 0x6);
-					printk(KERN_WARNING "Bad eraseblock %d at 0x%08x\n",
-					       i >> 1, (unsigned int)from);
-					break;
-				}
-			} else {
-				if (check_pattern(&buf[j * scanlen], scanlen, mtd->writesize, bd)) {
-					this->bbt[i >> 3] |= 0x03 << (i & 0x6);
-					printk(KERN_WARNING "Bad eraseblock %d at 0x%08x\n",
-					       i >> 1, (unsigned int)from);
-					break;
-				}
-			}
+		if (bd->options & NAND_BBT_SCANALLPAGES)
+			ret = scan_block_full(mtd, bd, from, buf, readlen,
+					      scanlen, len);
+		else
+			ret = scan_block_fast(mtd, bd, from, buf, len);
+
+		if (ret < 0)
+			return ret;
+
+		if (ret) {
+			this->bbt[i >> 3] |= 0x03 << (i & 0x6);
+			printk(KERN_WARNING "Bad eraseblock %d at 0x%08x\n",
+			       i >> 1, (unsigned int)from);
 		}
+
 		i += 2;
 		from += (1 << this->bbt_erase_shift);
 	}
@@ -383,6 +463,7 @@ static int search_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 	int bits, startblock, block, dir;
 	int scanlen = mtd->writesize + mtd->oobsize;
 	int bbtblocks;
+	int blocktopage = this->bbt_erase_shift - this->page_shift;
 
 	/* Search direction top -> down ? */
 	if (td->options & NAND_BBT_LASTBLOCK) {
@@ -412,11 +493,14 @@ static int search_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 		td->pages[i] = -1;
 		/* Scan the maximum number of blocks */
 		for (block = 0; block < td->maxblocks; block++) {
+
 			int actblock = startblock + dir * block;
+			loff_t offs = actblock << this->bbt_erase_shift;
+
 			/* Read first page */
-			nand_read_raw(mtd, buf, actblock << this->bbt_erase_shift, mtd->writesize, mtd->oobsize);
+			scan_read_raw(mtd, buf, offs, mtd->writesize);
 			if (!check_pattern(buf, scanlen, mtd->writesize, td)) {
-				td->pages[i] = actblock << (this->bbt_erase_shift - this->page_shift);
+				td->pages[i] = actblock << blocktopage;
 				if (td->options & NAND_BBT_VERSION) {
 					td->version[i] = buf[mtd->writesize + td->veroffs];
 				}
@@ -481,8 +565,14 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 	int nrchips, bbtoffs, pageoffs, ooboffs;
 	uint8_t msk[4];
 	uint8_t rcode = td->reserved_block_code;
-	size_t retlen, len = 0, ooblen;
+	size_t retlen, len = 0;
 	loff_t to;
+	struct mtd_oob_ops ops;
+
+	ops.ooblen = mtd->oobsize;
+	ops.ooboffs = 0;
+	ops.datbuf = NULL;
+	ops.mode = MTD_OOB_PLACE;
 
 	if (!rcode)
 		rcode = 0xff;
@@ -583,10 +673,10 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 				       "bad block table\n");
 			}
 			/* Read oob data */
-			ooblen = (len >> this->page_shift) * mtd->oobsize;
-			res = mtd->read_oob(mtd, to + mtd->writesize, ooblen,
-					    &retlen, &buf[len]);
-			if (res < 0 || retlen != ooblen)
+			ops.len = (len >> this->page_shift) * mtd->oobsize;
+			ops.oobbuf = &buf[len];
+			res = mtd->read_oob(mtd, to + mtd->writesize, &ops);
+			if (res < 0 || ops.retlen != ops.len)
 				goto outerr;
 
 			/* Calc the byte offset in the buffer */
@@ -635,7 +725,7 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 		if (res < 0)
 			goto outerr;
 
-		res = nand_write_raw(mtd, to, len, &retlen, buf, &buf[len]);
+		res = scan_write_bbt(mtd, to, len, buf, &buf[len]);
 		if (res < 0)
 			goto outerr;
 
diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c
index 359533b33d9b..f6ffe7949b26 100644
--- a/drivers/mtd/nftlcore.c
+++ b/drivers/mtd/nftlcore.c
@@ -134,6 +134,69 @@ static void nftl_remove_dev(struct mtd_blktrans_dev *dev)
 	kfree(nftl);
 }
 
+/*
+ * Read oob data from flash
+ */
+int nftl_read_oob(struct mtd_info *mtd, loff_t offs, size_t len,
+		  size_t *retlen, uint8_t *buf)
+{
+	struct mtd_oob_ops ops;
+	int res;
+
+	ops.mode = MTD_OOB_PLACE;
+	ops.ooboffs = offs & (mtd->writesize - 1);
+	ops.ooblen = len;
+	ops.oobbuf = buf;
+	ops.datbuf = NULL;
+	ops.len = len;
+
+	res = mtd->read_oob(mtd, offs & ~(mtd->writesize - 1), &ops);
+	*retlen = ops.retlen;
+	return res;
+}
+
+/*
+ * Write oob data to flash
+ */
+int nftl_write_oob(struct mtd_info *mtd, loff_t offs, size_t len,
+		   size_t *retlen, uint8_t *buf)
+{
+	struct mtd_oob_ops ops;
+	int res;
+
+	ops.mode = MTD_OOB_PLACE;
+	ops.ooboffs = offs & (mtd->writesize - 1);
+	ops.ooblen = len;
+	ops.oobbuf = buf;
+	ops.datbuf = NULL;
+	ops.len = len;
+
+	res = mtd->write_oob(mtd, offs & ~(mtd->writesize - 1), &ops);
+	*retlen = ops.retlen;
+	return res;
+}
+
+/*
+ * Write data and oob to flash
+ */
+static int nftl_write(struct mtd_info *mtd, loff_t offs, size_t len,
+		      size_t *retlen, uint8_t *buf, uint8_t *oob)
+{
+	struct mtd_oob_ops ops;
+	int res;
+
+	ops.mode = MTD_OOB_PLACE;
+	ops.ooboffs = offs;
+	ops.ooblen = mtd->oobsize;
+	ops.oobbuf = oob;
+	ops.datbuf = buf;
+	ops.len = len;
+
+	res = mtd->write_oob(mtd, offs & ~(mtd->writesize - 1), &ops);
+	*retlen = ops.retlen;
+	return res;
+}
+
 #ifdef CONFIG_NFTL_RW
 
 /* Actual NFTL access routines */
@@ -216,7 +279,7 @@ static u16 NFTL_foldchain (struct NFTLrecord *nftl, unsigned thisVUC, unsigned p
 
 		targetEUN = thisEUN;
 		for (block = 0; block < nftl->EraseSize / 512; block ++) {
-			mtd->read_oob(mtd, (thisEUN * nftl->EraseSize) +
+			nftl_read_oob(mtd, (thisEUN * nftl->EraseSize) +
 				      (block * 512), 16 , &retlen,
 				      (char *)&oob);
 			if (block == 2) {
@@ -333,7 +396,7 @@ static u16 NFTL_foldchain (struct NFTLrecord *nftl, unsigned thisVUC, unsigned p
                longer one */
 		oob.u.c.FoldMark = oob.u.c.FoldMark1 = cpu_to_le16(FOLD_MARK_IN_PROGRESS);
 		oob.u.c.unused = 0xffffffff;
-		mtd->write_oob(mtd, (nftl->EraseSize * targetEUN) + 2 * 512 + 8,
+		nftl_write_oob(mtd, (nftl->EraseSize * targetEUN) + 2 * 512 + 8,
 			       8, &retlen, (char *)&oob.u);
 	}
 
@@ -369,17 +432,15 @@ static u16 NFTL_foldchain (struct NFTLrecord *nftl, unsigned thisVUC, unsigned p
 		memset(&oob, 0xff, sizeof(struct nftl_oob));
 		oob.b.Status = oob.b.Status1 = SECTOR_USED;
 
-		nand_write_raw(nftl->mbd.mtd, (nftl->EraseSize * targetEUN) +
-			       (block * 512), 512, &retlen, movebuf,
-			       (char *)&oob);
-
+		nftl_write(nftl->mbd.mtd, (nftl->EraseSize * targetEUN) +
+			   (block * 512), 512, &retlen, movebuf, (char *)&oob);
 	}
 
 	/* add the header so that it is now a valid chain */
 	oob.u.a.VirtUnitNum = oob.u.a.SpareVirtUnitNum = cpu_to_le16(thisVUC);
 	oob.u.a.ReplUnitNum = oob.u.a.SpareReplUnitNum = 0xffff;
 
-	mtd->write_oob(mtd, (nftl->EraseSize * targetEUN) + 8,
+	nftl_write_oob(mtd, (nftl->EraseSize * targetEUN) + 8,
 		       8, &retlen, (char *)&oob.u);
 
 	/* OK. We've moved the whole lot into the new block. Now we have to free the original blocks. */
@@ -499,7 +560,7 @@ static inline u16 NFTL_findwriteunit(struct NFTLrecord *nftl, unsigned block)
 
 			lastEUN = writeEUN;
 
-			mtd->read_oob(mtd,
+			nftl_read_oob(mtd,
 				      (writeEUN * nftl->EraseSize) + blockofs,
 				      8, &retlen, (char *)&bci);
 
@@ -588,12 +649,12 @@ static inline u16 NFTL_findwriteunit(struct NFTLrecord *nftl, unsigned block)
 		nftl->ReplUnitTable[writeEUN] = BLOCK_NIL;
 
 		/* ... and on the flash itself */
-		mtd->read_oob(mtd, writeEUN * nftl->EraseSize + 8, 8,
+		nftl_read_oob(mtd, writeEUN * nftl->EraseSize + 8, 8,
 			      &retlen, (char *)&oob.u);
 
 		oob.u.a.VirtUnitNum = oob.u.a.SpareVirtUnitNum = cpu_to_le16(thisVUC);
 
-		mtd->write_oob(mtd, writeEUN * nftl->EraseSize + 8, 8,
+		nftl_write_oob(mtd, writeEUN * nftl->EraseSize + 8, 8,
 			       &retlen, (char *)&oob.u);
 
 		/* we link the new block to the chain only after the
@@ -603,13 +664,13 @@ static inline u16 NFTL_findwriteunit(struct NFTLrecord *nftl, unsigned block)
 			/* Both in our cache... */
 			nftl->ReplUnitTable[lastEUN] = writeEUN;
 			/* ... and on the flash itself */
-			mtd->read_oob(mtd, (lastEUN * nftl->EraseSize) + 8,
+			nftl_read_oob(mtd, (lastEUN * nftl->EraseSize) + 8,
 				      8, &retlen, (char *)&oob.u);
 
 			oob.u.a.ReplUnitNum = oob.u.a.SpareReplUnitNum
 				= cpu_to_le16(writeEUN);
 
-			mtd->write_oob(mtd, (lastEUN * nftl->EraseSize) + 8,
+			nftl_write_oob(mtd, (lastEUN * nftl->EraseSize) + 8,
 				       8, &retlen, (char *)&oob.u);
 		}
 
@@ -643,9 +704,8 @@ static int nftl_writeblock(struct mtd_blktrans_dev *mbd, unsigned long block,
 	memset(&oob, 0xff, sizeof(struct nftl_oob));
 	oob.b.Status = oob.b.Status1 = SECTOR_USED;
 
-	nand_write_raw(nftl->mbd.mtd, (writeEUN * nftl->EraseSize) +
-		       blockofs, 512, &retlen, (char *)buffer,
-		       (char *)&oob);
+	nftl_write(nftl->mbd.mtd, (writeEUN * nftl->EraseSize) + blockofs,
+		   512, &retlen, (char *)buffer, (char *)&oob);
 	return 0;
 }
 #endif /* CONFIG_NFTL_RW */
@@ -667,7 +727,7 @@ static int nftl_readblock(struct mtd_blktrans_dev *mbd, unsigned long block,
 
 	if (thisEUN != BLOCK_NIL) {
 		while (thisEUN < nftl->nb_blocks) {
-			if (mtd->read_oob(mtd, (thisEUN * nftl->EraseSize) +
+			if (nftl_read_oob(mtd, (thisEUN * nftl->EraseSize) +
 					  blockofs, 8, &retlen,
 					  (char *)&bci) < 0)
 				status = SECTOR_IGNORE;
diff --git a/drivers/mtd/nftlmount.c b/drivers/mtd/nftlmount.c
index 521b07cd2326..067262ee8df0 100644
--- a/drivers/mtd/nftlmount.c
+++ b/drivers/mtd/nftlmount.c
@@ -33,6 +33,11 @@
 
 char nftlmountrev[]="$Revision: 1.41 $";
 
+extern int nftl_read_oob(struct mtd_info *mtd, loff_t offs, size_t len,
+			 size_t *retlen, uint8_t *buf);
+extern int nftl_write_oob(struct mtd_info *mtd, loff_t offs, size_t len,
+			  size_t *retlen, uint8_t *buf);
+
 /* find_boot_record: Find the NFTL Media Header and its Spare copy which contains the
  *	various device information of the NFTL partition and Bad Unit Table. Update
  *	the ReplUnitTable[] table accroding to the Bad Unit Table. ReplUnitTable[]
@@ -92,7 +97,7 @@ static int find_boot_record(struct NFTLrecord *nftl)
 		}
 
 		/* To be safer with BIOS, also use erase mark as discriminant */
-		if ((ret = mtd->read_oob(mtd, block * nftl->EraseSize +
+		if ((ret = nftl_read_oob(mtd, block * nftl->EraseSize +
 					 SECTORSIZE + 8, 8, &retlen,
 					 (char *)&h1) < 0)) {
 			printk(KERN_WARNING "ANAND header found at 0x%x in mtd%d, but OOB data read failed (err %d)\n",
@@ -283,7 +288,7 @@ static int check_free_sectors(struct NFTLrecord *nftl, unsigned int address, int
 			return -1;
 
 		if (check_oob) {
-			if(mtd->read_oob(mtd, address, mtd->oobsize,
+			if(nftl_read_oob(mtd, address, mtd->oobsize,
 					 &retlen, &buf[SECTORSIZE]) < 0)
 				return -1;
 			if (memcmpb(buf + SECTORSIZE, 0xff, mtd->oobsize) != 0)
@@ -311,7 +316,7 @@ int NFTL_formatblock(struct NFTLrecord *nftl, int block)
 	struct mtd_info *mtd = nftl->mbd.mtd;
 
 	/* Read the Unit Control Information #1 for Wear-Leveling */
-	if (mtd->read_oob(mtd, block * nftl->EraseSize + SECTORSIZE + 8,
+	if (nftl_read_oob(mtd, block * nftl->EraseSize + SECTORSIZE + 8,
 			  8, &retlen, (char *)&uci) < 0)
 		goto default_uci1;
 
@@ -351,7 +356,7 @@ int NFTL_formatblock(struct NFTLrecord *nftl, int block)
 			goto fail;
 
 		uci.WearInfo = le32_to_cpu(nb_erases);
-		if (mtd->write_oob(mtd, block * nftl->EraseSize + SECTORSIZE +
+		if (nftl_write_oob(mtd, block * nftl->EraseSize + SECTORSIZE +
 				   8, 8, &retlen, (char *)&uci) < 0)
 			goto fail;
 		return 0;
@@ -383,7 +388,7 @@ static void check_sectors_in_chain(struct NFTLrecord *nftl, unsigned int first_b
 	block = first_block;
 	for (;;) {
 		for (i = 0; i < sectors_per_block; i++) {
-			if (mtd->read_oob(mtd,
+			if (nftl_read_oob(mtd,
 					  block * nftl->EraseSize + i * SECTORSIZE,
 					  8, &retlen, (char *)&bci) < 0)
 				status = SECTOR_IGNORE;
@@ -404,7 +409,7 @@ static void check_sectors_in_chain(struct NFTLrecord *nftl, unsigned int first_b
 					/* sector not free actually : mark it as SECTOR_IGNORE  */
 					bci.Status = SECTOR_IGNORE;
 					bci.Status1 = SECTOR_IGNORE;
-					mtd->write_oob(mtd, block *
+					nftl_write_oob(mtd, block *
 						       nftl->EraseSize +
 						       i * SECTORSIZE, 8,
 						       &retlen, (char *)&bci);
@@ -498,7 +503,7 @@ static int check_and_mark_free_block(struct NFTLrecord *nftl, int block)
 	size_t retlen;
 
 	/* check erase mark. */
-	if (mtd->read_oob(mtd, block * nftl->EraseSize + SECTORSIZE + 8, 8,
+	if (nftl_read_oob(mtd, block * nftl->EraseSize + SECTORSIZE + 8, 8,
 			  &retlen, (char *)&h1) < 0)
 		return -1;
 
@@ -513,7 +518,7 @@ static int check_and_mark_free_block(struct NFTLrecord *nftl, int block)
 		h1.EraseMark = cpu_to_le16(ERASE_MARK);
 		h1.EraseMark1 = cpu_to_le16(ERASE_MARK);
 		h1.WearInfo = cpu_to_le32(0);
-		if (mtd->write_oob(mtd,
+		if (nftl_write_oob(mtd,
 				   block * nftl->EraseSize + SECTORSIZE + 8, 8,
 				   &retlen, (char *)&h1) < 0)
 			return -1;
@@ -526,7 +531,7 @@ static int check_and_mark_free_block(struct NFTLrecord *nftl, int block)
 						SECTORSIZE, 0) != 0)
 				return -1;
 
-			if (mtd->read_oob(mtd, block * nftl->EraseSize + i,
+			if (nftl_read_oob(mtd, block * nftl->EraseSize + i,
 					  16, &retlen, buf) < 0)
 				return -1;
 			if (i == SECTORSIZE) {
@@ -557,7 +562,7 @@ static int get_fold_mark(struct NFTLrecord *nftl, unsigned int block)
 	struct nftl_uci2 uci;
 	size_t retlen;
 
-	if (mtd->read_oob(mtd, block * nftl->EraseSize + 2 * SECTORSIZE + 8,
+	if (nftl_read_oob(mtd, block * nftl->EraseSize + 2 * SECTORSIZE + 8,
 			  8, &retlen, (char *)&uci) < 0)
 		return 0;
 
@@ -597,10 +602,10 @@ int NFTL_mount(struct NFTLrecord *s)
 
 			for (;;) {
 				/* read the block header. If error, we format the chain */
-				if (mtd->read_oob(mtd,
+				if (nftl_read_oob(mtd,
 						  block * s->EraseSize + 8, 8,
 						  &retlen, (char *)&h0) < 0 ||
-				    mtd->read_oob(mtd,
+				    nftl_read_oob(mtd,
 						  block * s->EraseSize +
 						  SECTORSIZE + 8, 8,
 						  &retlen, (char *)&h1) < 0) {
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index a0d3f011c0f2..84ec40d25438 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -671,7 +671,7 @@ out:
 }
 
 /**
- * onenand_read_oob - [MTD Interface] OneNAND read out-of-band
+ * onenand_do_read_oob - [MTD Interface] OneNAND read out-of-band
  * @param mtd		MTD device structure
  * @param from		offset to read from
  * @param len		number of bytes to read
@@ -680,8 +680,8 @@ out:
  *
  * OneNAND read out-of-band data from the spare area
  */
-static int onenand_read_oob(struct mtd_info *mtd, loff_t from, size_t len,
-	size_t *retlen, u_char *buf)
+int onenand_do_read_oob(struct mtd_info *mtd, loff_t from, size_t len,
+			size_t *retlen, u_char *buf)
 {
 	struct onenand_chip *this = mtd->priv;
 	int read = 0, thislen, column;
@@ -744,6 +744,21 @@ out:
 	return ret;
 }
 
+/**
+ * onenand_read_oob - [MTD Interface] NAND write data and/or out-of-band
+ * @mtd:	MTD device structure
+ * @from:	offset to read from
+ * @ops:	oob operation description structure
+ */
+static int onenand_read_oob(struct mtd_info *mtd, loff_t from,
+			    struct mtd_oob_ops *ops)
+{
+	BUG_ON(ops->mode != MTD_OOB_PLACE);
+
+	return onenand_do_read_oob(mtd, from + ops->ooboffs, ops->len,
+				   &ops->retlen, ops->oobbuf);
+}
+
 #ifdef CONFIG_MTD_ONENAND_VERIFY_WRITE
 /**
  * onenand_verify_oob - [GENERIC] verify the oob contents after a write
@@ -894,7 +909,7 @@ out:
 }
 
 /**
- * onenand_write_oob - [MTD Interface] OneNAND write out-of-band
+ * onenand_do_write_oob - [Internal] OneNAND write out-of-band
  * @param mtd		MTD device structure
  * @param to		offset to write to
  * @param len		number of bytes to write
@@ -903,8 +918,8 @@ out:
  *
  * OneNAND write out-of-band
  */
-static int onenand_write_oob(struct mtd_info *mtd, loff_t to, size_t len,
-	size_t *retlen, const u_char *buf)
+static int onenand_do_write_oob(struct mtd_info *mtd, loff_t to, size_t len,
+				size_t *retlen, const u_char *buf)
 {
 	struct onenand_chip *this = mtd->priv;
 	int column, ret = 0;
@@ -972,6 +987,21 @@ out:
 	return ret;
 }
 
+/**
+ * onenand_write_oob - [MTD Interface] NAND write data and/or out-of-band
+ * @mtd:	MTD device structure
+ * @from:	offset to read from
+ * @ops:	oob operation description structure
+ */
+static int onenand_write_oob(struct mtd_info *mtd, loff_t to,
+			     struct mtd_oob_ops *ops)
+{
+	BUG_ON(ops->mode != MTD_OOB_PLACE);
+
+	return onenand_do_write_oob(mtd, to + ops->ooboffs, ops->len,
+				    &ops->retlen, ops->oobbuf);
+}
+
 /**
  * onenand_block_checkbad - [GENERIC] Check if a block is marked bad
  * @param mtd		MTD device structure
@@ -1138,7 +1168,7 @@ static int onenand_default_block_markbad(struct mtd_info *mtd, loff_t ofs)
 
         /* We write two bytes, so we dont have to mess with 16 bit access */
         ofs += mtd->oobsize + (bbm->badblockpos & ~0x01);
-        return mtd->write_oob(mtd, ofs , 2, &retlen, buf);
+        return onenand_do_write_oob(mtd, ofs , 2, &retlen, buf);
 }
 
 /**
@@ -1328,7 +1358,7 @@ static int do_otp_lock(struct mtd_info *mtd, loff_t from, size_t len,
 	this->command(mtd, ONENAND_CMD_OTP_ACCESS, 0, 0);
 	this->wait(mtd, FL_OTPING);
 
-	ret = mtd->write_oob(mtd, from, len, retlen, buf);
+	ret = onenand_do_write_oob(mtd, from, len, retlen, buf);
 
 	/* Exit OTP access mode */
 	this->command(mtd, ONENAND_CMD_RESET, 0, 0);
diff --git a/drivers/mtd/onenand/onenand_bbt.c b/drivers/mtd/onenand/onenand_bbt.c
index aafd7c2f7802..1b00dac3d7d6 100644
--- a/drivers/mtd/onenand/onenand_bbt.c
+++ b/drivers/mtd/onenand/onenand_bbt.c
@@ -17,6 +17,9 @@
 #include <linux/mtd/onenand.h>
 #include <linux/mtd/compatmac.h>
 
+extern int onenand_do_read_oob(struct mtd_info *mtd, loff_t from, size_t len,
+			       size_t *retlen, u_char *buf);
+
 /**
  * check_short_pattern - [GENERIC] check if a pattern is in the buffer
  * @param buf		the buffer to search
@@ -87,8 +90,8 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 
 			/* No need to read pages fully,
 			 * just read required OOB bytes */
-			ret = mtd->read_oob(mtd, from + j * mtd->writesize + bd->offs,
-						readlen, &retlen, &buf[0]);
+			ret = onenand_do_read_oob(mtd, from + j * mtd->writesize + bd->offs,
+						  readlen, &retlen, &buf[0]);
 
 			if (ret)
 				return ret;
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 506690cc9a78..935fec1b1201 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -100,6 +100,7 @@ struct jffs2_sb_info {
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	/* Write-behind buffer for NAND flash */
 	unsigned char *wbuf;
+	unsigned char *oobbuf;
 	uint32_t wbuf_ofs;
 	uint32_t wbuf_len;
 	struct jffs2_inodirty *wbuf_inodes;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index c6a62e162963..1195d06d4373 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -955,158 +955,159 @@ exit:
 	return ret;
 }
 
+#define NR_OOB_SCAN_PAGES	4
+
 /*
- *	Check, if the out of band area is empty
+ * Check, if the out of band area is empty
  */
-int jffs2_check_oob_empty( struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, int mode)
+int jffs2_check_oob_empty(struct jffs2_sb_info *c,
+			  struct jffs2_eraseblock *jeb, int mode)
 {
-	unsigned char *buf;
-	int 	ret = 0;
-	int	i,len,page;
-	size_t  retlen;
-	int	oob_size;
-
-	/* allocate a buffer for all oob data in this sector */
-	oob_size = c->mtd->oobsize;
-	len = 4 * oob_size;
-	buf = kmalloc(len, GFP_KERNEL);
-	if (!buf) {
-		printk(KERN_NOTICE "jffs2_check_oob_empty(): allocation of temporary data buffer for oob check failed\n");
-		return -ENOMEM;
-	}
-	/*
-	 * if mode = 0, we scan for a total empty oob area, else we have
-	 * to take care of the cleanmarker in the first page of the block
-	*/
-	ret = jffs2_flash_read_oob(c, jeb->offset, len , &retlen, buf);
+	int i, page, ret;
+	int oobsize = c->mtd->oobsize;
+	struct mtd_oob_ops ops;
+
+	ops.len = NR_OOB_SCAN_PAGES * oobsize;
+	ops.ooblen = oobsize;
+	ops.oobbuf = c->oobbuf;
+	ops.ooboffs = 0;
+	ops.datbuf = NULL;
+	ops.mode = MTD_OOB_PLACE;
+
+	ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops);
 	if (ret) {
-		D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB failed %d for block at %08x\n", ret, jeb->offset));
-		goto out;
+		D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB "
+			  "failed %d for block at %08x\n", ret, jeb->offset));
+		return ret;
 	}
 
-	if (retlen < len) {
-		D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB return short read "
-			  "(%zd bytes not %d) for block at %08x\n", retlen, len, jeb->offset));
-		ret = -EIO;
-		goto out;
+	if (ops.retlen < ops.len) {
+		D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB "
+			  "returned short read (%zd bytes not %d) for block "
+			  "at %08x\n", ops.retlen, ops.len, jeb->offset));
+		return -EIO;
 	}
 
 	/* Special check for first page */
-	for(i = 0; i < oob_size ; i++) {
+	for(i = 0; i < oobsize ; i++) {
 		/* Yeah, we know about the cleanmarker. */
 		if (mode && i >= c->fsdata_pos &&
 		    i < c->fsdata_pos + c->fsdata_len)
 			continue;
 
-		if (buf[i] != 0xFF) {
-			D2(printk(KERN_DEBUG "Found %02x at %x in OOB for %08x\n",
-				  buf[i], i, jeb->offset));
-			ret = 1;
-			goto out;
+		if (ops.oobbuf[i] != 0xFF) {
+			D2(printk(KERN_DEBUG "Found %02x at %x in OOB for "
+				  "%08x\n", ops.oobbuf[i], i, jeb->offset));
+			return 1;
 		}
 	}
 
 	/* we know, we are aligned :) */
-	for (page = oob_size; page < len; page += sizeof(long)) {
-		unsigned long dat = *(unsigned long *)(&buf[page]);
-		if(dat != -1) {
-			ret = 1;
-			goto out;
-		}
+	for (page = oobsize; page < ops.len; page += sizeof(long)) {
+		long dat = *(long *)(&ops.oobbuf[page]);
+		if(dat != -1)
+			return 1;
 	}
-
-out:
-	kfree(buf);
-
-	return ret;
+	return 0;
 }
 
 /*
-*	Scan for a valid cleanmarker and for bad blocks
-*	For virtual blocks (concatenated physical blocks) check the cleanmarker
-*	only in the first page of the first physical block, but scan for bad blocks in all
-*	physical blocks
-*/
-int jffs2_check_nand_cleanmarker (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+ * Scan for a valid cleanmarker and for bad blocks
+ */
+int jffs2_check_nand_cleanmarker (struct jffs2_sb_info *c,
+				  struct jffs2_eraseblock *jeb)
 {
 	struct jffs2_unknown_node n;
-	unsigned char buf[2 * NAND_MAX_OOBSIZE];
-	unsigned char *p;
-	int ret, i, cnt, retval = 0;
-	size_t retlen, offset;
-	int oob_size;
-
-	offset = jeb->offset;
-	oob_size = c->mtd->oobsize;
-
-	/* Loop through the physical blocks */
-	for (cnt = 0; cnt < (c->sector_size / c->mtd->erasesize); cnt++) {
-		/* Check first if the block is bad. */
-		if (c->mtd->block_isbad (c->mtd, offset)) {
-			D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): Bad block at %08x\n", jeb->offset));
-			return 2;
-		}
-		/*
-		   *    We read oob data from page 0 and 1 of the block.
-		   *    page 0 contains cleanmarker and badblock info
-		   *    page 1 contains failure count of this block
-		 */
-		ret = c->mtd->read_oob (c->mtd, offset, oob_size << 1, &retlen, buf);
+	struct mtd_oob_ops ops;
+	int oobsize = c->mtd->oobsize;
+	unsigned char *p,*b;
+	int i, ret;
+	size_t offset = jeb->offset;
+
+	/* Check first if the block is bad. */
+	if (c->mtd->block_isbad(c->mtd, offset)) {
+		D1 (printk(KERN_WARNING "jffs2_check_nand_cleanmarker()"
+			   ": Bad block at %08x\n", jeb->offset));
+		return 2;
+	}
 
-		if (ret) {
-			D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): Read OOB failed %d for block at %08x\n", ret, jeb->offset));
-			return ret;
-		}
-		if (retlen < (oob_size << 1)) {
-			D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): Read OOB return short read (%zd bytes not %d) for block at %08x\n", retlen, oob_size << 1, jeb->offset));
-			return -EIO;
-		}
+	ops.len = oobsize;
+	ops.ooblen = oobsize;
+	ops.oobbuf = c->oobbuf;
+	ops.ooboffs = 0;
+	ops.datbuf = NULL;
+	ops.mode = MTD_OOB_PLACE;
 
-		/* Check cleanmarker only on the first physical block */
-		if (!cnt) {
-			n.magic = cpu_to_je16 (JFFS2_MAGIC_BITMASK);
-			n.nodetype = cpu_to_je16 (JFFS2_NODETYPE_CLEANMARKER);
-			n.totlen = cpu_to_je32 (8);
-			p = (unsigned char *) &n;
+	ret = c->mtd->read_oob(c->mtd, offset, &ops);
+	if (ret) {
+		D1 (printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): "
+			   "Read OOB failed %d for block at %08x\n",
+			   ret, jeb->offset));
+		return ret;
+	}
 
-			for (i = 0; i < c->fsdata_len; i++) {
-				if (buf[c->fsdata_pos + i] != p[i]) {
-					retval = 1;
-				}
-			}
-			D1(if (retval == 1) {
-				printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): Cleanmarker node not detected in block at %08x\n", jeb->offset);
-				printk(KERN_WARNING "OOB at %08zx was ", offset);
-				for (i=0; i < oob_size; i++) {
-					printk("%02x ", buf[i]);
-				}
-				printk("\n");
-			})
-		}
-		offset += c->mtd->erasesize;
+	if (ops.retlen < ops.len) {
+		D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): "
+			    "Read OOB return short read (%zd bytes not %d) "
+			    "for block at %08x\n", ops.retlen, ops.len,
+			    jeb->offset));
+		return -EIO;
 	}
-	return retval;
+
+	n.magic = cpu_to_je16 (JFFS2_MAGIC_BITMASK);
+	n.nodetype = cpu_to_je16 (JFFS2_NODETYPE_CLEANMARKER);
+	n.totlen = cpu_to_je32 (8);
+	p = (unsigned char *) &n;
+	b = c->oobbuf + c->fsdata_pos;
+
+	for (i = c->fsdata_len; i; i--) {
+		if (*b++ != *p++)
+			ret = 1;
+	}
+
+	D1(if (ret == 1) {
+		printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): "
+		       "Cleanmarker node not detected in block at %08x\n",
+		       offset);
+		printk(KERN_WARNING "OOB at %08zx was ", offset);
+		for (i=0; i < oobsize; i++)
+			printk("%02x ", c->oobbuf[i]);
+		printk("\n");
+	});
+	return ret;
 }
 
-int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
+				 struct jffs2_eraseblock *jeb)
 {
-	struct 	jffs2_unknown_node n;
-	int 	ret;
-	size_t 	retlen;
+	struct jffs2_unknown_node n;
+	int	ret;
+	struct mtd_oob_ops ops;
 
 	n.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
 	n.nodetype = cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER);
 	n.totlen = cpu_to_je32(8);
 
-	ret = jffs2_flash_write_oob(c, jeb->offset + c->fsdata_pos, c->fsdata_len, &retlen, (unsigned char *)&n);
+	ops.len = c->fsdata_len;
+	ops.ooblen = c->fsdata_len;;
+	ops.oobbuf = (uint8_t *)&n;
+	ops.ooboffs = c->fsdata_pos;
+	ops.datbuf = NULL;
+	ops.mode = MTD_OOB_PLACE;
+
+	ret = c->mtd->write_oob(c->mtd, jeb->offset, &ops);
 
 	if (ret) {
-		D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): Write failed for block at %08x: error %d\n", jeb->offset, ret));
+		D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): "
+			  "Write failed for block at %08x: error %d\n",
+			  jeb->offset, ret));
 		return ret;
 	}
-	if (retlen != c->fsdata_len) {
-		D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): Short write for block at %08x: %zd not %d\n", jeb->offset, retlen, c->fsdata_len));
-		return ret;
+	if (ops.retlen != ops.len) {
+		D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): "
+			  "Short write for block at %08x: %zd not %d\n",
+			  jeb->offset, ops.retlen, ops.len));
+		return -EIO;
 	}
 	return 0;
 }
@@ -1185,6 +1186,10 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
 	if (!c->wbuf)
 		return -ENOMEM;
 
+	c->oobbuf = kmalloc(NR_OOB_SCAN_PAGES * c->mtd->oobsize, GFP_KERNEL);
+	if (!c->oobbuf)
+		return -ENOMEM;
+
 	res = jffs2_nand_set_oobinfo(c);
 
 #ifdef BREAKME
@@ -1202,6 +1207,7 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
 void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c)
 {
 	kfree(c->wbuf);
+	kfree(c->oobbuf);
 }
 
 int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 4970c2e96fbf..e75bb584e80b 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -67,6 +67,50 @@ struct mtd_ecc_stats {
 	unsigned long failed;
 };
 
+/*
+ * oob operation modes
+ *
+ * MTD_OOB_PLACE:	oob data are placed at the given offset
+ * MTD_OOB_AUTO:	oob data are automatically placed at the free areas
+ *			which are defined by the ecclayout
+ * MTD_OOB_RAW:		mode to read raw data+oob in one chunk. The oob data
+ *			is inserted into the data. Thats a raw image of the
+ *			flash contents.
+ */
+typedef enum {
+	MTD_OOB_PLACE,
+	MTD_OOB_AUTO,
+	MTD_OOB_RAW,
+} mtd_oob_mode_t;
+
+/**
+ * struct mtd_oob_ops - oob operation operands
+ * @mode:	operation mode
+ *
+ * @len:	number of bytes to write/read. When a data buffer is given
+ *		(datbuf != NULL) this is the number of data bytes. When
+ +		no data buffer is available this is the number of oob bytes.
+ *
+ * @retlen:	number of bytes written/read. When a data buffer is given
+ *		(datbuf != NULL) this is the number of data bytes. When
+ +		no data buffer is available this is the number of oob bytes.
+ *
+ * @ooblen:	number of oob bytes per page
+ * @ooboffs:	offset of oob data in the oob area (only relevant when
+ *		mode = MTD_OOB_PLACE)
+ * @datbuf:	data buffer - if NULL only oob data are read/written
+ * @oobbuf:	oob data buffer
+ */
+struct mtd_oob_ops {
+	mtd_oob_mode_t	mode;
+	size_t		len;
+	size_t		retlen;
+	size_t		ooblen;
+	uint32_t	ooboffs;
+	uint8_t		*datbuf;
+	uint8_t		*oobbuf;
+};
+
 struct mtd_info {
 	u_char type;
 	u_int32_t flags;
@@ -125,8 +169,10 @@ struct mtd_info {
 	int (*read) (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf);
 	int (*write) (struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf);
 
-	int (*read_oob) (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf);
-	int (*write_oob) (struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf);
+	int (*read_oob) (struct mtd_info *mtd, loff_t from,
+			 struct mtd_oob_ops *ops);
+	int (*write_oob) (struct mtd_info *mtd, loff_t to,
+			 struct mtd_oob_ops *ops);
 
 	/*
 	 * Methods to access the protection register area, present in some
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index dc2bf1bcf42b..bf2ce68901f5 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -31,14 +31,6 @@ extern int nand_scan (struct mtd_info *mtd, int max_chips);
 /* Free resources held by the NAND device */
 extern void nand_release (struct mtd_info *mtd);
 
-/* Read raw data from the device without ECC */
-extern int nand_read_raw (struct mtd_info *mtd, uint8_t *buf, loff_t from,
-			  size_t len, size_t ooblen);
-
-
-extern int nand_write_raw(struct mtd_info *mtd, loff_t to, size_t len,
-			  size_t *retlen, const uint8_t *buf, uint8_t *oob);
-
 /* The maximum number of NAND chips in an array */
 #define NAND_MAX_CHIPS		8
 
@@ -375,6 +367,8 @@ struct nand_chip {
 	struct nand_buffers buffers;
 	struct nand_hw_control hwcontrol;
 
+	struct mtd_oob_ops ops;
+
 	uint8_t		*bbt;
 	struct nand_bbt_descr	*bbt_td;
 	struct nand_bbt_descr	*bbt_md;
-- 
cgit v1.2.3


From f1a28c02843efcfcc41982149880bac3ac180234 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 30 May 2006 00:37:34 +0200
Subject: [MTD] NAND Expose the new raw mode function and status info to
 userspace

The raw read/write access to NAND (without ECC) has been changed in the
NAND rework. Expose the new way - setting the file mode via ioctl - to
userspace. Also allow to read out the ecc statistics information so userspace
tools can see that bitflips happened and whether errors where correctable
or not. Also expose the number of bad blocks for the partition, so nandwrite
can check if the data fits into the parition before writing to it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/mtdchar.c        | 200 ++++++++++++++++++++++++++++++-------------
 drivers/mtd/mtdconcat.c      |  51 +++++++----
 drivers/mtd/mtdpart.c        |  39 ++++++++-
 drivers/mtd/nand/nand_base.c |  26 +++---
 drivers/mtd/nand/nand_bbt.c  |   3 +
 include/linux/mtd/mtd.h      |  11 ---
 include/mtd/mtd-abi.h        |  27 ++++++
 7 files changed, 259 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index a48210d58b92..fdc535b22e39 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -49,24 +49,18 @@ static struct mtd_notifier notifier = {
 };
 
 /*
- * We use file->private_data to store a pointer to the MTDdevice.
- * Since alighment is at least 32 bits, we have 2 bits free for OTP
- * modes as well.
+ * Data structure to hold the pointer to the mtd device as well
+ * as mode information ofr various use cases.
  */
-
-#define TO_MTD(file) (struct mtd_info *)((long)((file)->private_data) & ~3L)
-
-#define MTD_MODE_OTP_FACT	1
-#define MTD_MODE_OTP_USER	2
-#define MTD_MODE(file)		((long)((file)->private_data) & 3)
-
-#define SET_MTD_MODE(file, mode) \
-	do { long __p = (long)((file)->private_data); \
-	     (file)->private_data = (void *)((__p & ~3L) | mode); } while (0)
+struct mtd_file_info {
+	struct mtd_info *mtd;
+	enum mtd_file_modes mode;
+};
 
 static loff_t mtd_lseek (struct file *file, loff_t offset, int orig)
 {
-	struct mtd_info *mtd = TO_MTD(file);
+	struct mtd_file_info *mfi = file->private_data;
+	struct mtd_info *mtd = mfi->mtd;
 
 	switch (orig) {
 	case 0:
@@ -97,6 +91,7 @@ static int mtd_open(struct inode *inode, struct file *file)
 	int minor = iminor(inode);
 	int devnum = minor >> 1;
 	struct mtd_info *mtd;
+	struct mtd_file_info *mfi;
 
 	DEBUG(MTD_DEBUG_LEVEL0, "MTD_open\n");
 
@@ -117,14 +112,20 @@ static int mtd_open(struct inode *inode, struct file *file)
 		return -ENODEV;
 	}
 
-	file->private_data = mtd;
-
 	/* You can't open it RW if it's not a writeable device */
 	if ((file->f_mode & 2) && !(mtd->flags & MTD_WRITEABLE)) {
 		put_mtd_device(mtd);
 		return -EACCES;
 	}
 
+	mfi = kzalloc(sizeof(*mfi), GFP_KERNEL);
+	if (!mfi) {
+		put_mtd_device(mtd);
+		return -ENOMEM;
+	}
+	mfi->mtd = mtd;
+	file->private_data = mfi;
+
 	return 0;
 } /* mtd_open */
 
@@ -132,16 +133,17 @@ static int mtd_open(struct inode *inode, struct file *file)
 
 static int mtd_close(struct inode *inode, struct file *file)
 {
-	struct mtd_info *mtd;
+	struct mtd_file_info *mfi = file->private_data;
+	struct mtd_info *mtd = mfi->mtd;
 
 	DEBUG(MTD_DEBUG_LEVEL0, "MTD_close\n");
 
-	mtd = TO_MTD(file);
-
 	if (mtd->sync)
 		mtd->sync(mtd);
 
 	put_mtd_device(mtd);
+	file->private_data = NULL;
+	kfree(mfi);
 
 	return 0;
 } /* mtd_close */
@@ -153,7 +155,8 @@ static int mtd_close(struct inode *inode, struct file *file)
 
 static ssize_t mtd_read(struct file *file, char __user *buf, size_t count,loff_t *ppos)
 {
-	struct mtd_info *mtd = TO_MTD(file);
+	struct mtd_file_info *mfi = file->private_data;
+	struct mtd_info *mtd = mfi->mtd;
 	size_t retlen=0;
 	size_t total_retlen=0;
 	int ret=0;
@@ -186,13 +189,26 @@ static ssize_t mtd_read(struct file *file, char __user *buf, size_t count,loff_t
 		else
 			len = count;
 
-		switch (MTD_MODE(file)) {
-		case MTD_MODE_OTP_FACT:
+		switch (mfi->mode) {
+		case MTD_MODE_OTP_FACTORY:
 			ret = mtd->read_fact_prot_reg(mtd, *ppos, len, &retlen, kbuf);
 			break;
 		case MTD_MODE_OTP_USER:
 			ret = mtd->read_user_prot_reg(mtd, *ppos, len, &retlen, kbuf);
 			break;
+		case MTD_MODE_RAW:
+		{
+			struct mtd_oob_ops ops;
+
+			ops.mode = MTD_OOB_RAW;
+			ops.datbuf = kbuf;
+			ops.oobbuf = NULL;
+			ops.len = len;
+
+			ret = mtd->read_oob(mtd, *ppos, &ops);
+			retlen = ops.retlen;
+			break;
+		}
 		default:
 			ret = mtd->read(mtd, *ppos, len, &retlen, kbuf);
 		}
@@ -232,7 +248,8 @@ static ssize_t mtd_read(struct file *file, char __user *buf, size_t count,loff_t
 
 static ssize_t mtd_write(struct file *file, const char __user *buf, size_t count,loff_t *ppos)
 {
-	struct mtd_info *mtd = TO_MTD(file);
+	struct mtd_file_info *mfi = file->private_data;
+	struct mtd_info *mtd = mfi->mtd;
 	char *kbuf;
 	size_t retlen;
 	size_t total_retlen=0;
@@ -270,8 +287,8 @@ static ssize_t mtd_write(struct file *file, const char __user *buf, size_t count
 			return -EFAULT;
 		}
 
-		switch (MTD_MODE(file)) {
-		case MTD_MODE_OTP_FACT:
+		switch (mfi->mode) {
+		case MTD_MODE_OTP_FACTORY:
 			ret = -EROFS;
 			break;
 		case MTD_MODE_OTP_USER:
@@ -281,6 +298,21 @@ static ssize_t mtd_write(struct file *file, const char __user *buf, size_t count
 			}
 			ret = mtd->write_user_prot_reg(mtd, *ppos, len, &retlen, kbuf);
 			break;
+
+		case MTD_MODE_RAW:
+		{
+			struct mtd_oob_ops ops;
+
+			ops.mode = MTD_OOB_RAW;
+			ops.datbuf = kbuf;
+			ops.oobbuf = NULL;
+			ops.len = len;
+
+			ret = mtd->write_oob(mtd, *ppos, &ops);
+			retlen = ops.retlen;
+			break;
+		}
+
 		default:
 			ret = (*(mtd->write))(mtd, *ppos, len, &retlen, kbuf);
 		}
@@ -310,10 +342,41 @@ static void mtdchar_erase_callback (struct erase_info *instr)
 	wake_up((wait_queue_head_t *)instr->priv);
 }
 
+#if defined(CONFIG_MTD_OTP) || defined(CONFIG_MTD_ONENAND_OTP)
+static int otp_select_filemode(struct mtd_file_info *mfi, int mode)
+{
+	struct mtd_info *mtd = mfi->mtd;
+	int ret = 0;
+
+	switch (mode) {
+	case MTD_OTP_FACTORY:
+		if (!mtd->read_fact_prot_reg)
+			ret = -EOPNOTSUPP;
+		else
+			mfi->mode = MTD_MODE_OTP_FACTORY;
+		break;
+	case MTD_OTP_USER:
+		if (!mtd->read_fact_prot_reg)
+			ret = -EOPNOTSUPP;
+		else
+			mfi->mode = MTD_MODE_OTP_USER;
+		break;
+	default:
+		ret = -EINVAL;
+	case MTD_OTP_OFF:
+		break;
+	}
+	return ret;
+}
+#else
+# define otp_select_filemode(f,m)	-EOPNOTSUPP
+#endif
+
 static int mtd_ioctl(struct inode *inode, struct file *file,
 		     u_int cmd, u_long arg)
 {
-	struct mtd_info *mtd = TO_MTD(file);
+	struct mtd_file_info *mfi = file->private_data;
+	struct mtd_info *mtd = mfi->mtd;
 	void __user *argp = (void __user *)arg;
 	int ret = 0;
 	u_long size;
@@ -554,16 +617,6 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 		break;
 	}
 
-	case ECCGETLAYOUT:
-
-		if (!mtd->ecclayout)
-			return -EOPNOTSUPP;
-
-		if (copy_to_user(argp, &mtd->ecclayout,
-				 sizeof(struct nand_ecclayout)))
-			return -EFAULT;
-		break;
-
 	case MEMGETBADBLOCK:
 	{
 		loff_t offs;
@@ -596,25 +649,11 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 		int mode;
 		if (copy_from_user(&mode, argp, sizeof(int)))
 			return -EFAULT;
-		SET_MTD_MODE(file, 0);
-		switch (mode) {
-		case MTD_OTP_FACTORY:
-			if (!mtd->read_fact_prot_reg)
-				ret = -EOPNOTSUPP;
-			else
-				SET_MTD_MODE(file, MTD_MODE_OTP_FACT);
-			break;
-		case MTD_OTP_USER:
-			if (!mtd->read_fact_prot_reg)
-				ret = -EOPNOTSUPP;
-			else
-				SET_MTD_MODE(file, MTD_MODE_OTP_USER);
-			break;
-		default:
-			ret = -EINVAL;
-		case MTD_OTP_OFF:
-			break;
-		}
+
+		mfi->mode = MTD_MODE_NORMAL;
+
+		ret = otp_select_filemode(mfi, mode);
+
 		file->f_pos = 0;
 		break;
 	}
@@ -626,8 +665,8 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 		if (!buf)
 			return -ENOMEM;
 		ret = -EOPNOTSUPP;
-		switch (MTD_MODE(file)) {
-		case MTD_MODE_OTP_FACT:
+		switch (mfi->mode) {
+		case MTD_MODE_OTP_FACTORY:
 			if (mtd->get_fact_prot_info)
 				ret = mtd->get_fact_prot_info(mtd, buf, 4096);
 			break;
@@ -635,6 +674,8 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 			if (mtd->get_user_prot_info)
 				ret = mtd->get_user_prot_info(mtd, buf, 4096);
 			break;
+		default:
+			break;
 		}
 		if (ret >= 0) {
 			if (cmd == OTPGETREGIONCOUNT) {
@@ -653,7 +694,7 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 	{
 		struct otp_info info;
 
-		if (MTD_MODE(file) != MTD_MODE_OTP_USER)
+		if (mfi->mode != MTD_MODE_OTP_USER)
 			return -EINVAL;
 		if (copy_from_user(&info, argp, sizeof(info)))
 			return -EFAULT;
@@ -664,6 +705,49 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 	}
 #endif
 
+	case ECCGETLAYOUT:
+	{
+		if (!mtd->ecclayout)
+			return -EOPNOTSUPP;
+
+		if (copy_to_user(argp, &mtd->ecclayout,
+				 sizeof(struct nand_ecclayout)))
+			return -EFAULT;
+		break;
+	}
+
+	case ECCGETSTATS:
+	{
+		if (copy_to_user(argp, &mtd->ecc_stats,
+				 sizeof(struct mtd_ecc_stats)))
+			return -EFAULT;
+		break;
+	}
+
+	case MTDFILEMODE:
+	{
+		mfi->mode = 0;
+
+		switch(arg) {
+		case MTD_MODE_OTP_FACTORY:
+		case MTD_MODE_OTP_USER:
+			ret = otp_select_filemode(mfi, arg);
+			break;
+
+		case MTD_MODE_RAW:
+			if (!mtd->read_oob || !mtd->write_oob)
+				return -EOPNOTSUPP;
+			mfi->mode = arg;
+
+		case MTD_MODE_NORMAL:
+			break;
+		default:
+			ret = -EINVAL;
+		}
+		file->f_pos = 0;
+		break;
+	}
+
 	default:
 		ret = -ENOTTY;
 	}
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 3c8d5e6fa010..1fea631b5852 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -56,7 +56,7 @@ concat_read(struct mtd_info *mtd, loff_t from, size_t len,
 	    size_t * retlen, u_char * buf)
 {
 	struct mtd_concat *concat = CONCAT(mtd);
-	int ret = 0, err = -EINVAL;
+	int ret = 0, err;
 	int i;
 
 	*retlen = 0;
@@ -80,28 +80,29 @@ concat_read(struct mtd_info *mtd, loff_t from, size_t len,
 
 		err = subdev->read(subdev, from, size, &retsize, buf);
 
-		if (err && (err != -EBADMSG) && (err != -EUCLEAN))
-			break;
-
 		/* Save information about bitflips! */
-		if (err) {
-			if (err == -EBADMSG)
-				ret = err;
-			else if (!ret)
+		if (unlikely(err)) {
+			if (err == -EBADMSG) {
+				mtd->ecc_stats.failed++;
 				ret = err;
-			err = 0;
+			} else if (err == -EUCLEAN) {
+				mtd->ecc_stats.corrected++;
+				/* Do not overwrite -EBADMSG !! */
+				if (!ret)
+					ret = err;
+			} else
+				return err;
 		}
 
 		*retlen += retsize;
 		len -= size;
 		if (len == 0)
-			break;
+			return ret;
 
-		err = -EINVAL;
 		buf += size;
 		from = 0;
 	}
-	return err ? err : ret;
+	return -EINVAL;
 }
 
 static int
@@ -244,7 +245,7 @@ concat_read_oob(struct mtd_info *mtd, loff_t from, struct mtd_oob_ops *ops)
 {
 	struct mtd_concat *concat = CONCAT(mtd);
 	struct mtd_oob_ops devops = *ops;
-	int i, err;
+	int i, err, ret = 0;
 
 	ops->retlen = 0;
 
@@ -262,12 +263,24 @@ concat_read_oob(struct mtd_info *mtd, loff_t from, struct mtd_oob_ops *ops)
 
 		err = subdev->read_oob(subdev, from, &devops);
 		ops->retlen += devops.retlen;
-		if (err)
-			return err;
+
+		/* Save information about bitflips! */
+		if (unlikely(err)) {
+			if (err == -EBADMSG) {
+				mtd->ecc_stats.failed++;
+				ret = err;
+			} else if (err == -EUCLEAN) {
+				mtd->ecc_stats.corrected++;
+				/* Do not overwrite -EBADMSG !! */
+				if (!ret)
+					ret = err;
+			} else
+				return err;
+		}
 
 		devops.len = ops->len - ops->retlen;
 		if (!devops.len)
-			return 0;
+			return ret;
 
 		if (devops.datbuf)
 			devops.datbuf += devops.retlen;
@@ -655,6 +668,8 @@ static int concat_block_markbad(struct mtd_info *mtd, loff_t ofs)
 		}
 
 		err = subdev->block_markbad(subdev, ofs);
+		if (!err)
+			mtd->ecc_stats.badblocks++;
 		break;
 	}
 
@@ -717,6 +732,8 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 	if (subdev[0]->block_markbad)
 		concat->mtd.block_markbad = concat_block_markbad;
 
+	concat->mtd.ecc_stats.badblocks = subdev[0]->ecc_stats.badblocks;
+
 	concat->subdev[0] = subdev[0];
 
 	for (i = 1; i < num_devs; i++) {
@@ -744,6 +761,8 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 				    subdev[i]->flags & MTD_WRITEABLE;
 		}
 		concat->mtd.size += subdev[i]->size;
+		concat->mtd.ecc_stats.badblocks +=
+			subdev[i]->ecc_stats.badblocks;
 		if (concat->mtd.writesize   !=  subdev[i]->writesize ||
 		    concat->mtd.oobsize    !=  subdev[i]->oobsize ||
 		    concat->mtd.ecctype    !=  subdev[i]->ecctype ||
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index f22aeccf01e7..77a7123a5c56 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -51,12 +51,21 @@ static int part_read (struct mtd_info *mtd, loff_t from, size_t len,
 			size_t *retlen, u_char *buf)
 {
 	struct mtd_part *part = PART(mtd);
+	int res;
+
 	if (from >= mtd->size)
 		len = 0;
 	else if (from + len > mtd->size)
 		len = mtd->size - from;
-	return part->master->read (part->master, from + part->offset,
+	res = part->master->read (part->master, from + part->offset,
 				   len, retlen, buf);
+	if (unlikely(res)) {
+		if (res == -EUCLEAN)
+			mtd->ecc_stats.corrected++;
+		if (res == -EBADMSG)
+			mtd->ecc_stats.failed++;
+	}
+	return res;
 }
 
 static int part_point (struct mtd_info *mtd, loff_t from, size_t len,
@@ -82,12 +91,21 @@ static int part_read_oob(struct mtd_info *mtd, loff_t from,
 			 struct mtd_oob_ops *ops)
 {
 	struct mtd_part *part = PART(mtd);
+	int res;
 
 	if (from >= mtd->size)
 		return -EINVAL;
 	if (from + ops->len > mtd->size)
 		return -EINVAL;
-	return part->master->read_oob(part->master, from + part->offset, ops);
+	res = part->master->read_oob(part->master, from + part->offset, ops);
+
+	if (unlikely(res)) {
+		if (res == -EUCLEAN)
+			mtd->ecc_stats.corrected++;
+		if (res == -EBADMSG)
+			mtd->ecc_stats.failed++;
+	}
+	return res;
 }
 
 static int part_read_user_prot_reg (struct mtd_info *mtd, loff_t from, size_t len,
@@ -246,12 +264,17 @@ static int part_block_isbad (struct mtd_info *mtd, loff_t ofs)
 static int part_block_markbad (struct mtd_info *mtd, loff_t ofs)
 {
 	struct mtd_part *part = PART(mtd);
+	int res;
+
 	if (!(mtd->flags & MTD_WRITEABLE))
 		return -EROFS;
 	if (ofs >= mtd->size)
 		return -EINVAL;
 	ofs += part->offset;
-	return part->master->block_markbad(part->master, ofs);
+	res = part->master->block_markbad(part->master, ofs);
+	if (!res)
+		mtd->ecc_stats.badblocks++;
+	return res;
 }
 
 /*
@@ -436,6 +459,16 @@ int add_mtd_partitions(struct mtd_info *master,
 		}
 
 		slave->mtd.ecclayout = master->ecclayout;
+		if (master->block_isbad) {
+			uint32_t offs = 0;
+
+			while(offs < slave->mtd.size) {
+				if (master->block_isbad(master,
+							offs + slave->offset))
+					slave->mtd.ecc_stats.badblocks++;
+				offs += slave->mtd.erasesize;
+			}
+		}
 
 		if(parts[i].mtdp)
 		{	/* store the object pointer (caller may or may not register it */
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 7a3a44907715..ea6d2c334aed 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -347,7 +347,7 @@ static int nand_default_block_markbad(struct mtd_info *mtd, loff_t ofs)
 {
 	struct nand_chip *chip = mtd->priv;
 	uint8_t buf[2] = { 0, 0 };
-	int block;
+	int block, ret;
 
 	/* Get block number */
 	block = ((int)ofs) >> chip->bbt_erase_shift;
@@ -356,16 +356,22 @@ static int nand_default_block_markbad(struct mtd_info *mtd, loff_t ofs)
 
 	/* Do we have a flash based bad block table ? */
 	if (chip->options & NAND_USE_FLASH_BBT)
-		return nand_update_bbt(mtd, ofs);
-
-	/* We write two bytes, so we dont have to mess with 16 bit access */
-	ofs += mtd->oobsize;
-	chip->ops.len = 2;
-	chip->ops.datbuf = NULL;
-	chip->ops.oobbuf = buf;
-	chip->ops.ooboffs = chip->badblockpos & ~0x01;
+		ret = nand_update_bbt(mtd, ofs);
+	else {
+		/* We write two bytes, so we dont have to mess with 16 bit
+		 * access
+		 */
+		ofs += mtd->oobsize;
+		chip->ops.len = 2;
+		chip->ops.datbuf = NULL;
+		chip->ops.oobbuf = buf;
+		chip->ops.ooboffs = chip->badblockpos & ~0x01;
 
-	return nand_do_write_oob(mtd, ofs, &chip->ops);
+		ret = nand_do_write_oob(mtd, ofs, &chip->ops);
+	}
+	if (!ret)
+		mtd->ecc_stats.badblocks++;
+	return ret;
 }
 
 /**
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index 480c3cbf9bf9..a612c4ea8194 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -176,6 +176,7 @@ static int read_bbt(struct mtd_info *mtd, uint8_t *buf, int page, int num,
 					printk(KERN_DEBUG "nand_read_bbt: Reserved block at 0x%08x\n",
 					       ((offs << 2) + (act >> 1)) << this->bbt_erase_shift);
 					this->bbt[offs + (act >> 3)] |= 0x2 << (act & 0x06);
+					mtd->ecc_stats.bbtblocks++;
 					continue;
 				}
 				/* Leave it for now, if its matured we can move this
@@ -187,6 +188,7 @@ static int read_bbt(struct mtd_info *mtd, uint8_t *buf, int page, int num,
 					this->bbt[offs + (act >> 3)] |= 0x3 << (act & 0x06);
 				else
 					this->bbt[offs + (act >> 3)] |= 0x1 << (act & 0x06);
+				mtd->ecc_stats.badblocks++;
 			}
 		}
 		totlen -= len;
@@ -431,6 +433,7 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf,
 			this->bbt[i >> 3] |= 0x03 << (i & 0x6);
 			printk(KERN_WARNING "Bad eraseblock %d at 0x%08x\n",
 			       i >> 1, (unsigned int)from);
+			mtd->ecc_stats.badblocks++;
 		}
 
 		i += 2;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index e75bb584e80b..9536567d041b 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -56,17 +56,6 @@ struct mtd_erase_region_info {
 	u_int32_t numblocks;		/* Number of blocks of erasesize in this region */
 };
 
-/**
- * struct mtd_ecc_stats - error correction status
- *
- * @corrected:	number of corrected bits
- * @failed:	number of uncorrectable errors
- */
-struct mtd_ecc_stats {
-	unsigned long corrected;
-	unsigned long failed;
-};
-
 /*
  * oob operation modes
  *
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
index 54c673f9648d..c11a589bdedf 100644
--- a/include/mtd/mtd-abi.h
+++ b/include/mtd/mtd-abi.h
@@ -99,6 +99,8 @@ struct otp_info {
 #define OTPGETREGIONINFO	_IOW('M', 15, struct otp_info)
 #define OTPLOCK			_IOR('M', 16, struct otp_info)
 #define ECCGETLAYOUT		_IOR('M', 17, struct nand_ecclayout)
+#define ECCGETSTATS		_IOR('M', 18, struct mtd_ecc_stats)
+#define MTDFILEMODE		_IO('M', 19)
 
 /*
  * Obsolete legacy interface. Keep it in order not to break userspace
@@ -128,4 +130,29 @@ struct nand_ecclayout {
 	struct nand_oobfree oobfree[MTD_MAX_OOBFREE_ENTRIES];
 };
 
+/**
+ * struct mtd_ecc_stats - error correction status
+ *
+ * @corrected:	number of corrected bits
+ * @failed:	number of uncorrectable errors
+ * @badblocks:	number of bad blocks in this partition
+ * @bbtblocks:	number of blocks reserved for bad block tables
+ */
+struct mtd_ecc_stats {
+	uint32_t corrected;
+	uint32_t failed;
+	uint32_t badblocks;
+	uint32_t bbtblocks;
+};
+
+/*
+ * Read/write file modes for access to MTD
+ */
+enum mtd_file_modes {
+	MTD_MODE_NORMAL = MTD_OTP_OFF,
+	MTD_MODE_OTP_FACTORY = MTD_OTP_FACTORY,
+	MTD_MODE_OTP_USER = MTD_OTP_USER,
+	MTD_MODE_RAW,
+};
+
 #endif /* __MTD_ABI_H__ */
-- 
cgit v1.2.3


From 7395acb2c840fd4d0cacc91d6fb71440057141ab Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 31 May 2006 18:27:25 +0900
Subject: [PATCH] libata: shift host flag constants

Nudge host flag constants to make a room after ATA_FLAG_EH_PENDING.
New EH flag will be added.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 include/linux/libata.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index b0ee1c1437d6..3f9c65f1aafa 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -149,14 +149,14 @@ enum {
 	ATA_FLAG_NO_ATAPI	= (1 << 6), /* No ATAPI support */
 	ATA_FLAG_PIO_DMA	= (1 << 7), /* PIO cmds via DMA */
 	ATA_FLAG_PIO_LBA48	= (1 << 8), /* Host DMA engine is LBA28 only */
-	ATA_FLAG_PIO_POLLING	= (1 << 10), /* use polling PIO if LLD
-					      * doesn't handle PIO interrupts */
-	ATA_FLAG_NCQ		= (1 << 11), /* host supports NCQ */
+	ATA_FLAG_PIO_POLLING	= (1 << 9), /* use polling PIO if LLD
+					     * doesn't handle PIO interrupts */
+	ATA_FLAG_NCQ		= (1 << 10), /* host supports NCQ */
 
-	ATA_FLAG_DEBUGMSG	= (1 << 14),
-	ATA_FLAG_FLUSH_PORT_TASK = (1 << 15), /* flush port task */
+	ATA_FLAG_DEBUGMSG	= (1 << 13),
+	ATA_FLAG_FLUSH_PORT_TASK = (1 << 14), /* flush port task */
 
-	ATA_FLAG_EH_PENDING	= (1 << 16), /* EH pending */
+	ATA_FLAG_EH_PENDING	= (1 << 15), /* EH pending */
 	ATA_FLAG_FROZEN		= (1 << 17), /* port is frozen */
 	ATA_FLAG_RECOVERED	= (1 << 18), /* recovery action performed */
 
-- 
cgit v1.2.3


From c6cf9e99d1de5ca6a08fb639bb73031ffe50d802 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 31 May 2006 18:27:27 +0900
Subject: [PATCH] libata: implement ata_eh_wait()

Implement ata_eh_wait().  On return from this function, it's
guaranteed that the EH which was pending or in progress when the
function was called is complete - including the tailing part of SCSI
EH.  This will be used by hotplug and others to synchronize with EH.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c |  1 +
 drivers/scsi/libata-eh.c   | 38 ++++++++++++++++++++++++++++++++++++++
 drivers/scsi/libata.h      |  1 +
 include/linux/libata.h     |  2 ++
 4 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 11df827e166f..66df895c9617 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -5189,6 +5189,7 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 
 	INIT_WORK(&ap->port_task, NULL, NULL);
 	INIT_LIST_HEAD(&ap->eh_done_q);
+	init_waitqueue_head(&ap->eh_wait_q);
 
 	/* set cable type */
 	ap->cbl = ATA_CBL_NONE;
diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index b88f492eab12..9173d8f2ce5d 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -237,6 +237,7 @@ void ata_scsi_error(struct Scsi_Host *host)
 		ap->eh_context.i = ap->eh_info;
 		memset(&ap->eh_info, 0, sizeof(ap->eh_info));
 
+		ap->flags |= ATA_FLAG_EH_IN_PROGRESS;
 		ap->flags &= ~ATA_FLAG_EH_PENDING;
 
 		spin_unlock_irqrestore(hs_lock, flags);
@@ -290,11 +291,48 @@ void ata_scsi_error(struct Scsi_Host *host)
 		ata_port_printk(ap, KERN_INFO, "EH complete\n");
 	ap->flags &= ~ATA_FLAG_RECOVERED;
 
+	/* tell wait_eh that we're done */
+	ap->flags &= ~ATA_FLAG_EH_IN_PROGRESS;
+	wake_up_all(&ap->eh_wait_q);
+
 	spin_unlock_irqrestore(hs_lock, flags);
 
 	DPRINTK("EXIT\n");
 }
 
+/**
+ *	ata_port_wait_eh - Wait for the currently pending EH to complete
+ *	@ap: Port to wait EH for
+ *
+ *	Wait until the currently pending EH is complete.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ */
+void ata_port_wait_eh(struct ata_port *ap)
+{
+	unsigned long flags;
+	DEFINE_WAIT(wait);
+
+ retry:
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+
+	while (ap->flags & (ATA_FLAG_EH_PENDING | ATA_FLAG_EH_IN_PROGRESS)) {
+		prepare_to_wait(&ap->eh_wait_q, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock_irqrestore(&ap->host_set->lock, flags);
+		schedule();
+		spin_lock_irqsave(&ap->host_set->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	/* make sure SCSI EH is complete */
+	if (scsi_host_in_recovery(ap->host)) {
+		msleep(10);
+		goto retry;
+	}
+}
+
 /**
  *	ata_qc_timeout - Handle timeout of queued command
  *	@qc: Command that timed out
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index b76ad7d7062a..d56d9e1d73dc 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -103,6 +103,7 @@ extern void ata_schedule_scsi_eh(struct Scsi_Host *shost);
 /* libata-eh.c */
 extern enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
 extern void ata_scsi_error(struct Scsi_Host *host);
+extern void ata_port_wait_eh(struct ata_port *ap);
 extern void ata_qc_schedule_eh(struct ata_queued_cmd *qc);
 
 #endif /* __LIBATA_H__ */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 3f9c65f1aafa..2eb5828839e4 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -157,6 +157,7 @@ enum {
 	ATA_FLAG_FLUSH_PORT_TASK = (1 << 14), /* flush port task */
 
 	ATA_FLAG_EH_PENDING	= (1 << 15), /* EH pending */
+	ATA_FLAG_EH_IN_PROGRESS	= (1 << 16), /* EH in progress */
 	ATA_FLAG_FROZEN		= (1 << 17), /* port is frozen */
 	ATA_FLAG_RECOVERED	= (1 << 18), /* recovery action performed */
 
@@ -490,6 +491,7 @@ struct ata_port {
 
 	u32			msg_enable;
 	struct list_head	eh_done_q;
+	wait_queue_head_t	eh_wait_q;
 
 	void			*private_data;
 
-- 
cgit v1.2.3


From abdda7331d469fa965167365f011d05e226008fb Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 31 May 2006 18:27:29 +0900
Subject: [PATCH] libata-hp-prep: add flags and eh_info/context fields for
 hotplug

Add hotplug related flags and eh_info/context fields.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 include/linux/libata.h | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2eb5828839e4..d4a668cf143b 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -131,6 +131,9 @@ enum {
 
 	ATA_DFLAG_PIO		= (1 << 8), /* device currently in PIO mode */
 
+	ATA_DFLAG_DETACH	= (1 << 16),
+	ATA_DFLAG_DETACHED	= (1 << 17),
+
 	ATA_DEV_UNKNOWN		= 0,	/* unknown device */
 	ATA_DEV_ATA		= 1,	/* ATA device */
 	ATA_DEV_ATA_UNSUP	= 2,	/* ATA device (unsupported) */
@@ -152,6 +155,9 @@ enum {
 	ATA_FLAG_PIO_POLLING	= (1 << 9), /* use polling PIO if LLD
 					     * doesn't handle PIO interrupts */
 	ATA_FLAG_NCQ		= (1 << 10), /* host supports NCQ */
+	ATA_FLAG_HRST_TO_RESUME	= (1 << 11), /* hardreset to resume phy */
+	ATA_FLAG_SKIP_D2H_BSY	= (1 << 12), /* can't wait for the first D2H
+					      * Register FIS clearing BSY */
 
 	ATA_FLAG_DEBUGMSG	= (1 << 13),
 	ATA_FLAG_FLUSH_PORT_TASK = (1 << 14), /* flush port task */
@@ -160,6 +166,9 @@ enum {
 	ATA_FLAG_EH_IN_PROGRESS	= (1 << 16), /* EH in progress */
 	ATA_FLAG_FROZEN		= (1 << 17), /* port is frozen */
 	ATA_FLAG_RECOVERED	= (1 << 18), /* recovery action performed */
+	ATA_FLAG_LOADING	= (1 << 19), /* boot/loading probe */
+	ATA_FLAG_UNLOADING	= (1 << 20), /* module is unloading */
+	ATA_FLAG_SCSI_HOTPLUG	= (1 << 21), /* SCSI hotplug scheduled */
 
 	ATA_FLAG_DISABLED	= (1 << 22), /* port is disabled, ignore it */
 	ATA_FLAG_SUSPENDED	= (1 << 23), /* port is suspended (power) */
@@ -241,7 +250,9 @@ enum {
 	ATA_EH_RESET_MASK	= ATA_EH_SOFTRESET | ATA_EH_HARDRESET,
 
 	/* ata_eh_info->flags */
-	ATA_EHI_DID_RESET	= (1 << 0), /* already reset this port */
+	ATA_EHI_HOTPLUGGED	= (1 << 0),  /* could have been hotplugged */
+
+	ATA_EHI_DID_RESET	= (1 << 16), /* already reset this port */
 
 	/* max repeat if error condition is still set after ->error_handler */
 	ATA_EH_MAX_REPEAT	= 5,
@@ -434,6 +445,10 @@ struct ata_eh_info {
 	unsigned int		err_mask;	/* port-wide err_mask */
 	unsigned int		action;		/* ATA_EH_* action mask */
 	unsigned int		flags;		/* ATA_EHI_* flags */
+
+	unsigned long		hotplug_timestamp;
+	unsigned int		probe_mask;
+
 	char			desc[ATA_EH_DESC_LEN];
 	int			desc_len;
 };
@@ -441,6 +456,8 @@ struct ata_eh_info {
 struct ata_eh_context {
 	struct ata_eh_info	i;
 	int			tries[ATA_MAX_DEVICES];
+	unsigned int		classes[ATA_MAX_DEVICES];
+	unsigned int		did_probe_mask;
 };
 
 struct ata_port {
-- 
cgit v1.2.3


From 72fa4b742b327bd1b07985d79a61c61dbd9fd4e6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 31 May 2006 18:27:32 +0900
Subject: [PATCH] libata-hp-prep: make some ata_device fields persistent

Lifetimes of some fields span over device plugging/unplugging.  This
patch moves such persistent fields to the top of ata_device and
separate them with ATA_DEVICE_CLEAR_OFFSET.  Fields above the offset
are initialized once during host initializatino while all other fields
are cleared before hotplugging.  Currently ->ap, devno and part of
flags are persistent.

Note that flags is partially cleared while holding host_set lock.
This is to synchronize with later warm plug implementation which will
record hotplug request in dev->flags.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c | 14 ++++++++++++--
 include/linux/libata.h     | 11 +++++++++--
 2 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 8fda8228159c..a07ab77d32d6 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -5153,9 +5153,18 @@ static void ata_host_remove(struct ata_port *ap, unsigned int do_unregister)
 void ata_dev_init(struct ata_device *dev)
 {
 	struct ata_port *ap = dev->ap;
+	unsigned long flags;
+
+	/* High bits of dev->flags are used to record warm plug
+	 * requests which occur asynchronously.  Synchronize using
+	 * host_set lock.
+	 */
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	dev->flags &= ~ATA_DFLAG_INIT_MASK;
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
 
-	memset((void *)dev, 0, sizeof(*dev));
-	dev->devno = dev - ap->device;
+	memset((void *)dev + ATA_DEVICE_CLEAR_OFFSET, 0,
+	       sizeof(*dev) - ATA_DEVICE_CLEAR_OFFSET);
 	dev->pio_mask = UINT_MAX;
 	dev->mwdma_mask = UINT_MAX;
 	dev->udma_mask = UINT_MAX;
@@ -5218,6 +5227,7 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 	for (i = 0; i < ATA_MAX_DEVICES; i++) {
 		struct ata_device *dev = &ap->device[i];
 		dev->ap = ap;
+		dev->devno = i;
 		ata_dev_init(dev);
 	}
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index d4a668cf143b..aa14eda0656c 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -130,6 +130,7 @@ enum {
 	ATA_DFLAG_CFG_MASK	= (1 << 8) - 1,
 
 	ATA_DFLAG_PIO		= (1 << 8), /* device currently in PIO mode */
+	ATA_DFLAG_INIT_MASK	= (1 << 16) - 1,
 
 	ATA_DFLAG_DETACH	= (1 << 16),
 	ATA_DFLAG_DETACHED	= (1 << 17),
@@ -410,10 +411,11 @@ struct ata_ering {
 
 struct ata_device {
 	struct ata_port		*ap;
-	u64			n_sectors;	/* size of device, if ATA */
+	unsigned int		devno;		/* 0 or 1 */
 	unsigned long		flags;		/* ATA_DFLAG_xxx */
+	/* n_sector is used as CLEAR_OFFSET, read comment above CLEAR_OFFSET */
+	u64			n_sectors;	/* size of device, if ATA */
 	unsigned int		class;		/* ATA_DEV_xxx */
-	unsigned int		devno;		/* 0 or 1 */
 	u16			id[ATA_ID_WORDS]; /* IDENTIFY xxx DEVICE data */
 	u8			pio_mode;
 	u8			dma_mode;
@@ -439,6 +441,11 @@ struct ata_device {
 	struct ata_ering	ering;
 };
 
+/* Offset into struct ata_device.  Fields above it are maintained
+ * acress device init.  Fields below are zeroed.
+ */
+#define ATA_DEVICE_CLEAR_OFFSET		offsetof(struct ata_device, n_sectors)
+
 struct ata_eh_info {
 	struct ata_device	*dev;		/* offending device */
 	u32			serror;		/* SError from LLDD */
-- 
cgit v1.2.3


From 5a04bf4befa8bffa012eedc3a0903c158b9131a9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 31 May 2006 18:27:38 +0900
Subject: [PATCH] libata-hp-prep: implement ap->hw_sata_spd_limit

Add ap->hw_sata_spd_limit and initialize it once during the boot
initialization (or driver load initialization).  ap->sata_spd_limit is
reset to ap->hw_sata_spd_limit on hotplug.  This prevents spd limits
introduced by earlier devices from affecting new devices.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c | 21 ++++++++++++---------
 include/linux/libata.h     |  1 +
 2 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index a07ab77d32d6..b1a02fdbb0a5 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -2466,17 +2466,9 @@ static int sata_phy_resume(struct ata_port *ap)
  */
 void ata_std_probeinit(struct ata_port *ap)
 {
-	u32 scontrol;
-
 	/* resume link */
 	sata_phy_resume(ap);
 
-	/* init sata_spd_limit to the current value */
-	if (sata_scr_read(ap, SCR_CONTROL, &scontrol) == 0) {
-		int spd = (scontrol >> 4) & 0xf;
-		ap->sata_spd_limit &= (1 << spd) - 1;
-	}
-
 	/* wait for device */
 	if (ata_port_online(ap))
 		ata_busy_sleep(ap, ATA_TMOUT_BOOT_QUICK, ATA_TMOUT_BOOT);
@@ -5155,6 +5147,9 @@ void ata_dev_init(struct ata_device *dev)
 	struct ata_port *ap = dev->ap;
 	unsigned long flags;
 
+	/* SATA spd limit is bound to the first device */
+	ap->sata_spd_limit = ap->hw_sata_spd_limit;
+
 	/* High bits of dev->flags are used to record warm plug
 	 * requests which occur asynchronously.  Synchronize using
 	 * host_set lock.
@@ -5210,7 +5205,7 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 	ap->udma_mask = ent->udma_mask;
 	ap->flags |= ent->host_flags;
 	ap->ops = ent->port_ops;
-	ap->sata_spd_limit = UINT_MAX;
+	ap->hw_sata_spd_limit = UINT_MAX;
 	ap->active_tag = ATA_TAG_POISON;
 	ap->last_ctl = 0xFF;
 	ap->msg_enable = ATA_MSG_DRV;
@@ -5375,10 +5370,18 @@ int ata_device_add(const struct ata_probe_ent *ent)
 	DPRINTK("probe begin\n");
 	for (i = 0; i < count; i++) {
 		struct ata_port *ap;
+		u32 scontrol;
 		int rc;
 
 		ap = host_set->ports[i];
 
+		/* init sata_spd_limit to the current value */
+		if (sata_scr_read(ap, SCR_CONTROL, &scontrol) == 0) {
+			int spd = (scontrol >> 4) & 0xf;
+			ap->hw_sata_spd_limit &= (1 << spd) - 1;
+		}
+		ap->sata_spd_limit = ap->hw_sata_spd_limit;
+
 		DPRINTK("ata%u: bus probe begin\n", ap->id);
 		rc = ata_bus_probe(ap);
 		DPRINTK("ata%u: bus probe end\n", ap->id);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index aa14eda0656c..10dc235ad8bc 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -489,6 +489,7 @@ struct ata_port {
 	unsigned int		mwdma_mask;
 	unsigned int		udma_mask;
 	unsigned int		cbl;	/* cable type; ATA_CBL_xxx */
+	unsigned int		hw_sata_spd_limit;
 	unsigned int		sata_spd_limit;	/* SATA PHY speed limit */
 
 	/* record runtime error info, protected by host_set lock */
-- 
cgit v1.2.3


From 3edebac41bab7e146578ad9e723ee7fff71c99c0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 31 May 2006 18:27:40 +0900
Subject: [PATCH] libata-hp-prep: store attached SCSI device

Add device persistent field dev->sdev and store the attached SCSI
device.  With hotplug, libata needs to know the attached SCSI device
to offline and detach it, but scsi_device_lookup() cannot be used
because libata will reuse SCSI ID numbers - dead but not gone devices
(due to zombie opens, etc...) interfere with the lookup.

dev->sdev doesn't hold reference to the SCSI device.  It's cleared
when the SCSI device goes away.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-scsi.c | 14 ++++++++++----
 include/linux/libata.h     |  1 +
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 05090768d9a8..da9689b70826 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -2743,16 +2743,22 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
 
 void ata_scsi_scan_host(struct ata_port *ap)
 {
-	struct ata_device *dev;
 	unsigned int i;
 
 	if (ap->flags & ATA_FLAG_DISABLED)
 		return;
 
 	for (i = 0; i < ATA_MAX_DEVICES; i++) {
-		dev = &ap->device[i];
+		struct ata_device *dev = &ap->device[i];
+		struct scsi_device *sdev;
+
+		if (!ata_dev_enabled(dev) || dev->sdev)
+			continue;
 
-		if (ata_dev_enabled(dev))
-			scsi_scan_target(&ap->host->shost_gendev, 0, i, 0, 0);
+		sdev = __scsi_add_device(ap->host, 0, i, 0, NULL);
+		if (!IS_ERR(sdev)) {
+			dev->sdev = sdev;
+			scsi_device_put(sdev);
+		}
 	}
 }
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 10dc235ad8bc..c0513c752751 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -413,6 +413,7 @@ struct ata_device {
 	struct ata_port		*ap;
 	unsigned int		devno;		/* 0 or 1 */
 	unsigned long		flags;		/* ATA_DFLAG_xxx */
+	struct scsi_device	*sdev;		/* attached SCSI device */
 	/* n_sector is used as CLEAR_OFFSET, read comment above CLEAR_OFFSET */
 	u64			n_sectors;	/* size of device, if ATA */
 	unsigned int		class;		/* ATA_DEV_xxx */
-- 
cgit v1.2.3


From d7bb4cc7575929a60b0a718daa1bce87bea9a9cc Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 31 May 2006 18:27:46 +0900
Subject: [PATCH] libata-hp-prep: implement sata_phy_debounce()

With hotplug, PHY always needs to be debounced before a reset as any
reset might find new devices.  Extract PHY waiting code from
sata_phy_resume() and extend it to include SStatus debouncing.  Note
that sata_phy_debounce() is superset of what used to be done inside
sata_phy_resume().

Three default debounce timing parameters are defined to be used by
hot/boot plug.  As resume failure during probing will be properly
handled as errors, timeout doesn't have to be long as before.
probeinit() uses the same timeout to retain the original behavior.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c | 107 +++++++++++++++++++++++++++++++++++++++------
 include/linux/libata.h     |   6 +++
 2 files changed, 99 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index a9f79b47b4c3..4823ecefb8a1 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -61,6 +61,11 @@
 
 #include "libata.h"
 
+/* debounce timing parameters in msecs { interval, duration, timeout } */
+const unsigned long sata_deb_timing_boot[]		= {   5,  100, 2000 };
+const unsigned long sata_deb_timing_eh[]		= {  25,  500, 2000 };
+const unsigned long sata_deb_timing_before_fsrst[]	= { 100, 2000, 5000 };
+
 static unsigned int ata_dev_init_params(struct ata_device *dev,
 					u16 heads, u16 sectors);
 static unsigned int ata_dev_set_xfermode(struct ata_device *dev);
@@ -2427,10 +2432,81 @@ err_out:
 	DPRINTK("EXIT\n");
 }
 
-static int sata_phy_resume(struct ata_port *ap)
+/**
+ *	sata_phy_debounce - debounce SATA phy status
+ *	@ap: ATA port to debounce SATA phy status for
+ *	@params: timing parameters { interval, duratinon, timeout } in msec
+ *
+ *	Make sure SStatus of @ap reaches stable state, determined by
+ *	holding the same value where DET is not 1 for @duration polled
+ *	every @interval, before @timeout.  Timeout constraints the
+ *	beginning of the stable state.  Because, after hot unplugging,
+ *	DET gets stuck at 1 on some controllers, this functions waits
+ *	until timeout then returns 0 if DET is stable at 1.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ *
+ *	RETURNS:
+ *	0 on success, -errno on failure.
+ */
+int sata_phy_debounce(struct ata_port *ap, const unsigned long *params)
 {
-	unsigned long timeout = jiffies + (HZ * 5);
-	u32 scontrol, sstatus;
+	unsigned long interval_msec = params[0];
+	unsigned long duration = params[1] * HZ / 1000;
+	unsigned long timeout = jiffies + params[2] * HZ / 1000;
+	unsigned long last_jiffies;
+	u32 last, cur;
+	int rc;
+
+	if ((rc = sata_scr_read(ap, SCR_STATUS, &cur)))
+		return rc;
+	cur &= 0xf;
+
+	last = cur;
+	last_jiffies = jiffies;
+
+	while (1) {
+		msleep(interval_msec);
+		if ((rc = sata_scr_read(ap, SCR_STATUS, &cur)))
+			return rc;
+		cur &= 0xf;
+
+		/* DET stable? */
+		if (cur == last) {
+			if (cur == 1 && time_before(jiffies, timeout))
+				continue;
+			if (time_after(jiffies, last_jiffies + duration))
+				return 0;
+			continue;
+		}
+
+		/* unstable, start over */
+		last = cur;
+		last_jiffies = jiffies;
+
+		/* check timeout */
+		if (time_after(jiffies, timeout))
+			return -EBUSY;
+	}
+}
+
+/**
+ *	sata_phy_resume - resume SATA phy
+ *	@ap: ATA port to resume SATA phy for
+ *	@params: timing parameters { interval, duratinon, timeout } in msec
+ *
+ *	Resume SATA phy of @ap and debounce it.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ *
+ *	RETURNS:
+ *	0 on success, -errno on failure.
+ */
+int sata_phy_resume(struct ata_port *ap, const unsigned long *params)
+{
+	u32 scontrol;
 	int rc;
 
 	if ((rc = sata_scr_read(ap, SCR_CONTROL, &scontrol)))
@@ -2441,16 +2517,12 @@ static int sata_phy_resume(struct ata_port *ap)
 	if ((rc = sata_scr_write(ap, SCR_CONTROL, scontrol)))
 		return rc;
 
-	/* Wait for phy to become ready, if necessary. */
-	do {
-		msleep(200);
-		if ((rc = sata_scr_read(ap, SCR_STATUS, &sstatus)))
-			return rc;
-		if ((sstatus & 0xf) != 1)
-			return 0;
-	} while (time_before(jiffies, timeout));
+	/* Some PHYs react badly if SStatus is pounded immediately
+	 * after resuming.  Delay 200ms before debouncing.
+	 */
+	msleep(200);
 
-	return -EBUSY;
+	return sata_phy_debounce(ap, params);
 }
 
 /**
@@ -2468,8 +2540,10 @@ static int sata_phy_resume(struct ata_port *ap)
  */
 void ata_std_probeinit(struct ata_port *ap)
 {
+	static const unsigned long deb_timing[] = { 5, 100, 5000 };
+
 	/* resume link */
-	sata_phy_resume(ap);
+	sata_phy_resume(ap, deb_timing);
 
 	/* wait for device */
 	if (ata_port_online(ap))
@@ -2585,7 +2659,7 @@ int sata_std_hardreset(struct ata_port *ap, unsigned int *class)
 	msleep(1);
 
 	/* bring phy back */
-	sata_phy_resume(ap);
+	sata_phy_resume(ap, sata_deb_timing_eh);
 
 	/* TODO: phy layer with polling, timeouts, etc. */
 	if (ata_port_offline(ap)) {
@@ -5718,6 +5792,9 @@ u32 ata_wait_register(void __iomem *reg, u32 mask, u32 val,
  * Do not depend on ABI/API stability.
  */
 
+EXPORT_SYMBOL_GPL(sata_deb_timing_boot);
+EXPORT_SYMBOL_GPL(sata_deb_timing_eh);
+EXPORT_SYMBOL_GPL(sata_deb_timing_before_fsrst);
 EXPORT_SYMBOL_GPL(ata_std_bios_param);
 EXPORT_SYMBOL_GPL(ata_std_ports);
 EXPORT_SYMBOL_GPL(ata_device_add);
@@ -5757,6 +5834,8 @@ EXPORT_SYMBOL_GPL(ata_bmdma_error_handler);
 EXPORT_SYMBOL_GPL(ata_bmdma_post_internal_cmd);
 EXPORT_SYMBOL_GPL(ata_port_probe);
 EXPORT_SYMBOL_GPL(sata_set_spd);
+EXPORT_SYMBOL_GPL(sata_phy_debounce);
+EXPORT_SYMBOL_GPL(sata_phy_resume);
 EXPORT_SYMBOL_GPL(sata_phy_reset);
 EXPORT_SYMBOL_GPL(__sata_phy_reset);
 EXPORT_SYMBOL_GPL(ata_bus_reset);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index c0513c752751..1c167f728fb4 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -607,11 +607,17 @@ struct ata_timing {
 
 #define FIT(v,vmin,vmax)	max_t(short,min_t(short,v,vmax),vmin)
 
+extern const unsigned long sata_deb_timing_boot[];
+extern const unsigned long sata_deb_timing_eh[];
+extern const unsigned long sata_deb_timing_before_fsrst[];
+
 extern void ata_port_probe(struct ata_port *);
 extern void __sata_phy_reset(struct ata_port *ap);
 extern void sata_phy_reset(struct ata_port *ap);
 extern void ata_bus_reset(struct ata_port *ap);
 extern int sata_set_spd(struct ata_port *ap);
+extern int sata_phy_debounce(struct ata_port *ap, const unsigned long *param);
+extern int sata_phy_resume(struct ata_port *ap, const unsigned long *param);
 extern int ata_drive_probe_reset(struct ata_port *ap,
 			ata_probeinit_fn_t probeinit,
 			ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
-- 
cgit v1.2.3


From f5914a461eb9703773226a0813f6ffcae10c0861 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 31 May 2006 18:27:48 +0900
Subject: [PATCH] libata-hp-prep: add prereset() method and implement
 ata_std_prereset()

With hotplug, every reset might be a probing reset and thus something
similar to probe_init() is needed.  prereset() method is called before
a series of resets to a port and is the counterpart of postreset().
prereset() can tell EH to use different type of reset or skip reset by
modifying ehc->i.action.

This patch also implements ata_std_prereset().  Most controllers
should be able to use this function directly or with some wrapping.
After hotplug, different controllers need different actions to resume
the PHY and detect the newly attached device.  Controllers can be
categorized as follows.

* Controllers which can wait for the first D2H FIS after hotplug.
  Note that if the waiting is implemented by polling TF status, there
  needs to be a way to set BSY on PHY status change.  It can be
  implemented by hardware or with the help of the driver.

* Controllers which can wait for the first D2H FIS after sending
  COMRESET.  These controllers need to issue COMRESET to wait for the
  first FIS.  Note that the received D2H FIS could be the first D2H
  FIS after POR (power-on-reset) or D2H FIS in response to the
  COMRESET.  Some controllers use COMRESET as TF status
  synchronization point and clear TF automatically (sata_sil).

* Controllers which cannot wait for the first D2H FIS reliably.
  Blindly issuing SRST to spinning-up device often results in command
  issue failure or timeout, causing extended delay.  For these
  controllers, ata_std_prereset() explicitly waits ATA_SPINUP_WAIT
  (currently 8s) to give newly attached device time to spin up, then
  issues reset.  Note that failing to getting ready in ATA_SPINUP_WAIT
  is not critical.  libata will retry.  So, the timeout needs to be
  long enough to spin up most devices.

LLDDs can tell ata_std_prereset() which of above action is needed with
ATA_FLAG_HRST_TO_RESUME and ATA_FLAG_SKIP_D2H_BSY flags.  These flags
are PHY-specific property and will be moved to ata_link later.

While at it, this patch unifies function typedef's such that they all
have named arguments.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/ahci.c         |  3 +-
 drivers/scsi/libata-bmdma.c | 11 +++---
 drivers/scsi/libata-core.c  | 85 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/scsi/libata-eh.c    | 60 +++++++++++++++++++++++++++-----
 drivers/scsi/sata_sil24.c   |  3 +-
 include/linux/libata.h      | 24 +++++++++----
 6 files changed, 165 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index 45fd71d80128..8493b021cc07 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -1026,7 +1026,8 @@ static void ahci_error_handler(struct ata_port *ap)
 	}
 
 	/* perform recovery */
-	ata_do_eh(ap, ahci_softreset, ahci_hardreset, ahci_postreset);
+	ata_do_eh(ap, ata_std_prereset, ahci_softreset, ahci_hardreset,
+		  ahci_postreset);
 }
 
 static void ahci_post_internal_cmd(struct ata_queued_cmd *qc)
diff --git a/drivers/scsi/libata-bmdma.c b/drivers/scsi/libata-bmdma.c
index 6d30d2c52960..4bc05371737c 100644
--- a/drivers/scsi/libata-bmdma.c
+++ b/drivers/scsi/libata-bmdma.c
@@ -695,6 +695,7 @@ void ata_bmdma_thaw(struct ata_port *ap)
 /**
  *	ata_bmdma_drive_eh - Perform EH with given methods for BMDMA controller
  *	@ap: port to handle error for
+ *	@prereset: prereset method (can be NULL)
  *	@softreset: softreset method (can be NULL)
  *	@hardreset: hardreset method (can be NULL)
  *	@postreset: postreset method (can be NULL)
@@ -710,8 +711,9 @@ void ata_bmdma_thaw(struct ata_port *ap)
  *	LOCKING:
  *	Kernel thread context (may sleep)
  */
-void ata_bmdma_drive_eh(struct ata_port *ap, ata_reset_fn_t softreset,
-			ata_reset_fn_t hardreset, ata_postreset_fn_t postreset)
+void ata_bmdma_drive_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
+			ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
+			ata_postreset_fn_t postreset)
 {
 	struct ata_host_set *host_set = ap->host_set;
 	struct ata_eh_context *ehc = &ap->eh_context;
@@ -759,7 +761,7 @@ void ata_bmdma_drive_eh(struct ata_port *ap, ata_reset_fn_t softreset,
 		ata_eh_thaw_port(ap);
 
 	/* PIO and DMA engines have been stopped, perform recovery */
-	ata_do_eh(ap, softreset, hardreset, postreset);
+	ata_do_eh(ap, prereset, softreset, hardreset, postreset);
 }
 
 /**
@@ -779,7 +781,8 @@ void ata_bmdma_error_handler(struct ata_port *ap)
 	if (sata_scr_valid(ap))
 		hardreset = sata_std_hardreset;
 
-	ata_bmdma_drive_eh(ap, ata_std_softreset, hardreset, ata_std_postreset);
+	ata_bmdma_drive_eh(ap, ata_std_prereset, ata_std_softreset, hardreset,
+			   ata_std_postreset);
 }
 
 /**
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 4823ecefb8a1..2531a701d6e9 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -2525,6 +2525,90 @@ int sata_phy_resume(struct ata_port *ap, const unsigned long *params)
 	return sata_phy_debounce(ap, params);
 }
 
+static void ata_wait_spinup(struct ata_port *ap)
+{
+	struct ata_eh_context *ehc = &ap->eh_context;
+	unsigned long end, secs;
+	int rc;
+
+	/* first, debounce phy if SATA */
+	if (ap->cbl == ATA_CBL_SATA) {
+		rc = sata_phy_debounce(ap, sata_deb_timing_eh);
+
+		/* if debounced successfully and offline, no need to wait */
+		if ((rc == 0 || rc == -EOPNOTSUPP) && ata_port_offline(ap))
+			return;
+	}
+
+	/* okay, let's give the drive time to spin up */
+	end = ehc->i.hotplug_timestamp + ATA_SPINUP_WAIT * HZ / 1000;
+	secs = ((end - jiffies) + HZ - 1) / HZ;
+
+	if (time_after(jiffies, end))
+		return;
+
+	if (secs > 5)
+		ata_port_printk(ap, KERN_INFO, "waiting for device to spin up "
+				"(%lu secs)\n", secs);
+
+	schedule_timeout_uninterruptible(end - jiffies);
+}
+
+/**
+ *	ata_std_prereset - prepare for reset
+ *	@ap: ATA port to be reset
+ *
+ *	@ap is about to be reset.  Initialize it.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ *
+ *	RETURNS:
+ *	0 on success, -errno otherwise.
+ */
+int ata_std_prereset(struct ata_port *ap)
+{
+	struct ata_eh_context *ehc = &ap->eh_context;
+	const unsigned long *timing;
+	int rc;
+
+	/* hotplug? */
+	if (ehc->i.flags & ATA_EHI_HOTPLUGGED) {
+		if (ap->flags & ATA_FLAG_HRST_TO_RESUME)
+			ehc->i.action |= ATA_EH_HARDRESET;
+		if (ap->flags & ATA_FLAG_SKIP_D2H_BSY)
+			ata_wait_spinup(ap);
+	}
+
+	/* if we're about to do hardreset, nothing more to do */
+	if (ehc->i.action & ATA_EH_HARDRESET)
+		return 0;
+
+	/* if SATA, resume phy */
+	if (ap->cbl == ATA_CBL_SATA) {
+		if (ap->flags & ATA_FLAG_LOADING)
+			timing = sata_deb_timing_boot;
+		else
+			timing = sata_deb_timing_eh;
+
+		rc = sata_phy_resume(ap, timing);
+		if (rc && rc != -EOPNOTSUPP) {
+			/* phy resume failed */
+			ata_port_printk(ap, KERN_WARNING, "failed to resume "
+					"link for reset (errno=%d)\n", rc);
+			return rc;
+		}
+	}
+
+	/* Wait for !BSY if the controller can wait for the first D2H
+	 * Reg FIS and we don't know that no device is attached.
+	 */
+	if (!(ap->flags & ATA_FLAG_SKIP_D2H_BSY) && !ata_port_offline(ap))
+		ata_busy_sleep(ap, ATA_TMOUT_BOOT_QUICK, ATA_TMOUT_BOOT);
+
+	return 0;
+}
+
 /**
  *	ata_std_probeinit - initialize probing
  *	@ap: port to be probed
@@ -5840,6 +5924,7 @@ EXPORT_SYMBOL_GPL(sata_phy_reset);
 EXPORT_SYMBOL_GPL(__sata_phy_reset);
 EXPORT_SYMBOL_GPL(ata_bus_reset);
 EXPORT_SYMBOL_GPL(ata_std_probeinit);
+EXPORT_SYMBOL_GPL(ata_std_prereset);
 EXPORT_SYMBOL_GPL(ata_std_softreset);
 EXPORT_SYMBOL_GPL(sata_std_hardreset);
 EXPORT_SYMBOL_GPL(ata_std_postreset);
diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index 9173d8f2ce5d..0e66f140e53b 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -1318,20 +1318,58 @@ static void ata_eh_report(struct ata_port *ap)
 	}
 }
 
-static int ata_eh_reset(struct ata_port *ap, ata_reset_fn_t softreset,
+static int ata_eh_reset(struct ata_port *ap,
+			ata_prereset_fn_t prereset, ata_reset_fn_t softreset,
 			ata_reset_fn_t hardreset, ata_postreset_fn_t postreset)
 {
 	struct ata_eh_context *ehc = &ap->eh_context;
 	unsigned int classes[ATA_MAX_DEVICES];
 	int tries = ATA_EH_RESET_TRIES;
+	unsigned int action;
 	ata_reset_fn_t reset;
 	int i, rc;
 
+	/* Determine which reset to use and record in ehc->i.action.
+	 * prereset() may examine and modify it.
+	 */
+	action = ehc->i.action;
+	ehc->i.action &= ~ATA_EH_RESET_MASK;
 	if (softreset && (!hardreset || (!sata_set_spd_needed(ap) &&
-					 !(ehc->i.action & ATA_EH_HARDRESET))))
-		reset = softreset;
+					 !(action & ATA_EH_HARDRESET))))
+		ehc->i.action |= ATA_EH_SOFTRESET;
 	else
+		ehc->i.action |= ATA_EH_HARDRESET;
+
+	if (prereset) {
+		rc = prereset(ap);
+		if (rc) {
+			ata_port_printk(ap, KERN_ERR,
+					"prereset failed (errno=%d)\n", rc);
+			return rc;
+		}
+	}
+
+	/* prereset() might have modified ehc->i.action */
+	if (ehc->i.action & ATA_EH_HARDRESET)
 		reset = hardreset;
+	else if (ehc->i.action & ATA_EH_SOFTRESET)
+		reset = softreset;
+	else {
+		/* prereset told us not to reset, bang classes and return */
+		for (i = 0; i < ATA_MAX_DEVICES; i++)
+			classes[i] = ATA_DEV_NONE;
+		return 0;
+	}
+
+	/* did prereset() screw up?  if so, fix up to avoid oopsing */
+	if (!reset) {
+		ata_port_printk(ap, KERN_ERR, "BUG: prereset() requested "
+				"invalid reset type\n");
+		if (softreset)
+			reset = softreset;
+		else
+			reset = hardreset;
+	}
 
  retry:
 	ata_port_printk(ap, KERN_INFO, "%s resetting port\n",
@@ -1424,6 +1462,7 @@ static int ata_port_nr_enabled(struct ata_port *ap)
 /**
  *	ata_eh_recover - recover host port after error
  *	@ap: host port to recover
+ *	@prereset: prereset method (can be NULL)
  *	@softreset: softreset method (can be NULL)
  *	@hardreset: hardreset method (can be NULL)
  *	@postreset: postreset method (can be NULL)
@@ -1440,8 +1479,8 @@ static int ata_port_nr_enabled(struct ata_port *ap)
  *	RETURNS:
  *	0 on success, -errno on failure.
  */
-static int ata_eh_recover(struct ata_port *ap, ata_reset_fn_t softreset,
-			  ata_reset_fn_t hardreset,
+static int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
+			  ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
 			  ata_postreset_fn_t postreset)
 {
 	struct ata_eh_context *ehc = &ap->eh_context;
@@ -1469,7 +1508,8 @@ static int ata_eh_recover(struct ata_port *ap, ata_reset_fn_t softreset,
 	if (ehc->i.action & ATA_EH_RESET_MASK) {
 		ata_eh_freeze_port(ap);
 
-		rc = ata_eh_reset(ap, softreset, hardreset, postreset);
+		rc = ata_eh_reset(ap, prereset, softreset, hardreset,
+				  postreset);
 		if (rc) {
 			ata_port_printk(ap, KERN_ERR,
 					"reset failed, giving up\n");
@@ -1586,6 +1626,7 @@ static void ata_eh_finish(struct ata_port *ap)
 /**
  *	ata_do_eh - do standard error handling
  *	@ap: host port to handle error for
+ *	@prereset: prereset method (can be NULL)
  *	@softreset: softreset method (can be NULL)
  *	@hardreset: hardreset method (can be NULL)
  *	@postreset: postreset method (can be NULL)
@@ -1595,11 +1636,12 @@ static void ata_eh_finish(struct ata_port *ap)
  *	LOCKING:
  *	Kernel thread context (may sleep).
  */
-void ata_do_eh(struct ata_port *ap, ata_reset_fn_t softreset,
-	       ata_reset_fn_t hardreset, ata_postreset_fn_t postreset)
+void ata_do_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
+	       ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
+	       ata_postreset_fn_t postreset)
 {
 	ata_eh_autopsy(ap);
 	ata_eh_report(ap);
-	ata_eh_recover(ap, softreset, hardreset, postreset);
+	ata_eh_recover(ap, prereset, softreset, hardreset, postreset);
 	ata_eh_finish(ap);
 }
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c
index 4c76f05d9b65..26d7c54c175e 100644
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -912,7 +912,8 @@ static void sil24_error_handler(struct ata_port *ap)
 	}
 
 	/* perform recovery */
-	ata_do_eh(ap, sil24_softreset, sil24_hardreset, ata_std_postreset);
+	ata_do_eh(ap, ata_std_prereset, sil24_softreset, sil24_hardreset,
+		  ata_std_postreset);
 }
 
 static void sil24_post_internal_cmd(struct ata_queued_cmd *qc)
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 1c167f728fb4..fe5f53943c44 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -262,6 +262,15 @@ enum {
 	ATA_PROBE_MAX_TRIES	= 3,
 	ATA_EH_RESET_TRIES	= 3,
 	ATA_EH_DEV_TRIES	= 3,
+
+	/* Drive spinup time (time from power-on to the first D2H FIS)
+	 * in msecs - 8s currently.  Failing to get ready in this time
+	 * isn't critical.  It will result in reset failure for
+	 * controllers which can't wait for the first D2H FIS.  libata
+	 * will retry, so it just has to be long enough to spin up
+	 * most devices.
+	 */
+	ATA_SPINUP_WAIT		= 8000,
 };
 
 enum hsm_task_states {
@@ -294,9 +303,10 @@ struct ata_queued_cmd;
 
 /* typedefs */
 typedef void (*ata_qc_cb_t) (struct ata_queued_cmd *qc);
-typedef void (*ata_probeinit_fn_t)(struct ata_port *);
-typedef int (*ata_reset_fn_t)(struct ata_port *, unsigned int *);
-typedef void (*ata_postreset_fn_t)(struct ata_port *ap, unsigned int *);
+typedef void (*ata_probeinit_fn_t)(struct ata_port *ap);
+typedef int (*ata_prereset_fn_t)(struct ata_port *ap);
+typedef int (*ata_reset_fn_t)(struct ata_port *ap, unsigned int *classes);
+typedef void (*ata_postreset_fn_t)(struct ata_port *ap, unsigned int *classes);
 
 struct ata_ioports {
 	unsigned long		cmd_addr;
@@ -623,6 +633,7 @@ extern int ata_drive_probe_reset(struct ata_port *ap,
 			ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
 			ata_postreset_fn_t postreset, unsigned int *classes);
 extern void ata_std_probeinit(struct ata_port *ap);
+extern int ata_std_prereset(struct ata_port *ap);
 extern int ata_std_softreset(struct ata_port *ap, unsigned int *classes);
 extern int sata_std_hardreset(struct ata_port *ap, unsigned int *class);
 extern void ata_std_postreset(struct ata_port *ap, unsigned int *classes);
@@ -706,7 +717,7 @@ extern u8   ata_bmdma_status(struct ata_port *ap);
 extern void ata_bmdma_irq_clear(struct ata_port *ap);
 extern void ata_bmdma_freeze(struct ata_port *ap);
 extern void ata_bmdma_thaw(struct ata_port *ap);
-extern void ata_bmdma_drive_eh(struct ata_port *ap,
+extern void ata_bmdma_drive_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
 			       ata_reset_fn_t softreset,
 			       ata_reset_fn_t hardreset,
 			       ata_postreset_fn_t postreset);
@@ -784,8 +795,9 @@ extern void ata_eh_thaw_port(struct ata_port *ap);
 extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
 
-extern void ata_do_eh(struct ata_port *ap, ata_reset_fn_t softreset,
-		      ata_reset_fn_t hardreset, ata_postreset_fn_t postreset);
+extern void ata_do_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
+		      ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
+		      ata_postreset_fn_t postreset);
 
 /*
  * printk helpers
-- 
cgit v1.2.3


From 9a1004d0c11be41c83d06a67dfe74567a41ae582 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 31 May 2006 18:27:52 +0900
Subject: [PATCH] libata: export ata_hsm_move()

ata_hsm_move() will be used by LLDDs which depend on standard PIO HSM
but implement their own interrupt handlers.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c | 6 +++---
 include/linux/libata.h     | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 2531a701d6e9..c965eea3b3d4 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -4140,9 +4140,8 @@ static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
  *	RETURNS:
  *	1 when poll next status needed, 0 otherwise.
  */
-
-static int ata_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
-			 u8 status, int in_wq)
+int ata_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
+		 u8 status, int in_wq)
 {
 	unsigned long flags = 0;
 	int poll_next;
@@ -5885,6 +5884,7 @@ EXPORT_SYMBOL_GPL(ata_device_add);
 EXPORT_SYMBOL_GPL(ata_host_set_remove);
 EXPORT_SYMBOL_GPL(ata_sg_init);
 EXPORT_SYMBOL_GPL(ata_sg_init_one);
+EXPORT_SYMBOL_GPL(ata_hsm_move);
 EXPORT_SYMBOL_GPL(ata_qc_complete);
 EXPORT_SYMBOL_GPL(ata_qc_complete_multiple);
 EXPORT_SYMBOL_GPL(ata_qc_issue_prot);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index fe5f53943c44..a1ceb5b67b97 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -723,6 +723,8 @@ extern void ata_bmdma_drive_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
 			       ata_postreset_fn_t postreset);
 extern void ata_bmdma_error_handler(struct ata_port *ap);
 extern void ata_bmdma_post_internal_cmd(struct ata_queued_cmd *qc);
+extern int ata_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
+			u8 status, int in_wq);
 extern void ata_qc_complete(struct ata_queued_cmd *qc);
 extern int ata_qc_complete_multiple(struct ata_port *ap, u32 qc_active,
 				    void (*finish_qc)(struct ata_queued_cmd *));
-- 
cgit v1.2.3


From 084fe639b81c4d418a2cf714acb0475e3713cb73 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 31 May 2006 18:28:03 +0900
Subject: [PATCH] libata-hp: implement hotplug

Implement ATA part of hotplug.  To avoid probing broken devices over
and over again, disabled devices are not automatically detached.  They
are detached only if probing is requested for the device or the
associated port is offline.  Also, to avoid infinite probing loop,
Each device is probed only once per EH run.

As SATA PHY status is fragile, devices are detached only after it has
used up its recovery chances unless explicitly requested by LLDD or
user (LLDD may request direct detach if, for example, it supports cold
presence detection).

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-eh.c | 123 +++++++++++++++++++++++++++++++++++++++--------
 include/linux/libata.h   |  13 +++++
 2 files changed, 115 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index a049bffdf770..b53e2e7db498 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -932,10 +932,8 @@ static void ata_eh_analyze_serror(struct ata_port *ap)
 		err_mask |= AC_ERR_SYSTEM;
 		action |= ATA_EH_SOFTRESET;
 	}
-	if (serror & (SERR_PHYRDY_CHG | SERR_DEV_XCHG)) {
-		err_mask |= AC_ERR_ATA_BUS;
-		action |= ATA_EH_HARDRESET;
-	}
+	if (serror & (SERR_PHYRDY_CHG | SERR_DEV_XCHG))
+		ata_ehi_hotplugged(&ehc->i);
 
 	ehc->i.err_mask |= err_mask;
 	ehc->i.action |= action;
@@ -1487,11 +1485,12 @@ static int ata_eh_reset(struct ata_port *ap, int classify,
 	return rc;
 }
 
-static int ata_eh_revalidate(struct ata_port *ap,
-			     struct ata_device **r_failed_dev)
+static int ata_eh_revalidate_and_attach(struct ata_port *ap,
+					struct ata_device **r_failed_dev)
 {
 	struct ata_eh_context *ehc = &ap->eh_context;
 	struct ata_device *dev;
+	unsigned long flags;
 	int i, rc = 0;
 
 	DPRINTK("ENTER\n");
@@ -1513,6 +1512,23 @@ static int ata_eh_revalidate(struct ata_port *ap,
 				break;
 
 			ehc->i.action &= ~ATA_EH_REVALIDATE;
+		} else if (dev->class == ATA_DEV_UNKNOWN &&
+			   ehc->tries[dev->devno] &&
+			   ata_class_enabled(ehc->classes[dev->devno])) {
+			dev->class = ehc->classes[dev->devno];
+
+			rc = ata_dev_read_id(dev, &dev->class, 1, dev->id);
+			if (rc == 0)
+				rc = ata_dev_configure(dev, 1);
+
+			if (rc) {
+				dev->class = ATA_DEV_UNKNOWN;
+				break;
+			}
+
+			spin_lock_irqsave(&ap->host_set->lock, flags);
+			ap->flags |= ATA_FLAG_SCSI_HOTPLUG;
+			spin_unlock_irqrestore(&ap->host_set->lock, flags);
 		}
 	}
 
@@ -1533,6 +1549,36 @@ static int ata_port_nr_enabled(struct ata_port *ap)
 	return cnt;
 }
 
+static int ata_port_nr_vacant(struct ata_port *ap)
+{
+	int i, cnt = 0;
+
+	for (i = 0; i < ATA_MAX_DEVICES; i++)
+		if (ap->device[i].class == ATA_DEV_UNKNOWN)
+			cnt++;
+	return cnt;
+}
+
+static int ata_eh_skip_recovery(struct ata_port *ap)
+{
+	struct ata_eh_context *ehc = &ap->eh_context;
+	int i;
+
+	if (ap->flags & ATA_FLAG_FROZEN || ata_port_nr_enabled(ap))
+		return 0;
+
+	/* skip if class codes for all vacant slots are ATA_DEV_NONE */
+	for (i = 0; i < ATA_MAX_DEVICES; i++) {
+		struct ata_device *dev = &ap->device[i];
+
+		if (dev->class == ATA_DEV_UNKNOWN &&
+		    ehc->classes[dev->devno] != ATA_DEV_NONE)
+			return 0;
+	}
+
+	return 1;
+}
+
 /**
  *	ata_eh_recover - recover host port after error
  *	@ap: host port to recover
@@ -1543,9 +1589,10 @@ static int ata_port_nr_enabled(struct ata_port *ap)
  *
  *	This is the alpha and omega, eum and yang, heart and soul of
  *	libata exception handling.  On entry, actions required to
- *	recover each devices are recorded in eh_context.  This
- *	function executes all the operations with appropriate retrials
- *	and fallbacks to resurrect failed devices.
+ *	recover the port and hotplug requests are recorded in
+ *	eh_context.  This function executes all the operations with
+ *	appropriate retrials and fallbacks to resurrect failed
+ *	devices, detach goners and greet newcomers.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep).
@@ -1568,6 +1615,19 @@ static int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 		dev = &ap->device[i];
 
 		ehc->tries[dev->devno] = ATA_EH_DEV_TRIES;
+
+		/* process hotplug request */
+		if (dev->flags & ATA_DFLAG_DETACH)
+			ata_eh_detach_dev(dev);
+
+		if (!ata_dev_enabled(dev) &&
+		    ((ehc->i.probe_mask & (1 << dev->devno)) &&
+		     !(ehc->did_probe_mask & (1 << dev->devno)))) {
+			ata_eh_detach_dev(dev);
+			ata_dev_init(dev);
+			ehc->did_probe_mask |= (1 << dev->devno);
+			ehc->i.action |= ATA_EH_SOFTRESET;
+		}
 	}
 
  retry:
@@ -1575,15 +1635,18 @@ static int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 	rc = 0;
 
 	/* skip EH if possible. */
-	if (!ata_port_nr_enabled(ap) && !(ap->flags & ATA_FLAG_FROZEN))
+	if (ata_eh_skip_recovery(ap))
 		ehc->i.action = 0;
 
+	for (i = 0; i < ATA_MAX_DEVICES; i++)
+		ehc->classes[i] = ATA_DEV_UNKNOWN;
+
 	/* reset */
 	if (ehc->i.action & ATA_EH_RESET_MASK) {
 		ata_eh_freeze_port(ap);
 
-		rc = ata_eh_reset(ap, 0, prereset, softreset, hardreset,
-				  postreset);
+		rc = ata_eh_reset(ap, ata_port_nr_vacant(ap), prereset,
+				  softreset, hardreset, postreset);
 		if (rc) {
 			ata_port_printk(ap, KERN_ERR,
 					"reset failed, giving up\n");
@@ -1593,8 +1656,8 @@ static int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 		ata_eh_thaw_port(ap);
 	}
 
-	/* revalidate existing devices */
-	rc = ata_eh_revalidate(ap, &dev);
+	/* revalidate existing devices and attach new ones */
+	rc = ata_eh_revalidate_and_attach(ap, &dev);
 	if (rc)
 		goto dev_fail;
 
@@ -1612,6 +1675,8 @@ static int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
  dev_fail:
 	switch (rc) {
 	case -ENODEV:
+		/* device missing, schedule probing */
+		ehc->i.probe_mask |= (1 << dev->devno);
 	case -EINVAL:
 		ehc->tries[dev->devno] = 0;
 		break;
@@ -1624,15 +1689,31 @@ static int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 			ehc->tries[dev->devno] = 0;
 	}
 
-	/* disable device if it has used up all its chances */
-	if (ata_dev_enabled(dev) && !ehc->tries[dev->devno])
+	if (ata_dev_enabled(dev) && !ehc->tries[dev->devno]) {
+		/* disable device if it has used up all its chances */
 		ata_dev_disable(dev);
 
-	/* soft didn't work?  be haaaaard */
-	if (ehc->i.flags & ATA_EHI_DID_RESET)
-		ehc->i.action |= ATA_EH_HARDRESET;
-	else
-		ehc->i.action |= ATA_EH_SOFTRESET;
+		/* detach if offline */
+		if (ata_port_offline(ap))
+			ata_eh_detach_dev(dev);
+
+		/* probe if requested */
+		if ((ehc->i.probe_mask & (1 << dev->devno)) &&
+		    !(ehc->did_probe_mask & (1 << dev->devno))) {
+			ata_eh_detach_dev(dev);
+			ata_dev_init(dev);
+
+			ehc->tries[dev->devno] = ATA_EH_DEV_TRIES;
+			ehc->did_probe_mask |= (1 << dev->devno);
+			ehc->i.action |= ATA_EH_SOFTRESET;
+		}
+	} else {
+		/* soft didn't work?  be haaaaard */
+		if (ehc->i.flags & ATA_EHI_DID_RESET)
+			ehc->i.action |= ATA_EH_HARDRESET;
+		else
+			ehc->i.action |= ATA_EH_SOFTRESET;
+	}
 
 	if (ata_port_nr_enabled(ap)) {
 		ata_port_printk(ap, KERN_WARNING, "failed to recover some "
diff --git a/include/linux/libata.h b/include/linux/libata.h
index a1ceb5b67b97..56971943d261 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -824,6 +824,19 @@ extern void ata_do_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
 	(ehi)->desc_len = 0; \
 } while (0)
 
+static inline void ata_ehi_hotplugged(struct ata_eh_info *ehi)
+{
+	if (ehi->flags & ATA_EHI_HOTPLUGGED)
+		return;
+
+	ehi->flags |= ATA_EHI_HOTPLUGGED;
+	ehi->hotplug_timestamp = jiffies;
+
+	ehi->err_mask |= AC_ERR_ATA_BUS;
+	ehi->action |= ATA_EH_SOFTRESET;
+	ehi->probe_mask |= (1 << ATA_MAX_DEVICES) - 1;
+}
+
 /*
  * qc helpers
  */
-- 
cgit v1.2.3


From 580b2102327ab8444af5bde4e70b50d268a1d558 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 31 May 2006 18:28:05 +0900
Subject: [PATCH] libata-hp: implement SCSI part of hotplug

Implement SCSI part of hotplug.

This must be done in a separate context as SCSI makes use of EH during
probing.  SCSI scan fails silently if EH is in progress.  In such
cases, libata pauses briefly and retries until every device is
attached.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c |   1 +
 drivers/scsi/libata-eh.c   |   6 ++-
 drivers/scsi/libata-scsi.c | 116 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/scsi/libata.h      |   1 +
 include/linux/libata.h     |   2 +-
 5 files changed, 124 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index c965eea3b3d4..8df8ecc51a78 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -5370,6 +5370,7 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 	ap->msg_enable = ATA_MSG_DRV;
 
 	INIT_WORK(&ap->port_task, NULL, NULL);
+	INIT_WORK(&ap->hotplug_task, ata_scsi_hotplug, ap);
 	INIT_LIST_HEAD(&ap->eh_done_q);
 	init_waitqueue_head(&ap->eh_wait_q);
 
diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index b53e2e7db498..733dfa532977 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -287,9 +287,13 @@ void ata_scsi_error(struct Scsi_Host *host)
 	/* clean up */
 	spin_lock_irqsave(hs_lock, flags);
 
+	if (ap->flags & ATA_FLAG_SCSI_HOTPLUG)
+		queue_work(ata_aux_wq, &ap->hotplug_task);
+
 	if (ap->flags & ATA_FLAG_RECOVERED)
 		ata_port_printk(ap, KERN_INFO, "EH complete\n");
-	ap->flags &= ~ATA_FLAG_RECOVERED;
+
+	ap->flags &= ~(ATA_FLAG_SCSI_HOTPLUG | ATA_FLAG_RECOVERED);
 
 	/* tell wait_eh that we're done */
 	ap->flags &= ~ATA_FLAG_EH_IN_PROGRESS;
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 3dc6188af0e8..12563998d97c 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -2786,3 +2786,119 @@ int ata_scsi_offline_dev(struct ata_device *dev)
 	}
 	return 0;
 }
+
+/**
+ *	ata_scsi_remove_dev - remove attached SCSI device
+ *	@dev: ATA device to remove attached SCSI device for
+ *
+ *	This function is called from ata_eh_scsi_hotplug() and
+ *	responsible for removing the SCSI device attached to @dev.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ */
+static void ata_scsi_remove_dev(struct ata_device *dev)
+{
+	struct ata_port *ap = dev->ap;
+	struct scsi_device *sdev;
+	unsigned long flags;
+
+	/* Alas, we need to grab scan_mutex to ensure SCSI device
+	 * state doesn't change underneath us and thus
+	 * scsi_device_get() always succeeds.  The mutex locking can
+	 * be removed if there is __scsi_device_get() interface which
+	 * increments reference counts regardless of device state.
+	 */
+	mutex_lock(&ap->host->scan_mutex);
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+
+	/* clearing dev->sdev is protected by host_set lock */
+	sdev = dev->sdev;
+	dev->sdev = NULL;
+
+	if (sdev) {
+		/* If user initiated unplug races with us, sdev can go
+		 * away underneath us after the host_set lock and
+		 * scan_mutex are released.  Hold onto it.
+		 */
+		if (scsi_device_get(sdev) == 0) {
+			/* The following ensures the attached sdev is
+			 * offline on return from ata_scsi_offline_dev()
+			 * regardless it wins or loses the race
+			 * against this function.
+			 */
+			scsi_device_set_state(sdev, SDEV_OFFLINE);
+		} else {
+			WARN_ON(1);
+			sdev = NULL;
+		}
+	}
+
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	mutex_unlock(&ap->host->scan_mutex);
+
+	if (sdev) {
+		ata_dev_printk(dev, KERN_INFO, "detaching (SCSI %s)\n",
+			       sdev->sdev_gendev.bus_id);
+
+		scsi_remove_device(sdev);
+		scsi_device_put(sdev);
+	}
+}
+
+/**
+ *	ata_scsi_hotplug - SCSI part of hotplug
+ *	@data: Pointer to ATA port to perform SCSI hotplug on
+ *
+ *	Perform SCSI part of hotplug.  It's executed from a separate
+ *	workqueue after EH completes.  This is necessary because SCSI
+ *	hot plugging requires working EH and hot unplugging is
+ *	synchronized with hot plugging with a mutex.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ */
+void ata_scsi_hotplug(void *data)
+{
+	struct ata_port *ap = data;
+	int i;
+
+	if (ap->flags & ATA_FLAG_UNLOADING) {
+		DPRINTK("ENTER/EXIT - unloading\n");
+		return;
+	}
+
+	DPRINTK("ENTER\n");
+
+	/* unplug detached devices */
+	for (i = 0; i < ATA_MAX_DEVICES; i++) {
+		struct ata_device *dev = &ap->device[i];
+		unsigned long flags;
+
+		if (!(dev->flags & ATA_DFLAG_DETACHED))
+			continue;
+
+		spin_lock_irqsave(&ap->host_set->lock, flags);
+		dev->flags &= ~ATA_DFLAG_DETACHED;
+		spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+		ata_scsi_remove_dev(dev);
+	}
+
+	/* scan for new ones */
+	ata_scsi_scan_host(ap);
+
+	/* If we scanned while EH was in progress, scan would have
+	 * failed silently.  Requeue if there are enabled but
+	 * unattached devices.
+	 */
+	for (i = 0; i < ATA_MAX_DEVICES; i++) {
+		struct ata_device *dev = &ap->device[i];
+		if (ata_dev_enabled(dev) && !dev->sdev) {
+			queue_delayed_work(ata_aux_wq, &ap->hotplug_task, HZ);
+			break;
+		}
+	}
+
+	DPRINTK("EXIT\n");
+}
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index e38759fdc183..0586b0cd73fd 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -76,6 +76,7 @@ extern struct scsi_transport_template ata_scsi_transport_template;
 
 extern void ata_scsi_scan_host(struct ata_port *ap);
 extern int ata_scsi_offline_dev(struct ata_device *dev);
+extern void ata_scsi_hotplug(void *data);
 extern unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf,
 			       unsigned int buflen);
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 56971943d261..407115624d9f 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -521,7 +521,7 @@ struct ata_port {
 	struct ata_host_set	*host_set;
 	struct device 		*dev;
 
-	struct work_struct	port_task;
+	struct work_struct	port_task, hotplug_task;
 
 	unsigned int		hsm_task_state;
 
-- 
cgit v1.2.3


From 83c47bcb3c533180a6dda78152334de50065358a Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 31 May 2006 18:28:07 +0900
Subject: [PATCH] libata-hp: implement warmplug

Implement warmplug.  User-initiated unplug can be detected by
hostt->slave_destroy() and plug by transportt->user_scan().  This
patch only implements the two callbacks.  The next function will hook
them.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c |  1 +
 drivers/scsi/libata-scsi.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/libata.h     |  1 +
 3 files changed, 91 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 8df8ecc51a78..c61cfc742388 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -5942,6 +5942,7 @@ EXPORT_SYMBOL_GPL(ata_port_queue_task);
 EXPORT_SYMBOL_GPL(ata_scsi_ioctl);
 EXPORT_SYMBOL_GPL(ata_scsi_queuecmd);
 EXPORT_SYMBOL_GPL(ata_scsi_slave_config);
+EXPORT_SYMBOL_GPL(ata_scsi_slave_destroy);
 EXPORT_SYMBOL_GPL(ata_scsi_change_queue_depth);
 EXPORT_SYMBOL_GPL(ata_scsi_release);
 EXPORT_SYMBOL_GPL(ata_host_intr);
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 12563998d97c..7c1ac58c430a 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -57,6 +57,8 @@ static struct ata_device * __ata_scsi_find_dev(struct ata_port *ap,
 					const struct scsi_device *scsidev);
 static struct ata_device * ata_scsi_find_dev(struct ata_port *ap,
 					    const struct scsi_device *scsidev);
+static int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
+			      unsigned int id, unsigned int lun);
 
 
 #define RW_RECOVERY_MPAGE 0x1
@@ -726,6 +728,40 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
 	return 0;	/* scsi layer doesn't check return value, sigh */
 }
 
+/**
+ *	ata_scsi_slave_destroy - SCSI device is about to be destroyed
+ *	@sdev: SCSI device to be destroyed
+ *
+ *	@sdev is about to be destroyed for hot/warm unplugging.  If
+ *	this unplugging was initiated by libata as indicated by NULL
+ *	dev->sdev, this function doesn't have to do anything.
+ *	Otherwise, SCSI layer initiated warm-unplug is in progress.
+ *	Clear dev->sdev, schedule the device for ATA detach and invoke
+ *	EH.
+ *
+ *	LOCKING:
+ *	Defined by SCSI layer.  We don't really care.
+ */
+void ata_scsi_slave_destroy(struct scsi_device *sdev)
+{
+	struct ata_port *ap = ata_shost_to_port(sdev->host);
+	unsigned long flags;
+	struct ata_device *dev;
+
+	if (!ap->ops->error_handler)
+		return;
+
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	dev = __ata_scsi_find_dev(ap, sdev);
+	if (dev && dev->sdev) {
+		/* SCSI device already in CANCEL state, no need to offline it */
+		dev->sdev = NULL;
+		dev->flags |= ATA_DFLAG_DETACH;
+		ata_port_schedule_eh(ap);
+	}
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+}
+
 /**
  *	ata_scsi_change_queue_depth - SCSI callback for queue depth config
  *	@sdev: SCSI device to configure queue depth for
@@ -2902,3 +2938,56 @@ void ata_scsi_hotplug(void *data)
 
 	DPRINTK("EXIT\n");
 }
+
+/**
+ *	ata_scsi_user_scan - indication for user-initiated bus scan
+ *	@shost: SCSI host to scan
+ *	@channel: Channel to scan
+ *	@id: ID to scan
+ *	@lun: LUN to scan
+ *
+ *	This function is called when user explicitly requests bus
+ *	scan.  Set probe pending flag and invoke EH.
+ *
+ *	LOCKING:
+ *	SCSI layer (we don't care)
+ *
+ *	RETURNS:
+ *	Zero.
+ */
+static int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
+			      unsigned int id, unsigned int lun)
+{
+	struct ata_port *ap = ata_shost_to_port(shost);
+	unsigned long flags;
+	int rc = 0;
+
+	if (!ap->ops->error_handler)
+		return -EOPNOTSUPP;
+
+	if ((channel != SCAN_WILD_CARD && channel != 0) ||
+	    (lun != SCAN_WILD_CARD && lun != 0))
+		return -EINVAL;
+
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+
+	if (id == SCAN_WILD_CARD) {
+		ap->eh_info.probe_mask |= (1 << ATA_MAX_DEVICES) - 1;
+		ap->eh_info.action |= ATA_EH_SOFTRESET;
+	} else {
+		struct ata_device *dev = ata_find_dev(ap, id);
+
+		if (dev) {
+			ap->eh_info.probe_mask |= 1 << dev->devno;
+			ap->eh_info.action |= ATA_EH_SOFTRESET;
+		} else
+			rc = -EINVAL;
+	}
+
+	if (rc == 0)
+		ata_port_schedule_eh(ap);
+
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	return rc;
+}
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 407115624d9f..74786c33c526 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -734,6 +734,7 @@ extern int ata_std_bios_param(struct scsi_device *sdev,
 			      struct block_device *bdev,
 			      sector_t capacity, int geom[]);
 extern int ata_scsi_slave_config(struct scsi_device *sdev);
+extern void ata_scsi_slave_destroy(struct scsi_device *sdev);
 extern int ata_scsi_change_queue_depth(struct scsi_device *sdev,
 				       int queue_depth);
 extern struct ata_device *ata_dev_pair(struct ata_device *adev);
-- 
cgit v1.2.3


From 720ba12620ee09dce269adf4ad50958adac7bb54 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 31 May 2006 18:28:13 +0900
Subject: [PATCH] libata-hp: update unload-unplug

Update unload unplug - driver unloading / PCI removal.  This is done
by ata_port_detach() which short-circuits EH, disables all devices and
freezes the port.  With this patch, EH and unloading/unplugging are
properly synchronized.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/ahci.c        | 10 +++----
 drivers/scsi/libata-core.c | 67 +++++++++++++++++++++++++++++++++++++++++-----
 drivers/scsi/libata-eh.c   |  8 ++++--
 include/linux/libata.h     |  1 +
 4 files changed, 71 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index afb3805f9e95..60f455bf3696 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -1389,21 +1389,17 @@ static void ahci_remove_one (struct pci_dev *pdev)
 	struct device *dev = pci_dev_to_dev(pdev);
 	struct ata_host_set *host_set = dev_get_drvdata(dev);
 	struct ahci_host_priv *hpriv = host_set->private_data;
-	struct ata_port *ap;
 	unsigned int i;
 	int have_msi;
 
-	for (i = 0; i < host_set->n_ports; i++) {
-		ap = host_set->ports[i];
-
-		scsi_remove_host(ap->host);
-	}
+	for (i = 0; i < host_set->n_ports; i++)
+		ata_port_detach(host_set->ports[i]);
 
 	have_msi = hpriv->flags & AHCI_FLAG_MSI;
 	free_irq(host_set->irq, host_set);
 
 	for (i = 0; i < host_set->n_ports; i++) {
-		ap = host_set->ports[i];
+		struct ata_port *ap = host_set->ports[i];
 
 		ata_scsi_release(ap->host);
 		scsi_host_put(ap->host);
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index a42877e6b865..01f2c59536bc 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -5609,6 +5609,63 @@ err_free_ret:
 	return 0;
 }
 
+/**
+ *	ata_port_detach - Detach ATA port in prepration of device removal
+ *	@ap: ATA port to be detached
+ *
+ *	Detach all ATA devices and the associated SCSI devices of @ap;
+ *	then, remove the associated SCSI host.  @ap is guaranteed to
+ *	be quiescent on return from this function.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ */
+void ata_port_detach(struct ata_port *ap)
+{
+	unsigned long flags;
+	int i;
+
+	if (!ap->ops->error_handler)
+		return;
+
+	/* tell EH we're leaving & flush EH */
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	ap->flags |= ATA_FLAG_UNLOADING;
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	ata_port_wait_eh(ap);
+
+	/* EH is now guaranteed to see UNLOADING, so no new device
+	 * will be attached.  Disable all existing devices.
+	 */
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+
+	for (i = 0; i < ATA_MAX_DEVICES; i++)
+		ata_dev_disable(&ap->device[i]);
+
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	/* Final freeze & EH.  All in-flight commands are aborted.  EH
+	 * will be skipped and retrials will be terminated with bad
+	 * target.
+	 */
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	ata_port_freeze(ap);	/* won't be thawed */
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	ata_port_wait_eh(ap);
+
+	/* Flush hotplug task.  The sequence is similar to
+	 * ata_port_flush_task().
+	 */
+	flush_workqueue(ata_aux_wq);
+	cancel_delayed_work(&ap->hotplug_task);
+	flush_workqueue(ata_aux_wq);
+
+	/* remove the associated SCSI host */
+	scsi_remove_host(ap->host);
+}
+
 /**
  *	ata_host_set_remove - PCI layer callback for device removal
  *	@host_set: ATA host set that was removed
@@ -5622,18 +5679,15 @@ err_free_ret:
 
 void ata_host_set_remove(struct ata_host_set *host_set)
 {
-	struct ata_port *ap;
 	unsigned int i;
 
-	for (i = 0; i < host_set->n_ports; i++) {
-		ap = host_set->ports[i];
-		scsi_remove_host(ap->host);
-	}
+	for (i = 0; i < host_set->n_ports; i++)
+		ata_port_detach(host_set->ports[i]);
 
 	free_irq(host_set->irq, host_set);
 
 	for (i = 0; i < host_set->n_ports; i++) {
-		ap = host_set->ports[i];
+		struct ata_port *ap = host_set->ports[i];
 
 		ata_scsi_release(ap->host);
 
@@ -5901,6 +5955,7 @@ EXPORT_SYMBOL_GPL(sata_deb_timing_before_fsrst);
 EXPORT_SYMBOL_GPL(ata_std_bios_param);
 EXPORT_SYMBOL_GPL(ata_std_ports);
 EXPORT_SYMBOL_GPL(ata_device_add);
+EXPORT_SYMBOL_GPL(ata_port_detach);
 EXPORT_SYMBOL_GPL(ata_host_set_remove);
 EXPORT_SYMBOL_GPL(ata_sg_init);
 EXPORT_SYMBOL_GPL(ata_sg_init_one);
diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index 70c132bef68e..30a83a57a12f 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -46,6 +46,7 @@
 #include "libata.h"
 
 static void __ata_port_freeze(struct ata_port *ap);
+static void ata_eh_finish(struct ata_port *ap);
 
 static void ata_ering_record(struct ata_ering *ering, int is_io,
 			     unsigned int err_mask)
@@ -242,8 +243,11 @@ void ata_scsi_error(struct Scsi_Host *host)
 
 		spin_unlock_irqrestore(hs_lock, flags);
 
-		/* invoke EH */
-		ap->ops->error_handler(ap);
+		/* invoke EH.  if unloading, just finish failed qcs */
+		if (!(ap->flags & ATA_FLAG_UNLOADING))
+			ap->ops->error_handler(ap);
+		else
+			ata_eh_finish(ap);
 
 		/* Exception might have happend after ->error_handler
 		 * recovered the port but before this point.  Repeat
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 74786c33c526..f11ba2715bef 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -649,6 +649,7 @@ extern int ata_pci_device_resume(struct pci_dev *pdev);
 extern int ata_pci_clear_simplex(struct pci_dev *pdev);
 #endif /* CONFIG_PCI */
 extern int ata_device_add(const struct ata_probe_ent *ent);
+extern void ata_port_detach(struct ata_port *ap);
 extern void ata_host_set_remove(struct ata_host_set *host_set);
 extern int ata_scsi_detect(struct scsi_host_template *sht);
 extern int ata_scsi_ioctl(struct scsi_device *dev, int cmd, void __user *arg);
-- 
cgit v1.2.3


From 52783c5dcc8d317bc8c3e2692d366e8a305abada Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 31 May 2006 18:28:22 +0900
Subject: [PATCH] libata-hp: killl ops->probe_reset

Now that all drivers implementing new EH are converted to new probing
mechanism, ops->probe_reset doesn't have any user.  Kill it.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 drivers/scsi/libata-core.c | 187 ++++-----------------------------------------
 include/linux/libata.h     |   8 --
 2 files changed, 13 insertions(+), 182 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 01f2c59536bc..30a6020c5121 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1479,31 +1479,21 @@ static int ata_bus_probe(struct ata_port *ap)
 	down_xfermask = 0;
 
 	/* reset and determine device classes */
-	for (i = 0; i < ATA_MAX_DEVICES; i++)
-		classes[i] = ATA_DEV_UNKNOWN;
+	ap->ops->phy_reset(ap);
 
-	if (ap->ops->probe_reset) {
-		rc = ap->ops->probe_reset(ap, classes);
-		if (rc) {
-			ata_port_printk(ap, KERN_ERR,
-					"reset failed (errno=%d)\n", rc);
-			return rc;
-		}
-	} else {
-		ap->ops->phy_reset(ap);
+	for (i = 0; i < ATA_MAX_DEVICES; i++) {
+		dev = &ap->device[i];
 
-		for (i = 0; i < ATA_MAX_DEVICES; i++) {
-			if (!(ap->flags & ATA_FLAG_DISABLED))
-				classes[i] = ap->device[i].class;
-			ap->device[i].class = ATA_DEV_UNKNOWN;
-		}
+		if (!(ap->flags & ATA_FLAG_DISABLED) &&
+		    dev->class != ATA_DEV_UNKNOWN)
+			classes[dev->devno] = dev->class;
+		else
+			classes[dev->devno] = ATA_DEV_NONE;
 
-		ata_port_probe(ap);
+		dev->class = ATA_DEV_UNKNOWN;
 	}
 
-	for (i = 0; i < ATA_MAX_DEVICES; i++)
-		if (classes[i] == ATA_DEV_UNKNOWN)
-			classes[i] = ATA_DEV_NONE;
+	ata_port_probe(ap);
 
 	/* after the reset the device state is PIO 0 and the controller
 	   state is undefined. Record the mode */
@@ -2609,38 +2599,12 @@ int ata_std_prereset(struct ata_port *ap)
 	return 0;
 }
 
-/**
- *	ata_std_probeinit - initialize probing
- *	@ap: port to be probed
- *
- *	@ap is about to be probed.  Initialize it.  This function is
- *	to be used as standard callback for ata_drive_probe_reset().
- *
- *	NOTE!!! Do not use this function as probeinit if a low level
- *	driver implements only hardreset.  Just pass NULL as probeinit
- *	in that case.  Using this function is probably okay but doing
- *	so makes reset sequence different from the original
- *	->phy_reset implementation and Jeff nervous.  :-P
- */
-void ata_std_probeinit(struct ata_port *ap)
-{
-	static const unsigned long deb_timing[] = { 5, 100, 5000 };
-
-	/* resume link */
-	sata_phy_resume(ap, deb_timing);
-
-	/* wait for device */
-	if (ata_port_online(ap))
-		ata_busy_sleep(ap, ATA_TMOUT_BOOT_QUICK, ATA_TMOUT_BOOT);
-}
-
 /**
  *	ata_std_softreset - reset host port via ATA SRST
  *	@ap: port to reset
  *	@classes: resulting classes of attached devices
  *
- *	Reset host port using ATA SRST.  This function is to be used
- *	as standard callback for ata_drive_*_reset() functions.
+ *	Reset host port using ATA SRST.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep)
@@ -2695,8 +2659,6 @@ int ata_std_softreset(struct ata_port *ap, unsigned int *classes)
  *	@class: resulting class of attached device
  *
  *	SATA phy-reset host port using DET bits of SControl register.
- *	This function is to be used as standard callback for
- *	ata_drive_*_reset().
  *
  *	LOCKING:
  *	Kernel thread context (may sleep)
@@ -2775,9 +2737,6 @@ int sata_std_hardreset(struct ata_port *ap, unsigned int *class)
  *	the device might have been reset more than once using
  *	different reset methods before postreset is invoked.
  *
- *	This function is to be used as standard callback for
- *	ata_drive_*_reset().
- *
  *	LOCKING:
  *	Kernel thread context (may sleep)
  */
@@ -2824,32 +2783,6 @@ void ata_std_postreset(struct ata_port *ap, unsigned int *classes)
 	DPRINTK("EXIT\n");
 }
 
-/**
- *	ata_std_probe_reset - standard probe reset method
- *	@ap: prot to perform probe-reset
- *	@classes: resulting classes of attached devices
- *
- *	The stock off-the-shelf ->probe_reset method.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep)
- *
- *	RETURNS:
- *	0 on success, -errno otherwise.
- */
-int ata_std_probe_reset(struct ata_port *ap, unsigned int *classes)
-{
-	ata_reset_fn_t hardreset;
-
-	hardreset = NULL;
-	if (sata_scr_valid(ap))
-		hardreset = sata_std_hardreset;
-
-	return ata_drive_probe_reset(ap, ata_std_probeinit,
-				     ata_std_softreset, hardreset,
-				     ata_std_postreset, classes);
-}
-
 int ata_do_reset(struct ata_port *ap, ata_reset_fn_t reset,
 		 unsigned int *classes)
 {
@@ -2878,97 +2811,6 @@ int ata_do_reset(struct ata_port *ap, ata_reset_fn_t reset,
 	return 0;
 }
 
-/**
- *	ata_drive_probe_reset - Perform probe reset with given methods
- *	@ap: port to reset
- *	@probeinit: probeinit method (can be NULL)
- *	@softreset: softreset method (can be NULL)
- *	@hardreset: hardreset method (can be NULL)
- *	@postreset: postreset method (can be NULL)
- *	@classes: resulting classes of attached devices
- *
- *	Reset the specified port and classify attached devices using
- *	given methods.  This function prefers softreset but tries all
- *	possible reset sequences to reset and classify devices.  This
- *	function is intended to be used for constructing ->probe_reset
- *	callback by low level drivers.
- *
- *	Reset methods should follow the following rules.
- *
- *	- Return 0 on sucess, -errno on failure.
- *	- If classification is supported, fill classes[] with
- *	  recognized class codes.
- *	- If classification is not supported, leave classes[] alone.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep)
- *
- *	RETURNS:
- *	0 on success, -EINVAL if no reset method is avaliable, -ENODEV
- *	if classification fails, and any error code from reset
- *	methods.
- */
-int ata_drive_probe_reset(struct ata_port *ap, ata_probeinit_fn_t probeinit,
-			  ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
-			  ata_postreset_fn_t postreset, unsigned int *classes)
-{
-	int rc = -EINVAL;
-
-	ata_eh_freeze_port(ap);
-
-	if (probeinit)
-		probeinit(ap);
-
-	if (softreset && !sata_set_spd_needed(ap)) {
-		rc = ata_do_reset(ap, softreset, classes);
-		if (rc == 0 && classes[0] != ATA_DEV_UNKNOWN)
-			goto done;
-		ata_port_printk(ap, KERN_INFO, "softreset failed, "
-				"will try hardreset in 5 secs\n");
-		ssleep(5);
-	}
-
-	if (!hardreset)
-		goto done;
-
-	while (1) {
-		rc = ata_do_reset(ap, hardreset, classes);
-		if (rc == 0) {
-			if (classes[0] != ATA_DEV_UNKNOWN)
-				goto done;
-			break;
-		}
-
-		if (sata_down_spd_limit(ap))
-			goto done;
-
-		ata_port_printk(ap, KERN_INFO, "hardreset failed, "
-				"will retry in 5 secs\n");
-		ssleep(5);
-	}
-
-	if (softreset) {
-		ata_port_printk(ap, KERN_INFO,
-				"hardreset succeeded without classification, "
-				"will retry softreset in 5 secs\n");
-		ssleep(5);
-
-		rc = ata_do_reset(ap, softreset, classes);
-	}
-
- done:
-	if (rc == 0) {
-		if (postreset)
-			postreset(ap, classes);
-
-		ata_eh_thaw_port(ap);
-
-		if (classes[0] == ATA_DEV_UNKNOWN)
-			rc = -ENODEV;
-	}
-	return rc;
-}
-
 /**
  *	ata_dev_same_device - Determine whether new ID matches configured device
  *	@dev: device to compare against
@@ -5419,7 +5261,7 @@ static struct ata_port * ata_host_add(const struct ata_probe_ent *ent,
 
 	DPRINTK("ENTER\n");
 
-	if (!ent->port_ops->probe_reset && !ent->port_ops->error_handler &&
+	if (!ent->port_ops->error_handler &&
 	    !(ent->host_flags & (ATA_FLAG_SATA_RESET | ATA_FLAG_SRST))) {
 		printk(KERN_ERR "ata%u: no reset mechanism available\n",
 		       port_no);
@@ -5551,7 +5393,7 @@ int ata_device_add(const struct ata_probe_ent *ent)
 			 */
 		}
 
-		if (!ap->ops->probe_reset) {
+		if (ap->ops->error_handler) {
 			unsigned long flags;
 
 			ata_port_probe(ap);
@@ -5998,13 +5840,10 @@ EXPORT_SYMBOL_GPL(sata_phy_resume);
 EXPORT_SYMBOL_GPL(sata_phy_reset);
 EXPORT_SYMBOL_GPL(__sata_phy_reset);
 EXPORT_SYMBOL_GPL(ata_bus_reset);
-EXPORT_SYMBOL_GPL(ata_std_probeinit);
 EXPORT_SYMBOL_GPL(ata_std_prereset);
 EXPORT_SYMBOL_GPL(ata_std_softreset);
 EXPORT_SYMBOL_GPL(sata_std_hardreset);
 EXPORT_SYMBOL_GPL(ata_std_postreset);
-EXPORT_SYMBOL_GPL(ata_std_probe_reset);
-EXPORT_SYMBOL_GPL(ata_drive_probe_reset);
 EXPORT_SYMBOL_GPL(ata_dev_revalidate);
 EXPORT_SYMBOL_GPL(ata_dev_classify);
 EXPORT_SYMBOL_GPL(ata_dev_pair);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index f11ba2715bef..a2a33a902917 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -303,7 +303,6 @@ struct ata_queued_cmd;
 
 /* typedefs */
 typedef void (*ata_qc_cb_t) (struct ata_queued_cmd *qc);
-typedef void (*ata_probeinit_fn_t)(struct ata_port *ap);
 typedef int (*ata_prereset_fn_t)(struct ata_port *ap);
 typedef int (*ata_reset_fn_t)(struct ata_port *ap, unsigned int *classes);
 typedef void (*ata_postreset_fn_t)(struct ata_port *ap, unsigned int *classes);
@@ -553,7 +552,6 @@ struct ata_port_operations {
 
 	void (*phy_reset) (struct ata_port *ap); /* obsolete */
 	void (*set_mode) (struct ata_port *ap);
-	int (*probe_reset) (struct ata_port *ap, unsigned int *classes);
 
 	void (*post_set_mode) (struct ata_port *ap);
 
@@ -628,11 +626,6 @@ extern void ata_bus_reset(struct ata_port *ap);
 extern int sata_set_spd(struct ata_port *ap);
 extern int sata_phy_debounce(struct ata_port *ap, const unsigned long *param);
 extern int sata_phy_resume(struct ata_port *ap, const unsigned long *param);
-extern int ata_drive_probe_reset(struct ata_port *ap,
-			ata_probeinit_fn_t probeinit,
-			ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
-			ata_postreset_fn_t postreset, unsigned int *classes);
-extern void ata_std_probeinit(struct ata_port *ap);
 extern int ata_std_prereset(struct ata_port *ap);
 extern int ata_std_softreset(struct ata_port *ap, unsigned int *classes);
 extern int sata_std_hardreset(struct ata_port *ap, unsigned int *class);
@@ -688,7 +681,6 @@ extern void ata_std_dev_select (struct ata_port *ap, unsigned int device);
 extern u8 ata_check_status(struct ata_port *ap);
 extern u8 ata_altstatus(struct ata_port *ap);
 extern void ata_exec_command(struct ata_port *ap, const struct ata_taskfile *tf);
-extern int ata_std_probe_reset(struct ata_port *ap, unsigned int *classes);
 extern int ata_port_start (struct ata_port *ap);
 extern void ata_port_stop (struct ata_port *ap);
 extern void ata_host_stop (struct ata_host_set *host_set);
-- 
cgit v1.2.3


From 7b1c6ca73aa102e9dde5098f58c523bca0f8e2c3 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 1 Jun 2006 12:49:30 +0100
Subject: Add <sys/types.h> to headers included for userspace in
 <linux/input.h>

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Vojtech Pavlik <vojtech@suse.cz>
---
 include/linux/input.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index 50e338d2ffda..b48d9873cbbc 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -15,6 +15,7 @@
 #else
 #include <sys/time.h>
 #include <sys/ioctl.h>
+#include <sys/types.h>
 #include <asm/types.h>
 #endif
 
-- 
cgit v1.2.3


From d27317657ae18cfbc45def8f566e4c3ed1f51d74 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sat, 3 Jun 2006 00:27:53 +0100
Subject: Switch to __s32 types in joystick.h instead of C99 types for
 consistency.

The rest of the file uses these types instead of C99 types.

Acked-by: Dmitry Torokhov <dtor@mail.ru>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/joystick.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/joystick.h b/include/linux/joystick.h
index 5fd20ddd7ae3..e2d3a18af456 100644
--- a/include/linux/joystick.h
+++ b/include/linux/joystick.h
@@ -111,25 +111,25 @@ struct js_corr {
 #define JS_SET_ALL		8
 
 struct JS_DATA_TYPE {
-	int32_t buttons;
-	int32_t x;
-	int32_t y;
+	__s32 buttons;
+	__s32 x;
+	__s32 y;
 };
 
 struct JS_DATA_SAVE_TYPE_32 {
-	int32_t JS_TIMEOUT;
-	int32_t BUSY;
-	int32_t JS_EXPIRETIME;
-	int32_t JS_TIMELIMIT;
+	__s32 JS_TIMEOUT;
+	__s32 BUSY;
+	__s32 JS_EXPIRETIME;
+	__s32 JS_TIMELIMIT;
 	struct JS_DATA_TYPE JS_SAVE;
 	struct JS_DATA_TYPE JS_CORR;
 };
 
 struct JS_DATA_SAVE_TYPE_64 {
-	int32_t JS_TIMEOUT;
-	int32_t BUSY;
-	int64_t JS_EXPIRETIME;
-	int64_t JS_TIMELIMIT;
+	__s32 JS_TIMEOUT;
+	__s32 BUSY;
+	__s64 JS_EXPIRETIME;
+	__s64 JS_TIMELIMIT;
 	struct JS_DATA_TYPE JS_SAVE;
 	struct JS_DATA_TYPE JS_CORR;
 };
-- 
cgit v1.2.3


From 2f3243aebd8df4d9eecaeca04bbff6c7dbfb2142 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 5 Jun 2006 20:19:05 +0100
Subject: [RBTREE] Switch rb_colour() et al to en_US spelling of 'color' for
 consistency

Since rb_insert_color() is part of the _public_ API, while the others are
purely internal, switch to be consistent with that.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/rbtree.h | 22 +++++++++++-----------
 lib/rbtree.c           | 10 +++++-----
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 3cc30b0ab828..f37006f21664 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -99,7 +99,7 @@ static inline struct page * rb_insert_page_cache(struct inode * inode,
 
 struct rb_node
 {
-	unsigned long  rb_parent_colour;
+	unsigned long  rb_parent_color;
 #define	RB_RED		0
 #define	RB_BLACK	1
 	struct rb_node *rb_right;
@@ -113,20 +113,20 @@ struct rb_root
 };
 
 
-#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_colour & ~3))
-#define rb_colour(r)   ((r)->rb_parent_colour & 1)
-#define rb_is_red(r)   (!rb_colour(r))
-#define rb_is_black(r) rb_colour(r)
-#define rb_set_red(r)  do { (r)->rb_parent_colour &= ~1; } while (0)
-#define rb_set_black(r)  do { (r)->rb_parent_colour |= 1; } while (0)
+#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_color & ~3))
+#define rb_color(r)   ((r)->rb_parent_color & 1)
+#define rb_is_red(r)   (!rb_color(r))
+#define rb_is_black(r) rb_color(r)
+#define rb_set_red(r)  do { (r)->rb_parent_color &= ~1; } while (0)
+#define rb_set_black(r)  do { (r)->rb_parent_color |= 1; } while (0)
 
 static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
 {
-	rb->rb_parent_colour = (rb->rb_parent_colour & 3) | (unsigned long)p;
+	rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
 }
-static inline void rb_set_colour(struct rb_node *rb, int colour)
+static inline void rb_set_color(struct rb_node *rb, int color)
 {
-	rb->rb_parent_colour = (rb->rb_parent_colour & ~1) | colour;
+	rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
 }
 
 #define RB_ROOT	(struct rb_root) { NULL, }
@@ -148,7 +148,7 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
 				struct rb_node ** rb_link)
 {
-	node->rb_parent_colour = (unsigned long )parent;
+	node->rb_parent_color = (unsigned long )parent;
 	node->rb_left = node->rb_right = NULL;
 
 	*rb_link = node;
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 4a7173cad149..1e55ba1c2edf 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -170,7 +170,7 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 					__rb_rotate_right(other, root);
 					other = parent->rb_right;
 				}
-				rb_set_colour(other, rb_colour(parent));
+				rb_set_color(other, rb_color(parent));
 				rb_set_black(parent);
 				if (other->rb_right)
 					rb_set_black(other->rb_right);
@@ -207,7 +207,7 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 					__rb_rotate_left(other, root);
 					other = parent->rb_left;
 				}
-				rb_set_colour(other, rb_colour(parent));
+				rb_set_color(other, rb_color(parent));
 				rb_set_black(parent);
 				if (other->rb_left)
 					rb_set_black(other->rb_left);
@@ -239,7 +239,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 			node = left;
 		child = node->rb_right;
 		parent = rb_parent(node);
-		color = rb_colour(node);
+		color = rb_color(node);
 
 		if (child)
 			rb_set_parent(child, parent);
@@ -249,7 +249,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 		} else
 			parent->rb_left = child;
 
-		node->rb_parent_colour = old->rb_parent_colour;
+		node->rb_parent_color = old->rb_parent_color;
 		node->rb_right = old->rb_right;
 		node->rb_left = old->rb_left;
 
@@ -269,7 +269,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 	}
 
 	parent = rb_parent(node);
-	color = rb_colour(node);
+	color = rb_color(node);
 
 	if (child)
 		rb_set_parent(child, parent);
-- 
cgit v1.2.3


From 651d765d0b2c72d33430487c8b6ef64c60cd2134 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 7 Jun 2006 16:10:19 +1000
Subject: [PATCH] Add a prctl to change the endianness of a process.

This new prctl is intended for changing the execution mode of the
processor, on processors that support both a little-endian mode and a
big-endian mode.  It is intended for use by programs such as
instruction set emulators (for example an x86 emulator on PowerPC),
which may find it convenient to use the processor in an alternate
endianness mode when executing translated instructions.

Note that this does not imply the existence of a fully-fledged ABI for
both endiannesses, or of compatibility code for converting system
calls done in the non-native endianness mode.  The program is expected
to arrange for all of its system call arguments to be presented in the
native endianness.

Switching between big and little-endian mode will require some care in
constructing the instruction sequence for the switch.  Generally the
instructions up to the instruction that invokes the prctl system call
will have to be in the old endianness, and subsequent instructions
will have to be in the new endianness.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/linux/prctl.h |  7 +++++++
 kernel/sys.c          | 13 +++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index bf022c43a18e..52a9be41250d 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -52,4 +52,11 @@
 #define PR_SET_NAME    15		/* Set process name */
 #define PR_GET_NAME    16		/* Get process name */
 
+/* Get/set process endian */
+#define PR_GET_ENDIAN	19
+#define PR_SET_ENDIAN	20
+# define PR_ENDIAN_BIG		0
+# define PR_ENDIAN_LITTLE	1	/* True little endian mode */
+# define PR_ENDIAN_PPC_LITTLE	2	/* "PowerPC" pseudo little endian */
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 0b6ec0e7936f..12d2d753dc3b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -57,6 +57,12 @@
 #ifndef GET_FPEXC_CTL
 # define GET_FPEXC_CTL(a,b)	(-EINVAL)
 #endif
+#ifndef GET_ENDIAN
+# define GET_ENDIAN(a,b)	(-EINVAL)
+#endif
+#ifndef SET_ENDIAN
+# define SET_ENDIAN(a,b)	(-EINVAL)
+#endif
 
 /*
  * this is where the system-wide overflow UID and GID are defined, for
@@ -2057,6 +2063,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 				return -EFAULT;
 			return 0;
 		}
+		case PR_GET_ENDIAN:
+			error = GET_ENDIAN(current, arg2);
+			break;
+		case PR_SET_ENDIAN:
+			error = SET_ENDIAN(current, arg2);
+			break;
+
 		default:
 			error = -EINVAL;
 			break;
-- 
cgit v1.2.3


From 73a3d07c1082145a3b78407bb5252df290470c4c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 May 2006 01:40:47 -0400
Subject: NFS: Clean up inode metadata updates

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c          | 12 ------------
 fs/nfs/nfs4proc.c       |  2 +-
 include/linux/nfs_xdr.h |  3 +--
 3 files changed, 2 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e870e4aae714..4f12c57456f4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1360,12 +1360,6 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0
-			&& nfsi->change_attr == fattr->pre_change_attr) {
-		nfsi->change_attr = fattr->change_attr;
-		nfsi->cache_change_attribute = jiffies;
-	}
-
 	/* If we have atomic WCC data, we may update some attributes */
 	if ((fattr->valid & NFS_ATTR_WCC) != 0) {
 		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
@@ -1399,9 +1393,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	int data_unstable;
 
 
-	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
-		return 0;
-
 	/* Has the inode gone and changed behind our back? */
 	if (nfsi->fileid != fattr->fileid
 			|| (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
@@ -1525,9 +1516,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			__FUNCTION__, inode->i_sb->s_id, inode->i_ino,
 			atomic_read(&inode->i_count), fattr->valid);
 
-	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
-		return 0;
-
 	if (nfsi->fileid != fattr->fileid)
 		goto out_fileid;
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d86c0db7b1e8..e38a84874492 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2008,7 +2008,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
 	if (!status) {
 		update_changeattr(dir, &res.cinfo);
 		nfs_post_op_update_inode(dir, res.dir_attr);
-		nfs_refresh_inode(inode, res.fattr);
+		nfs_post_op_update_inode(inode, res.fattr);
 	}
 
 	return status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7fafc4c546b7..c483e239f993 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -57,8 +57,7 @@ struct nfs_fattr {
 #define NFS_ATTR_WCC		0x0001		/* pre-op WCC data    */
 #define NFS_ATTR_FATTR		0x0002		/* post-op attributes */
 #define NFS_ATTR_FATTR_V3	0x0004		/* NFSv3 attributes */
-#define NFS_ATTR_FATTR_V4	0x0008
-#define NFS_ATTR_PRE_CHANGE	0x0010
+#define NFS_ATTR_FATTR_V4	0x0008		/* NFSv4 change attribute */
 
 /*
  * Info on the file system
-- 
cgit v1.2.3


From 0d0b5cb36faf7002a11736032313f06d6f3d881c Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Thu, 25 May 2006 01:40:53 -0400
Subject: NFS: Optimize allocation of nfs_read/write_data structures

Clean up use of page_array, and fix an off-by-one error noticed by Tom
Talpey which causes kmalloc calls in cases where using the page_array
is sufficient.

Test plan:
Normal client functional testing with r/wsize=32768.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/read.c           | 11 ++++-------
 fs/nfs/write.c          | 18 +++++++-----------
 include/linux/nfs_xdr.h |  4 ++--
 3 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4b5f58da5650..fd9018c692bb 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -51,14 +51,11 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		if (pagecount < NFS_PAGEVEC_SIZE)
-			p->pagevec = &p->page_array[0];
+		if (pagecount <= ARRAY_SIZE(p->page_array))
+			p->pagevec = p->page_array;
 		else {
-			size_t size = ++pagecount * sizeof(struct page *);
-			p->pagevec = kmalloc(size, GFP_NOFS);
-			if (p->pagevec) {
-				memset(p->pagevec, 0, size);
-			} else {
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+			if (!p->pagevec) {
 				mempool_free(p, nfs_rdata_mempool);
 				p = NULL;
 			}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4cfada2cc09f..a515ec714bb6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -98,11 +98,10 @@ struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		if (pagecount < NFS_PAGEVEC_SIZE)
-			p->pagevec = &p->page_array[0];
+		if (pagecount <= ARRAY_SIZE(p->page_array))
+			p->pagevec = p->page_array;
 		else {
-			size_t size = ++pagecount * sizeof(struct page *);
-			p->pagevec = kzalloc(size, GFP_NOFS);
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
 			if (!p->pagevec) {
 				mempool_free(p, nfs_commit_mempool);
 				p = NULL;
@@ -126,14 +125,11 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		if (pagecount < NFS_PAGEVEC_SIZE)
-			p->pagevec = &p->page_array[0];
+		if (pagecount <= ARRAY_SIZE(p->page_array))
+			p->pagevec = p->page_array;
 		else {
-			size_t size = ++pagecount * sizeof(struct page *);
-			p->pagevec = kmalloc(size, GFP_NOFS);
-			if (p->pagevec) {
-				memset(p->pagevec, 0, size);
-			} else {
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+			if (!p->pagevec) {
 				mempool_free(p, nfs_wdata_mempool);
 				p = NULL;
 			}
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index c483e239f993..e206c07080fe 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -694,7 +694,7 @@ struct nfs_read_data {
 #ifdef CONFIG_NFS_V4
 	unsigned long		timestamp;	/* For lease renewal */
 #endif
-	struct page		*page_array[NFS_PAGEVEC_SIZE + 1];
+	struct page		*page_array[NFS_PAGEVEC_SIZE];
 };
 
 struct nfs_write_data {
@@ -712,7 +712,7 @@ struct nfs_write_data {
 #ifdef CONFIG_NFS_V4
 	unsigned long		timestamp;	/* For lease renewal */
 #endif
-	struct page		*page_array[NFS_PAGEVEC_SIZE + 1];
+	struct page		*page_array[NFS_PAGEVEC_SIZE];
 };
 
 struct nfs_access_entry;
-- 
cgit v1.2.3


From 38478b24e37587f1c4fedf8ac070ca54f052ed28 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 May 2006 01:40:57 -0400
Subject: NFS: More page cache revalidation fixups

Whenever the directory changes, we want to make sure that we always
invalidate its page cache. Fix up update_changeattr() and
nfs_mark_for_revalidate() so that they do so.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c      | 10 +++++-----
 include/linux/nfs_fs.h |  6 +++++-
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e38a84874492..ef4c6cccf958 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -185,15 +185,15 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
 	spin_unlock(&clp->cl_lock);
 }
 
-static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinfo)
+static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_inode *nfsi = NFS_I(dir);
 
-	spin_lock(&inode->i_lock);
-	nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+	spin_lock(&dir->i_lock);
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
 	if (cinfo->before == nfsi->change_attr && cinfo->atomic)
 		nfsi->change_attr = cinfo->after;
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&dir->i_lock);
 }
 
 struct nfs4_opendata {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c71227dd4389..1d81e7d82970 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -234,8 +234,12 @@ static inline int nfs_caches_unstable(struct inode *inode)
 
 static inline void nfs_mark_for_revalidate(struct inode *inode)
 {
+	struct nfs_inode *nfsi = NFS_I(inode);
+
 	spin_lock(&inode->i_lock);
-	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS;
+	if (S_ISDIR(inode->i_mode))
+		nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
 	spin_unlock(&inode->i_lock);
 }
 
-- 
cgit v1.2.3


From 44b11874ff583b6e766a05856b04f3c492c32b84 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 May 2006 01:40:59 -0400
Subject: NFS: Separate metadata and page cache revalidation mechanisms

Separate out the function of revalidating the inode metadata, and
revalidating the mapping. The former may be called by lookup(),
and only really needs to check that permissions, ctime, etc haven't changed
whereas the latter needs only done when we want to read data from the page
cache, and may need to sync and then invalidate the mapping.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           |  2 +-
 fs/nfs/file.c          | 24 +++---------------------
 fs/nfs/inode.c         | 16 +++++++++++-----
 fs/nfs/symlink.c       |  2 +-
 include/linux/nfs_fs.h |  2 +-
 5 files changed, 17 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cae74dd4c7f5..1d3d8922a663 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -528,7 +528,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	lock_kernel();
 
-	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (res < 0) {
 		unlock_kernel();
 		return res;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index fade02c15e6e..63154070145a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -126,23 +126,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
 	return NFS_PROTO(inode)->file_release(inode, filp);
 }
 
-/**
- * nfs_revalidate_file - Revalidate the page cache & related metadata
- * @inode - pointer to inode struct
- * @file - pointer to file
- */
-static int nfs_revalidate_file(struct inode *inode, struct file *filp)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-	int retval = 0;
-
-	if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR))
-			|| nfs_attribute_timeout(inode))
-		retval = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	nfs_revalidate_mapping(inode, filp->f_mapping);
-	return 0;
-}
-
 /**
  * nfs_revalidate_size - Revalidate the file size
  * @inode - pointer to inode struct
@@ -228,7 +211,7 @@ nfs_file_read(struct kiocb *iocb, char __user * buf, size_t count, loff_t pos)
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		(unsigned long) count, (unsigned long) pos);
 
-	result = nfs_revalidate_file(inode, iocb->ki_filp);
+	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 	nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count);
 	if (!result)
 		result = generic_file_aio_read(iocb, buf, count, pos);
@@ -247,7 +230,7 @@ nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count,
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		(unsigned long) count, (unsigned long long) *ppos);
 
-	res = nfs_revalidate_file(inode, filp);
+	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (!res)
 		res = generic_file_sendfile(filp, ppos, count, actor, target);
 	return res;
@@ -263,7 +246,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 	dfprintk(VFS, "nfs: mmap(%s/%s)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	status = nfs_revalidate_file(inode, file);
+	status = nfs_revalidate_mapping(inode, file->f_mapping);
 	if (!status)
 		status = generic_file_mmap(file, vma);
 	return status;
@@ -373,7 +356,6 @@ nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t
 		if (result)
 			goto out;
 	}
-	nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 
 	result = count;
 	if (!count)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index eddd0e982d23..69036ef39866 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1220,7 +1220,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		status = -ESTALE;
 		/* Do we trust the cached ESTALE? */
 		if (NFS_ATTRTIMEO(inode) != 0) {
-			if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME)) {
+			if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) {
 				/* no */
 			} else
 				goto out;
@@ -1251,8 +1251,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	}
 	spin_unlock(&inode->i_lock);
 
-	nfs_revalidate_mapping(inode, inode->i_mapping);
-
 	if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
 		nfs_zap_acl_cache(inode);
 
@@ -1287,7 +1285,7 @@ int nfs_attribute_timeout(struct inode *inode)
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
 	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
-	if (!(NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+	if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
 			&& !nfs_attribute_timeout(inode))
 		return NFS_STALE(inode) ? -ESTALE : 0;
 	return __nfs_revalidate_inode(server, inode);
@@ -1298,9 +1296,16 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
  * @inode - pointer to host inode
  * @mapping - pointer to mapping
  */
-void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	int ret = 0;
+
+	if (NFS_STALE(inode))
+		ret = -ESTALE;
+	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+			|| nfs_attribute_timeout(inode))
+		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 
 	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
 		nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
@@ -1321,6 +1326,7 @@ void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 				inode->i_sb->s_id,
 				(long long)NFS_FILEID(inode));
 	}
+	return ret;
 }
 
 /**
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 18dc95b0b646..636c479995bc 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -52,7 +52,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
 	struct page *page;
-	void *err = ERR_PTR(nfs_revalidate_inode(NFS_SERVER(inode), inode));
+	void *err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
 	if (err)
 		goto read_failed;
 	page = read_cache_page(&inode->i_data, 0,
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1d81e7d82970..1b524b9f982a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -301,7 +301,7 @@ extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
-extern void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
+extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
 extern void nfs_begin_attr_update(struct inode *);
-- 
cgit v1.2.3


From d2ccddf042c403b146159beea438c6bfc4a445e2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 31 May 2006 01:13:38 -0400
Subject: NFS: Flesh out nfs_invalidate_page()

In the case of a call to truncate_inode_pages(), we should really try to
cancel any pending writes on the page.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c            |  6 +++++-
 fs/nfs/pagelist.c        | 47 ++++++++++++++++++++++++++++-------------------
 fs/nfs/write.c           | 27 ++++++++++++++++++++++++---
 include/linux/nfs_fs.h   |  1 +
 include/linux/nfs_page.h |  4 ++--
 5 files changed, 60 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 63154070145a..106ef0dec04d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -303,7 +303,11 @@ static int nfs_commit_write(struct file *file, struct page *page, unsigned offse
 
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
 {
-	/* FIXME: we really should cancel any unstarted writes on this page */
+	struct inode *inode = page->mapping->host;
+
+	/* Cancel any unstarted writes on this page */
+	if (offset == 0)
+		nfs_sync_inode_wait(inode, page->index, 1, FLUSH_INVALIDATE);
 }
 
 static int nfs_release_page(struct page *page, gfp_t gfp)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 106aca388ebc..656481c0daa3 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -325,6 +325,7 @@ out:
 
 /**
  * nfs_scan_list - Scan a list for matching requests
+ * @nfsi: NFS inode
  * @head: One of the NFS inode request lists
  * @dst: Destination list
  * @idx_start: lower bound of page->index to scan
@@ -336,14 +337,15 @@ out:
  * The requests are *not* checked to ensure that they form a contiguous set.
  * You must be holding the inode's req_lock when calling this function
  */
-int
-nfs_scan_list(struct list_head *head, struct list_head *dst,
-	      unsigned long idx_start, unsigned int npages)
+int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
+		struct list_head *dst, unsigned long idx_start,
+		unsigned int npages)
 {
-	struct list_head	*pos, *tmp;
-	struct nfs_page		*req;
-	unsigned long		idx_end;
-	int			res;
+	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
+	struct nfs_page *req;
+	unsigned long idx_end;
+	int found, i;
+	int res;
 
 	res = 0;
 	if (npages == 0)
@@ -351,21 +353,28 @@ nfs_scan_list(struct list_head *head, struct list_head *dst,
 	else
 		idx_end = idx_start + npages - 1;
 
-	list_for_each_safe(pos, tmp, head) {
-
-		req = nfs_list_entry(pos);
-
-		if (req->wb_index < idx_start)
-			continue;
-		if (req->wb_index > idx_end)
+	for (;;) {
+		found = radix_tree_gang_lookup(&nfsi->nfs_page_tree,
+				(void **)&pgvec[0], idx_start,
+				NFS_SCAN_MAXENTRIES);
+		if (found <= 0)
 			break;
+		for (i = 0; i < found; i++) {
+			req = pgvec[i];
+			if (req->wb_index > idx_end)
+				goto out;
+			idx_start = req->wb_index + 1;
+			if (req->wb_list_head != head)
+				continue;
+			if (nfs_set_page_writeback_locked(req)) {
+				nfs_list_remove_request(req);
+				nfs_list_add_request(req, dst);
+				res++;
+			}
+		}
 
-		if (!nfs_set_page_writeback_locked(req))
-			continue;
-		nfs_list_remove_request(req);
-		nfs_list_add_request(req, dst);
-		res++;
 	}
+out:
 	return res;
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a515ec714bb6..e03abbd8302e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -579,6 +579,17 @@ static int nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, un
 	return ret;
 }
 
+static void nfs_cancel_requests(struct list_head *head)
+{
+	struct nfs_page *req;
+	while(!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_inode_remove_request(req);
+		nfs_clear_page_writeback(req);
+	}
+}
+
 /*
  * nfs_scan_dirty - Scan an inode for dirty requests
  * @inode: NFS inode to scan
@@ -623,7 +634,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_st
 	int res = 0;
 
 	if (nfsi->ncommit != 0) {
-		res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages);
+		res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages);
 		nfsi->ncommit -= res;
 		if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
 			printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
@@ -1491,15 +1502,25 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
 		pages = nfs_scan_dirty(inode, &head, idx_start, npages);
 		if (pages != 0) {
 			spin_unlock(&nfsi->req_lock);
-			ret = nfs_flush_list(inode, &head, pages, how);
+			if (how & FLUSH_INVALIDATE)
+				nfs_cancel_requests(&head);
+			else
+				ret = nfs_flush_list(inode, &head, pages, how);
 			spin_lock(&nfsi->req_lock);
 			continue;
 		}
 		if (nocommit)
 			break;
-		pages = nfs_scan_commit(inode, &head, 0, 0);
+		pages = nfs_scan_commit(inode, &head, idx_start, npages);
 		if (pages == 0)
 			break;
+		if (how & FLUSH_INVALIDATE) {
+			spin_unlock(&nfsi->req_lock);
+			nfs_cancel_requests(&head);
+			spin_lock(&nfsi->req_lock);
+			continue;
+		}
+		pages += nfs_scan_commit(inode, &head, 0, 0);
 		spin_unlock(&nfsi->req_lock);
 		ret = nfs_commit_list(inode, &head, how);
 		spin_lock(&nfsi->req_lock);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1b524b9f982a..fc48135621ed 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -61,6 +61,7 @@
 #define FLUSH_LOWPRI		8	/* low priority background flush */
 #define FLUSH_HIGHPRI		16	/* high priority memory reclaim flush */
 #define FLUSH_NOCOMMIT		32	/* Don't send the NFSv3/v4 COMMIT */
+#define FLUSH_INVALIDATE	64	/* Invalidate the page cache */
 
 #ifdef __KERNEL__
 
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 66e2ed658527..8cadb0a77a7a 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -63,8 +63,8 @@ extern	void nfs_release_request(struct nfs_page *req);
 
 extern  int nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst,
 				unsigned long idx_start, unsigned int npages);
-extern	int nfs_scan_list(struct list_head *, struct list_head *,
-			  unsigned long, unsigned int);
+extern	int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, struct list_head *dst,
+			  unsigned long idx_start, unsigned int npages);
 extern	int nfs_coalesce_requests(struct list_head *, struct list_head *,
 				  unsigned int);
 extern  int nfs_wait_on_request(struct nfs_page *);
-- 
cgit v1.2.3


From bb4a58bf46473e3e83d84054bbc110db3a0f85e4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:15 -0400
Subject: VFS: Add GPL_EXPORTED function vfs_kern_mount()

do_kern_mount() does not allow the kernel to use private mount interfaces
without exposing the same interfaces to userland. The problem is that the
filesystem is referenced by name, thus meaning that it and its mount
interface must be registered in the global filesystem list.

vfs_kern_mount() passes the struct file_system_type as an explicit
parameter in order to overcome this limitation.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/super.c            | 22 +++++++++++++++-------
 include/linux/mount.h |  5 +++++
 2 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index a66f66bb8049..848be4fc67a2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -800,17 +800,13 @@ struct super_block *get_sb_single(struct file_system_type *fs_type,
 EXPORT_SYMBOL(get_sb_single);
 
 struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
-	struct file_system_type *type = get_fs_type(fstype);
 	struct super_block *sb = ERR_PTR(-ENOMEM);
 	struct vfsmount *mnt;
 	int error;
 	char *secdata = NULL;
 
-	if (!type)
-		return ERR_PTR(-ENODEV);
-
 	mnt = alloc_vfsmnt(name);
 	if (!mnt)
 		goto out;
@@ -841,7 +837,6 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data)
 	mnt->mnt_parent = mnt;
 	up_write(&sb->s_umount);
 	free_secdata(secdata);
-	put_filesystem(type);
 	return mnt;
 out_sb:
 	up_write(&sb->s_umount);
@@ -852,10 +847,23 @@ out_free_secdata:
 out_mnt:
 	free_vfsmnt(mnt);
 out:
-	put_filesystem(type);
 	return (struct vfsmount *)sb;
 }
 
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
+
+struct vfsmount *
+do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+{
+	struct file_system_type *type = get_fs_type(fstype);
+	struct vfsmount *mnt;
+	if (!type)
+		return ERR_PTR(-ENODEV);
+	mnt = vfs_kern_mount(type, flags, name, data);
+	put_filesystem(type);
+	return mnt;
+}
+
 EXPORT_SYMBOL_GPL(do_kern_mount);
 
 struct vfsmount *kern_mount(struct file_system_type *type)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index b7472ae91fa4..aff68c3660f5 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -73,6 +73,11 @@ extern struct vfsmount *alloc_vfsmnt(const char *name);
 extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
 				      const char *name, void *data);
 
+struct file_system_type;
+extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
+				      int flags, const char *name,
+				      void *data);
+
 struct nameidata;
 
 extern int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
-- 
cgit v1.2.3


From 1f5ce9e93aa96a867f195ed45f6f77935175f12e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:16 -0400
Subject: VFS: Unexport do_kern_mount() and clean up simple_pin_fs()

Replace all module uses with the new vfs_kern_mount() interface, and fix up
simple_pin_fs().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/filesystems/automount-support.txt | 2 +-
 drivers/usb/core/inode.c                        | 2 +-
 fs/afs/mntpt.c                                  | 2 +-
 fs/afs/super.c                                  | 2 +-
 fs/afs/super.h                                  | 2 ++
 fs/binfmt_misc.c                                | 3 ++-
 fs/configfs/mount.c                             | 2 +-
 fs/debugfs/inode.c                              | 2 +-
 fs/libfs.c                                      | 4 ++--
 fs/super.c                                      | 4 +---
 include/linux/fs.h                              | 2 +-
 mm/shmem.c                                      | 2 +-
 net/sunrpc/rpc_pipe.c                           | 2 +-
 security/inode.c                                | 2 +-
 14 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/automount-support.txt b/Documentation/filesystems/automount-support.txt
index 58c65a1713e5..7cac200e2a85 100644
--- a/Documentation/filesystems/automount-support.txt
+++ b/Documentation/filesystems/automount-support.txt
@@ -19,7 +19,7 @@ following procedure:
 
  (2) Have the follow_link() op do the following steps:
 
-     (a) Call do_kern_mount() to call the appropriate filesystem to set up a
+     (a) Call vfs_kern_mount() to call the appropriate filesystem to set up a
          superblock and gain a vfsmount structure representing it.
 
      (b) Copy the nameidata provided as an argument and substitute the dentry
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 3cf945cc5b9a..695b90a17a68 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -569,7 +569,7 @@ static int create_special_files (void)
 	ignore_mount = 1;
 
 	/* create the devices special file */
-	retval = simple_pin_fs("usbfs", &usbfs_mount, &usbfs_mount_count);
+	retval = simple_pin_fs(&usb_fs_type, &usbfs_mount, &usbfs_mount_count);
 	if (retval) {
 		err ("Unable to get usbfs mount");
 		goto exit;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 4e6eeb59b83c..7b6dc03caf44 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -210,7 +210,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 
 	/* try and do the mount */
 	kdebug("--- attempting mount %s -o %s ---", devname, options);
-	mnt = do_kern_mount("afs", 0, devname, options);
+	mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
 	kdebug("--- mount result %p ---", mnt);
 
 	free_page((unsigned long) devname);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 53c56e7231ab..93a7821db0d7 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -48,7 +48,7 @@ static void afs_put_super(struct super_block *sb);
 
 static void afs_destroy_inode(struct inode *inode);
 
-static struct file_system_type afs_fs_type = {
+struct file_system_type afs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "afs",
 	.get_sb		= afs_get_sb,
diff --git a/fs/afs/super.h b/fs/afs/super.h
index ac11362f4e95..32de8cc6fae8 100644
--- a/fs/afs/super.h
+++ b/fs/afs/super.h
@@ -38,6 +38,8 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
+extern struct file_system_type afs_fs_type;
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_AFS_SUPER_H */
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index d73d75591a39..c0a909e1d290 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -55,6 +55,7 @@ typedef struct {
 } Node;
 
 static DEFINE_RWLOCK(entries_lock);
+static struct file_system_type bm_fs_type;
 static struct vfsmount *bm_mnt;
 static int entry_count;
 
@@ -638,7 +639,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	if (!inode)
 		goto out2;
 
-	err = simple_pin_fs("binfmt_misc", &bm_mnt, &entry_count);
+	err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
 	if (err) {
 		iput(inode);
 		inode = NULL;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f920d30478e5..be5d86ae56f0 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -118,7 +118,7 @@ static struct file_system_type configfs_fs_type = {
 
 int configfs_pin_fs(void)
 {
-	return simple_pin_fs("configfs", &configfs_mount,
+	return simple_pin_fs(&configfs_fs_type, &configfs_mount,
 			     &configfs_mnt_count);
 }
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b55b4ea9a676..90f9417181fd 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -199,7 +199,7 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
 
 	pr_debug("debugfs: creating file '%s'\n",name);
 
-	error = simple_pin_fs("debugfs", &debugfs_mount, &debugfs_mount_count);
+	error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count);
 	if (error)
 		goto exit;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 7145ba7a48d0..4a3ec9ad8bed 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -424,13 +424,13 @@ out:
 
 static DEFINE_SPINLOCK(pin_fs_lock);
 
-int simple_pin_fs(char *name, struct vfsmount **mount, int *count)
+int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
 {
 	struct vfsmount *mnt = NULL;
 	spin_lock(&pin_fs_lock);
 	if (unlikely(!*mount)) {
 		spin_unlock(&pin_fs_lock);
-		mnt = do_kern_mount(name, 0, name, NULL);
+		mnt = vfs_kern_mount(type, 0, type->name, NULL);
 		if (IS_ERR(mnt))
 			return PTR_ERR(mnt);
 		spin_lock(&pin_fs_lock);
diff --git a/fs/super.c b/fs/super.c
index 848be4fc67a2..15f2afdbf82e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -864,11 +864,9 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data)
 	return mnt;
 }
 
-EXPORT_SYMBOL_GPL(do_kern_mount);
-
 struct vfsmount *kern_mount(struct file_system_type *type)
 {
-	return do_kern_mount(type->name, 0, type->name, NULL);
+	return vfs_kern_mount(type, 0, type->name, NULL);
 }
 
 EXPORT_SYMBOL(kern_mount);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f813bc8266aa..eca70cfe5b85 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1763,7 +1763,7 @@ extern struct inode_operations simple_dir_inode_operations;
 struct tree_descr { char *name; const struct file_operations *ops; int mode; };
 struct dentry *d_alloc_name(struct dentry *, const char *);
 extern int simple_fill_super(struct super_block *, int, struct tree_descr *);
-extern int simple_pin_fs(char *name, struct vfsmount **mount, int *count);
+extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
 extern void simple_release_fs(struct vfsmount **mount, int *count);
 
 extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const void *, size_t);
diff --git a/mm/shmem.c b/mm/shmem.c
index 4c5e68e4e9ae..8184342440f0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2261,7 +2261,7 @@ static int __init init_tmpfs(void)
 #ifdef CONFIG_TMPFS
 	devfs_mk_dir("shm");
 #endif
-	shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER,
+	shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
 				tmpfs_fs_type.name, NULL);
 	if (IS_ERR(shm_mnt)) {
 		error = PTR_ERR(shm_mnt);
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index cc673dd8433f..a5226df8ac03 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -439,7 +439,7 @@ struct vfsmount *rpc_get_mount(void)
 {
 	int err;
 
-	err = simple_pin_fs("rpc_pipefs", &rpc_mount, &rpc_mount_count);
+	err = simple_pin_fs(&rpc_pipe_fs_type, &rpc_mount, &rpc_mount_count);
 	if (err != 0)
 		return ERR_PTR(err);
 	return rpc_mount;
diff --git a/security/inode.c b/security/inode.c
index 0f77b0223662..8bf40625c670 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -224,7 +224,7 @@ struct dentry *securityfs_create_file(const char *name, mode_t mode,
 
 	pr_debug("securityfs: creating file '%s'\n",name);
 
-	error = simple_pin_fs("securityfs", &mount, &mount_count);
+	error = simple_pin_fs(&fs_type, &mount, &mount_count);
 	if (error) {
 		dentry = ERR_PTR(error);
 		goto exit;
-- 
cgit v1.2.3


From 5528f911b4c43a5de5da34bcbd7e3f2a62503617 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:17 -0400
Subject: VFS: Add shrink_submounts()

Allow a submount to be marked as being 'shrinkable' by means of the
vfsmount->mnt_flags, and then add a function 'shrink_submounts()' which
attempts to recursively unmount these submounts.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/namespace.c        | 124 ++++++++++++++++++++++++++++++++++++++++----------
 include/linux/mount.h |   3 ++
 2 files changed, 102 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index bf478addb852..b22e469ab560 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1162,6 +1162,40 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
 	}
 }
 
+/*
+ * go through the vfsmounts we've just consigned to the graveyard to
+ * - check that they're still dead
+ * - delete the vfsmount from the appropriate namespace under lock
+ * - dispose of the corpse
+ */
+static void expire_mount_list(struct list_head *graveyard, struct list_head *mounts)
+{
+	struct namespace *namespace;
+	struct vfsmount *mnt;
+
+	while (!list_empty(graveyard)) {
+		LIST_HEAD(umounts);
+		mnt = list_entry(graveyard->next, struct vfsmount, mnt_expire);
+		list_del_init(&mnt->mnt_expire);
+
+		/* don't do anything if the namespace is dead - all the
+		 * vfsmounts from it are going away anyway */
+		namespace = mnt->mnt_namespace;
+		if (!namespace || !namespace->root)
+			continue;
+		get_namespace(namespace);
+
+		spin_unlock(&vfsmount_lock);
+		down_write(&namespace_sem);
+		expire_mount(mnt, mounts, &umounts);
+		up_write(&namespace_sem);
+		release_mounts(&umounts);
+		mntput(mnt);
+		put_namespace(namespace);
+		spin_lock(&vfsmount_lock);
+	}
+}
+
 /*
  * process a list of expirable mountpoints with the intent of discarding any
  * mountpoints that aren't in use and haven't been touched since last we came
@@ -1169,7 +1203,6 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
  */
 void mark_mounts_for_expiry(struct list_head *mounts)
 {
-	struct namespace *namespace;
 	struct vfsmount *mnt, *next;
 	LIST_HEAD(graveyard);
 
@@ -1193,38 +1226,79 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 		list_move(&mnt->mnt_expire, &graveyard);
 	}
 
-	/*
-	 * go through the vfsmounts we've just consigned to the graveyard to
-	 * - check that they're still dead
-	 * - delete the vfsmount from the appropriate namespace under lock
-	 * - dispose of the corpse
-	 */
-	while (!list_empty(&graveyard)) {
-		LIST_HEAD(umounts);
-		mnt = list_entry(graveyard.next, struct vfsmount, mnt_expire);
-		list_del_init(&mnt->mnt_expire);
+	expire_mount_list(&graveyard, mounts);
 
-		/* don't do anything if the namespace is dead - all the
-		 * vfsmounts from it are going away anyway */
-		namespace = mnt->mnt_namespace;
-		if (!namespace || !namespace->root)
+	spin_unlock(&vfsmount_lock);
+}
+
+EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+
+/*
+ * Ripoff of 'select_parent()'
+ *
+ * search the list of submounts for a given mountpoint, and move any
+ * shrinkable submounts to the 'graveyard' list.
+ */
+static int select_submounts(struct vfsmount *parent, struct list_head *graveyard)
+{
+	struct vfsmount *this_parent = parent;
+	struct list_head *next;
+	int found = 0;
+
+repeat:
+	next = this_parent->mnt_mounts.next;
+resume:
+	while (next != &this_parent->mnt_mounts) {
+		struct list_head *tmp = next;
+		struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child);
+
+		next = tmp->next;
+		if (!(mnt->mnt_flags & MNT_SHRINKABLE))
 			continue;
-		get_namespace(namespace);
+		/*
+		 * Descend a level if the d_mounts list is non-empty.
+		 */
+		if (!list_empty(&mnt->mnt_mounts)) {
+			this_parent = mnt;
+			goto repeat;
+		}
 
-		spin_unlock(&vfsmount_lock);
-		down_write(&namespace_sem);
-		expire_mount(mnt, mounts, &umounts);
-		up_write(&namespace_sem);
-		release_mounts(&umounts);
-		mntput(mnt);
-		put_namespace(namespace);
-		spin_lock(&vfsmount_lock);
+		if (!propagate_mount_busy(mnt, 1)) {
+			mntget(mnt);
+			list_move_tail(&mnt->mnt_expire, graveyard);
+			found++;
+		}
 	}
+	/*
+	 * All done at this level ... ascend and resume the search
+	 */
+	if (this_parent != parent) {
+		next = this_parent->mnt_child.next;
+		this_parent = this_parent->mnt_parent;
+		goto resume;
+	}
+	return found;
+}
+
+/*
+ * process a list of expirable mountpoints with the intent of discarding any
+ * submounts of a specific parent mountpoint
+ */
+void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts)
+{
+	LIST_HEAD(graveyard);
+	int found;
+
+	spin_lock(&vfsmount_lock);
+
+	/* extract submounts of 'mountpoint' from the expiration list */
+	while ((found = select_submounts(mountpoint, &graveyard)) != 0)
+		expire_mount_list(&graveyard, mounts);
 
 	spin_unlock(&vfsmount_lock);
 }
 
-EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+EXPORT_SYMBOL_GPL(shrink_submounts);
 
 /*
  * Some copy_from_user() implementations do not return the exact number of
diff --git a/include/linux/mount.h b/include/linux/mount.h
index aff68c3660f5..9b4e0071b92e 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -23,6 +23,8 @@
 #define MNT_NOATIME	0x08
 #define MNT_NODIRATIME	0x10
 
+#define MNT_SHRINKABLE	0x100
+
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
 #define MNT_PNODE_MASK	0x3000	/* propogation flag mask */
@@ -84,6 +86,7 @@ extern int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
 			int mnt_flags, struct list_head *fslist);
 
 extern void mark_mounts_for_expiry(struct list_head *mounts);
+extern void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts);
 
 extern spinlock_t vfsmount_lock;
 extern dev_t name_to_dev_t(char *name);
-- 
cgit v1.2.3


From 8b512d9a88875affe584bb3d2a7a235f84343b9e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:18 -0400
Subject: VFS: Remove dependency of ->umount_begin() call on MNT_FORCE

Allow filesystems to decide to perform pre-umount processing whether or not
MNT_FORCE is set.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/9p/vfs_super.c  |  7 ++++---
 fs/cifs/cifsfs.c   |  6 ++++--
 fs/fuse/inode.c    |  5 +++--
 fs/namespace.c     |  4 ++--
 fs/nfs/inode.c     | 14 +++++++++-----
 include/linux/fs.h |  2 +-
 6 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 61c599b4a1e3..00c1f6baf870 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -253,11 +253,12 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
 }
 
 static void
-v9fs_umount_begin(struct super_block *sb)
+v9fs_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	struct v9fs_session_info *v9ses = vfsmnt->mnt_sb->s_fs_info;
 
-	v9fs_session_cancel(v9ses);
+	if (flags & MNT_FORCE)
+		v9fs_session_cancel(v9ses);
 }
 
 static struct super_operations v9fs_super_ops = {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c262d8874ce9..3fdc2258f447 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -402,12 +402,14 @@ static struct quotactl_ops cifs_quotactl_ops = {
 #endif
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-static void cifs_umount_begin(struct super_block * sblock)
+static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
 {
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo * tcon;
 
-	cifs_sb = CIFS_SB(sblock);
+	if (!(flags & MNT_FORCE))
+		return;
+	cifs_sb = CIFS_SB(vfsmnt->mnt_sb);
 	if(cifs_sb == NULL)
 		return;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7627022446b2..13ebe5780c93 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -195,9 +195,10 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
 	return inode;
 }
 
-static void fuse_umount_begin(struct super_block *sb)
+static void fuse_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-	fuse_abort_conn(get_fuse_conn_super(sb));
+	if (flags & MNT_FORCE)
+		fuse_abort_conn(get_fuse_conn_super(vfsmnt->mnt_sb));
 }
 
 static void fuse_put_super(struct super_block *sb)
diff --git a/fs/namespace.c b/fs/namespace.c
index b22e469ab560..6bb0b85293e7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -576,8 +576,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	 */
 
 	lock_kernel();
-	if ((flags & MNT_FORCE) && sb->s_op->umount_begin)
-		sb->s_op->umount_begin(sb);
+	if (sb->s_op->umount_begin)
+		sb->s_op->umount_begin(mnt, flags);
 	unlock_kernel();
 
 	/*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9ff039f9a836..fda2b4966179 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -63,7 +63,7 @@ static struct inode *nfs_alloc_inode(struct super_block *sb);
 static void nfs_destroy_inode(struct inode *);
 static int nfs_write_inode(struct inode *,int);
 static void nfs_clear_inode(struct inode *);
-static void nfs_umount_begin(struct super_block *);
+static void nfs_umount_begin(struct vfsmount *, int);
 static int  nfs_statfs(struct super_block *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
@@ -162,15 +162,19 @@ nfs_clear_inode(struct inode *inode)
 	BUG_ON(atomic_read(&nfsi->data_updates) != 0);
 }
 
-void
-nfs_umount_begin(struct super_block *sb)
+static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-	struct rpc_clnt	*rpc = NFS_SB(sb)->client;
+	struct nfs_server *server;
+	struct rpc_clnt	*rpc;
 
+	if (!(flags & MNT_FORCE))
+		return;
 	/* -EIO all pending I/O */
+	server = NFS_SB(vfsmnt->mnt_sb);
+	rpc = server->client;
 	if (!IS_ERR(rpc))
 		rpc_killall_tasks(rpc);
-	rpc = NFS_SB(sb)->client_acl;
+	rpc = server->client_acl;
 	if (!IS_ERR(rpc))
 		rpc_killall_tasks(rpc);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index eca70cfe5b85..1d80ba747484 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1101,7 +1101,7 @@ struct super_operations {
 	int (*statfs) (struct super_block *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*clear_inode) (struct inode *);
-	void (*umount_begin) (struct super_block *);
+	void (*umount_begin) (struct vfsmount *, int);
 
 	int (*show_options)(struct seq_file *, struct vfsmount *);
 	int (*show_stats)(struct seq_file *, struct vfsmount *);
-- 
cgit v1.2.3


From 8b4bdcf8995dd92b23d2ec22b32aee8fbbb50e1c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:19 -0400
Subject: NFS: Store the file system "fsid" value in the NFS super block.

This should enable us to detect if we are crossing a mountpoint in the
case where the server is exporting "nohide" mounts.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c            |  1 -
 fs/nfs/inode.c            |  8 ++++++++
 fs/nfs/nfs2xdr.c          |  3 ++-
 fs/nfs/nfs3xdr.c          |  3 ++-
 fs/nfs/nfs4xdr.c          |  4 ++--
 include/linux/nfs_fs.h    |  5 +++--
 include/linux/nfs_fs_sb.h |  1 +
 include/linux/nfs_page.h  |  1 -
 include/linux/nfs_xdr.h   | 19 ++++++++++++-------
 9 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 3fab5b0cfc5a..b81e7ed3c902 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -47,7 +47,6 @@
 #include <linux/workqueue.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 
-#include <linux/nfs_fs_sb.h>
 #include <linux/nfs_fs.h>
 
 #include <linux/nfs_idmap.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index fda2b4966179..1a809f6f8989 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -236,6 +236,7 @@ nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *f
 		return ERR_PTR(error);
 	}
 
+	server->fsid = fsinfo->fattr->fsid;
 	return nfs_fhget(sb, rootfh, fsinfo->fattr);
 }
 
@@ -1493,6 +1494,7 @@ out:
  */
 static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
+	struct nfs_server *server;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t cur_isize, new_isize;
 	unsigned int	invalid = 0;
@@ -1511,6 +1513,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		goto out_changed;
 
+	server = NFS_SERVER(inode);
+	/* Update the fsid if and only if this is the root directory */
+	if (inode == inode->i_sb->s_root->d_inode
+			&& !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+		server->fsid = fattr->fsid;
+
 	/*
 	 * Update the read time so we don't revalidate too often.
 	 */
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index f0015fa876e1..a7ed88f97a11 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -131,7 +131,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
 	fattr->du.nfs2.blocksize = ntohl(*p++);
 	rdev = ntohl(*p++);
 	fattr->du.nfs2.blocks = ntohl(*p++);
-	fattr->fsid_u.nfs3 = ntohl(*p++);
+	fattr->fsid.major = ntohl(*p++);
+	fattr->fsid.minor = 0;
 	fattr->fileid = ntohl(*p++);
 	p = xdr_decode_time(p, &fattr->atime);
 	p = xdr_decode_time(p, &fattr->mtime);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index ec233619687e..f70eee2cac05 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -166,7 +166,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
 	if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
 		fattr->rdev = 0;
 
-	p = xdr_decode_hyper(p, &fattr->fsid_u.nfs3);
+	p = xdr_decode_hyper(p, &fattr->fsid.major);
+	fattr->fsid.minor = 0;
 	p = xdr_decode_hyper(p, &fattr->fileid);
 	p = xdr_decode_time3(p, &fattr->atime);
 	p = xdr_decode_time3(p, &fattr->mtime);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7e9a840057f2..0d5794675944 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2217,7 +2217,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 	return 0;
 }
 
-static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fsid *fsid)
+static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
 {
 	uint32_t *p;
 
@@ -2863,7 +2863,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 		goto xdr_error;
 	if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0)
 		goto xdr_error;
-	if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid_u.nfs4)) != 0)
+	if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0)
 		goto xdr_error;
 	if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
 		goto xdr_error;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index fc48135621ed..6763a0089ee4 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -16,8 +16,6 @@
 #include <linux/rwsem.h>
 #include <linux/wait.h>
 
-#include <linux/nfs_fs_sb.h>
-
 #include <linux/sunrpc/debug.h>
 #include <linux/sunrpc/auth.h>
 #include <linux/sunrpc/clnt.h>
@@ -27,6 +25,9 @@
 #include <linux/nfs3.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_xdr.h>
+
+#include <linux/nfs_fs_sb.h>
+
 #include <linux/rwsem.h>
 #include <linux/mempool.h>
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 65dec21af774..6b4a13c79474 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -35,6 +35,7 @@ struct nfs_server {
 	char *			hostname;	/* remote hostname */
 	struct nfs_fh		fh;
 	struct sockaddr_in	addr;
+	struct nfs_fsid		fsid;
 	unsigned long		mount_time;	/* when this fs was mounted */
 #ifdef CONFIG_NFS_V4
 	/* Our own IP address, as a null-terminated string.
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 8cadb0a77a7a..1f7bd287c230 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -13,7 +13,6 @@
 #include <linux/list.h>
 #include <linux/pagemap.h>
 #include <linux/wait.h>
-#include <linux/nfs_fs_sb.h>
 #include <linux/sunrpc/auth.h>
 #include <linux/nfs_xdr.h>
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index e206c07080fe..95682f7d738a 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -14,11 +14,19 @@
 #define NFS_DEF_FILE_IO_SIZE	(4096U)
 #define NFS_MIN_FILE_IO_SIZE	(1024U)
 
-struct nfs4_fsid {
-	__u64 major;
-	__u64 minor;
+struct nfs_fsid {
+	uint64_t		major;
+	uint64_t		minor;
 };
 
+/*
+ * Helper for checking equality between 2 fsids.
+ */
+static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid *b)
+{
+	return a->major == b->major && a->minor == b->minor;
+}
+
 struct nfs_fattr {
 	unsigned short		valid;		/* which fields are valid */
 	__u64			pre_size;	/* pre_op_attr.size	  */
@@ -40,10 +48,7 @@ struct nfs_fattr {
 		} nfs3;
 	} du;
 	dev_t			rdev;
-	union {
-		__u64		nfs3;		/* also nfs2 */
-		struct nfs4_fsid nfs4;
-	} fsid_u;
+	struct nfs_fsid		fsid;
 	__u64			fileid;
 	struct timespec		atime;
 	struct timespec		mtime;
-- 
cgit v1.2.3


From 55a975937d40cac582e981ddc8ed783b3dcc043c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:19 -0400
Subject: NFS: Ensure the client submounts, when it crosses a server
 mountpoint.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/Makefile        |   3 +-
 fs/nfs/dir.c           |  16 +++
 fs/nfs/inode.c         | 303 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/nfs/namespace.c     |  89 +++++++++++++++
 fs/nfs/nfs4_fs.h       |   1 +
 fs/nfs/nfs4proc.c      |   2 +-
 include/linux/nfs_fs.h |   9 ++
 7 files changed, 418 insertions(+), 5 deletions(-)
 create mode 100644 fs/nfs/namespace.c

(limited to 'include/linux')

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ec61fd56a1a9..d9d494cee388 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -5,7 +5,8 @@
 obj-$(CONFIG_NFS_FS) += nfs.o
 
 nfs-y 			:= dir.o file.o inode.o nfs2xdr.o pagelist.o \
-			   proc.o read.o symlink.o unlink.o write.o
+			   proc.o read.o symlink.o unlink.o write.o \
+			   namespace.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o mount_clnt.o      
 nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)	+= nfs3acl.o
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1d3d8922a663..3ddda6f7ecc2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -868,6 +868,17 @@ int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
 	return (nd->intent.open.flags & O_EXCL) != 0;
 }
 
+static inline int nfs_reval_fsid(struct inode *dir,
+		struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+
+	if (!nfs_fsid_equal(&server->fsid, &fattr->fsid))
+		/* Revalidate fsid on root dir */
+		return __nfs_revalidate_inode(server, dir->i_sb->s_root->d_inode);
+	return 0;
+}
+
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
 	struct dentry *res;
@@ -900,6 +911,11 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 		res = ERR_PTR(error);
 		goto out_unlock;
 	}
+	error = nfs_reval_fsid(dir, &fhandle, &fattr);
+	if (error < 0) {
+		res = ERR_PTR(error);
+		goto out_unlock;
+	}
 	inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
 	res = (struct dentry *)inode;
 	if (IS_ERR(res))
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1a809f6f8989..47167ab64f5b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -221,6 +221,14 @@ nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
 	return nfs_block_bits(bsize, nrbitsp);
 }
 
+static inline void
+nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
+{
+	sb->s_maxbytes = (loff_t)maxfilesize;
+	if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
+		sb->s_maxbytes = MAX_LFS_FILESIZE;
+}
+
 /*
  * Obtain the root inode of the file system.
  */
@@ -331,9 +339,7 @@ nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
 	}
 	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
 
-	sb->s_maxbytes = fsinfo.maxfilesize;
-	if (sb->s_maxbytes > MAX_LFS_FILESIZE) 
-		sb->s_maxbytes = MAX_LFS_FILESIZE; 
+	nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
 
 	server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
 	server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
@@ -877,6 +883,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
 			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+			/* Deal with crossing mountpoints */
+			if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+				inode->i_op = &nfs_mountpoint_inode_operations;
+				inode->i_fop = NULL;
+			}
 		} else if (S_ISLNK(inode->i_mode))
 			inode->i_op = &nfs_symlink_inode_operations;
 		else
@@ -1650,6 +1661,141 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  * File system information
  */
 
+/*
+ * nfs_path - reconstruct the path given an arbitrary dentry
+ * @base - arbitrary string to prepend to the path
+ * @dentry - pointer to dentry
+ * @buffer - result buffer
+ * @buflen - length of buffer
+ *
+ * Helper function for constructing the path from the
+ * root dentry to an arbitrary hashed dentry.
+ *
+ * This is mainly for use in figuring out the path on the
+ * server side when automounting on top of an existing partition.
+ */
+static char *nfs_path(const char *base, const struct dentry *dentry,
+		      char *buffer, ssize_t buflen)
+{
+	char *end = buffer+buflen;
+	int namelen;
+
+	*--end = '\0';
+	buflen--;
+	spin_lock(&dcache_lock);
+	while (!IS_ROOT(dentry)) {
+		namelen = dentry->d_name.len;
+		buflen -= namelen + 1;
+		if (buflen < 0)
+			goto Elong;
+		end -= namelen;
+		memcpy(end, dentry->d_name.name, namelen);
+		*--end = '/';
+		dentry = dentry->d_parent;
+	}
+	spin_unlock(&dcache_lock);
+	namelen = strlen(base);
+	/* Strip off excess slashes in base string */
+	while (namelen > 0 && base[namelen - 1] == '/')
+		namelen--;
+	buflen -= namelen;
+	if (buflen < 0)
+		goto Elong;
+	end -= namelen;
+	memcpy(end, base, namelen);
+	return end;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+struct nfs_clone_mount {
+	const struct super_block *sb;
+	const struct dentry *dentry;
+	struct nfs_fh *fh;
+	struct nfs_fattr *fattr;
+};
+
+static struct super_block *nfs_clone_generic_sb(struct nfs_clone_mount *data,
+		struct super_block *(*clone_client)(struct nfs_server *, struct nfs_clone_mount *))
+{
+	struct nfs_server *server;
+	struct nfs_server *parent = NFS_SB(data->sb);
+	struct super_block *sb = ERR_PTR(-EINVAL);
+	void *err = ERR_PTR(-ENOMEM);
+	struct inode *root_inode;
+	struct nfs_fsinfo fsinfo;
+	int len;
+
+	server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
+	if (server == NULL)
+		goto out_err;
+	memcpy(server, parent, sizeof(*server));
+	len = strlen(parent->hostname) + 1;
+	server->hostname = kmalloc(len, GFP_KERNEL);
+	if (server->hostname == NULL)
+		goto free_server;
+	memcpy(server->hostname, parent->hostname, len);
+	server->fsid = data->fattr->fsid;
+	nfs_copy_fh(&server->fh, data->fh);
+	if (rpciod_up() != 0)
+		goto free_hostname;
+
+	sb = clone_client(server, data);
+	if (IS_ERR((err = sb)) || sb->s_root)
+		goto kill_rpciod;
+
+	sb->s_op = data->sb->s_op;
+	sb->s_blocksize = data->sb->s_blocksize;
+	sb->s_blocksize_bits = data->sb->s_blocksize_bits;
+	sb->s_maxbytes = data->sb->s_maxbytes;
+
+	server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+	err = ERR_PTR(-ENOMEM);
+	server->io_stats = nfs_alloc_iostats();
+	if (server->io_stats == NULL)
+		goto out_deactivate;
+
+	server->client = rpc_clone_client(parent->client);
+	if (IS_ERR((err = server->client)))
+		goto out_deactivate;
+	if (!IS_ERR(parent->client_sys)) {
+		server->client_sys = rpc_clone_client(parent->client_sys);
+		if (IS_ERR((err = server->client_sys)))
+			goto out_deactivate;
+	}
+	if (!IS_ERR(parent->client_acl)) {
+		server->client_acl = rpc_clone_client(parent->client_acl);
+		if (IS_ERR((err = server->client_acl)))
+			goto out_deactivate;
+	}
+	root_inode = nfs_fhget(sb, data->fh, data->fattr);
+	if (!root_inode)
+		goto out_deactivate;
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root)
+		goto out_put_root;
+	fsinfo.fattr = data->fattr;
+	if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0)
+		nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
+	sb->s_root->d_op = server->rpc_ops->dentry_ops;
+	sb->s_flags |= MS_ACTIVE;
+	return sb;
+out_put_root:
+	iput(root_inode);
+out_deactivate:
+	up_write(&sb->s_umount);
+	deactivate_super(sb);
+	return (struct super_block *)err;
+kill_rpciod:
+	rpciod_down();
+free_hostname:
+	kfree(server->hostname);
+free_server:
+	kfree(server);
+out_err:
+	return (struct super_block *)err;
+}
+
 static int nfs_set_super(struct super_block *s, void *data)
 {
 	s->s_fs_info = data;
@@ -1807,6 +1953,31 @@ static struct file_system_type nfs_fs_type = {
 	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
 
+static struct super_block *nfs_clone_client(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	struct super_block *sb;
+
+	sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
+	if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM))
+		lockd_up();
+	return sb;
+}
+
+static struct super_block *nfs_clone_nfs_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs_clone_client);
+}
+
+static struct file_system_type clone_nfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs",
+	.get_sb		= nfs_clone_nfs_sb,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
 #ifdef CONFIG_NFS_V4
 
 static void nfs4_clear_inode(struct inode *);
@@ -2156,6 +2327,75 @@ static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
 module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
 		 &nfs_idmap_cache_timeout, 0644);
 
+/* Constructs the SERVER-side path */
+static inline char *nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
+{
+	return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen);
+}
+
+static inline char *nfs4_dup_path(const struct dentry *dentry)
+{
+	char *page = (char *) __get_free_page(GFP_USER);
+	char *path;
+
+	path = nfs4_path(dentry, page, PAGE_SIZE);
+	if (!IS_ERR(path)) {
+		int len = PAGE_SIZE + page - path;
+		char *tmp = path;
+
+		path = kmalloc(len, GFP_KERNEL);
+		if (path)
+			memcpy(path, tmp, len);
+		else
+			path = ERR_PTR(-ENOMEM);
+	}
+	free_page((unsigned long)page);
+	return path;
+}
+
+static struct super_block *nfs4_clone_client(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	const struct dentry *dentry = data->dentry;
+	struct nfs4_client *clp = server->nfs4_state;
+	struct super_block *sb;
+
+	server->mnt_path = nfs4_dup_path(dentry);
+	if (IS_ERR(server->mnt_path)) {
+		sb = (struct super_block *)server->mnt_path;
+		goto err;
+	}
+	sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
+	if (IS_ERR(sb) || sb->s_root)
+		goto free_path;
+	nfs4_server_capabilities(server, &server->fh);
+
+	down_write(&clp->cl_sem);
+	atomic_inc(&clp->cl_count);
+	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+	up_write(&clp->cl_sem);
+	return sb;
+free_path:
+	kfree(server->mnt_path);
+err:
+	server->mnt_path = NULL;
+	return sb;
+}
+
+static struct super_block *nfs_clone_nfs4_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs4_clone_client);
+}
+
+static struct file_system_type clone_nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs",
+	.get_sb		= nfs_clone_nfs4_sb,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
 #define nfs4_init_once(nfsi) \
 	do { \
 		INIT_LIST_HEAD(&(nfsi)->open_states); \
@@ -2183,12 +2423,69 @@ static inline void unregister_nfs4fs(void)
 	nfs_unregister_sysctl();
 }
 #else
+#define nfs4_clone_client(a,b) ERR_PTR(-EINVAL)
 #define nfs4_init_once(nfsi) \
 	do { } while (0)
 #define register_nfs4fs() (0)
 #define unregister_nfs4fs()
 #endif
 
+static inline char *nfs_devname(const struct vfsmount *mnt_parent,
+			 const struct dentry *dentry,
+			 char *buffer, ssize_t buflen)
+{
+	return nfs_path(mnt_parent->mnt_devname, dentry, buffer, buflen);
+}
+
+/**
+ * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
+ * @mnt_parent - mountpoint of parent directory
+ * @dentry - parent directory
+ * @fh - filehandle for new root dentry
+ * @fattr - attributes for new root inode
+ *
+ */
+struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+		const struct dentry *dentry, struct nfs_fh *fh,
+		struct nfs_fattr *fattr)
+{
+	struct nfs_clone_mount mountdata = {
+		.sb = mnt_parent->mnt_sb,
+		.dentry = dentry,
+		.fh = fh,
+		.fattr = fattr,
+	};
+	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+	char *page = (char *) __get_free_page(GFP_USER);
+	char *devname;
+
+	dprintk("%s: submounting on %s/%s\n", __FUNCTION__,
+			dentry->d_parent->d_name.name,
+			dentry->d_name.name);
+	if (page == NULL)
+		goto out;
+	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+	mnt = (struct vfsmount *)devname;
+	if (IS_ERR(devname))
+		goto free_page;
+	switch (NFS_SB(mnt_parent->mnt_sb)->rpc_ops->version) {
+		case 2:
+		case 3:
+			mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, &mountdata);
+			break;
+		case 4:
+			mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, &mountdata);
+			break;
+		default:
+			BUG();
+	}
+free_page:
+	free_page((unsigned long)page);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
+
 extern int nfs_init_nfspagecache(void);
 extern void nfs_destroy_nfspagecache(void);
 extern int nfs_init_readpagecache(void);
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
new file mode 100644
index 000000000000..a155505c36f1
--- /dev/null
+++ b/fs/nfs/namespace.c
@@ -0,0 +1,89 @@
+/*
+ * linux/fs/nfs/namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * NFS namespace
+ */
+
+#include <linux/config.h>
+
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+/*
+ * nfs_follow_mountpoint - handle crossing a mountpoint on the server
+ * @dentry - dentry of mountpoint
+ * @nd - nameidata info
+ *
+ * When we encounter a mountpoint on the server, we want to set up
+ * a mountpoint on the client too, to prevent inode numbers from
+ * colliding, and to allow "df" to work properly.
+ * On NFSv4, we also want to allow for the fact that different
+ * filesystems may be migrated to different servers in a failover
+ * situation, and that different filesystems may want to use
+ * different security flavours.
+ */
+static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+{
+	struct vfsmount *mnt;
+	struct nfs_server *server = NFS_SERVER(dentry->d_inode);
+	struct dentry *parent;
+	struct nfs_fh fh;
+	struct nfs_fattr fattr;
+	int err;
+
+	BUG_ON(IS_ROOT(dentry));
+	dprintk("%s: enter\n", __FUNCTION__);
+	dput(nd->dentry);
+	nd->dentry = dget(dentry);
+	if (d_mountpoint(nd->dentry))
+		goto out_follow;
+	/* Look it up again */
+	parent = dget_parent(nd->dentry);
+	err = server->rpc_ops->lookup(parent->d_inode, &nd->dentry->d_name, &fh, &fattr);
+	dput(parent);
+	if (err != 0)
+		goto out_err;
+
+	mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr);
+	err = PTR_ERR(mnt);
+	if (IS_ERR(mnt))
+		goto out_err;
+
+	mntget(mnt);
+	err = do_add_mount(mnt, nd, nd->mnt->mnt_flags, NULL);
+	if (err < 0) {
+		mntput(mnt);
+		if (err == -EBUSY)
+			goto out_follow;
+		goto out_err;
+	}
+	mntput(nd->mnt);
+	dput(nd->dentry);
+	nd->mnt = mnt;
+	nd->dentry = dget(mnt->mnt_root);
+out:
+	dprintk("%s: done, returned %d\n", __FUNCTION__, err);
+	return ERR_PTR(err);
+out_err:
+	path_release(nd);
+	goto out;
+out_follow:
+	while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
+		;
+	err = 0;
+	goto out;
+}
+
+struct inode_operations nfs_mountpoint_inode_operations = {
+	.follow_link	= nfs_follow_mountpoint,
+	.getattr	= nfs_getattr,
+};
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 0f5e4e7cddec..307832fd1a49 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -217,6 +217,7 @@ extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
+extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ef4c6cccf958..308407205e6c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1331,7 +1331,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 	return status;
 }
 
-static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
 	struct nfs4_exception exception = { };
 	int err;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 6763a0089ee4..0ce8704732c2 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -313,6 +313,10 @@ extern void nfs_end_data_update(struct inode *);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode);
+extern struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+					const struct dentry *dentry,
+					struct nfs_fh *fh,
+					struct nfs_fattr *fattr);
 
 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
 extern u32 root_nfs_parse_addr(char *name); /*__init*/
@@ -398,6 +402,11 @@ extern void nfs_unregister_sysctl(void);
 #define nfs_unregister_sysctl() do { } while(0)
 #endif
 
+/*
+ * linux/fs/nfs/namespace.c
+ */
+extern struct inode_operations nfs_mountpoint_inode_operations;
+
 /*
  * linux/fs/nfs/unlink.c
  */
-- 
cgit v1.2.3


From 51d8fa6a109589d522c18a8e9bf3fb167a91b1bc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:20 -0400
Subject: NFS: Add timeout to submounts

Make automounted partitions expire using the mark_mounts_for_expiry()
function. The timeout is controlled via a sysctl.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |  3 +++
 fs/nfs/namespace.c     | 25 ++++++++++++++++++++++++-
 fs/nfs/sysctl.c        | 10 ++++++++++
 include/linux/nfs_fs.h |  3 +++
 4 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 47167ab64f5b..3eea556d8f59 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -167,6 +167,7 @@ static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
 	struct nfs_server *server;
 	struct rpc_clnt	*rpc;
 
+	shrink_submounts(vfsmnt, &nfs_automount_list);
 	if (!(flags & MNT_FORCE))
 		return;
 	/* -EIO all pending I/O */
@@ -1943,6 +1944,7 @@ static void nfs_kill_super(struct super_block *s)
 	nfs_free_iostats(server->io_stats);
 	kfree(server->hostname);
 	kfree(server);
+	nfs_release_automount_timer();
 }
 
 static struct file_system_type nfs_fs_type = {
@@ -2288,6 +2290,7 @@ static void nfs4_kill_super(struct super_block *sb)
 	nfs_free_iostats(server->io_stats);
 	kfree(server->hostname);
 	kfree(server);
+	nfs_release_automount_timer();
 }
 
 static struct file_system_type nfs4_fs_type = {
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index a155505c36f1..e426516c1116 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -18,6 +18,11 @@
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
+LIST_HEAD(nfs_automount_list);
+static void nfs_expire_automounts(void *list);
+static DECLARE_WORK(nfs_automount_task, nfs_expire_automounts, &nfs_automount_list);
+int nfs_mountpoint_expiry_timeout = 500 * HZ;
+
 /*
  * nfs_follow_mountpoint - handle crossing a mountpoint on the server
  * @dentry - dentry of mountpoint
@@ -59,7 +64,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 		goto out_err;
 
 	mntget(mnt);
-	err = do_add_mount(mnt, nd, nd->mnt->mnt_flags, NULL);
+	err = do_add_mount(mnt, nd, nd->mnt->mnt_flags|MNT_SHRINKABLE, &nfs_automount_list);
 	if (err < 0) {
 		mntput(mnt);
 		if (err == -EBUSY)
@@ -70,6 +75,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	dput(nd->dentry);
 	nd->mnt = mnt;
 	nd->dentry = dget(mnt->mnt_root);
+	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 out:
 	dprintk("%s: done, returned %d\n", __FUNCTION__, err);
 	return ERR_PTR(err);
@@ -87,3 +93,20 @@ struct inode_operations nfs_mountpoint_inode_operations = {
 	.follow_link	= nfs_follow_mountpoint,
 	.getattr	= nfs_getattr,
 };
+
+static void nfs_expire_automounts(void *data)
+{
+	struct list_head *list = (struct list_head *)data;
+
+	mark_mounts_for_expiry(list);
+	if (!list_empty(list))
+		schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+}
+
+void nfs_release_automount_timer(void)
+{
+	if (list_empty(&nfs_automount_list)) {
+		cancel_delayed_work(&nfs_automount_task);
+		flush_scheduled_work();
+	}
+}
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 4c486eb867ca..db61e51bb154 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_idmap.h>
+#include <linux/nfs_fs.h>
 
 #include "callback.h"
 
@@ -46,6 +47,15 @@ static ctl_table nfs_cb_sysctls[] = {
 		.strategy = &sysctl_jiffies,
 	},
 #endif
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nfs_mountpoint_timeout",
+		.data		= &nfs_mountpoint_expiry_timeout,
+		.maxlen		= sizeof(nfs_mountpoint_expiry_timeout),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 0ce8704732c2..a34b3ee443f1 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -405,7 +405,10 @@ extern void nfs_unregister_sysctl(void);
 /*
  * linux/fs/nfs/namespace.c
  */
+extern struct list_head nfs_automount_list;
 extern struct inode_operations nfs_mountpoint_inode_operations;
+extern int nfs_mountpoint_expiry_timeout;
+extern void nfs_release_automount_timer(void);
 
 /*
  * linux/fs/nfs/unlink.c
-- 
cgit v1.2.3


From 8b23ea7bedb8b45a5bb56745fa3ff11018acf04e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:21 -0400
Subject: RPC: Allow struc xdr_stream to read the page section of an xdr_buf

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xdr.h |  1 +
 net/sunrpc/xdr.c           | 28 ++++++++++++++++++++++++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 84c35d42d250..e6d3d349506c 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -194,6 +194,7 @@ extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
 extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p);
 extern uint32_t *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
 extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
+extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 
 #endif /* __KERNEL__ */
 
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index ca4bfa57e116..49174f0d0a3e 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -568,8 +568,7 @@ EXPORT_SYMBOL(xdr_inline_decode);
  *
  * Moves data beyond the current pointer position from the XDR head[] buffer
  * into the page list. Any data that lies beyond current position + "len"
- * bytes is moved into the XDR tail[]. The current pointer is then
- * repositioned at the beginning of the XDR tail.
+ * bytes is moved into the XDR tail[].
  */
 void xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
 {
@@ -606,6 +605,31 @@ void xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
 }
 EXPORT_SYMBOL(xdr_read_pages);
 
+/**
+ * xdr_enter_page - decode data from the XDR page
+ * @xdr: pointer to xdr_stream struct
+ * @len: number of bytes of page data
+ *
+ * Moves data beyond the current pointer position from the XDR head[] buffer
+ * into the page list. Any data that lies beyond current position + "len"
+ * bytes is moved into the XDR tail[]. The current pointer is then
+ * repositioned at the beginning of the first XDR page.
+ */
+void xdr_enter_page(struct xdr_stream *xdr, unsigned int len)
+{
+	char * kaddr = page_address(xdr->buf->pages[0]);
+	xdr_read_pages(xdr, len);
+	/*
+	 * Position current pointer at beginning of tail, and
+	 * set remaining message length.
+	 */
+	if (len > PAGE_CACHE_SIZE - xdr->buf->page_base)
+		len = PAGE_CACHE_SIZE - xdr->buf->page_base;
+	xdr->p = (uint32_t *)(kaddr + xdr->buf->page_base);
+	xdr->end = (uint32_t *)((char *)xdr->p + len);
+}
+EXPORT_SYMBOL(xdr_enter_page);
+
 static struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
 
 void
-- 
cgit v1.2.3


From 683b57b435326eb512c7305892683b6205669448 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:22 -0400
Subject: NFSv4: Implement the fs_locations function call

NFSv4 allows for the fact that filesystems may be replicated across
several servers or that they may be migrated to a backup server in case of
failure of the primary server.
fs_locations is an NFSv4 operation for retrieving information about the
location of migrated and/or replicated filesystems.

Based on an initial implementation by Jiaying Zhang <jiayingz@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h        |   2 +
 fs/nfs/nfs4proc.c       |  29 +++++++++++++
 fs/nfs/nfs4xdr.c        | 112 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/nfs4.h    |   1 +
 include/linux/nfs_xdr.h |  24 +++++++++++
 5 files changed, 166 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 307832fd1a49..5b7651171215 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -218,6 +218,8 @@ extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
+extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+		struct nfs_fs_locations *fs_locations, struct page *page);
 
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 308407205e6c..768514dc0c4c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3570,6 +3570,35 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
 	return len;
 }
 
+int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+		struct nfs_fs_locations *fs_locations, struct page *page)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	u32 bitmask[2] = {
+		[0] = server->attr_bitmask[0] | FATTR4_WORD0_FS_LOCATIONS,
+		[1] = server->attr_bitmask[1],
+	};
+	struct nfs4_fs_locations_arg args = {
+		.dir_fh = NFS_FH(dir),
+		.name = &dentry->d_name,
+		.page = page,
+		.bitmask = bitmask,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+		.rpc_argp = &args,
+		.rpc_resp = &fs_locations,
+	};
+	int status;
+
+	dprintk("%s: start\n", __FUNCTION__);
+	fs_locations->fattr.valid = 0;
+	fs_locations->server = server;
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("%s: returned status = %d\n", __FUNCTION__, status);
+	return status;
+}
+
 struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
 	.recover_open	= nfs4_open_reclaim,
 	.recover_lock	= nfs4_lock_reclaim,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0d5794675944..7add3137b6b6 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -411,6 +411,15 @@ static int nfs_stat_to_errno(int);
 #define NFS4_dec_setacl_sz	(compound_decode_hdr_maxsz + \
 				decode_putfh_maxsz + \
 				op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+#define NFS4_enc_fs_locations_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_getattr_maxsz)
+#define NFS4_dec_fs_locations_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_putfh_maxsz + \
+				 op_decode_hdr_maxsz + \
+				 nfs4_fattr_bitmap_maxsz)
 
 static struct {
 	unsigned int	mode;
@@ -2002,6 +2011,38 @@ out:
 	return status;
 }
 
+/*
+ * Encode FS_LOCATIONS request
+ */
+static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations_arg *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.nops = 3,
+	};
+	struct rpc_auth *auth = req->rq_task->tk_auth;
+	int replen;
+	int status;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, &hdr);
+	if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+		goto out;
+	if ((status = encode_lookup(&xdr, args->name)) != 0)
+		goto out;
+	if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+		goto out;
+	/* set up reply
+	 *   toplevel_status + taglen + rescount + OP_PUTFH + status
+	 *   + OP_LOOKUP + status + OP_GETATTR + status = 7
+	 */
+	replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
+	xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
+			0, PAGE_SIZE);
+out:
+	return status;
+}
+
 /*
  * START OF "GENERIC" DECODE ROUTINES.
  *   These may look a little ugly since they are imported from a "generic"
@@ -2036,7 +2077,7 @@ out:
 	} \
 } while (0)
 
-static int decode_opaque_inline(struct xdr_stream *xdr, uint32_t *len, char **string)
+static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
 {
 	uint32_t *p;
 
@@ -2087,7 +2128,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp)
 {
 	uint32_t *p;
-	uint32_t strlen;
+	unsigned int strlen;
 	char *str;
 
 	READ_BUF(12);
@@ -2336,6 +2377,45 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	return status;
 }
 
+static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fs_locations *res)
+{
+	int n;
+	uint32_t *p;
+	int status = -EIO;
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
+		goto out;
+	status = 0;
+	if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
+		goto out;
+	status = decode_opaque_inline(xdr, &res->fs_pathlen, &res->fs_path);
+	if (unlikely(status != 0))
+		goto out;
+	READ_BUF(4);
+	READ32(n);
+	if (n <= 0)
+		goto out_eio;
+	res->nlocations = 0;
+	while (res->nlocations < n) {
+		struct nfs_fs_location *loc = &res->locations[res->nlocations];
+
+		status = decode_opaque_inline(xdr, &loc->serverlen, &loc->server);
+		if (unlikely(status != 0))
+			goto out_eio;
+		status = decode_opaque_inline(xdr, &loc->rootpathlen, &loc->rootpath);
+		if (unlikely(status != 0))
+			goto out_eio;
+		if (res->nlocations < NFS_FS_LOCATIONS_MAXENTRIES)
+			res->nlocations++;
+	}
+out:
+	dprintk("%s: fs_locations done, error = %d\n", __FUNCTION__, status);
+	return status;
+out_eio:
+	status = -EIO;
+	goto out;
+}
+
 static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
 	uint32_t *p;
@@ -2867,6 +2947,10 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 		goto xdr_error;
 	if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
 		goto xdr_error;
+	if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+						struct nfs_fs_locations,
+						fattr))) != 0)
+		goto xdr_error;
 	if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
 		goto xdr_error;
 	fattr->mode |= fmode;
@@ -4210,6 +4294,29 @@ out:
 	return status;
 }
 
+/*
+ * FS_LOCATIONS request
+ */
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs_fs_locations *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (status != 0)
+		goto out;
+	if ((status = decode_putfh(&xdr)) != 0)
+		goto out;
+	if ((status = decode_lookup(&xdr)) != 0)
+		goto out;
+	xdr_enter_page(&xdr, PAGE_SIZE);
+	status = decode_getfattr(&xdr, &res->fattr, res->server);
+out:
+	return status;
+}
+
 uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus)
 {
 	uint32_t bitmap[2] = {0};
@@ -4381,6 +4488,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(DELEGRETURN,	enc_delegreturn, dec_delegreturn),
   PROC(GETACL,		enc_getacl,	dec_getacl),
   PROC(SETACL,		enc_setacl,	dec_setacl),
+  PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations),
 };
 
 struct rpc_version		nfs_version4 = {
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 0c1c306cdaec..1477fc857f6b 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -384,6 +384,7 @@ enum {
 	NFSPROC4_CLNT_DELEGRETURN,
 	NFSPROC4_CLNT_GETACL,
 	NFSPROC4_CLNT_SETACL,
+	NFSPROC4_CLNT_FS_LOCATIONS,
 };
 
 #endif
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 95682f7d738a..15a20b815302 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -679,6 +679,30 @@ struct nfs4_server_caps_res {
 	u32				has_symlinks;
 };
 
+struct nfs_fs_location {
+	unsigned int serverlen;
+	char * server;
+	unsigned int rootpathlen;
+	char * rootpath;
+};
+
+#define NFS_FS_LOCATIONS_MAXENTRIES 10
+struct nfs_fs_locations {
+	struct nfs_fattr fattr;
+	const struct nfs_server *server;
+	unsigned int fs_pathlen;
+	char * fs_path;
+	int nlocations;
+	struct nfs_fs_location locations[NFS_FS_LOCATIONS_MAXENTRIES];
+};
+
+struct nfs4_fs_locations_arg {
+	const struct nfs_fh *dir_fh;
+	const struct qstr *name;
+	struct page *page;
+	const u32 *bitmask;
+};
+
 #endif /* CONFIG_NFS_V4 */
 
 struct nfs_page;
-- 
cgit v1.2.3


From 7aaa0b3bd4d215d9ce4d62b6c2043a63ba650f93 Mon Sep 17 00:00:00 2001
From: Manoj Naik <manoj@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:34:23 -0400
Subject: NFSv4: convert fs-locations-components to conform to RFC3530

Use component4-style formats for decoding list of servers and pathnames in
fs_locations.

Signed-off-by: Manoj Naik <manoj@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h        |  2 +-
 fs/nfs/nfs4proc.c       |  4 +--
 fs/nfs/nfs4xdr.c        | 80 +++++++++++++++++++++++++++++++++++++++++++------
 include/linux/nfs_xdr.h | 30 ++++++++++++-------
 4 files changed, 94 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 5b7651171215..22a5f838ea58 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -219,7 +219,7 @@ extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct n
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
-		struct nfs_fs_locations *fs_locations, struct page *page);
+		struct nfs4_fs_locations *fs_locations, struct page *page);
 
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 768514dc0c4c..043223a0eda6 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3571,7 +3571,7 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
 }
 
 int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
-		struct nfs_fs_locations *fs_locations, struct page *page)
+		struct nfs4_fs_locations *fs_locations, struct page *page)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
 	u32 bitmask[2] = {
@@ -3587,7 +3587,7 @@ int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
 		.rpc_argp = &args,
-		.rpc_resp = &fs_locations,
+		.rpc_resp = fs_locations,
 	};
 	int status;
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7add3137b6b6..f6a1ea7df374 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2377,7 +2377,43 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	return status;
 }
 
-static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fs_locations *res)
+static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
+{
+	int n;
+	uint32_t *p;
+	int status = 0;
+
+	READ_BUF(4);
+	READ32(n);
+	if (n <= 0)
+		goto out_eio;
+	dprintk("path ");
+	path->ncomponents = 0;
+	while (path->ncomponents < n) {
+		struct nfs4_string *component = &path->components[path->ncomponents];
+		status = decode_opaque_inline(xdr, &component->len, &component->data);
+		if (unlikely(status != 0))
+			goto out_eio;
+		if (path->ncomponents != n)
+			dprintk("/");
+		dprintk("%s", component->data);
+		if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS)
+			path->ncomponents++;
+		else {
+			dprintk("cannot parse %d components in path\n", n);
+			goto out_eio;
+		}
+	}
+out:
+	dprintk("\n");
+	return status;
+out_eio:
+	dprintk(" status %d", status);
+	status = -EIO;
+	goto out;
+}
+
+static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
 {
 	int n;
 	uint32_t *p;
@@ -2388,7 +2424,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	status = 0;
 	if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
 		goto out;
-	status = decode_opaque_inline(xdr, &res->fs_pathlen, &res->fs_path);
+	dprintk("%s: fsroot ", __FUNCTION__);
+	status = decode_pathname(xdr, &res->fs_path);
 	if (unlikely(status != 0))
 		goto out;
 	READ_BUF(4);
@@ -2397,15 +2434,40 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		goto out_eio;
 	res->nlocations = 0;
 	while (res->nlocations < n) {
-		struct nfs_fs_location *loc = &res->locations[res->nlocations];
+		int m;
+		struct nfs4_fs_location *loc = &res->locations[res->nlocations];
 
-		status = decode_opaque_inline(xdr, &loc->serverlen, &loc->server);
-		if (unlikely(status != 0))
+		READ_BUF(4);
+		READ32(m);
+		if (m <= 0)
 			goto out_eio;
-		status = decode_opaque_inline(xdr, &loc->rootpathlen, &loc->rootpath);
+
+		loc->nservers = 0;
+		dprintk("%s: servers ", __FUNCTION__);
+		while (loc->nservers < m) {
+			struct nfs4_string *server = &loc->servers[loc->nservers];
+			status = decode_opaque_inline(xdr, &server->len, &server->data);
+			if (unlikely(status != 0))
+				goto out_eio;
+			dprintk("%s ", server->data);
+			if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS)
+				loc->nservers++;
+			else {
+				int i;
+				dprintk("%s: using first %d of %d servers returned for location %d\n", __FUNCTION__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations);
+				for (i = loc->nservers; i < m; i++) {
+					int len;
+					char *data;
+					status = decode_opaque_inline(xdr, &len, &data);
+					if (unlikely(status != 0))
+						goto out_eio;
+				}
+			}
+		}
+		status = decode_pathname(xdr, &loc->rootpath);
 		if (unlikely(status != 0))
 			goto out_eio;
-		if (res->nlocations < NFS_FS_LOCATIONS_MAXENTRIES)
+		if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
 			res->nlocations++;
 	}
 out:
@@ -2948,7 +3010,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
 		goto xdr_error;
 	if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
-						struct nfs_fs_locations,
+						struct nfs4_fs_locations,
 						fattr))) != 0)
 		goto xdr_error;
 	if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
@@ -4297,7 +4359,7 @@ out:
 /*
  * FS_LOCATIONS request
  */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs_fs_locations *res)
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 15a20b815302..d6eea8348728 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -679,21 +679,31 @@ struct nfs4_server_caps_res {
 	u32				has_symlinks;
 };
 
-struct nfs_fs_location {
-	unsigned int serverlen;
-	char * server;
-	unsigned int rootpathlen;
-	char * rootpath;
+struct nfs4_string {
+	unsigned int len;
+	char *data;
 };
 
-#define NFS_FS_LOCATIONS_MAXENTRIES 10
-struct nfs_fs_locations {
+#define NFS4_PATHNAME_MAXCOMPONENTS 512
+struct nfs4_pathname {
+	unsigned int ncomponents;
+	struct nfs4_string components[NFS4_PATHNAME_MAXCOMPONENTS];
+};
+
+#define NFS4_FS_LOCATION_MAXSERVERS 10
+struct nfs4_fs_location {
+	unsigned int nservers;
+	struct nfs4_string servers[NFS4_FS_LOCATION_MAXSERVERS];
+	struct nfs4_pathname rootpath;
+};
+
+#define NFS4_FS_LOCATIONS_MAXENTRIES 10
+struct nfs4_fs_locations {
 	struct nfs_fattr fattr;
 	const struct nfs_server *server;
-	unsigned int fs_pathlen;
-	char * fs_path;
+	struct nfs4_pathname fs_path;
 	int nlocations;
-	struct nfs_fs_location locations[NFS_FS_LOCATIONS_MAXENTRIES];
+	struct nfs4_fs_location locations[NFS4_FS_LOCATIONS_MAXENTRIES];
 };
 
 struct nfs4_fs_locations_arg {
-- 
cgit v1.2.3


From 9cdb3883c38f883436a84c2353a4cf964ff890a2 Mon Sep 17 00:00:00 2001
From: Manoj Naik <manoj@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:34:28 -0400
Subject: NFSv4: Ensure client submounts when following a referral

Set up mountpoint when hitting a referral on moved error by getting
fs_locations.

Signed-off-by: Manoj Naik <manoj@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         | 270 ++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/nfs_fs.h |   2 +
 2 files changed, 269 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ebdab885c475..0d8302e59d69 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,8 @@
 #include <linux/mount.h>
 #include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -1714,6 +1716,10 @@ struct nfs_clone_mount {
 	const struct dentry *dentry;
 	struct nfs_fh *fh;
 	struct nfs_fattr *fattr;
+	char *hostname;
+	char *mnt_path;
+	struct sockaddr_in *addr;
+	rpc_authflavor_t authflavor;
 };
 
 static struct super_block *nfs_clone_generic_sb(struct nfs_clone_mount *data,
@@ -1724,17 +1730,19 @@ static struct super_block *nfs_clone_generic_sb(struct nfs_clone_mount *data,
 	struct nfs_server *parent = NFS_SB(data->sb);
 	struct super_block *sb = ERR_PTR(-EINVAL);
 	void *err = ERR_PTR(-ENOMEM);
+	char *hostname;
 	int len;
 
 	server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
 	if (server == NULL)
 		goto out_err;
 	memcpy(server, parent, sizeof(*server));
-	len = strlen(parent->hostname) + 1;
+	hostname = (data->hostname != NULL) ? data->hostname : parent->hostname;
+	len = strlen(hostname) + 1;
 	server->hostname = kmalloc(len, GFP_KERNEL);
 	if (server->hostname == NULL)
 		goto free_server;
-	memcpy(server->hostname, parent->hostname, len);
+	memcpy(server->hostname, hostname, len);
 	if (rpciod_up() != 0)
 		goto free_hostname;
 
@@ -2458,7 +2466,8 @@ static inline void unregister_nfs4fs(void)
 	nfs_unregister_sysctl();
 }
 #else
-#define nfs4_clone_client(a,b) ERR_PTR(-EINVAL)
+#define nfs4_fill_sb(a,b)	ERR_PTR(-EINVAL)
+#define nfs4_fill_super(a,b)	ERR_PTR(-EINVAL)
 #define nfs4_init_once(nfsi) \
 	do { } while (0)
 #define register_nfs4fs() (0)
@@ -2521,6 +2530,261 @@ out:
 	return mnt;
 }
 
+/* Check if fs_root is valid */
+static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname, char *buffer, ssize_t buflen)
+{
+	char *end = buffer + buflen;
+	int n;
+
+	*--end = '\0';
+	buflen--;
+
+	n = pathname->ncomponents;
+	while (--n >= 0) {
+		struct nfs4_string *component = &pathname->components[n];
+		buflen -= component->len + 1;
+		if (buflen < 0)
+			goto Elong;
+		end -= component->len;
+		memcpy(end, component->data, component->len);
+		*--end = '/';
+	}
+	return end;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+/* Check if the string represents a "valid" IPv4 address */
+static inline int valid_ipaddr4(const char *buf)
+{
+	int rc, count, in[4];
+
+	rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
+	if (rc != 4)
+		return -EINVAL;
+	for (count = 0; count < 4; count++) {
+		if (in[count] > 255)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+static struct super_block *nfs4_referral_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	struct super_block *sb = ERR_PTR(-ENOMEM);
+	int len;
+
+	len = strlen(data->mnt_path) + 1;
+	server->mnt_path = kmalloc(len, GFP_KERNEL);
+	if (server->mnt_path == NULL)
+		goto err;
+	memcpy(server->mnt_path, data->mnt_path, len);
+	memcpy(&server->addr, data->addr, sizeof(struct sockaddr_in));
+
+	sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
+	if (IS_ERR(sb) || sb->s_root)
+		goto free_path;
+	return sb;
+free_path:
+	kfree(server->mnt_path);
+err:
+	server->mnt_path = NULL;
+	return sb;
+}
+
+static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nfs_clone_mount *data)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	struct rpc_timeout timeparms;
+	int proto, timeo, retrans;
+	void *err;
+
+	proto = IPPROTO_TCP;
+	/* Since we are following a referral and there may be alternatives,
+	   set the timeouts and retries to low values */
+	timeo = 2;
+	retrans = 1;
+	nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
+
+	server->client = nfs4_create_client(server, &timeparms, proto, data->authflavor);
+	if (IS_ERR((err = server->client)))
+		goto out_err;
+
+	sb->s_time_gran = 1;
+	sb->s_op = &nfs4_sops;
+	err = ERR_PTR(nfs_sb_init(sb, data->authflavor));
+	if (!IS_ERR(err))
+		return server;
+out_err:
+	return (struct nfs_server *)err;
+}
+
+static struct super_block *nfs_referral_nfs4_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs4_referral_sb, nfs4_referral_server);
+}
+
+static struct file_system_type nfs_referral_nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.get_sb		= nfs_referral_nfs4_sb,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+/**
+ * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
+ * @mnt_parent - mountpoint of parent directory
+ * @dentry - parent directory
+ * @fspath - fs path returned in fs_locations
+ * @mntpath - mount path to new server
+ * @hostname - hostname of new server
+ * @addr - host addr of new server
+ *
+ */
+struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
+	     const struct dentry *dentry, struct nfs4_fs_locations *locations)
+{
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct nfs_clone_mount mountdata = {
+		.sb = mnt_parent->mnt_sb,
+		.dentry = dentry,
+		.authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
+	};
+	char *page, *page2;
+	char *path, *fs_path;
+	char *devname;
+	int loc, s;
+
+	if (locations == NULL || locations->nlocations <= 0)
+		goto out;
+
+	dprintk("%s: referral at %s/%s\n", __FUNCTION__,
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+
+	/* Ensure fs path is a prefix of current dentry path */
+	page = (char *) __get_free_page(GFP_USER);
+	if (page == NULL)
+		goto out;
+	page2 = (char *) __get_free_page(GFP_USER);
+	if (page2 == NULL)
+		goto out;
+
+	path = nfs4_path(dentry, page, PAGE_SIZE);
+	if (IS_ERR(path))
+		goto out_free;
+
+	fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
+	if (IS_ERR(fs_path))
+		goto out_free;
+
+	if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
+		dprintk("%s: path %s does not begin with fsroot %s\n", __FUNCTION__, path, fs_path);
+		goto out_free;
+	}
+
+	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+	if (IS_ERR(devname)) {
+		mnt = (struct vfsmount *)devname;
+		goto out_free;
+	}
+
+	loc = 0;
+	while (loc < locations->nlocations && IS_ERR(mnt)) {
+		struct nfs4_fs_location *location = &locations->locations[loc];
+		char *mnt_path;
+
+		if (location == NULL || location->nservers <= 0 ||
+		    location->rootpath.ncomponents == 0) {
+			loc++;
+			continue;
+		}
+
+		mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
+		if (IS_ERR(mnt_path)) {
+			loc++;
+			continue;
+		}
+		mountdata.mnt_path = mnt_path;
+
+		s = 0;
+		while (s < location->nservers) {
+			struct sockaddr_in addr = {};
+
+			if (location->servers[s].len <= 0 ||
+			    valid_ipaddr4(location->servers[s].data) < 0) {
+				s++;
+				continue;
+			}
+
+			mountdata.hostname = location->servers[s].data;
+			addr.sin_addr.s_addr = in_aton(mountdata.hostname);
+			addr.sin_family = AF_INET;
+			addr.sin_port = htons(NFS_PORT);
+			mountdata.addr = &addr;
+
+			mnt = vfs_kern_mount(&nfs_referral_nfs4_fs_type, 0, devname, &mountdata);
+			if (!IS_ERR(mnt)) {
+				break;
+			}
+			s++;
+		}
+		loc++;
+	}
+
+out_free:
+	free_page((unsigned long)page);
+	free_page((unsigned long)page2);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
+
+/*
+ * nfs_do_refmount - handle crossing a referral on server
+ * @dentry - dentry of referral
+ * @nd - nameidata info
+ *
+ */
+struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+{
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct dentry *parent;
+	struct nfs4_fs_locations *fs_locations = NULL;
+	struct page *page;
+	int err;
+
+	/* BUG_ON(IS_ROOT(dentry)); */
+	dprintk("%s: enter\n", __FUNCTION__);
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL)
+		goto out;
+
+	fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (fs_locations == NULL)
+		goto out_free;
+
+	/* Get locations */
+	parent = dget_parent(dentry);
+	dprintk("%s: getting locations for %s/%s\n", __FUNCTION__, parent->d_name.name, dentry->d_name.name);
+	err = nfs4_proc_fs_locations(parent->d_inode, dentry, fs_locations, page);
+	dput(parent);
+	if (err != 0 || fs_locations->nlocations <= 0 ||
+	    fs_locations->fs_path.ncomponents <= 0)
+		goto out_free;
+
+	mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations);
+out_free:
+	__free_page(page);
+	kfree(fs_locations);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
+
 extern int nfs_init_nfspagecache(void);
 extern void nfs_destroy_nfspagecache(void);
 extern int nfs_init_readpagecache(void);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index a34b3ee443f1..09271b10f9a8 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -317,6 +317,8 @@ extern struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
 					const struct dentry *dentry,
 					struct nfs_fh *fh,
 					struct nfs_fattr *fattr);
+extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent,
+					struct dentry *dentry);
 
 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
 extern u32 root_nfs_parse_addr(char *name); /*__init*/
-- 
cgit v1.2.3


From 6b97fd3da1eab2cc490cfe884c7d4956522eaf8b Mon Sep 17 00:00:00 2001
From: Manoj Naik <manoj@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:34:29 -0400
Subject: NFSv4: Follow a referral

Respond to a moved error on NFS lookup by setting up the referral.
Note: We don't actually follow the referral during lookup/getattr, but
later when we detect fsid mismatch in inode revalidation (similar to the
processing done for cloning submounts). Referrals will have fake attributes
until they are actually followed or traversed.

Signed-off-by: Manoj Naik <manoj@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c          |  5 ++++-
 fs/nfs/namespace.c      |  9 ++++++++-
 fs/nfs/nfs4proc.c       | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs_fs.h  |  1 +
 include/linux/nfs_xdr.h |  1 +
 5 files changed, 60 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0d8302e59d69..ee13cb01b56e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -888,7 +888,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
 			/* Deal with crossing mountpoints */
 			if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
-				inode->i_op = &nfs_mountpoint_inode_operations;
+				if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+					inode->i_op = &nfs_referral_inode_operations;
+				else
+					inode->i_op = &nfs_mountpoint_inode_operations;
 				inode->i_fop = NULL;
 			}
 		} else if (S_ISLNK(inode->i_mode))
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index e426516c1116..8ca44b7b25c3 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -58,7 +58,10 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	if (err != 0)
 		goto out_err;
 
-	mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr);
+	if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL)
+		mnt = nfs_do_refmount(nd->mnt, nd->dentry);
+	else
+		mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr);
 	err = PTR_ERR(mnt);
 	if (IS_ERR(mnt))
 		goto out_err;
@@ -94,6 +97,10 @@ struct inode_operations nfs_mountpoint_inode_operations = {
 	.getattr	= nfs_getattr,
 };
 
+struct inode_operations nfs_referral_inode_operations = {
+	.follow_link	= nfs_follow_mountpoint,
+};
+
 static void nfs_expire_automounts(void *data)
 {
 	struct list_head *list = (struct list_head *)data;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 90ee21a07b3e..3300e35d74ad 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1462,6 +1462,50 @@ out:
 	return nfs4_map_errors(status);
 }
 
+/*
+ * Get locations and (maybe) other attributes of a referral.
+ * Note that we'll actually follow the referral later when
+ * we detect fsid mismatch in inode revalidation
+ */
+static int nfs4_get_referral(struct inode *dir, struct qstr *name, struct nfs_fattr *fattr, struct nfs_fh *fhandle)
+{
+	int status = -ENOMEM;
+	struct page *page = NULL;
+	struct nfs4_fs_locations *locations = NULL;
+	struct dentry dentry = {};
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL)
+		goto out;
+	locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (locations == NULL)
+		goto out;
+
+	dentry.d_name.name = name->name;
+	dentry.d_name.len = name->len;
+	status = nfs4_proc_fs_locations(dir, &dentry, locations, page);
+	if (status != 0)
+		goto out;
+	/* Make sure server returned a different fsid for the referral */
+	if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
+		dprintk("%s: server did not return a different fsid for a referral at %s\n", __FUNCTION__, name->name);
+		status = -EIO;
+		goto out;
+	}
+
+	memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr));
+	fattr->valid |= NFS_ATTR_FATTR_V4_REFERRAL;
+	if (!fattr->mode)
+		fattr->mode = S_IFDIR;
+	memset(fhandle, 0, sizeof(struct nfs_fh));
+out:
+	if (page)
+		__free_page(page);
+	if (locations)
+		kfree(locations);
+	return status;
+}
+
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	struct nfs4_getattr_arg args = {
@@ -1566,6 +1610,8 @@ static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name,
 	
 	dprintk("NFS call  lookup %s\n", name->name);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	if (status == -NFS4ERR_MOVED)
+		status = nfs4_get_referral(dir, name, fattr, fhandle);
 	dprintk("NFS reply lookup: %d\n", status);
 	return status;
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 09271b10f9a8..152798949113 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -409,6 +409,7 @@ extern void nfs_unregister_sysctl(void);
  */
 extern struct list_head nfs_automount_list;
 extern struct inode_operations nfs_mountpoint_inode_operations;
+extern struct inode_operations nfs_referral_inode_operations;
 extern int nfs_mountpoint_expiry_timeout;
 extern void nfs_release_automount_timer(void);
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d6eea8348728..7c7320fa51aa 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -63,6 +63,7 @@ struct nfs_fattr {
 #define NFS_ATTR_FATTR		0x0002		/* post-op attributes */
 #define NFS_ATTR_FATTR_V3	0x0004		/* NFSv3 attributes */
 #define NFS_ATTR_FATTR_V4	0x0008		/* NFSv4 change attribute */
+#define NFS_ATTR_FATTR_V4_REFERRAL	0x0010		/* NFSv4 referral */
 
 /*
  * Info on the file system
-- 
cgit v1.2.3


From 3134cbec5e172c3a86e2c3ef4af34b6cfd380bfa Mon Sep 17 00:00:00 2001
From: Marc Eshel <eshel@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:40:20 -0400
Subject: locks.c: add the fl_owner to nlm_compare_locks

Add the fl_owner to NLM compare locks. Since two different client can
present the same pid to the server it is not enough to distinguish locks
from different clients. The fl_owner field is a pointer to the struct
nlm_host which is unique for each client.

Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/lockd/lockd.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 995f89dc8c04..112936fcda80 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -220,6 +220,7 @@ static __inline__ int
 nlm_compare_locks(const struct file_lock *fl1, const struct file_lock *fl2)
 {
 	return	fl1->fl_pid   == fl2->fl_pid
+	     && fl1->fl_owner == fl2->fl_owner
 	     && fl1->fl_start == fl2->fl_start
 	     && fl1->fl_end   == fl2->fl_end
 	     &&(fl1->fl_type  == fl2->fl_type || fl2->fl_type == F_UNLCK);
-- 
cgit v1.2.3


From 5046791417dcac1ba126b77b8062af15a2f0b8e1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:40:24 -0400
Subject: NLM: sem to mutex conversion

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c             | 8 ++++----
 include/linux/lockd/lockd.h | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 729ac427d359..5242743c9403 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -112,7 +112,7 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
 	host->h_version    = version;
 	host->h_proto      = proto;
 	host->h_rpcclnt    = NULL;
-	init_MUTEX(&host->h_sema);
+	mutex_init(&host->h_mutex);
 	host->h_nextrebind = jiffies + NLM_HOST_REBIND;
 	host->h_expires    = jiffies + NLM_HOST_EXPIRE;
 	atomic_set(&host->h_count, 1);
@@ -172,7 +172,7 @@ nlm_bind_host(struct nlm_host *host)
 			(unsigned)ntohl(host->h_addr.sin_addr.s_addr));
 
 	/* Lock host handle */
-	down(&host->h_sema);
+	mutex_lock(&host->h_mutex);
 
 	/* If we've already created an RPC client, check whether
 	 * RPC rebind is required
@@ -204,12 +204,12 @@ nlm_bind_host(struct nlm_host *host)
 		host->h_rpcclnt = clnt;
 	}
 
-	up(&host->h_sema);
+	mutex_unlock(&host->h_mutex);
 	return clnt;
 
 forgetit:
 	printk("lockd: couldn't create RPC handle for %s\n", host->h_name);
-	up(&host->h_sema);
+	mutex_unlock(&host->h_mutex);
 	return NULL;
 }
 
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 112936fcda80..a6c1a33e5ae3 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -54,7 +54,7 @@ struct nlm_host {
 	u32			h_nsmstate;	/* true remote NSM state */
 	u32			h_pidcount;	/* Pseudopids */
 	atomic_t		h_count;	/* reference count */
-	struct semaphore	h_sema;		/* mutex for pmap binding */
+	struct mutex		h_mutex;	/* mutex for pmap binding */
 	unsigned long		h_nextrebind;	/* next portmap call */
 	unsigned long		h_expires;	/* eligible for GC */
 	struct list_head	h_lockowners;	/* Lockowners for the client */
-- 
cgit v1.2.3


From 28df955a2ad484d602314b30183ea8496a9aa34a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:40:27 -0400
Subject: NLM: Fix reclaim races

Currently it is possible for a task to remove its locks at the same time as
the NLM recovery thread is trying to recover them. This quickly leads to an
Oops.
Protect the locks using an rw semaphore while they are being recovered.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntlock.c         | 39 +++++++++++++++++++++++++--------------
 fs/lockd/clntproc.c         | 14 +++++++++++++-
 fs/lockd/host.c             |  1 +
 include/linux/lockd/lockd.h |  1 +
 4 files changed, 40 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index bce744468708..52774feab93f 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -147,11 +147,10 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
  * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number,
  * that we mark locks for reclaiming, and that we bump the pseudo NSM state.
  */
-static inline
-void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
+static void nlmclnt_prepare_reclaim(struct nlm_host *host)
 {
+	down_write(&host->h_rwsem);
 	host->h_monitored = 0;
-	host->h_nsmstate = newstate;
 	host->h_state++;
 	host->h_nextrebind = 0;
 	nlm_rebind_host(host);
@@ -164,6 +163,13 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
 	dprintk("NLM: reclaiming locks for host %s", host->h_name);
 }
 
+static void nlmclnt_finish_reclaim(struct nlm_host *host)
+{
+	host->h_reclaiming = 0;
+	up_write(&host->h_rwsem);
+	dprintk("NLM: done reclaiming locks for host %s", host->h_name);
+}
+
 /*
  * Reclaim all locks on server host. We do this by spawning a separate
  * reclaimer thread.
@@ -171,12 +177,10 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
 void
 nlmclnt_recovery(struct nlm_host *host, u32 newstate)
 {
-	if (host->h_reclaiming++) {
-		if (host->h_nsmstate == newstate)
-			return;
-		nlmclnt_prepare_reclaim(host, newstate);
-	} else {
-		nlmclnt_prepare_reclaim(host, newstate);
+	if (host->h_nsmstate == newstate)
+		return;
+	host->h_nsmstate = newstate;
+	if (!host->h_reclaiming++) {
 		nlm_get_host(host);
 		__module_get(THIS_MODULE);
 		if (kernel_thread(reclaimer, host, CLONE_KERNEL) < 0)
@@ -190,6 +194,7 @@ reclaimer(void *ptr)
 	struct nlm_host	  *host = (struct nlm_host *) ptr;
 	struct nlm_wait	  *block;
 	struct file_lock *fl, *next;
+	u32 nsmstate;
 
 	daemonize("%s-reclaim", host->h_name);
 	allow_signal(SIGKILL);
@@ -199,19 +204,25 @@ reclaimer(void *ptr)
 	lock_kernel();
 	lockd_up();
 
+	nlmclnt_prepare_reclaim(host);
 	/* First, reclaim all locks that have been marked. */
 restart:
+	nsmstate = host->h_nsmstate;
 	list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
 		list_del_init(&fl->fl_u.nfs_fl.list);
 
 		if (signalled())
 			continue;
-		if (nlmclnt_reclaim(host, fl) == 0)
-			list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
-		goto restart;
+		if (nlmclnt_reclaim(host, fl) != 0)
+			continue;
+		list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
+		if (host->h_nsmstate != nsmstate) {
+			/* Argh! The server rebooted again! */
+			list_splice_init(&host->h_granted, &host->h_reclaim);
+			goto restart;
+		}
 	}
-
-	host->h_reclaiming = 0;
+	nlmclnt_finish_reclaim(host);
 
 	/* Now, wake up all processes that sleep on a blocked lock */
 	list_for_each_entry(block, &nlm_blocked, b_list) {
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index f96e38155b5c..4db62098d3f4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -508,7 +508,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	}
 
 	block = nlmclnt_prepare_block(host, fl);
+again:
 	for(;;) {
+		/* Reboot protection */
+		fl->fl_u.nfs_fl.state = host->h_state;
 		status = nlmclnt_call(req, NLMPROC_LOCK);
 		if (status < 0)
 			goto out_unblock;
@@ -531,10 +534,16 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	}
 
 	if (resp->status == NLM_LCK_GRANTED) {
-		fl->fl_u.nfs_fl.state = host->h_state;
+		down_read(&host->h_rwsem);
+		/* Check whether or not the server has rebooted */
+		if (fl->fl_u.nfs_fl.state != host->h_state) {
+			up_read(&host->h_rwsem);
+			goto again;
+		}
 		fl->fl_flags |= FL_SLEEP;
 		/* Ensure the resulting lock will get added to granted list */
 		do_vfs_lock(fl);
+		up_read(&host->h_rwsem);
 	}
 	status = nlm_stat_to_errno(resp->status);
 out_unblock:
@@ -596,6 +605,7 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl)
 static int
 nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 {
+	struct nlm_host	*host = req->a_host;
 	struct nlm_res	*resp = &req->a_res;
 	int		status;
 
@@ -604,7 +614,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
 	 * case, we want to unlock.
 	 */
+	down_read(&host->h_rwsem);
 	do_vfs_lock(fl);
+	up_read(&host->h_rwsem);
 
 	if (req->a_flags & RPC_TASK_ASYNC)
 		return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 5242743c9403..38b0e8a1aec0 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -117,6 +117,7 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
 	host->h_expires    = jiffies + NLM_HOST_EXPIRE;
 	atomic_set(&host->h_count, 1);
 	init_waitqueue_head(&host->h_gracewait);
+	init_rwsem(&host->h_rwsem);
 	host->h_state      = 0;			/* pseudo NSM state */
 	host->h_nsmstate   = 0;			/* real NSM state */
 	host->h_server	   = server;
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index a6c1a33e5ae3..6b2684763fc7 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -50,6 +50,7 @@ struct nlm_host {
 				h_killed     : 1,
 				h_monitored  : 1;
 	wait_queue_head_t	h_gracewait;	/* wait while reclaiming */
+	struct rw_semaphore	h_rwsem;	/* Reboot recovery lock */
 	u32			h_state;	/* pseudo-state counter */
 	u32			h_nsmstate;	/* true remote NSM state */
 	u32			h_pidcount;	/* Pseudopids */
-- 
cgit v1.2.3


From b817f6feff4a565b08f0e699a5790b4008b8f494 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@mars.ravnborg.org>
Date: Fri, 9 Jun 2006 21:53:55 +0200
Subject: kbuild: check license compatibility when building modules

Modules that uses GPL symbols can no longer be build with kbuild,
the build will fail during the modpost step.
When a GPL-incompatible module uses a EXPORT_SYMBOL_GPL_FUTURE symbol
then warn during modpost so author are actually notified.

The actual license compatibility check is shared with the kernel
to make sure it is in sync.

Patch originally from: Andreas Gruenbacher <agruen@suse.de> and
Ram Pai <linuxram@us.ibm.com>

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 include/linux/license.h | 14 ++++++++++
 kernel/module.c         | 11 +-------
 scripts/mod/modpost.c   | 71 +++++++++++++++++++++++++++++++++++++++++++++++--
 scripts/mod/modpost.h   |  1 +
 4 files changed, 85 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/license.h

(limited to 'include/linux')

diff --git a/include/linux/license.h b/include/linux/license.h
new file mode 100644
index 000000000000..decdbf43cb5c
--- /dev/null
+++ b/include/linux/license.h
@@ -0,0 +1,14 @@
+#ifndef __LICENSE_H
+#define __LICENSE_H
+
+static inline int license_is_gpl_compatible(const char *license)
+{
+	return (strcmp(license, "GPL") == 0
+		|| strcmp(license, "GPL v2") == 0
+		|| strcmp(license, "GPL and additional rights") == 0
+		|| strcmp(license, "Dual BSD/GPL") == 0
+		|| strcmp(license, "Dual MIT/GPL") == 0
+		|| strcmp(license, "Dual MPL/GPL") == 0);
+}
+
+#endif
diff --git a/kernel/module.c b/kernel/module.c
index bbe04862e1b0..690381508d09 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -43,6 +43,7 @@
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
+#include <linux/license.h>
 
 #if 0
 #define DEBUGP printk
@@ -1248,16 +1249,6 @@ static void layout_sections(struct module *mod,
 	}
 }
 
-static inline int license_is_gpl_compatible(const char *license)
-{
-	return (strcmp(license, "GPL") == 0
-		|| strcmp(license, "GPL v2") == 0
-		|| strcmp(license, "GPL and additional rights") == 0
-		|| strcmp(license, "Dual BSD/GPL") == 0
-		|| strcmp(license, "Dual MIT/GPL") == 0
-		|| strcmp(license, "Dual MPL/GPL") == 0);
-}
-
 static void set_license(struct module *mod, const char *license)
 {
 	if (!license)
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index ba2e4fc2af20..baa4d83d29a8 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -13,6 +13,7 @@
 
 #include <ctype.h>
 #include "modpost.h"
+#include "../../include/linux/license.h"
 
 /* Are we using CONFIG_MODVERSIONS? */
 int modversions = 0;
@@ -99,6 +100,7 @@ static struct module *new_module(char *modname)
 
 	/* add to list */
 	mod->name = p;
+	mod->gpl_compatible = -1;
 	mod->next = modules;
 	modules = mod;
 
@@ -493,13 +495,18 @@ static char *next_string(char *string, unsigned long *secsize)
 	return string;
 }
 
-static char *get_modinfo(void *modinfo, unsigned long modinfo_len,
-			 const char *tag)
+static char *get_next_modinfo(void *modinfo, unsigned long modinfo_len,
+			      const char *tag, char *info)
 {
 	char *p;
 	unsigned int taglen = strlen(tag);
 	unsigned long size = modinfo_len;
 
+	if (info) {
+		size -= info - (char *)modinfo;
+		modinfo = next_string(info, &size);
+	}
+
 	for (p = modinfo; p; p = next_string(p, &size)) {
 		if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
 			return p + taglen + 1;
@@ -507,6 +514,13 @@ static char *get_modinfo(void *modinfo, unsigned long modinfo_len,
 	return NULL;
 }
 
+static char *get_modinfo(void *modinfo, unsigned long modinfo_len,
+			 const char *tag)
+
+{
+	return get_next_modinfo(modinfo, modinfo_len, tag, NULL);
+}
+
 /**
  * Test if string s ends in string sub
  * return 0 if match
@@ -981,6 +995,7 @@ static void read_symbols(char *modname)
 {
 	const char *symname;
 	char *version;
+	char *license;
 	struct module *mod;
 	struct elf_info info = { };
 	Elf_Sym *sym;
@@ -996,6 +1011,18 @@ static void read_symbols(char *modname)
 		mod->skip = 1;
 	}
 
+	license = get_modinfo(info.modinfo, info.modinfo_len, "license");
+	while (license) {
+		if (license_is_gpl_compatible(license))
+			mod->gpl_compatible = 1;
+		else {
+			mod->gpl_compatible = 0;
+			break;
+		}
+		license = get_next_modinfo(info.modinfo, info.modinfo_len,
+					   "license", license);
+	}
+
 	for (sym = info.symtab_start; sym < info.symtab_stop; sym++) {
 		symname = info.strtab + sym->st_name;
 
@@ -1052,6 +1079,40 @@ void buf_write(struct buffer *buf, const char *s, int len)
 	buf->pos += len;
 }
 
+void check_license(struct module *mod)
+{
+	struct symbol *s, *exp;
+
+	for (s = mod->unres; s; s = s->next) {
+		if (mod->gpl_compatible == 1) {
+			/* GPL-compatible modules may use all symbols */
+			continue;
+		}
+		exp = find_symbol(s->name);
+		if (!exp || exp->module == mod)
+			continue;
+		const char *basename = strrchr(mod->name, '/');
+		if (basename)
+			basename++;
+		switch (exp->export) {
+			case export_gpl:
+				fatal("modpost: GPL-incompatible module %s "
+				      "uses GPL-only symbol '%s'\n",
+				 basename ? basename : mod->name,
+				exp->name);
+				break;
+			case export_gpl_future:
+				warn("modpost: GPL-incompatible module %s "
+				      "uses future GPL-only symbol '%s'\n",
+				      basename ? basename : mod->name,
+				      exp->name);
+				break;
+			case export_plain: /* ignore */ break;
+			case export_unknown: /* ignore */ break;
+		}
+        }
+}
+
 /**
  * Header for the generated file
  **/
@@ -1325,6 +1386,12 @@ int main(int argc, char **argv)
 		read_symbols(argv[optind++]);
 	}
 
+	for (mod = modules; mod; mod = mod->next) {
+		if (mod->skip)
+			continue;
+		check_license(mod);
+	}
+
 	for (mod = modules; mod; mod = mod->next) {
 		if (mod->skip)
 			continue;
diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h
index f7ee3a3fde14..2b00c6062844 100644
--- a/scripts/mod/modpost.h
+++ b/scripts/mod/modpost.h
@@ -100,6 +100,7 @@ buf_write(struct buffer *buf, const char *s, int len);
 struct module {
 	struct module *next;
 	const char *name;
+	int gpl_compatible;
 	struct symbol *unres;
 	int seen;
 	int skip;
-- 
cgit v1.2.3


From 8d7feac3c7504425aaf61dc7d804685a6b89ee43 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Jun 2006 18:37:19 +0200
Subject: [SCSI] remove RQ_SCSI_* flags

The RQ_SCSI_* flags are a vestiage of a long past history.  The EH code
still sets them but we never make use of that information.  The other
users is pluto.c which never had a chance to work but needs to be kept
compiling to keep Davem happy, so copy over the definition there.

We could probably get rid of RQ_ACTIVE/RQ_INACTIVE aswell with some
work, there's only two more or less bogus looking uses in ubd and scsi.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/pluto.c      | 3 +++
 drivers/scsi/scsi_error.c | 3 ---
 include/linux/blkdev.h    | 3 ---
 3 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/pluto.c b/drivers/scsi/pluto.c
index 46624ab9c3d2..83a671799934 100644
--- a/drivers/scsi/pluto.c
+++ b/drivers/scsi/pluto.c
@@ -27,6 +27,9 @@
 
 #include <linux/module.h>
 
+#define RQ_SCSI_BUSY		0xffff
+#define RQ_SCSI_DONE		0xfffe
+
 /* #define PLUTO_DEBUG */
 
 #define pluto_printk printk ("PLUTO %s: ", fc->name); printk
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 471a43102923..1c7d993fa8ad 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -452,7 +452,6 @@ static int scsi_send_eh_cmnd(struct scsi_cmnd *scmd, int timeout)
 			(sdev->lun << 5 & 0xe0);
 
 	shost->eh_action = &done;
-	scmd->request->rq_status = RQ_SCSI_BUSY;
 
 	spin_lock_irqsave(shost->host_lock, flags);
 	scsi_log_send(scmd);
@@ -461,7 +460,6 @@ static int scsi_send_eh_cmnd(struct scsi_cmnd *scmd, int timeout)
 
 	timeleft = wait_for_completion_timeout(&done, timeout);
 
-	scmd->request->rq_status = RQ_SCSI_DONE;
 	shost->eh_action = NULL;
 
 	scsi_log_completion(scmd, SUCCESS);
@@ -1657,7 +1655,6 @@ scsi_reset_provider(struct scsi_device *dev, int flag)
 
 	scmd->request = &req;
 	memset(&scmd->eh_timeout, 0, sizeof(scmd->eh_timeout));
-	scmd->request->rq_status      	= RQ_SCSI_BUSY;
 
 	memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd));
     
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 59e1259b1c40..c889c459fd1b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -439,9 +439,6 @@ struct request_queue
 
 #define RQ_INACTIVE		(-1)
 #define RQ_ACTIVE		1
-#define RQ_SCSI_BUSY		0xffff
-#define RQ_SCSI_DONE		0xfffe
-#define RQ_SCSI_DISCONNECTING	0xffe0
 
 #define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
-- 
cgit v1.2.3


From 9a9c77dc4c4eed9dfb74080e768c0b3c9d905496 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 11 Jun 2006 11:19:00 +0900
Subject: [PATCH] libata: cosmetic change in struct ata_port

Cosmetic change in struct ata_port.

Signed-off-by: Tejun Heo <htejun@gmail.com>
---
 include/linux/libata.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index a2a33a902917..39e6b77de1a9 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -520,7 +520,8 @@ struct ata_port {
 	struct ata_host_set	*host_set;
 	struct device 		*dev;
 
-	struct work_struct	port_task, hotplug_task;
+	struct work_struct	port_task;
+	struct work_struct	hotplug_task;
 
 	unsigned int		hsm_task_state;
 
-- 
cgit v1.2.3


From 3057ac3c1a992ee135cbb7b7d1a12e58d81f0739 Mon Sep 17 00:00:00 2001
From: "zhao, forrest" <forrest.zhao@intel.com>
Date: Mon, 12 Jun 2006 12:01:34 +0800
Subject: [PATCH] Snoop SET FEATURES - WRITE CACHE ENABLE/DISABLE command(v5)

This patch makes libata snoop 'SET FEATURES - WRITE CACHE
ENABLE/DISABLE' command, executing requisite revalidation processes
to update cached data.

Signed-off-by: Forrest Zhao <forrest.zhao@intel.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c |  1 +
 drivers/scsi/libata-eh.c   |  3 +++
 drivers/scsi/libata-scsi.c | 36 ++++++++++++++++++++++++++++++++++++
 drivers/scsi/libata.h      |  1 +
 include/linux/ata.h        |  3 +++
 5 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 51b3a0ddb238..ddaad1a4f1f1 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -5194,6 +5194,7 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 
 	INIT_WORK(&ap->port_task, NULL, NULL);
 	INIT_WORK(&ap->hotplug_task, ata_scsi_hotplug, ap);
+	INIT_WORK(&ap->scsi_rescan_task, ata_scsi_dev_rescan, ap);
 	INIT_LIST_HEAD(&ap->eh_done_q);
 	init_waitqueue_head(&ap->eh_wait_q);
 
diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index 6285257a890d..f82799e22588 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -1554,6 +1554,9 @@ static int ata_eh_revalidate_and_attach(struct ata_port *ap,
 			if (rc)
 				break;
 
+			/* schedule the scsi_rescan_device() here */
+			queue_work(ata_aux_wq, &(ap->scsi_rescan_task));
+
 			ehc->i.action &= ~ATA_EH_REVALIDATE;
 		} else if (dev->class == ATA_DEV_UNKNOWN &&
 			   ehc->tries[dev->devno] &&
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 5f90d8e33546..45a49be65042 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -1306,6 +1306,17 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
 	u8 *cdb = cmd->cmnd;
  	int need_sense = (qc->err_mask != 0);
 
+	/* We snoop the SET_FEATURES - Write Cache ON/OFF command, and
+	 * schedule EH_REVALIDATE operation to update the IDENTIFY DEVICE
+	 * cache
+	 */
+	if (!need_sense && (qc->tf.command == ATA_CMD_SET_FEATURES) &&
+	    ((qc->tf.feature == SETFEATURES_WC_ON) ||
+	     (qc->tf.feature == SETFEATURES_WC_OFF))) {
+		qc->ap->eh_info.action |= ATA_EH_REVALIDATE;
+		ata_port_schedule_eh(qc->ap);
+	}
+
 	/* For ATA pass thru (SAT) commands, generate a sense block if
 	 * user mandated it or if there's an error.  Note that if we
 	 * generate because the user forced us to, a check condition
@@ -2992,3 +3003,28 @@ static int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
 
 	return rc;
 }
+
+/**
+ *      ata_scsi_dev_rescan - initiate scsi_rescan_device()
+ *      @data: Pointer to ATA port to perform scsi_rescan_device()
+ *
+ *      After ATA pass thru (SAT) commands are executed successfully,
+ *      libata need to propagate the changes to SCSI layer.
+ *
+ *      LOCKING:
+ *      Kernel thread context (may sleep).
+ */
+void ata_scsi_dev_rescan(void *data)
+{
+	struct ata_port *ap = data;
+	struct ata_device *dev;
+	unsigned int i;
+
+	for (i = 0; i < ATA_MAX_DEVICES; i++) {
+		dev = &ap->device[i];
+
+		if (ata_dev_enabled(dev))
+			scsi_rescan_device(&(dev->sdev->sdev_gendev));
+	}
+}
+
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index 1dd496f1f7ac..bdd488897096 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -104,6 +104,7 @@ extern void ata_scsi_rbuf_fill(struct ata_scsi_args *args,
                         unsigned int (*actor) (struct ata_scsi_args *args,
                                            u8 *rbuf, unsigned int buflen));
 extern void ata_schedule_scsi_eh(struct Scsi_Host *shost);
+extern void ata_scsi_dev_rescan(void *data);
 
 /* libata-eh.c */
 extern enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
diff --git a/include/linux/ata.h b/include/linux/ata.h
index c494e1c0531e..3671af869696 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -181,6 +181,9 @@ enum {
 	XFER_PIO_0		= 0x08,
 	XFER_PIO_SLOW		= 0x00,
 
+	SETFEATURES_WC_ON	= 0x02, /* Enable write cache */
+	SETFEATURES_WC_OFF	= 0x82, /* Disable write cache */
+
 	/* ATAPI stuff */
 	ATAPI_PKT_DMA		= (1 << 0),
 	ATAPI_DMADIR		= (1 << 2),	/* ATAPI data dir:
-- 
cgit v1.2.3


From 3b01b8af2414b6684051da4a1507dfacdbf24f86 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Mon, 12 Jun 2006 00:22:04 -0400
Subject: libata: fix build, by adding required workqueue member to port struct

---
 include/linux/libata.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 39e6b77de1a9..61eea5795d5a 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -522,6 +522,7 @@ struct ata_port {
 
 	struct work_struct	port_task;
 	struct work_struct	hotplug_task;
+	struct work_struct	scsi_rescan_task;
 
 	unsigned int		hsm_task_state;
 
-- 
cgit v1.2.3


From f0eb62b81dd16bfc4034916418c3406ba20011e1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 12 Jun 2006 23:05:38 +0900
Subject: [PATCH] libata: add host_set->next for legacy two host_sets case,
 take #3

For a legacy ATA controller, libata registers two separate host sets.
There was no connection between the two hosts making it impossible to
traverse all ports related to the controller.  This patch adds
host_set->next which points to the second host_set and makes
ata_pci_remove_one() remove all associated host_sets.

* On device removal, all ports hanging off the device are properly
  detached.  Prior to this patch, ports on the first host_set weren't
  detached casuing oops on driver unloading.

* On device removal, both host_sets are properly freed

This will also be used by new power management code to suspend and
resume all ports of a controller.  host_set/port representation will
be improved to handle legacy controllers better and this host_set
linking will go away with it.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-bmdma.c | 15 +++++++++++++--
 drivers/scsi/libata-core.c  |  4 ++++
 include/linux/libata.h      |  3 ++-
 3 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-bmdma.c b/drivers/scsi/libata-bmdma.c
index 4bc05371737c..13fab97c840e 100644
--- a/drivers/scsi/libata-bmdma.c
+++ b/drivers/scsi/libata-bmdma.c
@@ -1076,10 +1076,21 @@ int ata_pci_init_one (struct pci_dev *pdev, struct ata_port_info **port_info,
 
 	/* FIXME: check ata_device_add return */
 	if (legacy_mode) {
-		if (legacy_mode & (1 << 0))
+		struct device *dev = &pdev->dev;
+		struct ata_host_set *host_set = NULL;
+
+		if (legacy_mode & (1 << 0)) {
 			ata_device_add(probe_ent);
-		if (legacy_mode & (1 << 1))
+			host_set = dev_get_drvdata(dev);
+		}
+
+		if (legacy_mode & (1 << 1)) {
 			ata_device_add(probe_ent2);
+			if (host_set) {
+				host_set->next = dev_get_drvdata(dev);
+				dev_set_drvdata(dev, host_set);
+			}
+		}
 	} else
 		ata_device_add(probe_ent);
 
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 014855e5a43a..d73cb3672d6a 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -5621,8 +5621,12 @@ void ata_pci_remove_one (struct pci_dev *pdev)
 {
 	struct device *dev = pci_dev_to_dev(pdev);
 	struct ata_host_set *host_set = dev_get_drvdata(dev);
+	struct ata_host_set *host_set2 = host_set->next;
 
 	ata_host_set_remove(host_set);
+	if (host_set2)
+		ata_host_set_remove(host_set2);
+
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
 	dev_set_drvdata(dev, NULL);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 61eea5795d5a..f03b8664af11 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -356,7 +356,8 @@ struct ata_host_set {
 	unsigned long		flags;
 	int			simplex_claimed;	/* Keep seperate in case we
 							   ever need to do this locked */
-	struct ata_port *	ports[0];
+	struct ata_host_set	*next;		/* for legacy mode */
+	struct ata_port		*ports[0];
 };
 
 struct ata_queued_cmd {
-- 
cgit v1.2.3


From 783ed81ff39d3f938a6b2efd09fbad96e41e5c1f Mon Sep 17 00:00:00 2001
From: "Artem B. Bityutskiy" <dedekind@sauron.oktetlabs.ru>
Date: Wed, 14 Jun 2006 19:53:44 +0400
Subject: [MTD] assume mtd->writesize is 1 for NOR flashes

Signed-off-by: Artem B. Bityitskiy
---
 drivers/mtd/chips/amd_flash.c       | 1 +
 drivers/mtd/chips/cfi_cmdset_0002.c | 1 +
 drivers/mtd/chips/sharp.c           | 1 +
 drivers/mtd/devices/lart.c          | 1 +
 drivers/mtd/devices/m25p80.c        | 1 +
 drivers/mtd/devices/mtdram.c        | 1 +
 drivers/mtd/mtdcore.c               | 1 +
 include/linux/mtd/mtd.h             | 9 ++++++---
 8 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/amd_flash.c b/drivers/mtd/chips/amd_flash.c
index 9e466509a23d..16eaca69fb5a 100644
--- a/drivers/mtd/chips/amd_flash.c
+++ b/drivers/mtd/chips/amd_flash.c
@@ -730,6 +730,7 @@ static struct mtd_info *amd_flash_probe(struct map_info *map)
 		offset += dev_size;
 	}
 	mtd->type = MTD_NORFLASH;
+	mtd->writesize = 1;
 	mtd->flags = MTD_CAP_NORFLASH;
 	mtd->name = map->name;
 	mtd->erase = amd_flash_erase;
diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index 3aeb0c79e714..1e01ad38b26e 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -236,6 +236,7 @@ struct mtd_info *cfi_cmdset_0002(struct map_info *map, int primary)
 	mtd->resume  = cfi_amdstd_resume;
 	mtd->flags   = MTD_CAP_NORFLASH;
 	mtd->name    = map->name;
+	mtd->writesize = 1;
 
 	if (cfi->cfi_mode==CFI_MODE_CFI){
 		unsigned char bootloc;
diff --git a/drivers/mtd/chips/sharp.c b/drivers/mtd/chips/sharp.c
index 3cc0b23c5865..967abbecdff9 100644
--- a/drivers/mtd/chips/sharp.c
+++ b/drivers/mtd/chips/sharp.c
@@ -140,6 +140,7 @@ static struct mtd_info *sharp_probe(struct map_info *map)
 	mtd->suspend = sharp_suspend;
 	mtd->resume = sharp_resume;
 	mtd->flags = MTD_CAP_NORFLASH;
+	mtd->writesize = 1;
 	mtd->name = map->name;
 
 	memset(sharp, 0, sizeof(*sharp));
diff --git a/drivers/mtd/devices/lart.c b/drivers/mtd/devices/lart.c
index 29b0ddaa324e..4ea50a1dda85 100644
--- a/drivers/mtd/devices/lart.c
+++ b/drivers/mtd/devices/lart.c
@@ -635,6 +635,7 @@ int __init lart_flash_init (void)
    printk ("%s: This looks like a LART board to me.\n",module_name);
    mtd.name = module_name;
    mtd.type = MTD_NORFLASH;
+   mtd.writesize = 1;
    mtd.flags = MTD_CAP_NORFLASH;
    mtd.size = FLASH_BLOCKSIZE_PARAM * FLASH_NUMBLOCKS_16m_PARAM + FLASH_BLOCKSIZE_MAIN * FLASH_NUMBLOCKS_16m_MAIN;
    mtd.erasesize = FLASH_BLOCKSIZE_MAIN;
diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index 04e65d5dae00..a8466141e914 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -465,6 +465,7 @@ static int __devinit m25p_probe(struct spi_device *spi)
 		flash->mtd.name = spi->dev.bus_id;
 
 	flash->mtd.type = MTD_NORFLASH;
+	flash->mtd.writesize = 1;
 	flash->mtd.flags = MTD_CAP_NORFLASH;
 	flash->mtd.size = info->sector_size * info->n_sectors;
 	flash->mtd.erasesize = info->sector_size;
diff --git a/drivers/mtd/devices/mtdram.c b/drivers/mtd/devices/mtdram.c
index f284c9670be8..8ab4b931215f 100644
--- a/drivers/mtd/devices/mtdram.c
+++ b/drivers/mtd/devices/mtdram.c
@@ -106,6 +106,7 @@ int mtdram_init_device(struct mtd_info *mtd, void *mapped_address,
 	mtd->type = MTD_GENERIC_TYPE;
 	mtd->flags = MTD_CAP_RAM;
 	mtd->size = size;
+	mtd->writesize = 1;
 	mtd->erasesize = MTDRAM_ERASE_SIZE;
 	mtd->priv = mapped_address;
 
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 49bc9fdcb88b..16a952dd486a 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -47,6 +47,7 @@ int add_mtd_device(struct mtd_info *mtd)
 {
 	int i;
 
+	BUG_ON(mtd->writesize == 0);
 	mutex_lock(&mtd_table_mutex);
 
 	for (i=0; i < MAX_MTD_DEVICES; i++)
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 9536567d041b..e1d2a3d56546 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -110,9 +110,12 @@ struct mtd_info {
 	 * information below if they desire
 	 */
 	u_int32_t erasesize;
-	/* Smallest availlable size for writing to the device.  For NAND,
-	 * this is the page size, for some NOR chips, the size of ECC
-	 * covered blocks.
+	/* Minimal writable flash unit size. In case of NOR flash it is 1 (even
+	 * though individual bits can be cleared), in case of NAND flash it is
+	 * one NAND page (or half, or one-fourths of it), in case of ECC-ed NOR
+	 * it is of ECC block size, etc. It is illegal to have writesize = 0.
+	 * Any driver registering a struct mtd_info must ensure a writesize of
+	 * 1 or larger.
 	 */
 	u_int32_t writesize;
 
-- 
cgit v1.2.3


From afefc4158f3c8529e4bb99c1dc119fd792bac220 Mon Sep 17 00:00:00 2001
From: Andrew Victor <andrew@sanpeople.com>
Date: Mon, 19 Jun 2006 19:53:19 +0100
Subject: [ARM] 3592/1: AT91RM9200 Serial driver update

Patch from Andrew Victor

This patch includes a number of updates to the AT91RM9200 serial driver.

Changes include:
1. Conversion to a platform_driver.  [Ivan Kokshaysky]
2. Replaced all references to AT91RM9200 with AT91.  This driver can now
also be used for the AT91SAM9216.
3. Allow TIOCM_LOOP to configure local loopback mode.
4. Cleaned up the 'read_status_mask' usage and interrupt handler code.
[Chip Coldwell]
5. Suspend/resume support.  [David Brownell]

There are a few 'unused variable' warning when compiling this - I
removed the new DMA support to keep this first patch simpler.

Signed-off-by: Andrew Victor <andrew@sanpeople.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/Kconfig                   |  17 +-
 drivers/serial/at91_serial.c             | 463 +++++++++++++++++++------------
 include/asm-arm/mach/serial_at91.h       |  34 +++
 include/asm-arm/mach/serial_at91rm9200.h |  36 ---
 include/linux/serial_core.h              |   4 +-
 5 files changed, 325 insertions(+), 229 deletions(-)
 create mode 100644 include/asm-arm/mach/serial_at91.h
 delete mode 100644 include/asm-arm/mach/serial_at91rm9200.h

(limited to 'include/linux')

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 7d22dc0478d3..5ea778fc1caa 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -300,21 +300,22 @@ config SERIAL_AMBA_PL011_CONSOLE
 	  kernel at boot time.)
 
 config SERIAL_AT91
-	bool "AT91RM9200 serial port support"
-	depends on ARM && ARCH_AT91RM9200
+	bool "AT91RM9200 / AT91SAM9261 serial port support"
+	depends on ARM && (ARCH_AT91RM9200 || ARCH_AT91SAM9261)
 	select SERIAL_CORE
 	help
-	  This enables the driver for the on-chip UARTs of the AT91RM9200
-	  processor.
+	  This enables the driver for the on-chip UARTs of the Atmel
+	  AT91RM9200 and AT91SAM926 processor.
 
 config SERIAL_AT91_CONSOLE
-	bool "Support for console on AT91RM9200 serial port"
+	bool "Support for console on AT91RM9200 / AT91SAM9261 serial port"
 	depends on SERIAL_AT91=y
 	select SERIAL_CORE_CONSOLE
 	help
-	  Say Y here if you wish to use a UART on the AT91RM9200 as the system
-	  console (the system console is the device which receives all kernel
-	  messages and warnings and which allows logins in single user mode).
+	  Say Y here if you wish to use a UART on the Atmel AT91RM9200 or
+	  AT91SAM9261 as the system console (the system console is the device
+	  which receives all kernel messages and warnings and which allows
+	  logins in single user mode).
 
 config SERIAL_AT91_TTYAT
 	bool "Install as device ttyAT0-4 instead of ttyS0-4"
diff --git a/drivers/serial/at91_serial.c b/drivers/serial/at91_serial.c
index 6547fe0cef96..db5b25fafed4 100644
--- a/drivers/serial/at91_serial.c
+++ b/drivers/serial/at91_serial.c
@@ -2,7 +2,6 @@
  *  linux/drivers/char/at91_serial.c
  *
  *  Driver for Atmel AT91RM9200 Serial ports
- *
  *  Copyright (C) 2003 Rick Bronson
  *
  *  Based on drivers/char/serial_sa1100.c, by Deep Blue Solutions Ltd.
@@ -30,17 +29,19 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/serial.h>
+#include <linux/clk.h>
 #include <linux/console.h>
 #include <linux/sysrq.h>
 #include <linux/tty_flip.h>
+#include <linux/platform_device.h>
 
 #include <asm/io.h>
 
 #include <asm/arch/at91rm9200_usart.h>
-#include <asm/mach/serial_at91rm9200.h>
+#include <asm/arch/at91rm9200_pdc.h>
+#include <asm/mach/serial_at91.h>
 #include <asm/arch/board.h>
-#include <asm/arch/pio.h>
-
+#include <asm/arch/system.h>
 
 #if defined(CONFIG_SERIAL_AT91_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
 #define SUPPORT_SYSRQ
@@ -67,7 +68,6 @@
 
 #endif
 
-#define AT91_VA_BASE_DBGU	((unsigned long) AT91_VA_BASE_SYS + AT91_DBGU)
 #define AT91_ISR_PASS_LIMIT	256
 
 #define UART_PUT_CR(port,v)	writel(v, (port)->membase + AT91_US_CR)
@@ -87,16 +87,33 @@
 
  /* PDC registers */
 #define UART_PUT_PTCR(port,v)	writel(v, (port)->membase + AT91_PDC_PTCR)
+#define UART_GET_PTSR(port)	readl((port)->membase + AT91_PDC_PTSR)
+
 #define UART_PUT_RPR(port,v)	writel(v, (port)->membase + AT91_PDC_RPR)
+#define UART_GET_RPR(port)	readl((port)->membase + AT91_PDC_RPR)
 #define UART_PUT_RCR(port,v)	writel(v, (port)->membase + AT91_PDC_RCR)
-#define UART_GET_RCR(port)	readl((port)->membase + AT91_PDC_RCR)
 #define UART_PUT_RNPR(port,v)	writel(v, (port)->membase + AT91_PDC_RNPR)
 #define UART_PUT_RNCR(port,v)	writel(v, (port)->membase + AT91_PDC_RNCR)
 
+#define UART_PUT_TPR(port,v)	writel(v, (port)->membase + AT91_PDC_TPR)
+#define UART_PUT_TCR(port,v)	writel(v, (port)->membase + AT91_PDC_TCR)
+//#define UART_PUT_TNPR(port,v)	writel(v, (port)->membase + AT91_PDC_TNPR)
+//#define UART_PUT_TNCR(port,v)	writel(v, (port)->membase + AT91_PDC_TNCR)
 
 static int (*at91_open)(struct uart_port *);
 static void (*at91_close)(struct uart_port *);
 
+/*
+ * We wrap our port structure around the generic uart_port.
+ */
+struct at91_uart_port {
+	struct uart_port	uart;		/* uart */
+	struct clk		*clk;		/* uart clock */
+	unsigned short		suspended;	/* is port suspended? */
+};
+
+static struct at91_uart_port at91_ports[AT91_NR_UART];
+
 #ifdef SUPPORT_SYSRQ
 static struct console at91_console;
 #endif
@@ -115,16 +132,19 @@ static u_int at91_tx_empty(struct uart_port *port)
 static void at91_set_mctrl(struct uart_port *port, u_int mctrl)
 {
 	unsigned int control = 0;
+	unsigned int mode;
 
-	/*
-	 * Errata #39: RTS0 is not internally connected to PA21.  We need to drive
-	 *  the pin manually.
-	 */
-	if (port->mapbase == AT91_VA_BASE_US0) {
-		if (mctrl & TIOCM_RTS)
-			at91_sys_write(AT91_PIOA + PIO_CODR, AT91_PA21_RTS0);
-		else
-			at91_sys_write(AT91_PIOA + PIO_SODR, AT91_PA21_RTS0);
+	if (arch_identify() == ARCH_ID_AT91RM9200) {
+		/*
+		 * AT91RM9200 Errata #39: RTS0 is not internally connected to PA21.
+		 *  We need to drive the pin manually.
+		 */
+		if (port->mapbase == AT91_BASE_US0) {
+			if (mctrl & TIOCM_RTS)
+				at91_sys_write(AT91_PIOA + PIO_CODR, AT91_PA21_RTS0);
+			else
+				at91_sys_write(AT91_PIOA + PIO_SODR, AT91_PA21_RTS0);
+		}
 	}
 
 	if (mctrl & TIOCM_RTS)
@@ -137,7 +157,15 @@ static void at91_set_mctrl(struct uart_port *port, u_int mctrl)
 	else
 		control |= AT91_US_DTRDIS;
 
-	UART_PUT_CR(port,control);
+	UART_PUT_CR(port, control);
+
+	/* Local loopback mode? */
+	mode = UART_GET_MR(port) & ~AT91_US_CHMODE;
+	if (mctrl & TIOCM_LOOP)
+		mode |= AT91_US_CHMODE_LOC_LOOP;
+	else
+		mode |= AT91_US_CHMODE_NORMAL;
+	UART_PUT_MR(port, mode);
 }
 
 /*
@@ -169,8 +197,9 @@ static u_int at91_get_mctrl(struct uart_port *port)
  */
 static void at91_stop_tx(struct uart_port *port)
 {
+	struct at91_uart_port *at91_port = (struct at91_uart_port *) port;
+
 	UART_PUT_IDR(port, AT91_US_TXRDY);
-	port->read_status_mask &= ~AT91_US_TXRDY;
 }
 
 /*
@@ -178,7 +207,8 @@ static void at91_stop_tx(struct uart_port *port)
  */
 static void at91_start_tx(struct uart_port *port)
 {
-	port->read_status_mask |= AT91_US_TXRDY;
+	struct at91_uart_port *at91_port = (struct at91_uart_port *) port;
+
 	UART_PUT_IER(port, AT91_US_TXRDY);
 }
 
@@ -187,6 +217,8 @@ static void at91_start_tx(struct uart_port *port)
  */
 static void at91_stop_rx(struct uart_port *port)
 {
+	struct at91_uart_port *at91_port = (struct at91_uart_port *) port;
+
 	UART_PUT_IDR(port, AT91_US_RXRDY);
 }
 
@@ -195,7 +227,6 @@ static void at91_stop_rx(struct uart_port *port)
  */
 static void at91_enable_ms(struct uart_port *port)
 {
-	port->read_status_mask |= (AT91_US_RIIC | AT91_US_DSRIC | AT91_US_DCDIC | AT91_US_CTSIC);
 	UART_PUT_IER(port, AT91_US_RIIC | AT91_US_DSRIC | AT91_US_DCDIC | AT91_US_CTSIC);
 }
 
@@ -218,8 +249,8 @@ static void at91_rx_chars(struct uart_port *port, struct pt_regs *regs)
 	struct tty_struct *tty = port->info->tty;
 	unsigned int status, ch, flg;
 
-	status = UART_GET_CSR(port) & port->read_status_mask;
-	while (status & (AT91_US_RXRDY)) {
+	status = UART_GET_CSR(port);
+	while (status & AT91_US_RXRDY) {
 		ch = UART_GET_CHAR(port);
 
 		port->icount.rx++;
@@ -230,40 +261,38 @@ static void at91_rx_chars(struct uart_port *port, struct pt_regs *regs)
 		 * note that the error handling code is
 		 * out of the main execution path
 		 */
-		if (unlikely(status & (AT91_US_PARE | AT91_US_FRAME | AT91_US_OVRE))) {
+		if (unlikely(status & (AT91_US_PARE | AT91_US_FRAME | AT91_US_OVRE | AT91_US_RXBRK))) {
 			UART_PUT_CR(port, AT91_US_RSTSTA);	/* clear error */
-			if (status & (AT91_US_PARE))
+			if (status & AT91_US_RXBRK) {
+				status &= ~(AT91_US_PARE | AT91_US_FRAME);	/* ignore side-effect */
+				port->icount.brk++;
+				if (uart_handle_break(port))
+					goto ignore_char;
+			}
+			if (status & AT91_US_PARE)
 				port->icount.parity++;
-			if (status & (AT91_US_FRAME))
+			if (status & AT91_US_FRAME)
 				port->icount.frame++;
-			if (status & (AT91_US_OVRE))
+			if (status & AT91_US_OVRE)
 				port->icount.overrun++;
 
-			if (status & AT91_US_PARE)
+			status &= port->read_status_mask;
+
+			if (status & AT91_US_RXBRK)
+				flg = TTY_BREAK;
+			else if (status & AT91_US_PARE)
 				flg = TTY_PARITY;
 			else if (status & AT91_US_FRAME)
 				flg = TTY_FRAME;
-			if (status & AT91_US_OVRE) {
-				/*
-				 * overrun does *not* affect the character
-				 * we read from the FIFO
-				 */
-				tty_insert_flip_char(tty, ch, flg);
-				ch = 0;
-				flg = TTY_OVERRUN;
-			}
-#ifdef SUPPORT_SYSRQ
-			port->sysrq = 0;
-#endif
 		}
 
 		if (uart_handle_sysrq_char(port, ch, regs))
 			goto ignore_char;
 
-		tty_insert_flip_char(tty, ch, flg);
+		uart_insert_char(port, status, AT91_US_OVRE, ch, flg);
 
 	ignore_char:
-		status = UART_GET_CSR(port) & port->read_status_mask;
+		status = UART_GET_CSR(port);
 	}
 
 	tty_flip_buffer_push(tty);
@@ -308,40 +337,35 @@ static void at91_tx_chars(struct uart_port *port)
 static irqreturn_t at91_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
 	struct uart_port *port = dev_id;
+	struct at91_uart_port *at91_port = (struct at91_uart_port *) port;
 	unsigned int status, pending, pass_counter = 0;
 
 	status = UART_GET_CSR(port);
-	pending = status & port->read_status_mask;
-	if (pending) {
-		do {
-			if (pending & AT91_US_RXRDY)
-				at91_rx_chars(port, regs);
-
-			/* Clear the relevent break bits */
-			if (pending & AT91_US_RXBRK) {
-				UART_PUT_CR(port, AT91_US_RSTSTA);
-				port->icount.brk++;
-				uart_handle_break(port);
-			}
+	pending = status & UART_GET_IMR(port);
+	while (pending) {
+		/* Interrupt receive */
+		if (pending & AT91_US_RXRDY)
+			at91_rx_chars(port, regs);
+
+		// TODO: All reads to CSR will clear these interrupts!
+		if (pending & AT91_US_RIIC) port->icount.rng++;
+		if (pending & AT91_US_DSRIC) port->icount.dsr++;
+		if (pending & AT91_US_DCDIC)
+			uart_handle_dcd_change(port, !(status & AT91_US_DCD));
+		if (pending & AT91_US_CTSIC)
+			uart_handle_cts_change(port, !(status & AT91_US_CTS));
+		if (pending & (AT91_US_RIIC | AT91_US_DSRIC | AT91_US_DCDIC | AT91_US_CTSIC))
+			wake_up_interruptible(&port->info->delta_msr_wait);
+
+		/* Interrupt transmit */
+		if (pending & AT91_US_TXRDY)
+			at91_tx_chars(port);
+
+		if (pass_counter++ > AT91_ISR_PASS_LIMIT)
+			break;
 
-			// TODO: All reads to CSR will clear these interrupts!
-			if (pending & AT91_US_RIIC) port->icount.rng++;
-			if (pending & AT91_US_DSRIC) port->icount.dsr++;
-			if (pending & AT91_US_DCDIC)
-				uart_handle_dcd_change(port, !(status & AT91_US_DCD));
-			if (pending & AT91_US_CTSIC)
-				uart_handle_cts_change(port, !(status & AT91_US_CTS));
-			if (pending & (AT91_US_RIIC | AT91_US_DSRIC | AT91_US_DCDIC | AT91_US_CTSIC))
-				wake_up_interruptible(&port->info->delta_msr_wait);
-
-			if (pending & AT91_US_TXRDY)
-				at91_tx_chars(port);
-			if (pass_counter++ > AT91_ISR_PASS_LIMIT)
-				break;
-
-			status = UART_GET_CSR(port);
-			pending = status & port->read_status_mask;
-		} while (pending);
+		status = UART_GET_CSR(port);
+		pending = status & UART_GET_IMR(port);
 	}
 	return IRQ_HANDLED;
 }
@@ -351,6 +375,7 @@ static irqreturn_t at91_interrupt(int irq, void *dev_id, struct pt_regs *regs)
  */
 static int at91_startup(struct uart_port *port)
 {
+	struct at91_uart_port *at91_port = (struct at91_uart_port *) port;
 	int retval;
 
 	/*
@@ -381,14 +406,14 @@ static int at91_startup(struct uart_port *port)
 		}
 	}
 
-	port->read_status_mask = AT91_US_RXRDY | AT91_US_TXRDY | AT91_US_OVRE
-			| AT91_US_FRAME | AT91_US_PARE | AT91_US_RXBRK;
 	/*
 	 * Finally, enable the serial port
 	 */
 	UART_PUT_CR(port, AT91_US_RSTSTA | AT91_US_RSTRX);
 	UART_PUT_CR(port, AT91_US_TXEN | AT91_US_RXEN);		/* enable xmit & rcvr */
-	UART_PUT_IER(port, AT91_US_RXRDY);			/* do receive only */
+
+	UART_PUT_IER(port, AT91_US_RXRDY);		/* enable receive only */
+
 	return 0;
 }
 
@@ -397,6 +422,8 @@ static int at91_startup(struct uart_port *port)
  */
 static void at91_shutdown(struct uart_port *port)
 {
+	struct at91_uart_port *at91_port = (struct at91_uart_port *) port;
+
 	/*
 	 * Disable all interrupts, port and break condition.
 	 */
@@ -421,21 +448,22 @@ static void at91_shutdown(struct uart_port *port)
  */
 static void at91_serial_pm(struct uart_port *port, unsigned int state, unsigned int oldstate)
 {
+	struct at91_uart_port *at91_port = (struct at91_uart_port *) port;
+
 	switch (state) {
 		case 0:
 			/*
 			 * Enable the peripheral clock for this serial port.
 			 * This is called on uart_open() or a resume event.
 			 */
-			at91_sys_write(AT91_PMC_PCER, 1 << port->irq);
+			clk_enable(at91_port->clk);
 			break;
 		case 3:
 			/*
 			 * Disable the peripheral clock for this serial port.
 			 * This is called on uart_close() or a suspend event.
 			 */
-			if (port->irq != AT91_ID_SYS)			/* is this a shared clock? */
-				at91_sys_write(AT91_PMC_PCDR, 1 << port->irq);
+			clk_disable(at91_port->clk);
 			break;
 		default:
 			printk(KERN_ERR "at91_serial: unknown pm %d\n", state);
@@ -494,9 +522,9 @@ static void at91_set_termios(struct uart_port *port, struct termios * termios, s
 
 	spin_lock_irqsave(&port->lock, flags);
 
-	port->read_status_mask |= AT91_US_OVRE;
+	port->read_status_mask = AT91_US_OVRE;
 	if (termios->c_iflag & INPCK)
-		port->read_status_mask |= AT91_US_FRAME | AT91_US_PARE;
+		port->read_status_mask |= (AT91_US_FRAME | AT91_US_PARE);
 	if (termios->c_iflag & (BRKINT | PARMRK))
 		port->read_status_mask |= AT91_US_RXBRK;
 
@@ -552,7 +580,7 @@ static void at91_set_termios(struct uart_port *port, struct termios * termios, s
  */
 static const char *at91_type(struct uart_port *port)
 {
-	return (port->type == PORT_AT91RM9200) ? "AT91_SERIAL" : NULL;
+	return (port->type == PORT_AT91) ? "AT91_SERIAL" : NULL;
 }
 
 /*
@@ -560,8 +588,15 @@ static const char *at91_type(struct uart_port *port)
  */
 static void at91_release_port(struct uart_port *port)
 {
-	release_mem_region(port->mapbase,
-		(port->mapbase == AT91_VA_BASE_DBGU) ? 512 : SZ_16K);
+	struct platform_device *pdev = to_platform_device(port->dev);
+	int size = pdev->resource[0].end - pdev->resource[0].start + 1;
+
+	release_mem_region(port->mapbase, size);
+
+	if (port->flags & UPF_IOREMAP) {
+		iounmap(port->membase);
+		port->membase = NULL;
+	}
 }
 
 /*
@@ -569,10 +604,21 @@ static void at91_release_port(struct uart_port *port)
  */
 static int at91_request_port(struct uart_port *port)
 {
-	return request_mem_region(port->mapbase,
-		(port->mapbase == AT91_VA_BASE_DBGU) ? 512 : SZ_16K,
-		"at91_serial") != NULL ? 0 : -EBUSY;
+	struct platform_device *pdev = to_platform_device(port->dev);
+	int size = pdev->resource[0].end - pdev->resource[0].start + 1;
+
+	if (!request_mem_region(port->mapbase, size, "at91_serial"))
+		return -EBUSY;
+
+	if (port->flags & UPF_IOREMAP) {
+		port->membase = ioremap(port->mapbase, size);
+		if (port->membase == NULL) {
+			release_mem_region(port->mapbase, size);
+			return -ENOMEM;
+		}
+	}
 
+	return 0;
 }
 
 /*
@@ -581,7 +627,7 @@ static int at91_request_port(struct uart_port *port)
 static void at91_config_port(struct uart_port *port, int flags)
 {
 	if (flags & UART_CONFIG_TYPE) {
-		port->type = PORT_AT91RM9200;
+		port->type = PORT_AT91;
 		at91_request_port(port);
 	}
 }
@@ -592,7 +638,7 @@ static void at91_config_port(struct uart_port *port, int flags)
 static int at91_verify_port(struct uart_port *port, struct serial_struct *ser)
 {
 	int ret = 0;
-	if (ser->type != PORT_UNKNOWN && ser->type != PORT_AT91RM9200)
+	if (ser->type != PORT_UNKNOWN && ser->type != PORT_AT91)
 		ret = -EINVAL;
 	if (port->irq != ser->irq)
 		ret = -EINVAL;
@@ -624,33 +670,47 @@ static struct uart_ops at91_pops = {
 	.type		= at91_type,
 	.release_port	= at91_release_port,
 	.request_port	= at91_request_port,
-	.config_port 	= at91_config_port,
-	.verify_port 	= at91_verify_port,
+	.config_port	= at91_config_port,
+	.verify_port	= at91_verify_port,
 	.pm		= at91_serial_pm,
 };
 
-static struct uart_port at91_ports[AT91_NR_UART];
-
-void __init at91_init_ports(void)
+/*
+ * Configure the port from the platform device resource info.
+ */
+static void __devinit at91_init_port(struct at91_uart_port *at91_port, struct platform_device *pdev)
 {
-	static int first = 1;
-	int i;
-
-	if (!first)
-		return;
-	first = 0;
+	struct uart_port *port = &at91_port->uart;
+	struct at91_uart_data *data = pdev->dev.platform_data;
+
+	port->iotype	= UPIO_MEM;
+	port->flags     = UPF_BOOT_AUTOCONF;
+	port->ops	= &at91_pops;
+	port->fifosize  = 1;
+	port->line	= pdev->id;
+	port->dev	= &pdev->dev;
+
+	port->mapbase	= pdev->resource[0].start;
+	port->irq	= pdev->resource[1].start;
+
+	if (port->mapbase == AT91_VA_BASE_SYS + AT91_DBGU)		/* Part of system perpherals - already mapped */
+		port->membase = (void __iomem *) port->mapbase;
+	else {
+		port->flags	|= UPF_IOREMAP;
+		port->membase	= NULL;
+	}
 
-	for (i = 0; i < AT91_NR_UART; i++) {
-		at91_ports[i].iotype	= UPIO_MEM;
-		at91_ports[i].flags     = UPF_BOOT_AUTOCONF;
-		at91_ports[i].uartclk   = at91_master_clock;
-		at91_ports[i].ops	= &at91_pops;
-		at91_ports[i].fifosize  = 1;
-		at91_ports[i].line	= i;
- 	}
+	if (!at91_port->clk) {		/* for console, the clock could already be configured */
+		at91_port->clk = clk_get(&pdev->dev, "usart");
+		clk_enable(at91_port->clk);
+		port->uartclk = clk_get_rate(at91_port->clk);
+	}
 }
 
-void __init at91_register_uart_fns(struct at91rm9200_port_fns *fns)
+/*
+ * Register board-specific modem-control line handlers.
+ */
+void __init at91_register_uart_fns(struct at91_port_fns *fns)
 {
 	if (fns->enable_ms)
 		at91_pops.enable_ms = fns->enable_ms;
@@ -664,51 +724,6 @@ void __init at91_register_uart_fns(struct at91rm9200_port_fns *fns)
 	at91_pops.set_wake = fns->set_wake;
 }
 
-/*
- * Setup ports.
- */
-void __init at91_register_uart(int idx, int port)
-{
-	if ((idx < 0) || (idx >= AT91_NR_UART)) {
-		printk(KERN_ERR "%s: bad index number %d\n", __FUNCTION__, idx);
-		return;
-	}
-
-	switch (port) {
-	case 0:
-		at91_ports[idx].membase = (void __iomem *) AT91_VA_BASE_US0;
-		at91_ports[idx].mapbase = AT91_VA_BASE_US0;
-		at91_ports[idx].irq     = AT91_ID_US0;
-		AT91_CfgPIO_USART0();
-		break;
-	case 1:
-		at91_ports[idx].membase = (void __iomem *) AT91_VA_BASE_US1;
-		at91_ports[idx].mapbase = AT91_VA_BASE_US1;
-		at91_ports[idx].irq     = AT91_ID_US1;
-		AT91_CfgPIO_USART1();
-		break;
-	case 2:
-		at91_ports[idx].membase = (void __iomem *) AT91_VA_BASE_US2;
-		at91_ports[idx].mapbase = AT91_VA_BASE_US2;
-		at91_ports[idx].irq     = AT91_ID_US2;
-		AT91_CfgPIO_USART2();
-		break;
-	case 3:
-		at91_ports[idx].membase = (void __iomem *) AT91_VA_BASE_US3;
-		at91_ports[idx].mapbase = AT91_VA_BASE_US3;
-		at91_ports[idx].irq     = AT91_ID_US3;
-		AT91_CfgPIO_USART3();
-		break;
-	case 4:
-		at91_ports[idx].membase = (void __iomem *) AT91_VA_BASE_DBGU;
-		at91_ports[idx].mapbase = AT91_VA_BASE_DBGU;
-		at91_ports[idx].irq     = AT91_ID_SYS;
-		AT91_CfgPIO_DBGU();
-		break;
-	default:
-		printk(KERN_ERR  "%s : bad port number %d\n", __FUNCTION__, port);
-	}
-}
 
 #ifdef CONFIG_SERIAL_AT91_CONSOLE
 static void at91_console_putchar(struct uart_port *port, int ch)
@@ -723,7 +738,7 @@ static void at91_console_putchar(struct uart_port *port, int ch)
  */
 static void at91_console_write(struct console *co, const char *s, u_int count)
 {
-	struct uart_port *port = at91_ports + co->index;
+	struct uart_port *port = &at91_ports[co->index].uart;
 	unsigned int status, imr;
 
 	/*
@@ -778,23 +793,15 @@ static void __init at91_console_get_options(struct uart_port *port, int *baud, i
 
 static int __init at91_console_setup(struct console *co, char *options)
 {
-	struct uart_port *port;
+	struct uart_port *port = &at91_ports[co->index].uart;
 	int baud = 115200;
 	int bits = 8;
 	int parity = 'n';
 	int flow = 'n';
 
-	/*
-	 * Check whether an invalid uart number has been specified, and
-	 * if so, search for the first available port that does have
-	 * console support.
-	 */
-	port = uart_get_console(at91_ports, AT91_NR_UART, co);
+	if (port->membase == 0)		/* Port not initialized yet - delay setup */
+		return -ENODEV;
 
-	/*
-	 * Enable the serial console, in-case bootloader did not do it.
-	 */
-	at91_sys_write(AT91_PMC_PCER, 1 << port->irq);	/* enable clock */
 	UART_PUT_IDR(port, -1);				/* disable interrupts */
 	UART_PUT_CR(port, AT91_US_RSTSTA | AT91_US_RSTRX);
 	UART_PUT_CR(port, AT91_US_TXEN | AT91_US_RXEN);
@@ -821,23 +828,40 @@ static struct console at91_console = {
 
 #define AT91_CONSOLE_DEVICE	&at91_console
 
-static int  __init at91_console_init(void)
+/*
+ * Early console initialization (before VM subsystem initialized).
+ */
+static int __init at91_console_init(void)
 {
-	at91_init_ports();
+	if (at91_default_console_device) {
+		add_preferred_console(AT91_DEVICENAME, at91_default_console_device->id, NULL);
+		at91_init_port(&(at91_ports[at91_default_console_device->id]), at91_default_console_device);
+		register_console(&at91_console);
+	}
 
-	at91_console.index = at91_console_port;
-	register_console(&at91_console);
 	return 0;
 }
 console_initcall(at91_console_init);
 
+/*
+ * Late console initialization.
+ */
+static int __init at91_late_console_init(void)
+{
+	if (at91_default_console_device && !(at91_console.flags & CON_ENABLED))
+		register_console(&at91_console);
+
+	return 0;
+}
+core_initcall(at91_late_console_init);
+
 #else
 #define AT91_CONSOLE_DEVICE	NULL
 #endif
 
 static struct uart_driver at91_uart = {
 	.owner			= THIS_MODULE,
-	.driver_name		= AT91_DEVICENAME,
+	.driver_name		= "at91_serial",
 	.dev_name		= AT91_DEVICENAME,
 	.devfs_name		= AT91_DEVICENAME,
 	.major			= SERIAL_AT91_MAJOR,
@@ -846,33 +870,106 @@ static struct uart_driver at91_uart = {
 	.cons			= AT91_CONSOLE_DEVICE,
 };
 
-static int __init at91_serial_init(void)
+#ifdef CONFIG_PM
+static int at91_serial_suspend(struct platform_device *pdev, pm_message_t state)
 {
-	int ret, i;
+	struct uart_port *port = platform_get_drvdata(pdev);
+	struct at91_uart_port *at91_port = (struct at91_uart_port *) port;
+
+	if (device_may_wakeup(&pdev->dev) && !at91_suspend_entering_slow_clock())
+		enable_irq_wake(port->irq);
+	else {
+		disable_irq_wake(port->irq);
+		uart_suspend_port(&at91_uart, port);
+		at91_port->suspended = 1;
+	}
 
-	at91_init_ports();
+	return 0;
+}
 
-	ret = uart_register_driver(&at91_uart);
-	if (ret)
-		return ret;
+static int at91_serial_resume(struct platform_device *pdev)
+{
+	struct uart_port *port = platform_get_drvdata(pdev);
+	struct at91_uart_port *at91_port = (struct at91_uart_port *) port;
 
-	for (i = 0; i < AT91_NR_UART; i++) {
-		if (at91_serial_map[i] >= 0)
-			uart_add_one_port(&at91_uart, &at91_ports[i]);
+	if (at91_port->suspended) {
+		uart_resume_port(&at91_uart, port);
+		at91_port->suspended = 0;
 	}
 
 	return 0;
 }
+#else
+#define at91_serial_suspend NULL
+#define at91_serial_resume NULL
+#endif
 
-static void __exit at91_serial_exit(void)
+static int __devinit at91_serial_probe(struct platform_device *pdev)
 {
-	int i;
+	struct at91_uart_port *port;
+	int ret;
 
-	for (i = 0; i < AT91_NR_UART; i++) {
- 		if (at91_serial_map[i] >= 0)
-			uart_remove_one_port(&at91_uart, &at91_ports[i]);
-  	}
+	port = &at91_ports[pdev->id];
+	at91_init_port(port, pdev);
 
+	ret = uart_add_one_port(&at91_uart, &port->uart);
+	if (!ret) {
+		device_init_wakeup(&pdev->dev, 1);
+		platform_set_drvdata(pdev, port);
+	}
+
+	return ret;
+}
+
+static int __devexit at91_serial_remove(struct platform_device *pdev)
+{
+	struct uart_port *port = platform_get_drvdata(pdev);
+	struct at91_uart_port *at91_port = (struct at91_uart_port *) port;
+	int ret = 0;
+
+	clk_disable(at91_port->clk);
+	clk_put(at91_port->clk);
+
+	device_init_wakeup(&pdev->dev, 0);
+	platform_set_drvdata(pdev, NULL);
+
+	if (port) {
+		ret = uart_remove_one_port(&at91_uart, port);
+		kfree(port);
+	}
+
+	return ret;
+}
+
+static struct platform_driver at91_serial_driver = {
+	.probe		= at91_serial_probe,
+	.remove		= __devexit_p(at91_serial_remove),
+	.suspend	= at91_serial_suspend,
+	.resume		= at91_serial_resume,
+	.driver		= {
+		.name	= "at91_usart",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init at91_serial_init(void)
+{
+	int ret;
+
+	ret = uart_register_driver(&at91_uart);
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&at91_serial_driver);
+	if (ret)
+		uart_unregister_driver(&at91_uart);
+
+	return ret;
+}
+
+static void __exit at91_serial_exit(void)
+{
+	platform_driver_unregister(&at91_serial_driver);
 	uart_unregister_driver(&at91_uart);
 }
 
diff --git a/include/asm-arm/mach/serial_at91.h b/include/asm-arm/mach/serial_at91.h
new file mode 100644
index 000000000000..18bc4e00e475
--- /dev/null
+++ b/include/asm-arm/mach/serial_at91.h
@@ -0,0 +1,34 @@
+/*
+ *  linux/include/asm-arm/mach/serial_at91.h
+ *
+ *  Based on serial_sa1100.h  by Nicolas Pitre
+ *
+ *  Copyright (C) 2002 ATMEL Rousset
+ *
+ *  Low level machine dependent UART functions.
+ */
+#include <linux/config.h>
+
+struct uart_port;
+
+/*
+ * This is a temporary structure for registering these
+ * functions; it is intended to be discarded after boot.
+ */
+struct at91_port_fns {
+	void	(*set_mctrl)(struct uart_port *, u_int);
+	u_int	(*get_mctrl)(struct uart_port *);
+	void	(*enable_ms)(struct uart_port *);
+	void	(*pm)(struct uart_port *, u_int, u_int);
+	int	(*set_wake)(struct uart_port *, u_int);
+	int	(*open)(struct uart_port *);
+	void	(*close)(struct uart_port *);
+};
+
+#if defined(CONFIG_SERIAL_AT91)
+void at91_register_uart_fns(struct at91_port_fns *fns);
+#else
+#define at91_register_uart_fns(fns) do { } while (0)
+#endif
+
+
diff --git a/include/asm-arm/mach/serial_at91rm9200.h b/include/asm-arm/mach/serial_at91rm9200.h
deleted file mode 100644
index 98f4b0cb883c..000000000000
--- a/include/asm-arm/mach/serial_at91rm9200.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- *  linux/include/asm-arm/mach/serial_at91rm9200.h
- *
- *  Based on serial_sa1100.h  by Nicolas Pitre
- *
- *  Copyright (C) 2002 ATMEL Rousset
- *
- *  Low level machine dependent UART functions.
- */
-#include <linux/config.h>
-
-struct uart_port;
-
-/*
- * This is a temporary structure for registering these
- * functions; it is intended to be discarded after boot.
- */
-struct at91rm9200_port_fns {
-	void	(*set_mctrl)(struct uart_port *, u_int);
-	u_int	(*get_mctrl)(struct uart_port *);
-	void	(*enable_ms)(struct uart_port *);
-	void	(*pm)(struct uart_port *, u_int, u_int);
-	int	(*set_wake)(struct uart_port *, u_int);
-	int	(*open)(struct uart_port *);
-	void	(*close)(struct uart_port *);
-};
-
-#if defined(CONFIG_SERIAL_AT91)
-void at91_register_uart_fns(struct at91rm9200_port_fns *fns);
-void at91_register_uart(int idx, int port);
-#else
-#define at91_register_uart_fns(fns) do { } while (0)
-#define at91_register_uart(idx,port) do { } while (0)
-#endif
-
-
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index bd14858121ea..56c2a1db4a90 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -67,8 +67,8 @@
 /* Parisc type numbers. */
 #define PORT_MUX	48
 
-/* Atmel AT91RM9200 SoC */
-#define PORT_AT91RM9200 49
+/* Atmel AT91xxx SoC */
+#define PORT_AT91	49
 
 /* Macintosh Zilog type numbers */
 #define PORT_MAC_ZILOG	50	/* m68k : not yet implemented */
-- 
cgit v1.2.3


From 48d83325b61043e3bbd24dd37b9fe433744cf330 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 19 Jun 2006 23:57:59 -0700
Subject: [NET]: Prevent multiple qdisc runs

Having two or more qdisc_run's contend against each other is bad because
it can induce packet reordering if the packets have to be requeued.  It
appears that this is an unintended consequence of relinquinshing the queue
lock while transmitting.  That in turn is needed for devices that spend a
lot of time in their transmit routine.

There are no advantages to be had as devices with queues are inherently
single-threaded (the loopback device is not but then it doesn't have a
queue).

Even if you were to add a queue to a parallel virtual device (e.g., bolt
a tbf filter in front of an ipip tunnel device), you would still want to
process the queue in sequence to ensure that the packets are ordered
correctly.

The solution here is to steal a bit from net_device to prevent this.

BTW, as qdisc_restart is no longer used by anyone as a module inside the
kernel (IIRC it used to with netif_wake_queue), I have not exported the
new __qdisc_run function.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 include/net/pkt_sched.h   |  7 ++++---
 net/sched/sch_generic.c   | 11 +++++++++--
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e432b743dda2..39919c882a25 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -233,6 +233,7 @@ enum netdev_state_t
 	__LINK_STATE_RX_SCHED,
 	__LINK_STATE_LINKWATCH_PENDING,
 	__LINK_STATE_DORMANT,
+	__LINK_STATE_QDISC_RUNNING,
 };
 
 
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index b94d1ad92c4d..75b5b9333fc7 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -218,12 +218,13 @@ extern struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
 		struct rtattr *tab);
 extern void qdisc_put_rtab(struct qdisc_rate_table *tab);
 
-extern int qdisc_restart(struct net_device *dev);
+extern void __qdisc_run(struct net_device *dev);
 
 static inline void qdisc_run(struct net_device *dev)
 {
-	while (!netif_queue_stopped(dev) && qdisc_restart(dev) < 0)
-		/* NOTHING */;
+	if (!netif_queue_stopped(dev) &&
+	    !test_and_set_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
+		__qdisc_run(dev);
 }
 
 extern int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index b1e4c5e20ac7..d7aca8ef524a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -90,7 +90,7 @@ void qdisc_unlock_tree(struct net_device *dev)
    NOTE: Called under dev->queue_lock with locally disabled BH.
 */
 
-int qdisc_restart(struct net_device *dev)
+static inline int qdisc_restart(struct net_device *dev)
 {
 	struct Qdisc *q = dev->qdisc;
 	struct sk_buff *skb;
@@ -179,6 +179,14 @@ requeue:
 	return q->q.qlen;
 }
 
+void __qdisc_run(struct net_device *dev)
+{
+	while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
+		/* NOTHING */;
+
+	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
+}
+
 static void dev_watchdog(unsigned long arg)
 {
 	struct net_device *dev = (struct net_device *)arg;
@@ -620,6 +628,5 @@ EXPORT_SYMBOL(qdisc_create_dflt);
 EXPORT_SYMBOL(qdisc_alloc);
 EXPORT_SYMBOL(qdisc_destroy);
 EXPORT_SYMBOL(qdisc_reset);
-EXPORT_SYMBOL(qdisc_restart);
 EXPORT_SYMBOL(qdisc_lock_tree);
 EXPORT_SYMBOL(qdisc_unlock_tree);
-- 
cgit v1.2.3


From 90204e0b7b51e9f2a6905adca12dc331128602c7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 Jun 2006 21:39:38 -0400
Subject: [PATCH] remove config.h from inotify.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/inotify.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 09e00433c78e..71aa1553ef38 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -67,7 +67,6 @@ struct inotify_event {
 
 #include <linux/dcache.h>
 #include <linux/fs.h>
-#include <linux/config.h>
 
 #ifdef CONFIG_INOTIFY
 
-- 
cgit v1.2.3


From 2d9048e201bfb67ba21f05e647b1286b8a4a5667 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 1 Jun 2006 13:10:59 -0700
Subject: [PATCH] inotify (1/5): split kernel API from userspace support

The following series of patches introduces a kernel API for inotify,
making it possible for kernel modules to benefit from inotify's
mechanism for watching inodes.  With these patches, inotify will
maintain for each caller a list of watches (via an embedded struct
inotify_watch), where each inotify_watch is associated with a
corresponding struct inode.  The caller registers an event handler and
specifies for which filesystem events their event handler should be
called per inotify_watch.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Acked-by: Robert Love <rml@novell.com>
Acked-by: John McCutchan <john@johnmccutchan.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Kconfig              |  24 +-
 fs/Makefile             |   1 +
 fs/inotify.c            | 941 ++++++++++++------------------------------------
 fs/inotify_user.c       | 717 ++++++++++++++++++++++++++++++++++++
 include/linux/inotify.h |  76 ++++
 include/linux/sched.h   |   2 +-
 kernel/sysctl.c         |   4 +-
 kernel/user.c           |   2 +-
 8 files changed, 1046 insertions(+), 721 deletions(-)
 create mode 100644 fs/inotify_user.c

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index f9b5842c8d2d..74f11a23622d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -393,18 +393,30 @@ config INOTIFY
 	bool "Inotify file change notification support"
 	default y
 	---help---
-	  Say Y here to enable inotify support and the associated system
-	  calls.  Inotify is a file change notification system and a
-	  replacement for dnotify.  Inotify fixes numerous shortcomings in
-	  dnotify and introduces several new features.  It allows monitoring
-	  of both files and directories via a single open fd.  Other features
-	  include multiple file events, one-shot support, and unmount
+	  Say Y here to enable inotify support.  Inotify is a file change
+	  notification system and a replacement for dnotify.  Inotify fixes
+	  numerous shortcomings in dnotify and introduces several new features
+	  including multiple file events, one-shot support, and unmount
 	  notification.
 
 	  For more information, see Documentation/filesystems/inotify.txt
 
 	  If unsure, say Y.
 
+config INOTIFY_USER
+	bool "Inotify support for userspace"
+	depends on INOTIFY
+	default y
+	---help---
+	  Say Y here to enable inotify support for userspace, including the
+	  associated system calls.  Inotify allows monitoring of both files and
+	  directories via a single open fd.  Events are read from the file
+	  descriptor, which is also select()- and poll()-able.
+
+	  For more information, see Documentation/filesystems/inotify.txt
+
+	  If unsure, say Y.
+
 config QUOTA
 	bool "Quota support"
 	help
diff --git a/fs/Makefile b/fs/Makefile
index 078d3d1191a5..d0ea6bfccf29 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,6 +13,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 		ioprio.o pnode.o drop_caches.o splice.o sync.o
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
+obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 
diff --git a/fs/inotify.c b/fs/inotify.c
index 732ec4bd5774..a1bedf3975ca 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -5,7 +5,10 @@
  *	John McCutchan	<ttb@tentacle.dhs.org>
  *	Robert Love	<rml@novell.com>
  *
+ * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
+ *
  * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
@@ -20,35 +23,17 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/idr.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/writeback.h>
 #include <linux/inotify.h>
-#include <linux/syscalls.h>
-
-#include <asm/ioctls.h>
 
 static atomic_t inotify_cookie;
 
-static kmem_cache_t *watch_cachep __read_mostly;
-static kmem_cache_t *event_cachep __read_mostly;
-
-static struct vfsmount *inotify_mnt __read_mostly;
-
-/* these are configurable via /proc/sys/fs/inotify/ */
-int inotify_max_user_instances __read_mostly;
-int inotify_max_user_watches __read_mostly;
-int inotify_max_queued_events __read_mostly;
-
 /*
  * Lock ordering:
  *
@@ -56,327 +41,108 @@ int inotify_max_queued_events __read_mostly;
  * iprune_mutex (synchronize shrink_icache_memory())
  * 	inode_lock (protects the super_block->s_inodes list)
  * 	inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
- * 		inotify_dev->mutex (protects inotify_device and watches->d_list)
+ * 		inotify_handle->mutex (protects inotify_handle and watches->h_list)
+ *
+ * The inode->inotify_mutex and inotify_handle->mutex and held during execution
+ * of a caller's event handler.  Thus, the caller must not hold any locks
+ * taken in their event handler while calling any of the published inotify
+ * interfaces.
  */
 
 /*
- * Lifetimes of the three main data structures--inotify_device, inode, and
+ * Lifetimes of the three main data structures--inotify_handle, inode, and
  * inotify_watch--are managed by reference count.
  *
- * inotify_device: Lifetime is from inotify_init() until release.  Additional
- * references can bump the count via get_inotify_dev() and drop the count via
- * put_inotify_dev().
+ * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
+ * Additional references can bump the count via get_inotify_handle() and drop
+ * the count via put_inotify_handle().
  *
- * inotify_watch: Lifetime is from create_watch() to destory_watch().
- * Additional references can bump the count via get_inotify_watch() and drop
- * the count via put_inotify_watch().
+ * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
+ * to remove_watch_no_event().  Additional references can bump the count via
+ * get_inotify_watch() and drop the count via put_inotify_watch().  The caller
+ * is reponsible for the final put after receiving IN_IGNORED, or when using
+ * IN_ONESHOT after receiving the first event.  Inotify does the final put if
+ * inotify_destroy() is called.
  *
  * inode: Pinned so long as the inode is associated with a watch, from
- * create_watch() to put_inotify_watch().
+ * inotify_add_watch() to the final put_inotify_watch().
  */
 
 /*
- * struct inotify_device - represents an inotify instance
+ * struct inotify_handle - represents an inotify instance
  *
  * This structure is protected by the mutex 'mutex'.
  */
-struct inotify_device {
-	wait_queue_head_t 	wq;		/* wait queue for i/o */
+struct inotify_handle {
 	struct idr		idr;		/* idr mapping wd -> watch */
 	struct mutex		mutex;		/* protects this bad boy */
-	struct list_head 	events;		/* list of queued events */
 	struct list_head	watches;	/* list of watches */
 	atomic_t		count;		/* reference count */
-	struct user_struct	*user;		/* user who opened this dev */
-	unsigned int		queue_size;	/* size of the queue (bytes) */
-	unsigned int		event_count;	/* number of pending events */
-	unsigned int		max_events;	/* maximum number of events */
 	u32			last_wd;	/* the last wd allocated */
+	const struct inotify_operations *in_ops; /* inotify caller operations */
 };
 
-/*
- * struct inotify_kernel_event - An inotify event, originating from a watch and
- * queued for user-space.  A list of these is attached to each instance of the
- * device.  In read(), this list is walked and all events that can fit in the
- * buffer are returned.
- *
- * Protected by dev->mutex of the device in which we are queued.
- */
-struct inotify_kernel_event {
-	struct inotify_event	event;	/* the user-space event */
-	struct list_head        list;	/* entry in inotify_device's list */
-	char			*name;	/* filename, if any */
-};
-
-/*
- * struct inotify_watch - represents a watch request on a specific inode
- *
- * d_list is protected by dev->mutex of the associated watch->dev.
- * i_list and mask are protected by inode->inotify_mutex of the associated inode.
- * dev, inode, and wd are never written to once the watch is created.
- */
-struct inotify_watch {
-	struct list_head	d_list;	/* entry in inotify_device's list */
-	struct list_head	i_list;	/* entry in inode's list */
-	atomic_t		count;	/* reference count */
-	struct inotify_device	*dev;	/* associated device */
-	struct inode		*inode;	/* associated inode */
-	s32 			wd;	/* watch descriptor */
-	u32			mask;	/* event mask for this watch */
-};
-
-#ifdef CONFIG_SYSCTL
-
-#include <linux/sysctl.h>
-
-static int zero;
-
-ctl_table inotify_table[] = {
-	{
-		.ctl_name	= INOTIFY_MAX_USER_INSTANCES,
-		.procname	= "max_user_instances",
-		.data		= &inotify_max_user_instances,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
-	},
-	{
-		.ctl_name	= INOTIFY_MAX_USER_WATCHES,
-		.procname	= "max_user_watches",
-		.data		= &inotify_max_user_watches,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero, 
-	},
-	{
-		.ctl_name	= INOTIFY_MAX_QUEUED_EVENTS,
-		.procname	= "max_queued_events",
-		.data		= &inotify_max_queued_events,
-		.maxlen		= sizeof(int),
-		.mode		= 0644, 
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec, 
-		.extra1		= &zero
-	},
-	{ .ctl_name = 0 }
-};
-#endif /* CONFIG_SYSCTL */
-
-static inline void get_inotify_dev(struct inotify_device *dev)
+static inline void get_inotify_handle(struct inotify_handle *ih)
 {
-	atomic_inc(&dev->count);
+	atomic_inc(&ih->count);
 }
 
-static inline void put_inotify_dev(struct inotify_device *dev)
+static inline void put_inotify_handle(struct inotify_handle *ih)
 {
-	if (atomic_dec_and_test(&dev->count)) {
-		atomic_dec(&dev->user->inotify_devs);
-		free_uid(dev->user);
-		idr_destroy(&dev->idr);
-		kfree(dev);
+	if (atomic_dec_and_test(&ih->count)) {
+		idr_destroy(&ih->idr);
+		kfree(ih);
 	}
 }
 
-static inline void get_inotify_watch(struct inotify_watch *watch)
+/**
+ * get_inotify_watch - grab a reference to an inotify_watch
+ * @watch: watch to grab
+ */
+void get_inotify_watch(struct inotify_watch *watch)
 {
 	atomic_inc(&watch->count);
 }
+EXPORT_SYMBOL_GPL(get_inotify_watch);
 
-/*
+/**
  * put_inotify_watch - decrements the ref count on a given watch.  cleans up
- * the watch and its references if the count reaches zero.
+ * watch references if the count reaches zero.  inotify_watch is freed by
+ * inotify callers via the destroy_watch() op.
+ * @watch: watch to release
  */
-static inline void put_inotify_watch(struct inotify_watch *watch)
+void put_inotify_watch(struct inotify_watch *watch)
 {
 	if (atomic_dec_and_test(&watch->count)) {
-		put_inotify_dev(watch->dev);
-		iput(watch->inode);
-		kmem_cache_free(watch_cachep, watch);
-	}
-}
-
-/*
- * kernel_event - create a new kernel event with the given parameters
- *
- * This function can sleep.
- */
-static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
-						  const char *name)
-{
-	struct inotify_kernel_event *kevent;
-
-	kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL);
-	if (unlikely(!kevent))
-		return NULL;
-
-	/* we hand this out to user-space, so zero it just in case */
-	memset(&kevent->event, 0, sizeof(struct inotify_event));
-
-	kevent->event.wd = wd;
-	kevent->event.mask = mask;
-	kevent->event.cookie = cookie;
-
-	INIT_LIST_HEAD(&kevent->list);
-
-	if (name) {
-		size_t len, rem, event_size = sizeof(struct inotify_event);
-
-		/*
-		 * We need to pad the filename so as to properly align an
-		 * array of inotify_event structures.  Because the structure is
-		 * small and the common case is a small filename, we just round
-		 * up to the next multiple of the structure's sizeof.  This is
-		 * simple and safe for all architectures.
-		 */
-		len = strlen(name) + 1;
-		rem = event_size - len;
-		if (len > event_size) {
-			rem = event_size - (len % event_size);
-			if (len % event_size == 0)
-				rem = 0;
-		}
-
-		kevent->name = kmalloc(len + rem, GFP_KERNEL);
-		if (unlikely(!kevent->name)) {
-			kmem_cache_free(event_cachep, kevent);
-			return NULL;
-		}
-		memcpy(kevent->name, name, len);
-		if (rem)
-			memset(kevent->name + len, 0, rem);		
-		kevent->event.len = len + rem;
-	} else {
-		kevent->event.len = 0;
-		kevent->name = NULL;
-	}
-
-	return kevent;
-}
-
-/*
- * inotify_dev_get_event - return the next event in the given dev's queue
- *
- * Caller must hold dev->mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_event(struct inotify_device *dev)
-{
-	return list_entry(dev->events.next, struct inotify_kernel_event, list);
-}
-
-/*
- * inotify_dev_queue_event - add a new event to the given device
- *
- * Caller must hold dev->mutex.  Can sleep (calls kernel_event()).
- */
-static void inotify_dev_queue_event(struct inotify_device *dev,
-				    struct inotify_watch *watch, u32 mask,
-				    u32 cookie, const char *name)
-{
-	struct inotify_kernel_event *kevent, *last;
-
-	/* coalescing: drop this event if it is a dupe of the previous */
-	last = inotify_dev_get_event(dev);
-	if (last && last->event.mask == mask && last->event.wd == watch->wd &&
-			last->event.cookie == cookie) {
-		const char *lastname = last->name;
-
-		if (!name && !lastname)
-			return;
-		if (name && lastname && !strcmp(lastname, name))
-			return;
-	}
-
-	/* the queue overflowed and we already sent the Q_OVERFLOW event */
-	if (unlikely(dev->event_count > dev->max_events))
-		return;
-
-	/* if the queue overflows, we need to notify user space */
-	if (unlikely(dev->event_count == dev->max_events))
-		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
-	else
-		kevent = kernel_event(watch->wd, mask, cookie, name);
-
-	if (unlikely(!kevent))
-		return;
-
-	/* queue the event and wake up anyone waiting */
-	dev->event_count++;
-	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
-	list_add_tail(&kevent->list, &dev->events);
-	wake_up_interruptible(&dev->wq);
-}
-
-/*
- * remove_kevent - cleans up and ultimately frees the given kevent
- *
- * Caller must hold dev->mutex.
- */
-static void remove_kevent(struct inotify_device *dev,
-			  struct inotify_kernel_event *kevent)
-{
-	list_del(&kevent->list);
+		struct inotify_handle *ih = watch->ih;
 
-	dev->event_count--;
-	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
-
-	kfree(kevent->name);
-	kmem_cache_free(event_cachep, kevent);
-}
-
-/*
- * inotify_dev_event_dequeue - destroy an event on the given device
- *
- * Caller must hold dev->mutex.
- */
-static void inotify_dev_event_dequeue(struct inotify_device *dev)
-{
-	if (!list_empty(&dev->events)) {
-		struct inotify_kernel_event *kevent;
-		kevent = inotify_dev_get_event(dev);
-		remove_kevent(dev, kevent);
+		iput(watch->inode);
+		ih->in_ops->destroy_watch(watch);
+		put_inotify_handle(ih);
 	}
 }
+EXPORT_SYMBOL_GPL(put_inotify_watch);
 
 /*
- * inotify_dev_get_wd - returns the next WD for use by the given dev
+ * inotify_handle_get_wd - returns the next WD for use by the given handle
  *
- * Callers must hold dev->mutex.  This function can sleep.
+ * Callers must hold ih->mutex.  This function can sleep.
  */
-static int inotify_dev_get_wd(struct inotify_device *dev,
-			      struct inotify_watch *watch)
+static int inotify_handle_get_wd(struct inotify_handle *ih,
+				 struct inotify_watch *watch)
 {
 	int ret;
 
 	do {
-		if (unlikely(!idr_pre_get(&dev->idr, GFP_KERNEL)))
+		if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
 			return -ENOSPC;
-		ret = idr_get_new_above(&dev->idr, watch, dev->last_wd+1, &watch->wd);
+		ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
 	} while (ret == -EAGAIN);
 
-	return ret;
-}
+	if (likely(!ret))
+		ih->last_wd = watch->wd;
 
-/*
- * find_inode - resolve a user-given path to a specific inode and return a nd
- */
-static int find_inode(const char __user *dirname, struct nameidata *nd,
-		      unsigned flags)
-{
-	int error;
-
-	error = __user_walk(dirname, flags, nd);
-	if (error)
-		return error;
-	/* you can only watch an inode if you have read permissions on it */
-	error = vfs_permission(nd, MAY_READ);
-	if (error) 
-		path_release(nd);
-	return error;
+	return ret;
 }
 
 /*
@@ -422,67 +188,18 @@ static void set_dentry_child_flags(struct inode *inode, int watched)
 }
 
 /*
- * create_watch - creates a watch on the given device.
- *
- * Callers must hold dev->mutex.  Calls inotify_dev_get_wd() so may sleep.
- * Both 'dev' and 'inode' (by way of nameidata) need to be pinned.
- */
-static struct inotify_watch *create_watch(struct inotify_device *dev,
-					  u32 mask, struct inode *inode)
-{
-	struct inotify_watch *watch;
-	int ret;
-
-	if (atomic_read(&dev->user->inotify_watches) >=
-			inotify_max_user_watches)
-		return ERR_PTR(-ENOSPC);
-
-	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
-	if (unlikely(!watch))
-		return ERR_PTR(-ENOMEM);
-
-	ret = inotify_dev_get_wd(dev, watch);
-	if (unlikely(ret)) {
-		kmem_cache_free(watch_cachep, watch);
-		return ERR_PTR(ret);
-	}
-
-	dev->last_wd = watch->wd;
-	watch->mask = mask;
-	atomic_set(&watch->count, 0);
-	INIT_LIST_HEAD(&watch->d_list);
-	INIT_LIST_HEAD(&watch->i_list);
-
-	/* save a reference to device and bump the count to make it official */
-	get_inotify_dev(dev);
-	watch->dev = dev;
-
-	/*
-	 * Save a reference to the inode and bump the ref count to make it
-	 * official.  We hold a reference to nameidata, which makes this safe.
-	 */
-	watch->inode = igrab(inode);
-
-	/* bump our own count, corresponding to our entry in dev->watches */
-	get_inotify_watch(watch);
-
-	atomic_inc(&dev->user->inotify_watches);
-
-	return watch;
-}
-
-/*
- * inotify_find_dev - find the watch associated with the given inode and dev
+ * inotify_find_handle - find the watch associated with the given inode and
+ * handle
  *
  * Callers must hold inode->inotify_mutex.
  */
-static struct inotify_watch *inode_find_dev(struct inode *inode,
-					    struct inotify_device *dev)
+static struct inotify_watch *inode_find_handle(struct inode *inode,
+					       struct inotify_handle *ih)
 {
 	struct inotify_watch *watch;
 
 	list_for_each_entry(watch, &inode->inotify_watches, i_list) {
-		if (watch->dev == dev)
+		if (watch->ih == ih)
 			return watch;
 	}
 
@@ -491,39 +208,34 @@ static struct inotify_watch *inode_find_dev(struct inode *inode,
 
 /*
  * remove_watch_no_event - remove_watch() without the IN_IGNORED event.
+ *
+ * Callers must hold both inode->inotify_mutex and ih->mutex.
  */
 static void remove_watch_no_event(struct inotify_watch *watch,
-				  struct inotify_device *dev)
+				  struct inotify_handle *ih)
 {
 	list_del(&watch->i_list);
-	list_del(&watch->d_list);
+	list_del(&watch->h_list);
 
 	if (!inotify_inode_watched(watch->inode))
 		set_dentry_child_flags(watch->inode, 0);
 
-	atomic_dec(&dev->user->inotify_watches);
-	idr_remove(&dev->idr, watch->wd);
-	put_inotify_watch(watch);
+	idr_remove(&ih->idr, watch->wd);
 }
 
 /*
- * remove_watch - Remove a watch from both the device and the inode.  Sends
- * the IN_IGNORED event to the given device signifying that the inode is no
- * longer watched.
- *
- * Callers must hold both inode->inotify_mutex and dev->mutex.  We drop a
- * reference to the inode before returning.
+ * remove_watch - Remove a watch from both the handle and the inode.  Sends
+ * the IN_IGNORED event signifying that the inode is no longer watched.
  *
- * The inode is not iput() so as to remain atomic.  If the inode needs to be
- * iput(), the call returns one.  Otherwise, it returns zero.
+ * Callers must hold both inode->inotify_mutex and ih->mutex.
  */
-static void remove_watch(struct inotify_watch *watch,struct inotify_device *dev)
+static void remove_watch(struct inotify_watch *watch, struct inotify_handle *ih)
 {
-	inotify_dev_queue_event(dev, watch, IN_IGNORED, 0, NULL);
-	remove_watch_no_event(watch, dev);
+	remove_watch_no_event(watch, ih);
+	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL);
 }
 
-/* Kernel API */
+/* Kernel API for producing events */
 
 /*
  * inotify_d_instantiate - instantiate dcache entry for inode
@@ -576,14 +288,12 @@ void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
 	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
 		u32 watch_mask = watch->mask;
 		if (watch_mask & mask) {
-			struct inotify_device *dev = watch->dev;
-			get_inotify_watch(watch);
-			mutex_lock(&dev->mutex);
-			inotify_dev_queue_event(dev, watch, mask, cookie, name);
+			struct inotify_handle *ih= watch->ih;
+			mutex_lock(&ih->mutex);
 			if (watch_mask & IN_ONESHOT)
-				remove_watch_no_event(watch, dev);
-			mutex_unlock(&dev->mutex);
-			put_inotify_watch(watch);
+				remove_watch_no_event(watch, ih);
+			ih->in_ops->handle_event(watch, watch->wd, mask, cookie, name);
+			mutex_unlock(&ih->mutex);
 		}
 	}
 	mutex_unlock(&inode->inotify_mutex);
@@ -694,11 +404,12 @@ void inotify_unmount_inodes(struct list_head *list)
 		mutex_lock(&inode->inotify_mutex);
 		watches = &inode->inotify_watches;
 		list_for_each_entry_safe(watch, next_w, watches, i_list) {
-			struct inotify_device *dev = watch->dev;
-			mutex_lock(&dev->mutex);
-			inotify_dev_queue_event(dev, watch, IN_UNMOUNT,0,NULL);
-			remove_watch(watch, dev);
-			mutex_unlock(&dev->mutex);
+			struct inotify_handle *ih= watch->ih;
+			mutex_lock(&ih->mutex);
+			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
+						 NULL);
+			remove_watch(watch, ih);
+			mutex_unlock(&ih->mutex);
 		}
 		mutex_unlock(&inode->inotify_mutex);
 		iput(inode);		
@@ -718,432 +429,240 @@ void inotify_inode_is_dead(struct inode *inode)
 
 	mutex_lock(&inode->inotify_mutex);
 	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
-		struct inotify_device *dev = watch->dev;
-		mutex_lock(&dev->mutex);
-		remove_watch(watch, dev);
-		mutex_unlock(&dev->mutex);
+		struct inotify_handle *ih = watch->ih;
+		mutex_lock(&ih->mutex);
+		remove_watch(watch, ih);
+		mutex_unlock(&ih->mutex);
 	}
 	mutex_unlock(&inode->inotify_mutex);
 }
 EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
 
-/* Device Interface */
-
-static unsigned int inotify_poll(struct file *file, poll_table *wait)
-{
-	struct inotify_device *dev = file->private_data;
-	int ret = 0;
-
-	poll_wait(file, &dev->wq, wait);
-	mutex_lock(&dev->mutex);
-	if (!list_empty(&dev->events))
-		ret = POLLIN | POLLRDNORM;
-	mutex_unlock(&dev->mutex);
-
-	return ret;
-}
+/* Kernel Consumer API */
 
-static ssize_t inotify_read(struct file *file, char __user *buf,
-			    size_t count, loff_t *pos)
+/**
+ * inotify_init - allocate and initialize an inotify instance
+ * @ops: caller's inotify operations
+ */
+struct inotify_handle *inotify_init(const struct inotify_operations *ops)
 {
-	size_t event_size = sizeof (struct inotify_event);
-	struct inotify_device *dev;
-	char __user *start;
-	int ret;
-	DEFINE_WAIT(wait);
-
-	start = buf;
-	dev = file->private_data;
-
-	while (1) {
-		int events;
-
-		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+	struct inotify_handle *ih;
 
-		mutex_lock(&dev->mutex);
-		events = !list_empty(&dev->events);
-		mutex_unlock(&dev->mutex);
-		if (events) {
-			ret = 0;
-			break;
-		}
-
-		if (file->f_flags & O_NONBLOCK) {
-			ret = -EAGAIN;
-			break;
-		}
-
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			break;
-		}
-
-		schedule();
-	}
-
-	finish_wait(&dev->wq, &wait);
-	if (ret)
-		return ret;
-
-	mutex_lock(&dev->mutex);
-	while (1) {
-		struct inotify_kernel_event *kevent;
-
-		ret = buf - start;
-		if (list_empty(&dev->events))
-			break;
-
-		kevent = inotify_dev_get_event(dev);
-		if (event_size + kevent->event.len > count)
-			break;
-
-		if (copy_to_user(buf, &kevent->event, event_size)) {
-			ret = -EFAULT;
-			break;
-		}
-		buf += event_size;
-		count -= event_size;
-
-		if (kevent->name) {
-			if (copy_to_user(buf, kevent->name, kevent->event.len)){
-				ret = -EFAULT;
-				break;
-			}
-			buf += kevent->event.len;
-			count -= kevent->event.len;
-		}
+	ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
+	if (unlikely(!ih))
+		return ERR_PTR(-ENOMEM);
 
-		remove_kevent(dev, kevent);
-	}
-	mutex_unlock(&dev->mutex);
+	idr_init(&ih->idr);
+	INIT_LIST_HEAD(&ih->watches);
+	mutex_init(&ih->mutex);
+	ih->last_wd = 0;
+	ih->in_ops = ops;
+	atomic_set(&ih->count, 0);
+	get_inotify_handle(ih);
 
-	return ret;
+	return ih;
 }
+EXPORT_SYMBOL_GPL(inotify_init);
 
-static int inotify_release(struct inode *ignored, struct file *file)
+/**
+ * inotify_destroy - clean up and destroy an inotify instance
+ * @ih: inotify handle
+ */
+void inotify_destroy(struct inotify_handle *ih)
 {
-	struct inotify_device *dev = file->private_data;
-
 	/*
-	 * Destroy all of the watches on this device.  Unfortunately, not very
+	 * Destroy all of the watches for this handle. Unfortunately, not very
 	 * pretty.  We cannot do a simple iteration over the list, because we
 	 * do not know the inode until we iterate to the watch.  But we need to
-	 * hold inode->inotify_mutex before dev->mutex.  The following works.
+	 * hold inode->inotify_mutex before ih->mutex.  The following works.
 	 */
 	while (1) {
 		struct inotify_watch *watch;
 		struct list_head *watches;
 		struct inode *inode;
 
-		mutex_lock(&dev->mutex);
-		watches = &dev->watches;
+		mutex_lock(&ih->mutex);
+		watches = &ih->watches;
 		if (list_empty(watches)) {
-			mutex_unlock(&dev->mutex);
+			mutex_unlock(&ih->mutex);
 			break;
 		}
-		watch = list_entry(watches->next, struct inotify_watch, d_list);
+		watch = list_entry(watches->next, struct inotify_watch, h_list);
 		get_inotify_watch(watch);
-		mutex_unlock(&dev->mutex);
+		mutex_unlock(&ih->mutex);
 
 		inode = watch->inode;
 		mutex_lock(&inode->inotify_mutex);
-		mutex_lock(&dev->mutex);
+		mutex_lock(&ih->mutex);
 
 		/* make sure we didn't race with another list removal */
-		if (likely(idr_find(&dev->idr, watch->wd)))
-			remove_watch_no_event(watch, dev);
+		if (likely(idr_find(&ih->idr, watch->wd))) {
+			remove_watch_no_event(watch, ih);
+			put_inotify_watch(watch);
+		}
 
-		mutex_unlock(&dev->mutex);
+		mutex_unlock(&ih->mutex);
 		mutex_unlock(&inode->inotify_mutex);
 		put_inotify_watch(watch);
 	}
 
-	/* destroy all of the events on this device */
-	mutex_lock(&dev->mutex);
-	while (!list_empty(&dev->events))
-		inotify_dev_event_dequeue(dev);
-	mutex_unlock(&dev->mutex);
-
-	/* free this device: the put matching the get in inotify_init() */
-	put_inotify_dev(dev);
-
-	return 0;
+	/* free this handle: the put matching the get in inotify_init() */
+	put_inotify_handle(ih);
 }
+EXPORT_SYMBOL_GPL(inotify_destroy);
 
-/*
- * inotify_ignore - remove a given wd from this inotify instance.
+/**
+ * inotify_find_update_watch - find and update the mask of an existing watch
+ * @ih: inotify handle
+ * @inode: inode's watch to update
+ * @mask: mask of events to watch
  *
- * Can sleep.
+ * Caller must pin given inode (via nameidata).
  */
-static int inotify_ignore(struct inotify_device *dev, s32 wd)
+s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
+			      u32 mask)
 {
-	struct inotify_watch *watch;
-	struct inode *inode;
+	struct inotify_watch *old;
+	int mask_add = 0;
+	int ret;
 
-	mutex_lock(&dev->mutex);
-	watch = idr_find(&dev->idr, wd);
-	if (unlikely(!watch)) {
-		mutex_unlock(&dev->mutex);
+	if (mask & IN_MASK_ADD)
+		mask_add = 1;
+
+	/* don't allow invalid bits: we don't want flags set */
+	mask &= IN_ALL_EVENTS | IN_ONESHOT;
+	if (unlikely(!mask))
 		return -EINVAL;
-	}
-	get_inotify_watch(watch);
-	inode = watch->inode;
-	mutex_unlock(&dev->mutex);
 
 	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&dev->mutex);
-
-	/* make sure that we did not race */
-	if (likely(idr_find(&dev->idr, wd) == watch))
-		remove_watch(watch, dev);
-
-	mutex_unlock(&dev->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-	put_inotify_watch(watch);
-
-	return 0;
-}
-
-static long inotify_ioctl(struct file *file, unsigned int cmd,
-			  unsigned long arg)
-{
-	struct inotify_device *dev;
-	void __user *p;
-	int ret = -ENOTTY;
+	mutex_lock(&ih->mutex);
 
-	dev = file->private_data;
-	p = (void __user *) arg;
-
-	switch (cmd) {
-	case FIONREAD:
-		ret = put_user(dev->queue_size, (int __user *) p);
-		break;
-	}
-
-	return ret;
-}
-
-static const struct file_operations inotify_fops = {
-	.poll           = inotify_poll,
-	.read           = inotify_read,
-	.release        = inotify_release,
-	.unlocked_ioctl = inotify_ioctl,
-	.compat_ioctl	= inotify_ioctl,
-};
-
-asmlinkage long sys_inotify_init(void)
-{
-	struct inotify_device *dev;
-	struct user_struct *user;
-	struct file *filp;	
-	int fd, ret;
-
-	fd = get_unused_fd();
-	if (fd < 0)
-		return fd;
-
-	filp = get_empty_filp();
-	if (!filp) {
-		ret = -ENFILE;
-		goto out_put_fd;
-	}
-
-	user = get_uid(current->user);
-	if (unlikely(atomic_read(&user->inotify_devs) >=
-			inotify_max_user_instances)) {
-		ret = -EMFILE;
-		goto out_free_uid;
-	}
-
-	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
-	if (unlikely(!dev)) {
-		ret = -ENOMEM;
-		goto out_free_uid;
+	/*
+	 * Handle the case of re-adding a watch on an (inode,ih) pair that we
+	 * are already watching.  We just update the mask and return its wd.
+	 */
+	old = inode_find_handle(inode, ih);
+	if (unlikely(!old)) {
+		ret = -ENOENT;
+		goto out;
 	}
 
-	filp->f_op = &inotify_fops;
-	filp->f_vfsmnt = mntget(inotify_mnt);
-	filp->f_dentry = dget(inotify_mnt->mnt_root);
-	filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
-	filp->f_mode = FMODE_READ;
-	filp->f_flags = O_RDONLY;
-	filp->private_data = dev;
-
-	idr_init(&dev->idr);
-	INIT_LIST_HEAD(&dev->events);
-	INIT_LIST_HEAD(&dev->watches);
-	init_waitqueue_head(&dev->wq);
-	mutex_init(&dev->mutex);
-	dev->event_count = 0;
-	dev->queue_size = 0;
-	dev->max_events = inotify_max_queued_events;
-	dev->user = user;
-	dev->last_wd = 0;
-	atomic_set(&dev->count, 0);
-
-	get_inotify_dev(dev);
-	atomic_inc(&user->inotify_devs);
-	fd_install(fd, filp);
-
-	return fd;
-out_free_uid:
-	free_uid(user);
-	put_filp(filp);
-out_put_fd:
-	put_unused_fd(fd);
+	if (mask_add)
+		old->mask |= mask;
+	else
+		old->mask = mask;
+	ret = old->wd;
+out:
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(inotify_find_update_watch);
 
-asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+/**
+ * inotify_add_watch - add a watch to an inotify instance
+ * @ih: inotify handle
+ * @watch: caller allocated watch structure
+ * @inode: inode to watch
+ * @mask: mask of events to watch
+ *
+ * Caller must pin given inode (via nameidata).
+ * Caller must ensure it only calls inotify_add_watch() once per watch.
+ * Calls inotify_handle_get_wd() so may sleep.
+ */
+s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
+		      struct inode *inode, u32 mask)
 {
-	struct inotify_watch *watch, *old;
-	struct inode *inode;
-	struct inotify_device *dev;
-	struct nameidata nd;
-	struct file *filp;
-	int ret, fput_needed;
-	int mask_add = 0;
-	unsigned flags = 0;
-
-	filp = fget_light(fd, &fput_needed);
-	if (unlikely(!filp))
-		return -EBADF;
+	int ret = 0;
 
-	/* verify that this is indeed an inotify instance */
-	if (unlikely(filp->f_op != &inotify_fops)) {
-		ret = -EINVAL;
-		goto fput_and_out;
-	}
+	/* don't allow invalid bits: we don't want flags set */
+	mask &= IN_ALL_EVENTS | IN_ONESHOT;
+	if (unlikely(!mask))
+		return -EINVAL;
+	watch->mask = mask;
 
-	if (!(mask & IN_DONT_FOLLOW))
-		flags |= LOOKUP_FOLLOW;
-	if (mask & IN_ONLYDIR)
-		flags |= LOOKUP_DIRECTORY;
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
 
-	ret = find_inode(path, &nd, flags);
+	/* Initialize a new watch */
+	ret = inotify_handle_get_wd(ih, watch);
 	if (unlikely(ret))
-		goto fput_and_out;
-
-	/* inode held in place by reference to nd; dev by fget on fd */
-	inode = nd.dentry->d_inode;
-	dev = filp->private_data;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&dev->mutex);
+		goto out;
+	ret = watch->wd;
 
-	if (mask & IN_MASK_ADD)
-		mask_add = 1;
+	atomic_set(&watch->count, 0);
+	INIT_LIST_HEAD(&watch->h_list);
+	INIT_LIST_HEAD(&watch->i_list);
 
-	/* don't let user-space set invalid bits: we don't want flags set */
-	mask &= IN_ALL_EVENTS | IN_ONESHOT;
-	if (unlikely(!mask)) {
-		ret = -EINVAL;
-		goto out;
-	}
+	/* save a reference to handle and bump the count to make it official */
+	get_inotify_handle(ih);
+	watch->ih = ih;
 
 	/*
-	 * Handle the case of re-adding a watch on an (inode,dev) pair that we
-	 * are already watching.  We just update the mask and return its wd.
+	 * Save a reference to the inode and bump the ref count to make it
+	 * official.  We hold a reference to nameidata, which makes this safe.
 	 */
-	old = inode_find_dev(inode, dev);
-	if (unlikely(old)) {
-		if (mask_add)
-			old->mask |= mask;
-		else
-			old->mask = mask;
-		ret = old->wd;
-		goto out;
-	}
+	watch->inode = igrab(inode);
 
-	watch = create_watch(dev, mask, inode);
-	if (unlikely(IS_ERR(watch))) {
-		ret = PTR_ERR(watch);
-		goto out;
-	}
+	get_inotify_watch(watch); /* initial get */
 
 	if (!inotify_inode_watched(inode))
 		set_dentry_child_flags(inode, 1);
 
-	/* Add the watch to the device's and the inode's list */
-	list_add(&watch->d_list, &dev->watches);
+	/* Add the watch to the handle's and the inode's list */
+	list_add(&watch->h_list, &ih->watches);
 	list_add(&watch->i_list, &inode->inotify_watches);
-	ret = watch->wd;
 out:
-	mutex_unlock(&dev->mutex);
+	mutex_unlock(&ih->mutex);
 	mutex_unlock(&inode->inotify_mutex);
-	path_release(&nd);
-fput_and_out:
-	fput_light(filp, fput_needed);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(inotify_add_watch);
 
-asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+/**
+ * inotify_rm_wd - remove a watch from an inotify instance
+ * @ih: inotify handle
+ * @wd: watch descriptor to remove
+ *
+ * Can sleep.
+ */
+int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 {
-	struct file *filp;
-	struct inotify_device *dev;
-	int ret, fput_needed;
-
-	filp = fget_light(fd, &fput_needed);
-	if (unlikely(!filp))
-		return -EBADF;
+	struct inotify_watch *watch;
+	struct inode *inode;
 
-	/* verify that this is indeed an inotify instance */
-	if (unlikely(filp->f_op != &inotify_fops)) {
-		ret = -EINVAL;
-		goto out;
+	mutex_lock(&ih->mutex);
+	watch = idr_find(&ih->idr, wd);
+	if (unlikely(!watch)) {
+		mutex_unlock(&ih->mutex);
+		return -EINVAL;
 	}
+	get_inotify_watch(watch);
+	inode = watch->inode;
+	mutex_unlock(&ih->mutex);
 
-	dev = filp->private_data;
-	ret = inotify_ignore(dev, wd);
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
 
-out:
-	fput_light(filp, fput_needed);
-	return ret;
-}
+	/* make sure that we did not race */
+	if (likely(idr_find(&ih->idr, wd) == watch))
+		remove_watch(watch, ih);
 
-static struct super_block *
-inotify_get_sb(struct file_system_type *fs_type, int flags,
-	       const char *dev_name, void *data)
-{
-    return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA);
-}
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
+	put_inotify_watch(watch);
 
-static struct file_system_type inotify_fs_type = {
-    .name           = "inotifyfs",
-    .get_sb         = inotify_get_sb,
-    .kill_sb        = kill_anon_super,
-};
+	return 0;
+}
+EXPORT_SYMBOL_GPL(inotify_rm_wd);
 
 /*
- * inotify_setup - Our initialization function.  Note that we cannnot return
- * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
- * must result in panic().
+ * inotify_setup - core initialization function
  */
 static int __init inotify_setup(void)
 {
-	int ret;
-
-	ret = register_filesystem(&inotify_fs_type);
-	if (unlikely(ret))
-		panic("inotify: register_filesystem returned %d!\n", ret);
-
-	inotify_mnt = kern_mount(&inotify_fs_type);
-	if (IS_ERR(inotify_mnt))
-		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
-
-	inotify_max_queued_events = 16384;
-	inotify_max_user_instances = 128;
-	inotify_max_user_watches = 8192;
-
 	atomic_set(&inotify_cookie, 0);
 
-	watch_cachep = kmem_cache_create("inotify_watch_cache",
-					 sizeof(struct inotify_watch),
-					 0, SLAB_PANIC, NULL, NULL);
-	event_cachep = kmem_cache_create("inotify_event_cache",
-					 sizeof(struct inotify_kernel_event),
-					 0, SLAB_PANIC, NULL, NULL);
-
 	return 0;
 }
 
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
new file mode 100644
index 000000000000..845dc79a4e9c
--- /dev/null
+++ b/fs/inotify_user.c
@@ -0,0 +1,717 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/inotify.h>
+#include <linux/syscalls.h>
+
+#include <asm/ioctls.h>
+
+static kmem_cache_t *watch_cachep __read_mostly;
+static kmem_cache_t *event_cachep __read_mostly;
+
+static struct vfsmount *inotify_mnt __read_mostly;
+
+/* these are configurable via /proc/sys/fs/inotify/ */
+int inotify_max_user_instances __read_mostly;
+int inotify_max_user_watches __read_mostly;
+int inotify_max_queued_events __read_mostly;
+
+/*
+ * Lock ordering:
+ *
+ * inotify_dev->up_mutex (ensures we don't re-add the same watch)
+ * 	inode->inotify_mutex (protects inode's watch list)
+ * 		inotify_handle->mutex (protects inotify_handle's watch list)
+ * 			inotify_dev->ev_mutex (protects device's event queue)
+ */
+
+/*
+ * Lifetimes of the main data structures:
+ *
+ * inotify_device: Lifetime is managed by reference count, from
+ * sys_inotify_init() until release.  Additional references can bump the count
+ * via get_inotify_dev() and drop the count via put_inotify_dev().
+ *
+ * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
+ * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
+ * first event, or to inotify_destroy().
+ */
+
+/*
+ * struct inotify_device - represents an inotify instance
+ *
+ * This structure is protected by the mutex 'mutex'.
+ */
+struct inotify_device {
+	wait_queue_head_t 	wq;		/* wait queue for i/o */
+	struct mutex		ev_mutex;	/* protects event queue */
+	struct mutex		up_mutex;	/* synchronizes watch updates */
+	struct list_head 	events;		/* list of queued events */
+	atomic_t		count;		/* reference count */
+	struct user_struct	*user;		/* user who opened this dev */
+	struct inotify_handle	*ih;		/* inotify handle */
+	unsigned int		queue_size;	/* size of the queue (bytes) */
+	unsigned int		event_count;	/* number of pending events */
+	unsigned int		max_events;	/* maximum number of events */
+};
+
+/*
+ * struct inotify_kernel_event - An inotify event, originating from a watch and
+ * queued for user-space.  A list of these is attached to each instance of the
+ * device.  In read(), this list is walked and all events that can fit in the
+ * buffer are returned.
+ *
+ * Protected by dev->ev_mutex of the device in which we are queued.
+ */
+struct inotify_kernel_event {
+	struct inotify_event	event;	/* the user-space event */
+	struct list_head        list;	/* entry in inotify_device's list */
+	char			*name;	/* filename, if any */
+};
+
+/*
+ * struct inotify_user_watch - our version of an inotify_watch, we add
+ * a reference to the associated inotify_device.
+ */
+struct inotify_user_watch {
+	struct inotify_device	*dev;	/* associated device */
+	struct inotify_watch	wdata;	/* inotify watch data */
+};
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+static int zero;
+
+ctl_table inotify_table[] = {
+	{
+		.ctl_name	= INOTIFY_MAX_USER_INSTANCES,
+		.procname	= "max_user_instances",
+		.data		= &inotify_max_user_instances,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= INOTIFY_MAX_USER_WATCHES,
+		.procname	= "max_user_watches",
+		.data		= &inotify_max_user_watches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= INOTIFY_MAX_QUEUED_EVENTS,
+		.procname	= "max_queued_events",
+		.data		= &inotify_max_queued_events,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
+	{ .ctl_name = 0 }
+};
+#endif /* CONFIG_SYSCTL */
+
+static inline void get_inotify_dev(struct inotify_device *dev)
+{
+	atomic_inc(&dev->count);
+}
+
+static inline void put_inotify_dev(struct inotify_device *dev)
+{
+	if (atomic_dec_and_test(&dev->count)) {
+		atomic_dec(&dev->user->inotify_devs);
+		free_uid(dev->user);
+		kfree(dev);
+	}
+}
+
+/*
+ * free_inotify_user_watch - cleans up the watch and its references
+ */
+static void free_inotify_user_watch(struct inotify_watch *w)
+{
+	struct inotify_user_watch *watch;
+	struct inotify_device *dev;
+
+	watch = container_of(w, struct inotify_user_watch, wdata);
+	dev = watch->dev;
+
+	atomic_dec(&dev->user->inotify_watches);
+	put_inotify_dev(dev);
+	kmem_cache_free(watch_cachep, watch);
+}
+
+/*
+ * kernel_event - create a new kernel event with the given parameters
+ *
+ * This function can sleep.
+ */
+static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
+						  const char *name)
+{
+	struct inotify_kernel_event *kevent;
+
+	kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL);
+	if (unlikely(!kevent))
+		return NULL;
+
+	/* we hand this out to user-space, so zero it just in case */
+	memset(&kevent->event, 0, sizeof(struct inotify_event));
+
+	kevent->event.wd = wd;
+	kevent->event.mask = mask;
+	kevent->event.cookie = cookie;
+
+	INIT_LIST_HEAD(&kevent->list);
+
+	if (name) {
+		size_t len, rem, event_size = sizeof(struct inotify_event);
+
+		/*
+		 * We need to pad the filename so as to properly align an
+		 * array of inotify_event structures.  Because the structure is
+		 * small and the common case is a small filename, we just round
+		 * up to the next multiple of the structure's sizeof.  This is
+		 * simple and safe for all architectures.
+		 */
+		len = strlen(name) + 1;
+		rem = event_size - len;
+		if (len > event_size) {
+			rem = event_size - (len % event_size);
+			if (len % event_size == 0)
+				rem = 0;
+		}
+
+		kevent->name = kmalloc(len + rem, GFP_KERNEL);
+		if (unlikely(!kevent->name)) {
+			kmem_cache_free(event_cachep, kevent);
+			return NULL;
+		}
+		memcpy(kevent->name, name, len);
+		if (rem)
+			memset(kevent->name + len, 0, rem);
+		kevent->event.len = len + rem;
+	} else {
+		kevent->event.len = 0;
+		kevent->name = NULL;
+	}
+
+	return kevent;
+}
+
+/*
+ * inotify_dev_get_event - return the next event in the given dev's queue
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static inline struct inotify_kernel_event *
+inotify_dev_get_event(struct inotify_device *dev)
+{
+	return list_entry(dev->events.next, struct inotify_kernel_event, list);
+}
+
+/*
+ * inotify_dev_queue_event - event handler registered with core inotify, adds
+ * a new event to the given device
+ *
+ * Can sleep (calls kernel_event()).
+ */
+static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
+				    u32 cookie, const char *name)
+{
+	struct inotify_user_watch *watch;
+	struct inotify_device *dev;
+	struct inotify_kernel_event *kevent, *last;
+
+	watch = container_of(w, struct inotify_user_watch, wdata);
+	dev = watch->dev;
+
+	mutex_lock(&dev->ev_mutex);
+
+	/* we can safely put the watch as we don't reference it while
+	 * generating the event
+	 */
+	if (mask & IN_IGNORED || mask & IN_ONESHOT)
+		put_inotify_watch(w); /* final put */
+
+	/* coalescing: drop this event if it is a dupe of the previous */
+	last = inotify_dev_get_event(dev);
+	if (last && last->event.mask == mask && last->event.wd == wd &&
+			last->event.cookie == cookie) {
+		const char *lastname = last->name;
+
+		if (!name && !lastname)
+			goto out;
+		if (name && lastname && !strcmp(lastname, name))
+			goto out;
+	}
+
+	/* the queue overflowed and we already sent the Q_OVERFLOW event */
+	if (unlikely(dev->event_count > dev->max_events))
+		goto out;
+
+	/* if the queue overflows, we need to notify user space */
+	if (unlikely(dev->event_count == dev->max_events))
+		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
+	else
+		kevent = kernel_event(wd, mask, cookie, name);
+
+	if (unlikely(!kevent))
+		goto out;
+
+	/* queue the event and wake up anyone waiting */
+	dev->event_count++;
+	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
+	list_add_tail(&kevent->list, &dev->events);
+	wake_up_interruptible(&dev->wq);
+
+out:
+	mutex_unlock(&dev->ev_mutex);
+}
+
+/*
+ * remove_kevent - cleans up and ultimately frees the given kevent
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void remove_kevent(struct inotify_device *dev,
+			  struct inotify_kernel_event *kevent)
+{
+	list_del(&kevent->list);
+
+	dev->event_count--;
+	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+
+	kfree(kevent->name);
+	kmem_cache_free(event_cachep, kevent);
+}
+
+/*
+ * inotify_dev_event_dequeue - destroy an event on the given device
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void inotify_dev_event_dequeue(struct inotify_device *dev)
+{
+	if (!list_empty(&dev->events)) {
+		struct inotify_kernel_event *kevent;
+		kevent = inotify_dev_get_event(dev);
+		remove_kevent(dev, kevent);
+	}
+}
+
+/*
+ * find_inode - resolve a user-given path to a specific inode and return a nd
+ */
+static int find_inode(const char __user *dirname, struct nameidata *nd,
+		      unsigned flags)
+{
+	int error;
+
+	error = __user_walk(dirname, flags, nd);
+	if (error)
+		return error;
+	/* you can only watch an inode if you have read permissions on it */
+	error = vfs_permission(nd, MAY_READ);
+	if (error)
+		path_release(nd);
+	return error;
+}
+
+/*
+ * create_watch - creates a watch on the given device.
+ *
+ * Callers must hold dev->up_mutex.
+ */
+static int create_watch(struct inotify_device *dev, struct inode *inode,
+			u32 mask)
+{
+	struct inotify_user_watch *watch;
+	int ret;
+
+	if (atomic_read(&dev->user->inotify_watches) >=
+			inotify_max_user_watches)
+		return -ENOSPC;
+
+	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
+	if (unlikely(!watch))
+		return -ENOMEM;
+
+	/* save a reference to device and bump the count to make it official */
+	get_inotify_dev(dev);
+	watch->dev = dev;
+
+	atomic_inc(&dev->user->inotify_watches);
+
+	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
+	if (ret < 0)
+		free_inotify_user_watch(&watch->wdata);
+
+	return ret;
+}
+
+/* Device Interface */
+
+static unsigned int inotify_poll(struct file *file, poll_table *wait)
+{
+	struct inotify_device *dev = file->private_data;
+	int ret = 0;
+
+	poll_wait(file, &dev->wq, wait);
+	mutex_lock(&dev->ev_mutex);
+	if (!list_empty(&dev->events))
+		ret = POLLIN | POLLRDNORM;
+	mutex_unlock(&dev->ev_mutex);
+
+	return ret;
+}
+
+static ssize_t inotify_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	size_t event_size = sizeof (struct inotify_event);
+	struct inotify_device *dev;
+	char __user *start;
+	int ret;
+	DEFINE_WAIT(wait);
+
+	start = buf;
+	dev = file->private_data;
+
+	while (1) {
+		int events;
+
+		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+
+		mutex_lock(&dev->ev_mutex);
+		events = !list_empty(&dev->events);
+		mutex_unlock(&dev->ev_mutex);
+		if (events) {
+			ret = 0;
+			break;
+		}
+
+		if (file->f_flags & O_NONBLOCK) {
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		schedule();
+	}
+
+	finish_wait(&dev->wq, &wait);
+	if (ret)
+		return ret;
+
+	mutex_lock(&dev->ev_mutex);
+	while (1) {
+		struct inotify_kernel_event *kevent;
+
+		ret = buf - start;
+		if (list_empty(&dev->events))
+			break;
+
+		kevent = inotify_dev_get_event(dev);
+		if (event_size + kevent->event.len > count)
+			break;
+
+		if (copy_to_user(buf, &kevent->event, event_size)) {
+			ret = -EFAULT;
+			break;
+		}
+		buf += event_size;
+		count -= event_size;
+
+		if (kevent->name) {
+			if (copy_to_user(buf, kevent->name, kevent->event.len)){
+				ret = -EFAULT;
+				break;
+			}
+			buf += kevent->event.len;
+			count -= kevent->event.len;
+		}
+
+		remove_kevent(dev, kevent);
+	}
+	mutex_unlock(&dev->ev_mutex);
+
+	return ret;
+}
+
+static int inotify_release(struct inode *ignored, struct file *file)
+{
+	struct inotify_device *dev = file->private_data;
+
+	inotify_destroy(dev->ih);
+
+	/* destroy all of the events on this device */
+	mutex_lock(&dev->ev_mutex);
+	while (!list_empty(&dev->events))
+		inotify_dev_event_dequeue(dev);
+	mutex_unlock(&dev->ev_mutex);
+
+	/* free this device: the put matching the get in inotify_init() */
+	put_inotify_dev(dev);
+
+	return 0;
+}
+
+static long inotify_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct inotify_device *dev;
+	void __user *p;
+	int ret = -ENOTTY;
+
+	dev = file->private_data;
+	p = (void __user *) arg;
+
+	switch (cmd) {
+	case FIONREAD:
+		ret = put_user(dev->queue_size, (int __user *) p);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct file_operations inotify_fops = {
+	.poll           = inotify_poll,
+	.read           = inotify_read,
+	.release        = inotify_release,
+	.unlocked_ioctl = inotify_ioctl,
+	.compat_ioctl	= inotify_ioctl,
+};
+
+static const struct inotify_operations inotify_user_ops = {
+	.handle_event	= inotify_dev_queue_event,
+	.destroy_watch	= free_inotify_user_watch,
+};
+
+asmlinkage long sys_inotify_init(void)
+{
+	struct inotify_device *dev;
+	struct inotify_handle *ih;
+	struct user_struct *user;
+	struct file *filp;
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	filp = get_empty_filp();
+	if (!filp) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	user = get_uid(current->user);
+	if (unlikely(atomic_read(&user->inotify_devs) >=
+			inotify_max_user_instances)) {
+		ret = -EMFILE;
+		goto out_free_uid;
+	}
+
+	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
+	if (unlikely(!dev)) {
+		ret = -ENOMEM;
+		goto out_free_uid;
+	}
+
+	ih = inotify_init(&inotify_user_ops);
+	if (unlikely(IS_ERR(ih))) {
+		ret = PTR_ERR(ih);
+		goto out_free_dev;
+	}
+	dev->ih = ih;
+
+	filp->f_op = &inotify_fops;
+	filp->f_vfsmnt = mntget(inotify_mnt);
+	filp->f_dentry = dget(inotify_mnt->mnt_root);
+	filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+	filp->f_mode = FMODE_READ;
+	filp->f_flags = O_RDONLY;
+	filp->private_data = dev;
+
+	INIT_LIST_HEAD(&dev->events);
+	init_waitqueue_head(&dev->wq);
+	mutex_init(&dev->ev_mutex);
+	mutex_init(&dev->up_mutex);
+	dev->event_count = 0;
+	dev->queue_size = 0;
+	dev->max_events = inotify_max_queued_events;
+	dev->user = user;
+	atomic_set(&dev->count, 0);
+
+	get_inotify_dev(dev);
+	atomic_inc(&user->inotify_devs);
+	fd_install(fd, filp);
+
+	return fd;
+out_free_dev:
+	kfree(dev);
+out_free_uid:
+	free_uid(user);
+	put_filp(filp);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+{
+	struct inode *inode;
+	struct inotify_device *dev;
+	struct nameidata nd;
+	struct file *filp;
+	int ret, fput_needed;
+	unsigned flags = 0;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an inotify instance */
+	if (unlikely(filp->f_op != &inotify_fops)) {
+		ret = -EINVAL;
+		goto fput_and_out;
+	}
+
+	if (!(mask & IN_DONT_FOLLOW))
+		flags |= LOOKUP_FOLLOW;
+	if (mask & IN_ONLYDIR)
+		flags |= LOOKUP_DIRECTORY;
+
+	ret = find_inode(path, &nd, flags);
+	if (unlikely(ret))
+		goto fput_and_out;
+
+	/* inode held in place by reference to nd; dev by fget on fd */
+	inode = nd.dentry->d_inode;
+	dev = filp->private_data;
+
+	mutex_lock(&dev->up_mutex);
+	ret = inotify_find_update_watch(dev->ih, inode, mask);
+	if (ret == -ENOENT)
+		ret = create_watch(dev, inode, mask);
+	mutex_unlock(&dev->up_mutex);
+
+	path_release(&nd);
+fput_and_out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+{
+	struct file *filp;
+	struct inotify_device *dev;
+	int ret, fput_needed;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an inotify instance */
+	if (unlikely(filp->f_op != &inotify_fops)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	dev = filp->private_data;
+
+	/* we free our watch data when we get IN_IGNORED */
+	ret = inotify_rm_wd(dev->ih, wd);
+
+out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+static struct super_block *
+inotify_get_sb(struct file_system_type *fs_type, int flags,
+	       const char *dev_name, void *data)
+{
+    return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA);
+}
+
+static struct file_system_type inotify_fs_type = {
+    .name           = "inotifyfs",
+    .get_sb         = inotify_get_sb,
+    .kill_sb        = kill_anon_super,
+};
+
+/*
+ * inotify_user_setup - Our initialization function.  Note that we cannnot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init inotify_user_setup(void)
+{
+	int ret;
+
+	ret = register_filesystem(&inotify_fs_type);
+	if (unlikely(ret))
+		panic("inotify: register_filesystem returned %d!\n", ret);
+
+	inotify_mnt = kern_mount(&inotify_fs_type);
+	if (IS_ERR(inotify_mnt))
+		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
+
+	inotify_max_queued_events = 16384;
+	inotify_max_user_instances = 128;
+	inotify_max_user_watches = 8192;
+
+	watch_cachep = kmem_cache_create("inotify_watch_cache",
+					 sizeof(struct inotify_user_watch),
+					 0, SLAB_PANIC, NULL, NULL);
+	event_cachep = kmem_cache_create("inotify_event_cache",
+					 sizeof(struct inotify_kernel_event),
+					 0, SLAB_PANIC, NULL, NULL);
+
+	return 0;
+}
+
+module_init(inotify_user_setup);
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 71aa1553ef38..68b6e0127de4 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -68,8 +68,37 @@ struct inotify_event {
 #include <linux/dcache.h>
 #include <linux/fs.h>
 
+/*
+ * struct inotify_watch - represents a watch request on a specific inode
+ *
+ * h_list is protected by ih->mutex of the associated inotify_handle.
+ * i_list, mask are protected by inode->inotify_mutex of the associated inode.
+ * ih, inode, and wd are never written to once the watch is created.
+ *
+ * Callers must use the established inotify interfaces to access inotify_watch
+ * contents.  The content of this structure is private to the inotify
+ * implementation.
+ */
+struct inotify_watch {
+	struct list_head	h_list;	/* entry in inotify_handle's list */
+	struct list_head	i_list;	/* entry in inode's list */
+	atomic_t		count;	/* reference count */
+	struct inotify_handle	*ih;	/* associated inotify handle */
+	struct inode		*inode;	/* associated inode */
+	__s32			wd;	/* watch descriptor */
+	__u32			mask;	/* event mask for this watch */
+};
+
+struct inotify_operations {
+	void (*handle_event)(struct inotify_watch *, u32, u32, u32,
+			     const char *);
+	void (*destroy_watch)(struct inotify_watch *);
+};
+
 #ifdef CONFIG_INOTIFY
 
+/* Kernel API for producing events */
+
 extern void inotify_d_instantiate(struct dentry *, struct inode *);
 extern void inotify_d_move(struct dentry *);
 extern void inotify_inode_queue_event(struct inode *, __u32, __u32,
@@ -80,6 +109,18 @@ extern void inotify_unmount_inodes(struct list_head *);
 extern void inotify_inode_is_dead(struct inode *);
 extern u32 inotify_get_cookie(void);
 
+/* Kernel Consumer API */
+
+extern struct inotify_handle *inotify_init(const struct inotify_operations *);
+extern void inotify_destroy(struct inotify_handle *);
+extern __s32 inotify_find_update_watch(struct inotify_handle *, struct inode *,
+				       u32);
+extern __s32 inotify_add_watch(struct inotify_handle *, struct inotify_watch *,
+			       struct inode *, __u32);
+extern int inotify_rm_wd(struct inotify_handle *, __u32);
+extern void get_inotify_watch(struct inotify_watch *);
+extern void put_inotify_watch(struct inotify_watch *);
+
 #else
 
 static inline void inotify_d_instantiate(struct dentry *dentry,
@@ -116,6 +157,41 @@ static inline u32 inotify_get_cookie(void)
 	return 0;
 }
 
+static inline struct inotify_handle *inotify_init(const struct inotify_operations *ops)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void inotify_destroy(struct inotify_handle *ih)
+{
+}
+
+static inline __s32 inotify_find_update_watch(struct inotify_handle *ih,
+					      struct inode *inode, u32 mask)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline __s32 inotify_add_watch(struct inotify_handle *ih,
+				      struct inotify_watch *watch,
+				      struct inode *inode, __u32 mask)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int inotify_rm_wd(struct inotify_handle *ih, __u32 wd)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void get_inotify_watch(struct inotify_watch *watch)
+{
+}
+
+static inline void put_inotify_watch(struct inotify_watch *watch)
+{
+}
+
 #endif	/* CONFIG_INOTIFY */
 
 #endif	/* __KERNEL __ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29b7d4f87d20..864e5a70ff65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -494,7 +494,7 @@ struct user_struct {
 	atomic_t processes;	/* How many processes does this user have? */
 	atomic_t files;		/* How many open files does this user have? */
 	atomic_t sigpending;	/* How many pending signals does this user have? */
-#ifdef CONFIG_INOTIFY
+#ifdef CONFIG_INOTIFY_USER
 	atomic_t inotify_watches; /* How many inotify watches does this user have? */
 	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
 #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e82726faeeff..0d656e61621d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -150,7 +150,7 @@ extern ctl_table random_table[];
 #ifdef CONFIG_UNIX98_PTYS
 extern ctl_table pty_table[];
 #endif
-#ifdef CONFIG_INOTIFY
+#ifdef CONFIG_INOTIFY_USER
 extern ctl_table inotify_table[];
 #endif
 
@@ -1028,7 +1028,7 @@ static ctl_table fs_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_doulongvec_minmax,
 	},
-#ifdef CONFIG_INOTIFY
+#ifdef CONFIG_INOTIFY_USER
 	{
 		.ctl_name	= FS_INOTIFY,
 		.procname	= "inotify",
diff --git a/kernel/user.c b/kernel/user.c
index 2116642f42c6..4b1eb745afa1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -140,7 +140,7 @@ struct user_struct * alloc_uid(uid_t uid)
 		atomic_set(&new->processes, 0);
 		atomic_set(&new->files, 0);
 		atomic_set(&new->sigpending, 0);
-#ifdef CONFIG_INOTIFY
+#ifdef CONFIG_INOTIFY_USER
 		atomic_set(&new->inotify_watches, 0);
 		atomic_set(&new->inotify_devs, 0);
 #endif
-- 
cgit v1.2.3


From 7c29772288b7026504cfe75bfd90d40fbd1574bf Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 1 Jun 2006 13:11:01 -0700
Subject: [PATCH] inotify (2/5): add name's inode to event handler

When an inotify event includes a dentry name, also include the inode
associated with that name.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Acked-by: Robert Love <rml@novell.com>
Acked-by: John McCutchan <john@johnmccutchan.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inotify.c             | 13 ++++++++-----
 fs/inotify_user.c        |  3 ++-
 include/linux/fsnotify.h | 29 ++++++++++++++++-------------
 include/linux/inotify.h  |  7 ++++---
 4 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inotify.c b/fs/inotify.c
index a1bedf3975ca..f25c21801fdc 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -232,7 +232,7 @@ static void remove_watch_no_event(struct inotify_watch *watch,
 static void remove_watch(struct inotify_watch *watch, struct inotify_handle *ih)
 {
 	remove_watch_no_event(watch, ih);
-	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL);
+	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
 }
 
 /* Kernel API for producing events */
@@ -275,9 +275,10 @@ void inotify_d_move(struct dentry *entry)
  * @mask: event mask describing this event
  * @cookie: cookie for synchronization, or zero
  * @name: filename, if any
+ * @n_inode: inode associated with name
  */
 void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
-			       const char *name)
+			       const char *name, struct inode *n_inode)
 {
 	struct inotify_watch *watch, *next;
 
@@ -292,7 +293,8 @@ void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
 			mutex_lock(&ih->mutex);
 			if (watch_mask & IN_ONESHOT)
 				remove_watch_no_event(watch, ih);
-			ih->in_ops->handle_event(watch, watch->wd, mask, cookie, name);
+			ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
+						 name, n_inode);
 			mutex_unlock(&ih->mutex);
 		}
 	}
@@ -323,7 +325,8 @@ void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
 	if (inotify_inode_watched(inode)) {
 		dget(parent);
 		spin_unlock(&dentry->d_lock);
-		inotify_inode_queue_event(inode, mask, cookie, name);
+		inotify_inode_queue_event(inode, mask, cookie, name,
+					  dentry->d_inode);
 		dput(parent);
 	} else
 		spin_unlock(&dentry->d_lock);
@@ -407,7 +410,7 @@ void inotify_unmount_inodes(struct list_head *list)
 			struct inotify_handle *ih= watch->ih;
 			mutex_lock(&ih->mutex);
 			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
-						 NULL);
+						 NULL, NULL);
 			remove_watch(watch, ih);
 			mutex_unlock(&ih->mutex);
 		}
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 845dc79a4e9c..8b83c7190067 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -253,7 +253,8 @@ inotify_dev_get_event(struct inotify_device *dev)
  * Can sleep (calls kernel_event()).
  */
 static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
-				    u32 cookie, const char *name)
+				    u32 cookie, const char *name,
+				    struct inode *ignored)
 {
 	struct inotify_user_watch *watch;
 	struct inotify_device *dev;
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 11438eff4d44..a9d30442448f 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -54,16 +54,18 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 
 	if (isdir)
 		isdir = IN_ISDIR;
-	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir,cookie,old_name);
-	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name);
+	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir,cookie,old_name,
+				  source);
+	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name,
+				  source);
 
 	if (target) {
-		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL);
+		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL);
 		inotify_inode_is_dead(target);
 	}
 
 	if (source) {
-		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL);
+		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
 	}
 	audit_inode_child(old_name, source, old_dir->i_ino);
 	audit_inode_child(new_name, target, new_dir->i_ino);
@@ -85,7 +87,7 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
  */
 static inline void fsnotify_inoderemove(struct inode *inode)
 {
-	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL);
+	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
 	inotify_inode_is_dead(inode);
 }
 
@@ -95,7 +97,8 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
 	inode_dir_notify(inode, DN_CREATE);
-	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name);
+	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
+				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino);
 }
 
@@ -106,7 +109,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 {
 	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, 
-				  dentry->d_name.name);
+				  dentry->d_name.name, dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino);
 }
 
@@ -123,7 +126,7 @@ static inline void fsnotify_access(struct dentry *dentry)
 
 	dnotify_parent(dentry, DN_ACCESS);
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
-	inotify_inode_queue_event(inode, mask, 0, NULL);
+	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 }
 
 /*
@@ -139,7 +142,7 @@ static inline void fsnotify_modify(struct dentry *dentry)
 
 	dnotify_parent(dentry, DN_MODIFY);
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
-	inotify_inode_queue_event(inode, mask, 0, NULL);
+	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 }
 
 /*
@@ -154,7 +157,7 @@ static inline void fsnotify_open(struct dentry *dentry)
 		mask |= IN_ISDIR;
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
-	inotify_inode_queue_event(inode, mask, 0, NULL);	
+	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 }
 
 /*
@@ -172,7 +175,7 @@ static inline void fsnotify_close(struct file *file)
 		mask |= IN_ISDIR;
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, name);
-	inotify_inode_queue_event(inode, mask, 0, NULL);
+	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 }
 
 /*
@@ -187,7 +190,7 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 		mask |= IN_ISDIR;
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
-	inotify_inode_queue_event(inode, mask, 0, NULL);
+	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 }
 
 /*
@@ -234,7 +237,7 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 	if (in_mask) {
 		if (S_ISDIR(inode->i_mode))
 			in_mask |= IN_ISDIR;
-		inotify_inode_queue_event(inode, in_mask, 0, NULL);
+		inotify_inode_queue_event(inode, in_mask, 0, NULL, NULL);
 		inotify_dentry_parent_queue_event(dentry, in_mask, 0,
 						  dentry->d_name.name);
 	}
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 68b6e0127de4..e7899e7d83ad 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -91,7 +91,7 @@ struct inotify_watch {
 
 struct inotify_operations {
 	void (*handle_event)(struct inotify_watch *, u32, u32, u32,
-			     const char *);
+			     const char *, struct inode *);
 	void (*destroy_watch)(struct inotify_watch *);
 };
 
@@ -102,7 +102,7 @@ struct inotify_operations {
 extern void inotify_d_instantiate(struct dentry *, struct inode *);
 extern void inotify_d_move(struct dentry *);
 extern void inotify_inode_queue_event(struct inode *, __u32, __u32,
-				      const char *);
+				      const char *, struct inode *);
 extern void inotify_dentry_parent_queue_event(struct dentry *, __u32, __u32,
 					      const char *);
 extern void inotify_unmount_inodes(struct list_head *);
@@ -134,7 +134,8 @@ static inline void inotify_d_move(struct dentry *dentry)
 
 static inline void inotify_inode_queue_event(struct inode *inode,
 					     __u32 mask, __u32 cookie,
-					     const char *filename)
+					     const char *filename,
+					     struct inode *n_inode)
 {
 }
 
-- 
cgit v1.2.3


From a9dc971d3fdb857a2bcd6d53238125a2cd31d5f4 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 1 Jun 2006 13:11:03 -0700
Subject: [PATCH] inotify (3/5): add interfaces to kernel API

Add inotify_init_watch() so caller can use inotify_watch refcounts
before calling inotify_add_watch().

Add inotify_find_watch() to find an existing watch for an (ih,inode)
pair.  This is similar to inotify_find_update_watch(), but does not
update the watch's mask if one is found.

Add inotify_rm_watch() to remove a watch via the watch pointer instead
of the watch descriptor.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Acked-by: Robert Love <rml@novell.com>
Acked-by: John McCutchan <john@johnmccutchan.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inotify.c            | 64 ++++++++++++++++++++++++++++++++++++++++++++-----
 fs/inotify_user.c       |  1 +
 include/linux/inotify.h | 20 ++++++++++++++++
 3 files changed, 79 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inotify.c b/fs/inotify.c
index f25c21801fdc..8477c4fbecb4 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -467,6 +467,19 @@ struct inotify_handle *inotify_init(const struct inotify_operations *ops)
 }
 EXPORT_SYMBOL_GPL(inotify_init);
 
+/**
+ * inotify_init_watch - initialize an inotify watch
+ * @watch: watch to initialize
+ */
+void inotify_init_watch(struct inotify_watch *watch)
+{
+	INIT_LIST_HEAD(&watch->h_list);
+	INIT_LIST_HEAD(&watch->i_list);
+	atomic_set(&watch->count, 0);
+	get_inotify_watch(watch); /* initial get */
+}
+EXPORT_SYMBOL_GPL(inotify_init_watch);
+
 /**
  * inotify_destroy - clean up and destroy an inotify instance
  * @ih: inotify handle
@@ -514,6 +527,37 @@ void inotify_destroy(struct inotify_handle *ih)
 }
 EXPORT_SYMBOL_GPL(inotify_destroy);
 
+/**
+ * inotify_find_watch - find an existing watch for an (ih,inode) pair
+ * @ih: inotify handle
+ * @inode: inode to watch
+ * @watchp: pointer to existing inotify_watch
+ *
+ * Caller must pin given inode (via nameidata).
+ */
+s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
+		       struct inotify_watch **watchp)
+{
+	struct inotify_watch *old;
+	int ret = -ENOENT;
+
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
+
+	old = inode_find_handle(inode, ih);
+	if (unlikely(old)) {
+		get_inotify_watch(old); /* caller must put watch */
+		*watchp = old;
+		ret = old->wd;
+	}
+
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(inotify_find_watch);
+
 /**
  * inotify_find_update_watch - find and update the mask of an existing watch
  * @ih: inotify handle
@@ -593,10 +637,6 @@ s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
 		goto out;
 	ret = watch->wd;
 
-	atomic_set(&watch->count, 0);
-	INIT_LIST_HEAD(&watch->h_list);
-	INIT_LIST_HEAD(&watch->i_list);
-
 	/* save a reference to handle and bump the count to make it official */
 	get_inotify_handle(ih);
 	watch->ih = ih;
@@ -607,8 +647,6 @@ s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
 	 */
 	watch->inode = igrab(inode);
 
-	get_inotify_watch(watch); /* initial get */
-
 	if (!inotify_inode_watched(inode))
 		set_dentry_child_flags(inode, 1);
 
@@ -659,6 +697,20 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 }
 EXPORT_SYMBOL_GPL(inotify_rm_wd);
 
+/**
+ * inotify_rm_watch - remove a watch from an inotify instance
+ * @ih: inotify handle
+ * @watch: watch to remove
+ *
+ * Can sleep.
+ */
+int inotify_rm_watch(struct inotify_handle *ih,
+		     struct inotify_watch *watch)
+{
+	return inotify_rm_wd(ih, watch->wd);
+}
+EXPORT_SYMBOL_GPL(inotify_rm_watch);
+
 /*
  * inotify_setup - core initialization function
  */
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 8b83c7190067..9e9931e2badd 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -380,6 +380,7 @@ static int create_watch(struct inotify_device *dev, struct inode *inode,
 
 	atomic_inc(&dev->user->inotify_watches);
 
+	inotify_init_watch(&watch->wdata);
 	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
 	if (ret < 0)
 		free_inotify_user_watch(&watch->wdata);
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index e7899e7d83ad..e7e7fb7fc778 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -112,11 +112,15 @@ extern u32 inotify_get_cookie(void);
 /* Kernel Consumer API */
 
 extern struct inotify_handle *inotify_init(const struct inotify_operations *);
+extern void inotify_init_watch(struct inotify_watch *);
 extern void inotify_destroy(struct inotify_handle *);
+extern __s32 inotify_find_watch(struct inotify_handle *, struct inode *,
+				struct inotify_watch **);
 extern __s32 inotify_find_update_watch(struct inotify_handle *, struct inode *,
 				       u32);
 extern __s32 inotify_add_watch(struct inotify_handle *, struct inotify_watch *,
 			       struct inode *, __u32);
+extern int inotify_rm_watch(struct inotify_handle *, struct inotify_watch *);
 extern int inotify_rm_wd(struct inotify_handle *, __u32);
 extern void get_inotify_watch(struct inotify_watch *);
 extern void put_inotify_watch(struct inotify_watch *);
@@ -163,10 +167,20 @@ static inline struct inotify_handle *inotify_init(const struct inotify_operation
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline void inotify_init_watch(struct inotify_watch *watch)
+{
+}
+
 static inline void inotify_destroy(struct inotify_handle *ih)
 {
 }
 
+static inline __s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
+				       struct inotify_watch **watchp)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline __s32 inotify_find_update_watch(struct inotify_handle *ih,
 					      struct inode *inode, u32 mask)
 {
@@ -180,6 +194,12 @@ static inline __s32 inotify_add_watch(struct inotify_handle *ih,
 	return -EOPNOTSUPP;
 }
 
+static inline int inotify_rm_watch(struct inotify_handle *ih,
+				   struct inotify_watch *watch)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int inotify_rm_wd(struct inotify_handle *ih, __u32 wd)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 3ca10067f7f4bfa62a1b0edc84f590261fa02d75 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 1 Jun 2006 13:11:05 -0700
Subject: [PATCH] inotify (4/5): allow watch removal from event handler

Allow callers to remove watches from their event handler via
inotify_remove_watch_locked().  This functionality can be used to
achieve IN_ONESHOT-like functionality for a subset of events in the
mask.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Acked-by: Robert Love <rml@novell.com>
Acked-by: John McCutchan <john@johnmccutchan.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inotify.c            | 23 ++++++++++++++---------
 include/linux/inotify.h |  7 +++++++
 2 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inotify.c b/fs/inotify.c
index 8477c4fbecb4..723836a1f718 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -207,7 +207,7 @@ static struct inotify_watch *inode_find_handle(struct inode *inode,
 }
 
 /*
- * remove_watch_no_event - remove_watch() without the IN_IGNORED event.
+ * remove_watch_no_event - remove watch without the IN_IGNORED event.
  *
  * Callers must hold both inode->inotify_mutex and ih->mutex.
  */
@@ -223,17 +223,22 @@ static void remove_watch_no_event(struct inotify_watch *watch,
 	idr_remove(&ih->idr, watch->wd);
 }
 
-/*
- * remove_watch - Remove a watch from both the handle and the inode.  Sends
- * the IN_IGNORED event signifying that the inode is no longer watched.
+/**
+ * inotify_remove_watch_locked - Remove a watch from both the handle and the
+ * inode.  Sends the IN_IGNORED event signifying that the inode is no longer
+ * watched.  May be invoked from a caller's event handler.
+ * @ih: inotify handle associated with watch
+ * @watch: watch to remove
  *
  * Callers must hold both inode->inotify_mutex and ih->mutex.
  */
-static void remove_watch(struct inotify_watch *watch, struct inotify_handle *ih)
+void inotify_remove_watch_locked(struct inotify_handle *ih,
+				 struct inotify_watch *watch)
 {
 	remove_watch_no_event(watch, ih);
 	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
 }
+EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
 
 /* Kernel API for producing events */
 
@@ -378,7 +383,7 @@ void inotify_unmount_inodes(struct list_head *list)
 
 		need_iput_tmp = need_iput;
 		need_iput = NULL;
-		/* In case the remove_watch() drops a reference. */
+		/* In case inotify_remove_watch_locked() drops a reference. */
 		if (inode != need_iput_tmp)
 			__iget(inode);
 		else
@@ -411,7 +416,7 @@ void inotify_unmount_inodes(struct list_head *list)
 			mutex_lock(&ih->mutex);
 			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
 						 NULL, NULL);
-			remove_watch(watch, ih);
+			inotify_remove_watch_locked(ih, watch);
 			mutex_unlock(&ih->mutex);
 		}
 		mutex_unlock(&inode->inotify_mutex);
@@ -434,7 +439,7 @@ void inotify_inode_is_dead(struct inode *inode)
 	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
 		struct inotify_handle *ih = watch->ih;
 		mutex_lock(&ih->mutex);
-		remove_watch(watch, ih);
+		inotify_remove_watch_locked(ih, watch);
 		mutex_unlock(&ih->mutex);
 	}
 	mutex_unlock(&inode->inotify_mutex);
@@ -687,7 +692,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 
 	/* make sure that we did not race */
 	if (likely(idr_find(&ih->idr, wd) == watch))
-		remove_watch(watch, ih);
+		inotify_remove_watch_locked(ih, watch);
 
 	mutex_unlock(&ih->mutex);
 	mutex_unlock(&inode->inotify_mutex);
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index e7e7fb7fc778..d4f48c6402e6 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -122,6 +122,8 @@ extern __s32 inotify_add_watch(struct inotify_handle *, struct inotify_watch *,
 			       struct inode *, __u32);
 extern int inotify_rm_watch(struct inotify_handle *, struct inotify_watch *);
 extern int inotify_rm_wd(struct inotify_handle *, __u32);
+extern void inotify_remove_watch_locked(struct inotify_handle *,
+					struct inotify_watch *);
 extern void get_inotify_watch(struct inotify_watch *);
 extern void put_inotify_watch(struct inotify_watch *);
 
@@ -205,6 +207,11 @@ static inline int inotify_rm_wd(struct inotify_handle *ih, __u32 wd)
 	return -EOPNOTSUPP;
 }
 
+static inline void inotify_remove_watch_locked(struct inotify_handle *ih,
+					       struct inotify_watch *watch)
+{
+}
+
 static inline void get_inotify_watch(struct inotify_watch *watch)
 {
 }
-- 
cgit v1.2.3


From bc0f3b8ebba611291fdaa2864dbffd2d29336c64 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 22 May 2006 01:36:34 -0400
Subject: [PATCH] audit_panic() is audit-internal

... no need to provide a stub; note that extern is already gone from
include/linux/audit.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index b74c148f14e3..e65399bf2710 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -384,7 +384,6 @@ extern int  audit_receive_filter(int type, int pid, int uid, int seq,
 #define audit_log_hex(a,b,l) do { ; } while (0)
 #define audit_log_untrustedstring(a,s) do { ; } while (0)
 #define audit_log_d_path(b,p,d,v) do { ; } while (0)
-#define audit_panic(m) do { ; } while (0)
 #endif
 #endif
 #endif
-- 
cgit v1.2.3


From 473ae30bc7b1dda5c5791c773f95e9424ddfead9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Apr 2006 14:04:08 -0400
Subject: [PATCH] execve argument logging

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c             |  6 ++++++
 include/linux/audit.h |  6 +++++-
 kernel/audit.c        |  8 +++++---
 kernel/auditsc.c      | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 67 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 3a79d97ac234..d07858c0b7c4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -49,6 +49,7 @@
 #include <linux/rmap.h>
 #include <linux/acct.h>
 #include <linux/cn_proc.h>
+#include <linux/audit.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1085,6 +1086,11 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 	/* kernel module loader fixup */
 	/* so we don't try to load run modprobe in kernel space. */
 	set_fs(USER_DS);
+
+	retval = audit_bprm(bprm);
+	if (retval)
+		return retval;
+
 	retval = -ENOENT;
 	for (try=0; try<2; try++) {
 		read_lock(&binfmt_lock);
diff --git a/include/linux/audit.h b/include/linux/audit.h
index e65399bf2710..1a221b65f7b7 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -83,6 +83,7 @@
 #define AUDIT_CONFIG_CHANGE	1305	/* Audit system configuration change */
 #define AUDIT_SOCKADDR		1306	/* sockaddr copied as syscall arg */
 #define AUDIT_CWD		1307	/* Current working directory */
+#define AUDIT_EXECVE		1309	/* execve arguments */
 #define AUDIT_IPC_SET_PERM	1311	/* IPC new permissions record type */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
@@ -283,6 +284,7 @@ struct audit_buffer;
 struct audit_context;
 struct inode;
 struct netlink_skb_parms;
+struct linux_binprm;
 
 #define AUDITSC_INVALID 0
 #define AUDITSC_SUCCESS 1
@@ -322,6 +324,7 @@ extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 extern uid_t audit_get_loginuid(struct audit_context *ctx);
 extern int audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp);
+extern int audit_bprm(struct linux_binprm *bprm);
 extern int audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
 extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt);
@@ -342,6 +345,7 @@ extern int audit_set_macxattr(const char *name);
 #define audit_get_loginuid(c) ({ -1; })
 #define audit_ipc_obj(i) ({ 0; })
 #define audit_ipc_set_perm(q,u,g,m,i) ({ 0; })
+#define audit_bprm(p) ({ 0; })
 #define audit_socketcall(n,a) ({ 0; })
 #define audit_sockaddr(len, addr) ({ 0; })
 #define audit_avc_path(dentry, mnt) ({ 0; })
@@ -364,7 +368,7 @@ extern void		    audit_log_end(struct audit_buffer *ab);
 extern void		    audit_log_hex(struct audit_buffer *ab,
 					  const unsigned char *buf,
 					  size_t len);
-extern void		    audit_log_untrustedstring(struct audit_buffer *ab,
+extern const char *	    audit_log_untrustedstring(struct audit_buffer *ab,
 						      const char *string);
 extern void		    audit_log_d_path(struct audit_buffer *ab,
 					     const char *prefix,
diff --git a/kernel/audit.c b/kernel/audit.c
index bf74bf02aa4b..d09f131b111a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1026,18 +1026,20 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
  * or a space. Unescaped strings will start and end with a double quote mark.
  * Strings that are escaped are printed in hex (2 digits per char).
  */
-void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
+const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
 {
 	const unsigned char *p = string;
+	size_t len = strlen(string);
 
 	while (*p) {
 		if (*p == '"' || *p < 0x21 || *p > 0x7f) {
-			audit_log_hex(ab, string, strlen(string));
-			return;
+			audit_log_hex(ab, string, len);
+			return string + len + 1;
 		}
 		p++;
 	}
 	audit_log_format(ab, "\"%s\"", string);
+	return p + 1;
 }
 
 /* This is a helper-function to print the escaped d_path */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c03a4ed1b27..114f921979ec 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -59,6 +59,7 @@
 #include <linux/list.h>
 #include <linux/tty.h>
 #include <linux/selinux.h>
+#include <linux/binfmts.h>
 
 #include "audit.h"
 
@@ -110,6 +111,13 @@ struct audit_aux_data_ipcctl {
 	u32			osid;
 };
 
+struct audit_aux_data_execve {
+	struct audit_aux_data	d;
+	int argc;
+	int envc;
+	char mem[0];
+};
+
 struct audit_aux_data_socketcall {
 	struct audit_aux_data	d;
 	int			nargs;
@@ -667,6 +675,16 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				kfree(ctx);
 			}
 			break; }
+		case AUDIT_EXECVE: {
+			struct audit_aux_data_execve *axi = (void *)aux;
+			int i;
+			const char *p;
+			for (i = 0, p = axi->mem; i < axi->argc; i++) {
+				audit_log_format(ab, "a%d=", i);
+				p = audit_log_untrustedstring(ab, p);
+				audit_log_format(ab, "\n");
+			}
+			break; }
 
 		case AUDIT_SOCKETCALL: {
 			int i;
@@ -1231,6 +1249,39 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode,
 	return 0;
 }
 
+int audit_bprm(struct linux_binprm *bprm)
+{
+	struct audit_aux_data_execve *ax;
+	struct audit_context *context = current->audit_context;
+	unsigned long p, next;
+	void *to;
+
+	if (likely(!audit_enabled || !context))
+		return 0;
+
+	ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p,
+				GFP_KERNEL);
+	if (!ax)
+		return -ENOMEM;
+
+	ax->argc = bprm->argc;
+	ax->envc = bprm->envc;
+	for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) {
+		struct page *page = bprm->page[p / PAGE_SIZE];
+		void *kaddr = kmap(page);
+		next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1);
+		memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p);
+		to += next - p;
+		kunmap(page);
+	}
+
+	ax->d.type = AUDIT_EXECVE;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+	return 0;
+}
+
+
 /**
  * audit_socketcall - record audit data for sys_socketcall
  * @nargs: number of args
-- 
cgit v1.2.3


From e1396065e0489f98b35021b97907ab4edbfb24e1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 25 May 2006 10:19:47 -0400
Subject: [PATCH] collect sid of those who send signals to auditd

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  3 +--
 kernel/audit.c        | 31 ++++++++++++++++++++-----------
 kernel/audit.h        | 11 +++++++++++
 kernel/auditsc.c      | 23 ++++++++++++-----------
 kernel/signal.c       |  2 +-
 5 files changed, 45 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 1a221b65f7b7..1057e90bd3e3 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -278,6 +278,7 @@ struct audit_rule {		/* for AUDIT_LIST, AUDIT_ADD, and AUDIT_DEL */
 struct audit_sig_info {
 	uid_t		uid;
 	pid_t		pid;
+	char		ctx[0];
 };
 
 struct audit_buffer;
@@ -328,7 +329,6 @@ extern int audit_bprm(struct linux_binprm *bprm);
 extern int audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
 extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt);
-extern void audit_signal_info(int sig, struct task_struct *t);
 extern int audit_set_macxattr(const char *name);
 #else
 #define audit_alloc(t) ({ 0; })
@@ -349,7 +349,6 @@ extern int audit_set_macxattr(const char *name);
 #define audit_socketcall(n,a) ({ 0; })
 #define audit_sockaddr(len, addr) ({ 0; })
 #define audit_avc_path(dentry, mnt) ({ 0; })
-#define audit_signal_info(s,t) do { ; } while (0)
 #define audit_set_macxattr(n) do { ; } while (0)
 #endif
 
diff --git a/kernel/audit.c b/kernel/audit.c
index d09f131b111a..bb20922d08cc 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -89,6 +89,7 @@ static int	audit_backlog_wait_overflow = 0;
 /* The identity of the user shutting down the audit system. */
 uid_t		audit_sig_uid = -1;
 pid_t		audit_sig_pid = -1;
+u32		audit_sig_sid = 0;
 
 /* Records can be lost in several ways:
    0) [suppressed in audit_alloc]
@@ -479,7 +480,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	struct audit_buffer	*ab;
 	u16			msg_type = nlh->nlmsg_type;
 	uid_t			loginuid; /* loginuid of sender */
-	struct audit_sig_info   sig_data;
+	struct audit_sig_info   *sig_data;
+	char			*ctx;
+	u32			len;
 
 	err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
 	if (err)
@@ -531,12 +534,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (status_get->mask & AUDIT_STATUS_PID) {
 			int old   = audit_pid;
 			if (sid) {
-				char *ctx = NULL;
-				u32 len;
-				int rc;
-				if ((rc = selinux_ctxid_to_string(
+				if ((err = selinux_ctxid_to_string(
 						sid, &ctx, &len)))
-					return rc;
+					return err;
 				else
 					audit_log(NULL, GFP_KERNEL,
 						AUDIT_CONFIG_CHANGE,
@@ -572,8 +572,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 						 "user pid=%d uid=%u auid=%u",
 						 pid, uid, loginuid);
 				if (sid) {
-					char *ctx = NULL;
-					u32 len;
 					if (selinux_ctxid_to_string(
 							sid, &ctx, &len)) {
 						audit_log_format(ab, 
@@ -612,10 +610,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 					   loginuid, sid);
 		break;
 	case AUDIT_SIGNAL_INFO:
-		sig_data.uid = audit_sig_uid;
-		sig_data.pid = audit_sig_pid;
+		err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len);
+		if (err)
+			return err;
+		sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
+		if (!sig_data) {
+			kfree(ctx);
+			return -ENOMEM;
+		}
+		sig_data->uid = audit_sig_uid;
+		sig_data->pid = audit_sig_pid;
+		memcpy(sig_data->ctx, ctx, len);
+		kfree(ctx);
 		audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 
-				0, 0, &sig_data, sizeof(sig_data));
+				0, 0, sig_data, sizeof(*sig_data) + len);
+		kfree(sig_data);
 		break;
 	default:
 		err = -EINVAL;
diff --git a/kernel/audit.h b/kernel/audit.h
index 8948fc1e9e54..52cb1e31d522 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -101,3 +101,14 @@ struct audit_netlink_list {
 int audit_send_list(void *);
 
 extern int selinux_audit_rule_update(void);
+
+#ifdef CONFIG_AUDITSYSCALL
+extern void __audit_signal_info(int sig, struct task_struct *t);
+static inline void audit_signal_info(int sig, struct task_struct *t)
+{
+	if (unlikely(audit_pid && t->tgid == audit_pid))
+		__audit_signal_info(sig, t);
+}
+#else
+#define audit_signal_info(s,t)
+#endif
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 114f921979ec..4ca913daa7da 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1376,19 +1376,20 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
  * If the audit subsystem is being terminated, record the task (pid)
  * and uid that is doing that.
  */
-void audit_signal_info(int sig, struct task_struct *t)
+void __audit_signal_info(int sig, struct task_struct *t)
 {
 	extern pid_t audit_sig_pid;
 	extern uid_t audit_sig_uid;
-
-	if (unlikely(audit_pid && t->tgid == audit_pid)) {
-		if (sig == SIGTERM || sig == SIGHUP) {
-			struct audit_context *ctx = current->audit_context;
-			audit_sig_pid = current->pid;
-			if (ctx)
-				audit_sig_uid = ctx->loginuid;
-			else
-				audit_sig_uid = current->uid;
-		}
+	extern u32 audit_sig_sid;
+
+	if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
+		struct task_struct *tsk = current;
+		struct audit_context *ctx = tsk->audit_context;
+		audit_sig_pid = tsk->pid;
+		if (ctx)
+			audit_sig_uid = ctx->loginuid;
+		else
+			audit_sig_uid = tsk->uid;
+		selinux_get_task_sid(tsk, &audit_sig_sid);
 	}
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index e5f8aea78ffe..1b3c921737e2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,12 +23,12 @@
 #include <linux/syscalls.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
-#include <linux/audit.h>
 #include <linux/capability.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/siginfo.h>
+#include "audit.h"	/* audit_signal_info() */
 
 /*
  * SLAB caches for signal bits.
-- 
cgit v1.2.3


From 3c66251e573219a0532a5a07381b2f60a412d9eb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 6 May 2006 08:26:27 -0400
Subject: [PATCH] add filtering by ppid

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h | 1 +
 kernel/auditsc.c      | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 1057e90bd3e3..8f6424f2b604 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -152,6 +152,7 @@
 #define AUDIT_SE_TYPE	15	/* security label type */
 #define AUDIT_SE_SEN	16	/* security label sensitivity label */
 #define AUDIT_SE_CLR	17	/* security label clearance label */
+#define AUDIT_PPID	18
 
 				/* These are ONLY useful when checking
 				 * at syscall exit time (AUDIT_AT_EXIT). */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4fc3867fa25a..e4551659ad79 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -188,6 +188,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_PID:
 			result = audit_comparator(tsk->pid, f->op, f->val);
 			break;
+		case AUDIT_PPID:
+			if (ctx)
+				result = audit_comparator(ctx->ppid, f->op, f->val);
+			break;
 		case AUDIT_UID:
 			result = audit_comparator(tsk->uid, f->op, f->val);
 			break;
-- 
cgit v1.2.3


From ac03221a4fdda9bfdabf99bcd129847f20fc1d80 Mon Sep 17 00:00:00 2001
From: Linda Knippers <linda.knippers@hp.com>
Date: Tue, 16 May 2006 22:03:48 -0400
Subject: [PATCH] update of IPC audit record cleanup

The following patch addresses most of the issues with the IPC_SET_PERM
records as described in:
https://www.redhat.com/archives/linux-audit/2006-May/msg00010.html
and addresses the comments I received on the record field names.

To summarize, I made the following changes:

1. Changed sys_msgctl() and semctl_down() so that an IPC_SET_PERM
   record is emitted in the failure case as well as the success case.
   This matches the behavior in sys_shmctl().  I could simplify the
   code in sys_msgctl() and semctl_down() slightly but it would mean
   that in some error cases we could get an IPC_SET_PERM record
   without an IPC record and that seemed odd.

2. No change to the IPC record type, given no feedback on the backward
   compatibility question.

3. Removed the qbytes field from the IPC record.  It wasn't being
   set and when audit_ipc_obj() is called from ipcperms(), the
   information isn't available.  If we want the information in the IPC
   record, more extensive changes will be necessary.  Since it only
   applies to message queues and it isn't really permission related, it
   doesn't seem worth it.

4. Removed the obj field from the IPC_SET_PERM record.  This means that
   the kern_ipc_perm argument is no longer needed.

5. Removed the spaces and renamed the IPC_SET_PERM field names.  Replaced iuid and
   igid fields with ouid and ogid in the IPC record.

I tested this with the lspp.22 kernel on an x86_64 box.  I believe it
applies cleanly on the latest kernel.

-- ljk

Signed-off-by: Linda Knippers <linda.knippers@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  4 ++--
 ipc/msg.c             |  9 +++++----
 ipc/sem.c             |  8 +++++---
 ipc/shm.c             |  2 +-
 kernel/auditsc.c      | 22 +++++-----------------
 5 files changed, 18 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8f6424f2b604..da5f521be04b 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -325,7 +325,7 @@ extern void auditsc_get_stamp(struct audit_context *ctx,
 extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 extern uid_t audit_get_loginuid(struct audit_context *ctx);
 extern int audit_ipc_obj(struct kern_ipc_perm *ipcp);
-extern int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp);
+extern int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode);
 extern int audit_bprm(struct linux_binprm *bprm);
 extern int audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
@@ -345,7 +345,7 @@ extern int audit_set_macxattr(const char *name);
 #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0)
 #define audit_get_loginuid(c) ({ -1; })
 #define audit_ipc_obj(i) ({ 0; })
-#define audit_ipc_set_perm(q,u,g,m,i) ({ 0; })
+#define audit_ipc_set_perm(q,u,g,m) ({ 0; })
 #define audit_bprm(p) ({ 0; })
 #define audit_socketcall(n,a) ({ 0; })
 #define audit_sockaddr(len, addr) ({ 0; })
diff --git a/ipc/msg.c b/ipc/msg.c
index 7d1340ccb16b..00f015a092d2 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -454,6 +454,11 @@ asmlinkage long sys_msgctl (int msqid, int cmd, struct msqid_ds __user *buf)
 	err = audit_ipc_obj(ipcp);
 	if (err)
 		goto out_unlock_up;
+	if (cmd==IPC_SET) {
+		err = audit_ipc_set_perm(setbuf.qbytes, setbuf.uid, setbuf.gid, setbuf.mode);
+		if (err)
+			goto out_unlock_up;
+	}
 
 	err = -EPERM;
 	if (current->euid != ipcp->cuid && 
@@ -468,10 +473,6 @@ asmlinkage long sys_msgctl (int msqid, int cmd, struct msqid_ds __user *buf)
 	switch (cmd) {
 	case IPC_SET:
 	{
-		err = audit_ipc_set_perm(setbuf.qbytes, setbuf.uid, setbuf.gid, setbuf.mode, ipcp);
-		if (err)
-			goto out_unlock_up;
-
 		err = -EPERM;
 		if (setbuf.qbytes > msg_ctlmnb && !capable(CAP_SYS_RESOURCE))
 			goto out_unlock_up;
diff --git a/ipc/sem.c b/ipc/sem.c
index 7919f8ece6ba..fce0bc8b5ad6 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -828,6 +828,11 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun
 	if (err)
 		goto out_unlock;
 
+	if (cmd == IPC_SET) {
+		err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode);
+		if (err)
+			goto out_unlock;
+	}
 	if (current->euid != ipcp->cuid && 
 	    current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) {
 	    	err=-EPERM;
@@ -844,9 +849,6 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun
 		err = 0;
 		break;
 	case IPC_SET:
-		err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode, ipcp);
-		if (err)
-			goto out_unlock;
 		ipcp->uid = setbuf.uid;
 		ipcp->gid = setbuf.gid;
 		ipcp->mode = (ipcp->mode & ~S_IRWXUGO)
diff --git a/ipc/shm.c b/ipc/shm.c
index 809896851902..4f133d24030f 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -643,7 +643,7 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf)
 		err = audit_ipc_obj(&(shp->shm_perm));
 		if (err)
 			goto out_unlock_up;
-		err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode, &(shp->shm_perm));
+		err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode);
 		if (err)
 			goto out_unlock_up;
 		err=-EPERM;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e4551659ad79..fa4bf9625456 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -648,8 +648,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		case AUDIT_IPC: {
 			struct audit_aux_data_ipcctl *axi = (void *)aux;
 			audit_log_format(ab, 
-				 " qbytes=%lx iuid=%u igid=%u mode=%x",
-				 axi->qbytes, axi->uid, axi->gid, axi->mode);
+				 "ouid=%u ogid=%u mode=%x",
+				 axi->uid, axi->gid, axi->mode);
 			if (axi->osid != 0) {
 				char *ctx = NULL;
 				u32 len;
@@ -667,21 +667,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		case AUDIT_IPC_SET_PERM: {
 			struct audit_aux_data_ipcctl *axi = (void *)aux;
 			audit_log_format(ab,
-				" new qbytes=%lx new iuid=%u new igid=%u new mode=%x",
+				"qbytes=%lx ouid=%u ogid=%u mode=%x",
 				axi->qbytes, axi->uid, axi->gid, axi->mode);
-			if (axi->osid != 0) {
-				char *ctx = NULL;
-				u32 len;
-				if (selinux_ctxid_to_string(
-						axi->osid, &ctx, &len)) {
-					audit_log_format(ab, " osid=%u",
-							axi->osid);
-					call_panic = 1;
-				} else
-					audit_log_format(ab, " obj=%s", ctx);
-				kfree(ctx);
-			}
 			break; }
+
 		case AUDIT_EXECVE: {
 			struct audit_aux_data_execve *axi = (void *)aux;
 			int i;
@@ -1232,7 +1221,7 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp)
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
-int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
+int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 {
 	struct audit_aux_data_ipcctl *ax;
 	struct audit_context *context = current->audit_context;
@@ -1248,7 +1237,6 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode,
 	ax->uid = uid;
 	ax->gid = gid;
 	ax->mode = mode;
-	selinux_get_ipc_sid(ipcp, &ax->osid);
 
 	ax->d.type = AUDIT_IPC_SET_PERM;
 	ax->d.next = context->aux;
-- 
cgit v1.2.3


From d8945bb51a2bb6623cfa36b9ff63594f46d513aa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 18 May 2006 16:01:30 -0400
Subject: [PATCH] inline more audit helpers

pull checks for ->audit_context into inlined wrappers

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h | 24 +++++++++++++++++++++---
 kernel/auditsc.c      | 14 ++++----------
 2 files changed, 25 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index da5f521be04b..4b62743b2e6d 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -301,11 +301,16 @@ extern void audit_syscall_entry(int arch,
 				int major, unsigned long a0, unsigned long a1,
 				unsigned long a2, unsigned long a3);
 extern void audit_syscall_exit(int failed, long return_code);
-extern void audit_getname(const char *name);
+extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
 extern void __audit_inode(const char *name, const struct inode *inode, unsigned flags);
 extern void __audit_inode_child(const char *dname, const struct inode *inode,
 				unsigned long pino);
+static inline void audit_getname(const char *name)
+{
+	if (unlikely(current->audit_context))
+		__audit_getname(name);
+}
 static inline void audit_inode(const char *name, const struct inode *inode,
 			       unsigned flags) {
 	if (unlikely(current->audit_context))
@@ -324,13 +329,26 @@ extern void auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec *t, unsigned int *serial);
 extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 extern uid_t audit_get_loginuid(struct audit_context *ctx);
-extern int audit_ipc_obj(struct kern_ipc_perm *ipcp);
-extern int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode);
+extern int __audit_ipc_obj(struct kern_ipc_perm *ipcp);
+extern int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode);
 extern int audit_bprm(struct linux_binprm *bprm);
 extern int audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
 extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt);
 extern int audit_set_macxattr(const char *name);
+
+static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp)
+{
+	if (unlikely(current->audit_context))
+		return __audit_ipc_obj(ipcp);
+	return 0;
+}
+static inline int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+{
+	if (unlikely(current->audit_context))
+		return __audit_ipc_set_perm(qbytes, uid, gid, mode);
+	return 0;
+}
 #else
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fa4bf9625456..05d31ee4f3dd 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -922,11 +922,11 @@ void audit_syscall_exit(int valid, long return_code)
  * Add a name to the list of audit names for this context.
  * Called from fs/namei.c:getname().
  */
-void audit_getname(const char *name)
+void __audit_getname(const char *name)
 {
 	struct audit_context *context = current->audit_context;
 
-	if (!context || IS_ERR(name) || !name)
+	if (IS_ERR(name) || !name)
 		return;
 
 	if (!context->in_syscall) {
@@ -1189,14 +1189,11 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
-int audit_ipc_obj(struct kern_ipc_perm *ipcp)
+int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
 	struct audit_aux_data_ipcctl *ax;
 	struct audit_context *context = current->audit_context;
 
-	if (likely(!context))
-		return 0;
-
 	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
 	if (!ax)
 		return -ENOMEM;
@@ -1221,14 +1218,11 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp)
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
-int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 {
 	struct audit_aux_data_ipcctl *ax;
 	struct audit_context *context = current->audit_context;
 
-	if (likely(!context))
-		return 0;
-
 	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
 	if (!ax)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 20ca73bc792be9625af184cbec36e1372611d1c3 Mon Sep 17 00:00:00 2001
From: "George C. Wilson" <ltcgcw@us.ibm.com>
Date: Wed, 24 May 2006 16:09:55 -0500
Subject: [PATCH] Audit of POSIX Message Queue Syscalls v.2

This patch adds audit support to POSIX message queues.  It applies cleanly to
the lspp.b15 branch of Al Viro's git tree.  There are new auxiliary data
structures, and collection and emission routines in kernel/auditsc.c.  New hooks
in ipc/mqueue.c collect arguments from the syscalls.

I tested the patch by building the examples from the POSIX MQ library tarball.
Build them -lrt, not against the old MQ library in the tarball.  Here's the URL:
http://www.geocities.com/wronski12/posix_ipc/libmqueue-4.41.tar.gz
Do auditctl -a exit,always -S for mq_open, mq_timedsend, mq_timedreceive,
mq_notify, mq_getsetattr.  mq_unlink has no new hooks.  Please see the
corresponding userspace patch to get correct output from auditd for the new
record types.

[fixes folded]

Signed-off-by: George Wilson <ltcgcw@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  46 +++++++++
 ipc/mqueue.c          |  22 ++++
 kernel/auditsc.c      | 274 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 341 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 4b62743b2e6d..7c8780b150e6 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -85,6 +85,10 @@
 #define AUDIT_CWD		1307	/* Current working directory */
 #define AUDIT_EXECVE		1309	/* execve arguments */
 #define AUDIT_IPC_SET_PERM	1311	/* IPC new permissions record type */
+#define AUDIT_MQ_OPEN		1312	/* POSIX MQ open record type */
+#define AUDIT_MQ_SENDRECV	1313	/* POSIX MQ send/receive record type */
+#define AUDIT_MQ_NOTIFY		1314	/* POSIX MQ notify record type */
+#define AUDIT_MQ_GETSETATTR	1315	/* POSIX MQ get/set attribute record type */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
@@ -287,6 +291,8 @@ struct audit_context;
 struct inode;
 struct netlink_skb_parms;
 struct linux_binprm;
+struct mq_attr;
+struct mqstat;
 
 #define AUDITSC_INVALID 0
 #define AUDITSC_SUCCESS 1
@@ -336,6 +342,11 @@ extern int audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
 extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt);
 extern int audit_set_macxattr(const char *name);
+extern int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr);
+extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec __user *u_abs_timeout);
+extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout);
+extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification);
+extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
 
 static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -349,6 +360,36 @@ static inline int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid,
 		return __audit_ipc_set_perm(qbytes, uid, gid, mode);
 	return 0;
 }
+static inline int audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
+{
+	if (unlikely(current->audit_context))
+		return __audit_mq_open(oflag, mode, u_attr);
+	return 0;
+}
+static inline int audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec __user *u_abs_timeout)
+{
+	if (unlikely(current->audit_context))
+		return __audit_mq_timedsend(mqdes, msg_len, msg_prio, u_abs_timeout);
+	return 0;
+}
+static inline int audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout)
+{
+	if (unlikely(current->audit_context))
+		return __audit_mq_timedreceive(mqdes, msg_len, u_msg_prio, u_abs_timeout);
+	return 0;
+}
+static inline int audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
+{
+	if (unlikely(current->audit_context))
+		return __audit_mq_notify(mqdes, u_notification);
+	return 0;
+}
+static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
+{
+	if (unlikely(current->audit_context))
+		return __audit_mq_getsetattr(mqdes, mqstat);
+	return 0;
+}
 #else
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
@@ -369,6 +410,11 @@ static inline int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid,
 #define audit_sockaddr(len, addr) ({ 0; })
 #define audit_avc_path(dentry, mnt) ({ 0; })
 #define audit_set_macxattr(n) do { ; } while (0)
+#define audit_mq_open(o,m,a) ({ 0; })
+#define audit_mq_timedsend(d,l,p,t) ({ 0; })
+#define audit_mq_timedreceive(d,l,p,t) ({ 0; })
+#define audit_mq_notify(d,n) ({ 0; })
+#define audit_mq_getsetattr(d,s) ({ 0; })
 #endif
 
 #ifdef CONFIG_AUDIT
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 41ecbd440fed..1511714a9585 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -8,6 +8,8 @@
  * Lockless receive & send, fd based notify:
  * 			    Manfred Spraul	    (manfred@colorfullife.com)
  *
+ * Audit:                   George Wilson           (ltcgcw@us.ibm.com)
+ *
  * This file is released under the GPL.
  */
 
@@ -24,6 +26,7 @@
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/syscalls.h>
+#include <linux/audit.h>
 #include <linux/signal.h>
 #include <linux/mutex.h>
 
@@ -657,6 +660,10 @@ asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode,
 	char *name;
 	int fd, error;
 
+	error = audit_mq_open(oflag, mode, u_attr);
+	if (error != 0)
+		return error;
+
 	if (IS_ERR(name = getname(u_name)))
 		return PTR_ERR(name);
 
@@ -814,6 +821,10 @@ asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
 	long timeout;
 	int ret;
 
+	ret = audit_mq_timedsend(mqdes, msg_len, msg_prio, u_abs_timeout);
+	if (ret != 0)
+		return ret;
+
 	if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
 		return -EINVAL;
 
@@ -896,6 +907,10 @@ asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
 	struct mqueue_inode_info *info;
 	struct ext_wait_queue wait;
 
+	ret = audit_mq_timedreceive(mqdes, msg_len, u_msg_prio, u_abs_timeout);
+	if (ret != 0)
+		return ret;
+
 	timeout = prepare_timeout(u_abs_timeout);
 
 	ret = -EBADF;
@@ -975,6 +990,10 @@ asmlinkage long sys_mq_notify(mqd_t mqdes,
 	struct mqueue_inode_info *info;
 	struct sk_buff *nc;
 
+	ret = audit_mq_notify(mqdes, u_notification);
+	if (ret != 0)
+		return ret;
+
 	nc = NULL;
 	sock = NULL;
 	if (u_notification != NULL) {
@@ -1115,6 +1134,9 @@ asmlinkage long sys_mq_getsetattr(mqd_t mqdes,
 	omqstat = info->attr;
 	omqstat.mq_flags = filp->f_flags & O_NONBLOCK;
 	if (u_mqstat) {
+		ret = audit_mq_getsetattr(mqdes, &mqstat);
+		if (ret != 0)
+			goto out;
 		if (mqstat.mq_flags & O_NONBLOCK)
 			filp->f_flags |= O_NONBLOCK;
 		else
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4503c4663cf8..14e295a4121b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -3,7 +3,7 @@
  *
  * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
  * Copyright 2005 Hewlett-Packard Development Company, L.P.
- * Copyright (C) 2005 IBM Corporation
+ * Copyright (C) 2005, 2006 IBM Corporation
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
@@ -29,6 +29,9 @@
  * this file -- see entry.S) is based on a GPL'd patch written by
  * okir@suse.de and Copyright 2003 SuSE Linux AG.
  *
+ * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>,
+ * 2006.
+ *
  * The support of additional filter rules compares (>, <, >=, <=) was
  * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
  *
@@ -49,6 +52,7 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/socket.h>
+#include <linux/mqueue.h>
 #include <linux/audit.h>
 #include <linux/personality.h>
 #include <linux/time.h>
@@ -102,6 +106,33 @@ struct audit_aux_data {
 
 #define AUDIT_AUX_IPCPERM	0
 
+struct audit_aux_data_mq_open {
+	struct audit_aux_data	d;
+	int			oflag;
+	mode_t			mode;
+	struct mq_attr		attr;
+};
+
+struct audit_aux_data_mq_sendrecv {
+	struct audit_aux_data	d;
+	mqd_t			mqdes;
+	size_t			msg_len;
+	unsigned int		msg_prio;
+	struct timespec		abs_timeout;
+};
+
+struct audit_aux_data_mq_notify {
+	struct audit_aux_data	d;
+	mqd_t			mqdes;
+	struct sigevent 	notification;
+};
+
+struct audit_aux_data_mq_getsetattr {
+	struct audit_aux_data	d;
+	mqd_t			mqdes;
+	struct mq_attr 		mqstat;
+};
+
 struct audit_aux_data_ipcctl {
 	struct audit_aux_data	d;
 	struct ipc_perm		p;
@@ -644,6 +675,43 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			continue; /* audit_panic has been called */
 
 		switch (aux->type) {
+		case AUDIT_MQ_OPEN: {
+			struct audit_aux_data_mq_open *axi = (void *)aux;
+			audit_log_format(ab,
+				"oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
+				"mq_msgsize=%ld mq_curmsgs=%ld",
+				axi->oflag, axi->mode, axi->attr.mq_flags,
+				axi->attr.mq_maxmsg, axi->attr.mq_msgsize,
+				axi->attr.mq_curmsgs);
+			break; }
+
+		case AUDIT_MQ_SENDRECV: {
+			struct audit_aux_data_mq_sendrecv *axi = (void *)aux;
+			audit_log_format(ab,
+				"mqdes=%d msg_len=%zd msg_prio=%u "
+				"abs_timeout_sec=%ld abs_timeout_nsec=%ld",
+				axi->mqdes, axi->msg_len, axi->msg_prio,
+				axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
+			break; }
+
+		case AUDIT_MQ_NOTIFY: {
+			struct audit_aux_data_mq_notify *axi = (void *)aux;
+			audit_log_format(ab,
+				"mqdes=%d sigev_signo=%d",
+				axi->mqdes,
+				axi->notification.sigev_signo);
+			break; }
+
+		case AUDIT_MQ_GETSETATTR: {
+			struct audit_aux_data_mq_getsetattr *axi = (void *)aux;
+			audit_log_format(ab,
+				"mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
+				"mq_curmsgs=%ld ",
+				axi->mqdes,
+				axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg,
+				axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
+			break; }
+
 		case AUDIT_IPC: {
 			struct audit_aux_data_ipcctl *axi = (void *)aux;
 			audit_log_format(ab, 
@@ -1182,6 +1250,210 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
 	return ctx ? ctx->loginuid : -1;
 }
 
+/**
+ * __audit_mq_open - record audit data for a POSIX MQ open
+ * @oflag: open flag
+ * @mode: mode bits
+ * @u_attr: queue attributes
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
+{
+	struct audit_aux_data_mq_open *ax;
+	struct audit_context *context = current->audit_context;
+
+	if (!audit_enabled)
+		return 0;
+
+	if (likely(!context))
+		return 0;
+
+	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+	if (!ax)
+		return -ENOMEM;
+
+	if (u_attr != NULL) {
+		if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) {
+			kfree(ax);
+			return -EFAULT;
+		}
+	} else
+		memset(&ax->attr, 0, sizeof(ax->attr));
+
+	ax->oflag = oflag;
+	ax->mode = mode;
+
+	ax->d.type = AUDIT_MQ_OPEN;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+	return 0;
+}
+
+/**
+ * __audit_mq_timedsend - record audit data for a POSIX MQ timed send
+ * @mqdes: MQ descriptor
+ * @msg_len: Message length
+ * @msg_prio: Message priority
+ * @abs_timeout: Message timeout in absolute time
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
+			const struct timespec __user *u_abs_timeout)
+{
+	struct audit_aux_data_mq_sendrecv *ax;
+	struct audit_context *context = current->audit_context;
+
+	if (!audit_enabled)
+		return 0;
+
+	if (likely(!context))
+		return 0;
+
+	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+	if (!ax)
+		return -ENOMEM;
+
+	if (u_abs_timeout != NULL) {
+		if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
+			kfree(ax);
+			return -EFAULT;
+		}
+	} else
+		memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
+
+	ax->mqdes = mqdes;
+	ax->msg_len = msg_len;
+	ax->msg_prio = msg_prio;
+
+	ax->d.type = AUDIT_MQ_SENDRECV;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+	return 0;
+}
+
+/**
+ * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
+ * @mqdes: MQ descriptor
+ * @msg_len: Message length
+ * @msg_prio: Message priority
+ * @abs_timeout: Message timeout in absolute time
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
+				unsigned int __user *u_msg_prio,
+				const struct timespec __user *u_abs_timeout)
+{
+	struct audit_aux_data_mq_sendrecv *ax;
+	struct audit_context *context = current->audit_context;
+
+	if (!audit_enabled)
+		return 0;
+
+	if (likely(!context))
+		return 0;
+
+	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+	if (!ax)
+		return -ENOMEM;
+
+	if (u_msg_prio != NULL) {
+		if (get_user(ax->msg_prio, u_msg_prio)) {
+			kfree(ax);
+			return -EFAULT;
+		}
+	} else
+		ax->msg_prio = 0;
+
+	if (u_abs_timeout != NULL) {
+		if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
+			kfree(ax);
+			return -EFAULT;
+		}
+	} else
+		memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
+
+	ax->mqdes = mqdes;
+	ax->msg_len = msg_len;
+
+	ax->d.type = AUDIT_MQ_SENDRECV;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+	return 0;
+}
+
+/**
+ * __audit_mq_notify - record audit data for a POSIX MQ notify
+ * @mqdes: MQ descriptor
+ * @u_notification: Notification event
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+
+int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
+{
+	struct audit_aux_data_mq_notify *ax;
+	struct audit_context *context = current->audit_context;
+
+	if (!audit_enabled)
+		return 0;
+
+	if (likely(!context))
+		return 0;
+
+	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+	if (!ax)
+		return -ENOMEM;
+
+	if (u_notification != NULL) {
+		if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) {
+			kfree(ax);
+			return -EFAULT;
+		}
+	} else
+		memset(&ax->notification, 0, sizeof(ax->notification));
+
+	ax->mqdes = mqdes;
+
+	ax->d.type = AUDIT_MQ_NOTIFY;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+	return 0;
+}
+
+/**
+ * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute
+ * @mqdes: MQ descriptor
+ * @mqstat: MQ flags
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
+{
+	struct audit_aux_data_mq_getsetattr *ax;
+	struct audit_context *context = current->audit_context;
+
+	if (!audit_enabled)
+		return 0;
+
+	if (likely(!context))
+		return 0;
+
+	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+	if (!ax)
+		return -ENOMEM;
+
+	ax->mqdes = mqdes;
+	ax->mqstat = *mqstat;
+
+	ax->d.type = AUDIT_MQ_GETSETATTR;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+	return 0;
+}
+
 /**
  * audit_ipc_obj - record audit data for ipc object
  * @ipcp: ipc permissions
-- 
cgit v1.2.3


From f368c07d7214a7c41dfceb76c8db473b850f0229 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Fri, 7 Apr 2006 16:55:56 -0400
Subject: [PATCH] audit: path-based rules

In this implementation, audit registers inotify watches on the parent
directories of paths specified in audit rules.  When audit's inotify
event handler is called, it updates any affected rules based on the
filesystem event.  If the parent directory is renamed, removed, or its
filesystem is unmounted, audit removes all rules referencing that
inotify watch.

To keep things simple, this implementation limits location-based
auditing to the directory entries in an existing directory.  Given
a path-based rule for /foo/bar/passwd, the following table applies:

    passwd modified -- audit event logged
    passwd replaced -- audit event logged, rules list updated
    bar renamed     -- rule removed
    foo renamed     -- untracked, meaning that the rule now applies to
		       the new location

Audit users typically want to have many rules referencing filesystem
objects, which can significantly impact filtering performance.  This
patch also adds an inode-number-based rule hash to mitigate this
situation.

The patch is relative to the audit git tree:
http://kernel.org/git/?p=linux/kernel/git/viro/audit-current.git;a=summary
and uses the inotify kernel API:
http://lkml.org/lkml/2006/6/1/145

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |   1 +
 init/Kconfig          |   3 +-
 kernel/audit.c        |  41 ++-
 kernel/audit.h        |  38 ++-
 kernel/auditfilter.c  | 785 +++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/auditsc.c      | 124 +++++---
 6 files changed, 903 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 7c8780b150e6..c78327507f4e 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -165,6 +165,7 @@
 #define AUDIT_INODE	102
 #define AUDIT_EXIT	103
 #define AUDIT_SUCCESS   104	/* exit >= 0; value ignored */
+#define AUDIT_WATCH	105
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
diff --git a/init/Kconfig b/init/Kconfig
index 3b36a1d53656..c4d0fa655d5d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -182,7 +182,8 @@ config AUDITSYSCALL
 	help
 	  Enable low-overhead system-call auditing infrastructure that
 	  can be used independently or with another kernel subsystem,
-	  such as SELinux.
+	  such as SELinux.  To use audit's filesystem watch feature, please
+	  ensure that INOTIFY is configured.
 
 config IKCONFIG
 	bool "Kernel .config support"
diff --git a/kernel/audit.c b/kernel/audit.c
index 0738a4b290e6..0fbf1c116363 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,6 +56,7 @@
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/selinux.h>
+#include <linux/inotify.h>
 
 #include "audit.h"
 
@@ -103,6 +104,12 @@ static atomic_t    audit_lost = ATOMIC_INIT(0);
 /* The netlink socket. */
 static struct sock *audit_sock;
 
+/* Inotify handle. */
+struct inotify_handle *audit_ih;
+
+/* Hash for inode-based rules */
+struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
+
 /* The audit_freelist is a list of pre-allocated audit buffers (if more
  * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
  * being placed on the freelist). */
@@ -115,10 +122,8 @@ static struct task_struct *kauditd_task;
 static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
 static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
 
-/* The netlink socket is only to be read by 1 CPU, which lets us assume
- * that list additions and deletions never happen simultaneously in
- * auditsc.c */
-DEFINE_MUTEX(audit_netlink_mutex);
+/* Serialize requests from userspace. */
+static DEFINE_MUTEX(audit_cmd_mutex);
 
 /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
  * audit records.  Since printk uses a 1024 byte buffer, this buffer
@@ -373,8 +378,8 @@ int audit_send_list(void *_dest)
 	struct sk_buff *skb;
 
 	/* wait for parent to finish and send an ACK */
-	mutex_lock(&audit_netlink_mutex);
-	mutex_unlock(&audit_netlink_mutex);
+	mutex_lock(&audit_cmd_mutex);
+	mutex_unlock(&audit_cmd_mutex);
 
 	while ((skb = __skb_dequeue(&dest->q)) != NULL)
 		netlink_unicast(audit_sock, skb, pid, 0);
@@ -665,20 +670,30 @@ static void audit_receive(struct sock *sk, int length)
 	struct sk_buff  *skb;
 	unsigned int qlen;
 
-	mutex_lock(&audit_netlink_mutex);
+	mutex_lock(&audit_cmd_mutex);
 
 	for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
 		skb = skb_dequeue(&sk->sk_receive_queue);
 		audit_receive_skb(skb);
 		kfree_skb(skb);
 	}
-	mutex_unlock(&audit_netlink_mutex);
+	mutex_unlock(&audit_cmd_mutex);
 }
 
+#ifdef CONFIG_AUDITSYSCALL
+static const struct inotify_operations audit_inotify_ops = {
+	.handle_event	= audit_handle_ievent,
+	.destroy_watch	= audit_free_parent,
+};
+#endif
 
 /* Initialize audit support at boot time. */
 static int __init audit_init(void)
 {
+#ifdef CONFIG_AUDITSYSCALL
+	int i;
+#endif
+
 	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
 	       audit_default ? "enabled" : "disabled");
 	audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
@@ -697,6 +712,16 @@ static int __init audit_init(void)
 	selinux_audit_set_callback(&selinux_audit_rule_update);
 
 	audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
+
+#ifdef CONFIG_AUDITSYSCALL
+	audit_ih = inotify_init(&audit_inotify_ops);
+	if (IS_ERR(audit_ih))
+		audit_panic("cannot initialize inotify handle");
+
+	for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
+		INIT_LIST_HEAD(&audit_inode_hash[i]);
+#endif
+
 	return 0;
 }
 __initcall(audit_init);
diff --git a/kernel/audit.h b/kernel/audit.h
index 52cb1e31d522..58fa44cb8d01 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -19,7 +19,6 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include <linux/mutex.h>
 #include <linux/fs.h>
 #include <linux/audit.h>
 #include <linux/skbuff.h>
@@ -54,6 +53,18 @@ enum audit_state {
 };
 
 /* Rule lists */
+struct audit_parent;
+
+struct audit_watch {
+	atomic_t		count;	/* reference count */
+	char			*path;	/* insertion path */
+	dev_t			dev;	/* associated superblock device */
+	unsigned long		ino;	/* associated inode number */
+	struct audit_parent	*parent; /* associated parent */
+	struct list_head	wlist;	/* entry in parent->watches list */
+	struct list_head	rules;	/* associated rules */
+};
+
 struct audit_field {
 	u32				type;
 	u32				val;
@@ -71,6 +82,9 @@ struct audit_krule {
 	u32			buflen; /* for data alloc on list rules */
 	u32			field_count;
 	struct audit_field	*fields;
+	struct audit_field	*inode_f; /* quick access to an inode field */
+	struct audit_watch	*watch;	/* associated watch */
+	struct list_head	rlist;	/* entry in audit_watch.rules list */
 };
 
 struct audit_entry {
@@ -79,10 +93,18 @@ struct audit_entry {
 	struct audit_krule	rule;
 };
 
-
 extern int audit_pid;
-extern int audit_comparator(const u32 left, const u32 op, const u32 right);
 
+#define AUDIT_INODE_BUCKETS	32
+extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
+
+static inline int audit_hash_ino(u32 ino)
+{
+	return (ino & (AUDIT_INODE_BUCKETS-1));
+}
+
+extern int audit_comparator(const u32 left, const u32 op, const u32 right);
+extern int audit_compare_dname_path(const char *dname, const char *path);
 extern struct sk_buff *	    audit_make_reply(int pid, int seq, int type,
 					     int done, int multi,
 					     void *payload, int size);
@@ -91,7 +113,6 @@ extern void		    audit_send_reply(int pid, int seq, int type,
 					     void *payload, int size);
 extern void		    audit_log_lost(const char *message);
 extern void		    audit_panic(const char *message);
-extern struct mutex audit_netlink_mutex;
 
 struct audit_netlink_list {
 	int pid;
@@ -100,6 +121,10 @@ struct audit_netlink_list {
 
 int audit_send_list(void *);
 
+struct inotify_watch;
+extern void audit_free_parent(struct inotify_watch *);
+extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
+				const char *, struct inode *);
 extern int selinux_audit_rule_update(void);
 
 #ifdef CONFIG_AUDITSYSCALL
@@ -109,6 +134,11 @@ static inline void audit_signal_info(int sig, struct task_struct *t)
 	if (unlikely(audit_pid && t->tgid == audit_pid))
 		__audit_signal_info(sig, t);
 }
+extern enum audit_state audit_filter_inodes(struct task_struct *,
+					    struct audit_context *);
+extern void audit_set_auditable(struct audit_context *);
 #else
 #define audit_signal_info(s,t)
+#define audit_filter_inodes(t,c) AUDIT_DISABLED
+#define audit_set_auditable(c)
 #endif
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index df9503da40fb..03a6919103d4 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -22,13 +22,59 @@
 #include <linux/kernel.h>
 #include <linux/audit.h>
 #include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/inotify.h>
 #include <linux/selinux.h>
 #include "audit.h"
 
-/* There are three lists of rules -- one to search at task creation
- * time, one to search at syscall entry time, and another to search at
- * syscall exit time. */
+/*
+ * Locking model:
+ *
+ * audit_filter_mutex:
+ * 		Synchronizes writes and blocking reads of audit's filterlist
+ * 		data.  Rcu is used to traverse the filterlist and access
+ * 		contents of structs audit_entry, audit_watch and opaque
+ * 		selinux rules during filtering.  If modified, these structures
+ * 		must be copied and replace their counterparts in the filterlist.
+ * 		An audit_parent struct is not accessed during filtering, so may
+ * 		be written directly provided audit_filter_mutex is held.
+ */
+
+/*
+ * Reference counting:
+ *
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ * 	event.  Each audit_watch holds a reference to its associated parent.
+ *
+ * audit_watch: if added to lists, lifetime is from audit_init_watch() to
+ * 	audit_remove_watch().  Additionally, an audit_watch may exist
+ * 	temporarily to assist in searching existing filter data.  Each
+ * 	audit_krule holds a reference to its associated watch.
+ */
+
+struct audit_parent {
+	struct list_head	ilist;	/* entry in inotify registration list */
+	struct list_head	watches; /* associated watches */
+	struct inotify_watch	wdata;	/* inotify watch data */
+	unsigned		flags;	/* status flags */
+};
+
+/*
+ * audit_parent status flags:
+ *
+ * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
+ * a filesystem event to ensure we're adding audit watches to a valid parent.
+ * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
+ * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
+ * we can receive while holding nameidata.
+ */
+#define AUDIT_PARENT_INVALID	0x001
+
+/* Audit filter lists, defined in <linux/audit.h> */
 struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 	LIST_HEAD_INIT(audit_filter_list[0]),
 	LIST_HEAD_INIT(audit_filter_list[1]),
@@ -41,9 +87,53 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 #endif
 };
 
+static DEFINE_MUTEX(audit_filter_mutex);
+
+/* Inotify handle */
+extern struct inotify_handle *audit_ih;
+
+/* Inotify events we care about. */
+#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+
+void audit_free_parent(struct inotify_watch *i_watch)
+{
+	struct audit_parent *parent;
+
+	parent = container_of(i_watch, struct audit_parent, wdata);
+	WARN_ON(!list_empty(&parent->watches));
+	kfree(parent);
+}
+
+static inline void audit_get_watch(struct audit_watch *watch)
+{
+	atomic_inc(&watch->count);
+}
+
+static void audit_put_watch(struct audit_watch *watch)
+{
+	if (atomic_dec_and_test(&watch->count)) {
+		WARN_ON(watch->parent);
+		WARN_ON(!list_empty(&watch->rules));
+		kfree(watch->path);
+		kfree(watch);
+	}
+}
+
+static void audit_remove_watch(struct audit_watch *watch)
+{
+	list_del(&watch->wlist);
+	put_inotify_watch(&watch->parent->wdata);
+	watch->parent = NULL;
+	audit_put_watch(watch); /* match initial get */
+}
+
 static inline void audit_free_rule(struct audit_entry *e)
 {
 	int i;
+
+	/* some rules don't have associated watches */
+	if (e->rule.watch)
+		audit_put_watch(e->rule.watch);
 	if (e->rule.fields)
 		for (i = 0; i < e->rule.field_count; i++) {
 			struct audit_field *f = &e->rule.fields[i];
@@ -60,6 +150,50 @@ static inline void audit_free_rule_rcu(struct rcu_head *head)
 	audit_free_rule(e);
 }
 
+/* Initialize a parent watch entry. */
+static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+{
+	struct audit_parent *parent;
+	s32 wd;
+
+	parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+	if (unlikely(!parent))
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&parent->watches);
+	parent->flags = 0;
+
+	inotify_init_watch(&parent->wdata);
+	/* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
+	get_inotify_watch(&parent->wdata);
+	wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode,
+			       AUDIT_IN_WATCH);
+	if (wd < 0) {
+		audit_free_parent(&parent->wdata);
+		return ERR_PTR(wd);
+	}
+
+	return parent;
+}
+
+/* Initialize a watch entry. */
+static struct audit_watch *audit_init_watch(char *path)
+{
+	struct audit_watch *watch;
+
+	watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+	if (unlikely(!watch))
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&watch->rules);
+	atomic_set(&watch->count, 1);
+	watch->path = path;
+	watch->dev = (dev_t)-1;
+	watch->ino = (unsigned long)-1;
+
+	return watch;
+}
+
 /* Initialize an audit filterlist entry. */
 static inline struct audit_entry *audit_init_entry(u32 field_count)
 {
@@ -107,6 +241,43 @@ static char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
 	return str;
 }
 
+/* Translate an inode field to kernel respresentation. */
+static inline int audit_to_inode(struct audit_krule *krule,
+				 struct audit_field *f)
+{
+	if (krule->listnr != AUDIT_FILTER_EXIT ||
+	    krule->watch || krule->inode_f)
+		return -EINVAL;
+
+	krule->inode_f = f;
+	return 0;
+}
+
+/* Translate a watch string to kernel respresentation. */
+static int audit_to_watch(struct audit_krule *krule, char *path, int len,
+			  u32 op)
+{
+	struct audit_watch *watch;
+
+	if (!audit_ih)
+		return -EOPNOTSUPP;
+
+	if (path[0] != '/' || path[len-1] == '/' ||
+	    krule->listnr != AUDIT_FILTER_EXIT ||
+	    op & ~AUDIT_EQUAL ||
+	    krule->inode_f || krule->watch) /* 1 inode # per rule, for hash */
+		return -EINVAL;
+
+	watch = audit_init_watch(path);
+	if (unlikely(IS_ERR(watch)))
+		return PTR_ERR(watch);
+
+	audit_get_watch(watch);
+	krule->watch = watch;
+
+	return 0;
+}
+
 /* Common user-space to kernel rule translation. */
 static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
 {
@@ -161,6 +332,7 @@ exit_err:
 static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 {
 	struct audit_entry *entry;
+	struct audit_field *f;
 	int err = 0;
 	int i;
 
@@ -175,14 +347,23 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 		f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
 		f->val = rule->values[i];
 
-		if (f->type & AUDIT_UNUSED_BITS ||
-		    f->type == AUDIT_SE_USER ||
-		    f->type == AUDIT_SE_ROLE ||
-		    f->type == AUDIT_SE_TYPE ||
-		    f->type == AUDIT_SE_SEN ||
-		    f->type == AUDIT_SE_CLR) {
-			err = -EINVAL;
+		err = -EINVAL;
+		if (f->type & AUDIT_UNUSED_BITS)
+			goto exit_free;
+
+		switch(f->type) {
+		case AUDIT_SE_USER:
+		case AUDIT_SE_ROLE:
+		case AUDIT_SE_TYPE:
+		case AUDIT_SE_SEN:
+		case AUDIT_SE_CLR:
+		case AUDIT_WATCH:
 			goto exit_free;
+		case AUDIT_INODE:
+			err = audit_to_inode(&entry->rule, f);
+			if (err)
+				goto exit_free;
+			break;
 		}
 
 		entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
@@ -199,6 +380,18 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 		}
 	}
 
+	f = entry->rule.inode_f;
+	if (f) {
+		switch(f->op) {
+		case AUDIT_NOT_EQUAL:
+			entry->rule.inode_f = NULL;
+		case AUDIT_EQUAL:
+			break;
+		default:
+			goto exit_free;
+		}
+	}
+
 exit_nofree:
 	return entry;
 
@@ -213,6 +406,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 {
 	int err = 0;
 	struct audit_entry *entry;
+	struct audit_field *f;
 	void *bufp;
 	size_t remain = datasz - sizeof(struct audit_rule_data);
 	int i;
@@ -263,6 +457,35 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 			} else
 				f->se_str = str;
 			break;
+		case AUDIT_WATCH:
+			str = audit_unpack_string(&bufp, &remain, f->val);
+			if (IS_ERR(str))
+				goto exit_free;
+			entry->rule.buflen += f->val;
+
+			err = audit_to_watch(&entry->rule, str, f->val, f->op);
+			if (err) {
+				kfree(str);
+				goto exit_free;
+			}
+			break;
+		case AUDIT_INODE:
+			err = audit_to_inode(&entry->rule, f);
+			if (err)
+				goto exit_free;
+			break;
+		}
+	}
+
+	f = entry->rule.inode_f;
+	if (f) {
+		switch(f->op) {
+		case AUDIT_NOT_EQUAL:
+			entry->rule.inode_f = NULL;
+		case AUDIT_EQUAL:
+			break;
+		default:
+			goto exit_free;
 		}
 	}
 
@@ -346,6 +569,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
 			data->buflen += data->values[i] =
 				audit_pack_string(&bufp, f->se_str);
 			break;
+		case AUDIT_WATCH:
+			data->buflen += data->values[i] =
+				audit_pack_string(&bufp, krule->watch->path);
+			break;
 		default:
 			data->values[i] = f->val;
 		}
@@ -381,6 +608,10 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
 			if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
 				return 1;
 			break;
+		case AUDIT_WATCH:
+			if (strcmp(a->watch->path, b->watch->path))
+				return 1;
+			break;
 		default:
 			if (a->fields[i].val != b->fields[i].val)
 				return 1;
@@ -394,6 +625,32 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
 	return 0;
 }
 
+/* Duplicate the given audit watch.  The new watch's rules list is initialized
+ * to an empty list and wlist is undefined. */
+static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
+{
+	char *path;
+	struct audit_watch *new;
+
+	path = kstrdup(old->path, GFP_KERNEL);
+	if (unlikely(!path))
+		return ERR_PTR(-ENOMEM);
+
+	new = audit_init_watch(path);
+	if (unlikely(IS_ERR(new))) {
+		kfree(path);
+		goto out;
+	}
+
+	new->dev = old->dev;
+	new->ino = old->ino;
+	get_inotify_watch(&old->parent->wdata);
+	new->parent = old->parent;
+
+out:
+	return new;
+}
+
 /* Duplicate selinux field information.  The se_rule is opaque, so must be
  * re-initialized. */
 static inline int audit_dupe_selinux_field(struct audit_field *df,
@@ -425,8 +682,11 @@ static inline int audit_dupe_selinux_field(struct audit_field *df,
 /* Duplicate an audit rule.  This will be a deep copy with the exception
  * of the watch - that pointer is carried over.  The selinux specific fields
  * will be updated in the copy.  The point is to be able to replace the old
- * rule with the new rule in the filterlist, then free the old rule. */
-static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
+ * rule with the new rule in the filterlist, then free the old rule.
+ * The rlist element is undefined; list manipulations are handled apart from
+ * the initial copy. */
+static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+					   struct audit_watch *watch)
 {
 	u32 fcount = old->field_count;
 	struct audit_entry *entry;
@@ -445,6 +705,8 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
 	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
 		new->mask[i] = old->mask[i];
 	new->buflen = old->buflen;
+	new->inode_f = old->inode_f;
+	new->watch = NULL;
 	new->field_count = old->field_count;
 	memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
 
@@ -466,21 +728,318 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
 		}
 	}
 
+	if (watch) {
+		audit_get_watch(watch);
+		new->watch = watch;
+	}
+
 	return entry;
 }
 
-/* Add rule to given filterlist if not a duplicate.  Protected by
- * audit_netlink_mutex. */
+/* Update inode info in audit rules based on filesystem event. */
+static void audit_update_watch(struct audit_parent *parent,
+			       const char *dname, dev_t dev,
+			       unsigned long ino, unsigned invalidating)
+{
+	struct audit_watch *owatch, *nwatch, *nextw;
+	struct audit_krule *r, *nextr;
+	struct audit_entry *oentry, *nentry;
+	struct audit_buffer *ab;
+
+	mutex_lock(&audit_filter_mutex);
+	list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
+		if (audit_compare_dname_path(dname, owatch->path))
+			continue;
+
+		/* If the update involves invalidating rules, do the inode-based
+		 * filtering now, so we don't omit records. */
+		if (invalidating &&
+		    audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT)
+			audit_set_auditable(current->audit_context);
+
+		nwatch = audit_dupe_watch(owatch);
+		if (unlikely(IS_ERR(nwatch))) {
+			mutex_unlock(&audit_filter_mutex);
+			audit_panic("error updating watch, skipping");
+			return;
+		}
+		nwatch->dev = dev;
+		nwatch->ino = ino;
+
+		list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
+
+			oentry = container_of(r, struct audit_entry, rule);
+			list_del(&oentry->rule.rlist);
+			list_del_rcu(&oentry->list);
+
+			nentry = audit_dupe_rule(&oentry->rule, nwatch);
+			if (unlikely(IS_ERR(nentry)))
+				audit_panic("error updating watch, removing");
+			else {
+				int h = audit_hash_ino((u32)ino);
+				list_add(&nentry->rule.rlist, &nwatch->rules);
+				list_add_rcu(&nentry->list, &audit_inode_hash[h]);
+			}
+
+			call_rcu(&oentry->rcu, audit_free_rule_rcu);
+		}
+
+		ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+		audit_log_format(ab, "audit updated rules specifying watch=");
+		audit_log_untrustedstring(ab, owatch->path);
+		audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino);
+		audit_log_end(ab);
+
+		audit_remove_watch(owatch);
+		goto add_watch_to_parent; /* event applies to a single watch */
+	}
+	mutex_unlock(&audit_filter_mutex);
+	return;
+
+add_watch_to_parent:
+	list_add(&nwatch->wlist, &parent->watches);
+	mutex_unlock(&audit_filter_mutex);
+	return;
+}
+
+/* Remove all watches & rules associated with a parent that is going away. */
+static void audit_remove_parent_watches(struct audit_parent *parent)
+{
+	struct audit_watch *w, *nextw;
+	struct audit_krule *r, *nextr;
+	struct audit_entry *e;
+
+	mutex_lock(&audit_filter_mutex);
+	parent->flags |= AUDIT_PARENT_INVALID;
+	list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
+		list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
+			e = container_of(r, struct audit_entry, rule);
+			list_del(&r->rlist);
+			list_del_rcu(&e->list);
+			call_rcu(&e->rcu, audit_free_rule_rcu);
+
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				 "audit implicitly removed rule from list=%d\n",
+				  AUDIT_FILTER_EXIT);
+		}
+		audit_remove_watch(w);
+	}
+	mutex_unlock(&audit_filter_mutex);
+}
+
+/* Unregister inotify watches for parents on in_list.
+ * Generates an IN_IGNORED event. */
+static void audit_inotify_unregister(struct list_head *in_list)
+{
+	struct audit_parent *p, *n;
+
+	list_for_each_entry_safe(p, n, in_list, ilist) {
+		list_del(&p->ilist);
+		inotify_rm_watch(audit_ih, &p->wdata);
+		/* the put matching the get in audit_do_del_rule() */
+		put_inotify_watch(&p->wdata);
+	}
+}
+
+/* Find an existing audit rule.
+ * Caller must hold audit_filter_mutex to prevent stale rule data. */
+static struct audit_entry *audit_find_rule(struct audit_entry *entry,
+					   struct list_head *list)
+{
+	struct audit_entry *e, *found = NULL;
+	int h;
+
+	if (entry->rule.watch) {
+		/* we don't know the inode number, so must walk entire hash */
+		for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
+			list = &audit_inode_hash[h];
+			list_for_each_entry(e, list, list)
+				if (!audit_compare_rule(&entry->rule, &e->rule)) {
+					found = e;
+					goto out;
+				}
+		}
+		goto out;
+	}
+
+	list_for_each_entry(e, list, list)
+		if (!audit_compare_rule(&entry->rule, &e->rule)) {
+			found = e;
+			goto out;
+		}
+
+out:
+	return found;
+}
+
+/* Get path information necessary for adding watches. */
+static int audit_get_nd(char *path, struct nameidata **ndp,
+			struct nameidata **ndw)
+{
+	struct nameidata *ndparent, *ndwatch;
+	int err;
+
+	ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
+	if (unlikely(!ndparent))
+		return -ENOMEM;
+
+	ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
+	if (unlikely(!ndwatch)) {
+		kfree(ndparent);
+		return -ENOMEM;
+	}
+
+	err = path_lookup(path, LOOKUP_PARENT, ndparent);
+	if (err) {
+		kfree(ndparent);
+		kfree(ndwatch);
+		return err;
+	}
+
+	err = path_lookup(path, 0, ndwatch);
+	if (err) {
+		kfree(ndwatch);
+		ndwatch = NULL;
+	}
+
+	*ndp = ndparent;
+	*ndw = ndwatch;
+
+	return 0;
+}
+
+/* Release resources used for watch path information. */
+static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
+{
+	if (ndp) {
+		path_release(ndp);
+		kfree(ndp);
+	}
+	if (ndw) {
+		path_release(ndw);
+		kfree(ndw);
+	}
+}
+
+/* Associate the given rule with an existing parent inotify_watch.
+ * Caller must hold audit_filter_mutex. */
+static void audit_add_to_parent(struct audit_krule *krule,
+				struct audit_parent *parent)
+{
+	struct audit_watch *w, *watch = krule->watch;
+	int watch_found = 0;
+
+	list_for_each_entry(w, &parent->watches, wlist) {
+		if (strcmp(watch->path, w->path))
+			continue;
+
+		watch_found = 1;
+
+		/* put krule's and initial refs to temporary watch */
+		audit_put_watch(watch);
+		audit_put_watch(watch);
+
+		audit_get_watch(w);
+		krule->watch = watch = w;
+		break;
+	}
+
+	if (!watch_found) {
+		get_inotify_watch(&parent->wdata);
+		watch->parent = parent;
+
+		list_add(&watch->wlist, &parent->watches);
+	}
+	list_add(&krule->rlist, &watch->rules);
+}
+
+/* Find a matching watch entry, or add this one.
+ * Caller must hold audit_filter_mutex. */
+static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
+			   struct nameidata *ndw)
+{
+	struct audit_watch *watch = krule->watch;
+	struct inotify_watch *i_watch;
+	struct audit_parent *parent;
+	int ret = 0;
+
+	/* update watch filter fields */
+	if (ndw) {
+		watch->dev = ndw->dentry->d_inode->i_sb->s_dev;
+		watch->ino = ndw->dentry->d_inode->i_ino;
+	}
+
+	/* The audit_filter_mutex must not be held during inotify calls because
+	 * we hold it during inotify event callback processing.  If an existing
+	 * inotify watch is found, inotify_find_watch() grabs a reference before
+	 * returning.
+	 */
+	mutex_unlock(&audit_filter_mutex);
+
+	if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) {
+		parent = audit_init_parent(ndp);
+		if (IS_ERR(parent)) {
+			/* caller expects mutex locked */
+			mutex_lock(&audit_filter_mutex);
+			return PTR_ERR(parent);
+		}
+	} else
+		parent = container_of(i_watch, struct audit_parent, wdata);
+
+	mutex_lock(&audit_filter_mutex);
+
+	/* parent was moved before we took audit_filter_mutex */
+	if (parent->flags & AUDIT_PARENT_INVALID)
+		ret = -ENOENT;
+	else
+		audit_add_to_parent(krule, parent);
+
+	/* match get in audit_init_parent or inotify_find_watch */
+	put_inotify_watch(&parent->wdata);
+	return ret;
+}
+
+/* Add rule to given filterlist if not a duplicate. */
 static inline int audit_add_rule(struct audit_entry *entry,
-				  struct list_head *list)
+				 struct list_head *list)
 {
 	struct audit_entry *e;
+	struct audit_field *inode_f = entry->rule.inode_f;
+	struct audit_watch *watch = entry->rule.watch;
+	struct nameidata *ndp, *ndw;
+	int h, err, putnd_needed = 0;
+
+	if (inode_f) {
+		h = audit_hash_ino(inode_f->val);
+		list = &audit_inode_hash[h];
+	}
+
+	mutex_lock(&audit_filter_mutex);
+	e = audit_find_rule(entry, list);
+	mutex_unlock(&audit_filter_mutex);
+	if (e) {
+		err = -EEXIST;
+		goto error;
+	}
 
-	/* Do not use the _rcu iterator here, since this is the only
-	 * addition routine. */
-	list_for_each_entry(e, list, list) {
-		if (!audit_compare_rule(&entry->rule, &e->rule))
-			return -EEXIST;
+	/* Avoid calling path_lookup under audit_filter_mutex. */
+	if (watch) {
+		err = audit_get_nd(watch->path, &ndp, &ndw);
+		if (err)
+			goto error;
+		putnd_needed = 1;
+	}
+
+	mutex_lock(&audit_filter_mutex);
+	if (watch) {
+		/* audit_filter_mutex is dropped and re-taken during this call */
+		err = audit_add_watch(&entry->rule, ndp, ndw);
+		if (err) {
+			mutex_unlock(&audit_filter_mutex);
+			goto error;
+		}
+		h = audit_hash_ino((u32)watch->ino);
+		list = &audit_inode_hash[h];
 	}
 
 	if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
@@ -488,27 +1047,77 @@ static inline int audit_add_rule(struct audit_entry *entry,
 	} else {
 		list_add_tail_rcu(&entry->list, list);
 	}
+	mutex_unlock(&audit_filter_mutex);
 
-	return 0;
+	if (putnd_needed)
+		audit_put_nd(ndp, ndw);
+
+ 	return 0;
+
+error:
+	if (putnd_needed)
+		audit_put_nd(ndp, ndw);
+	if (watch)
+		audit_put_watch(watch); /* tmp watch, matches initial get */
+	return err;
 }
 
-/* Remove an existing rule from filterlist.  Protected by
- * audit_netlink_mutex. */
+/* Remove an existing rule from filterlist. */
 static inline int audit_del_rule(struct audit_entry *entry,
 				 struct list_head *list)
 {
 	struct audit_entry  *e;
+	struct audit_field *inode_f = entry->rule.inode_f;
+	struct audit_watch *watch, *tmp_watch = entry->rule.watch;
+	LIST_HEAD(inotify_list);
+	int h, ret = 0;
+
+	if (inode_f) {
+		h = audit_hash_ino(inode_f->val);
+		list = &audit_inode_hash[h];
+	}
 
-	/* Do not use the _rcu iterator here, since this is the only
-	 * deletion routine. */
-	list_for_each_entry(e, list, list) {
-		if (!audit_compare_rule(&entry->rule, &e->rule)) {
-			list_del_rcu(&e->list);
-			call_rcu(&e->rcu, audit_free_rule_rcu);
-			return 0;
+	mutex_lock(&audit_filter_mutex);
+	e = audit_find_rule(entry, list);
+	if (!e) {
+		mutex_unlock(&audit_filter_mutex);
+		ret = -ENOENT;
+		goto out;
+	}
+
+	watch = e->rule.watch;
+	if (watch) {
+		struct audit_parent *parent = watch->parent;
+
+		list_del(&e->rule.rlist);
+
+		if (list_empty(&watch->rules)) {
+			audit_remove_watch(watch);
+
+			if (list_empty(&parent->watches)) {
+				/* Put parent on the inotify un-registration
+				 * list.  Grab a reference before releasing
+				 * audit_filter_mutex, to be released in
+				 * audit_inotify_unregister(). */
+				list_add(&parent->ilist, &inotify_list);
+				get_inotify_watch(&parent->wdata);
+			}
 		}
 	}
-	return -ENOENT;		/* No matching rule */
+
+	list_del_rcu(&e->list);
+	call_rcu(&e->rcu, audit_free_rule_rcu);
+
+	mutex_unlock(&audit_filter_mutex);
+
+	if (!list_empty(&inotify_list))
+		audit_inotify_unregister(&inotify_list);
+
+out:
+	if (tmp_watch)
+		audit_put_watch(tmp_watch); /* match initial get */
+
+	return ret;
 }
 
 /* List rules using struct audit_rule.  Exists for backward
@@ -519,8 +1128,8 @@ static void audit_list(int pid, int seq, struct sk_buff_head *q)
 	struct audit_entry *entry;
 	int i;
 
-	/* The *_rcu iterators not needed here because we are
-	   always called with audit_netlink_mutex held. */
+	/* This is a blocking read, so use audit_filter_mutex instead of rcu
+	 * iterator to sync with list writers. */
 	for (i=0; i<AUDIT_NR_FILTERS; i++) {
 		list_for_each_entry(entry, &audit_filter_list[i], list) {
 			struct audit_rule *rule;
@@ -535,6 +1144,20 @@ static void audit_list(int pid, int seq, struct sk_buff_head *q)
 			kfree(rule);
 		}
 	}
+	for (i = 0; i < AUDIT_INODE_BUCKETS; i++) {
+		list_for_each_entry(entry, &audit_inode_hash[i], list) {
+			struct audit_rule *rule;
+
+			rule = audit_krule_to_rule(&entry->rule);
+			if (unlikely(!rule))
+				break;
+			skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 rule, sizeof(*rule));
+			if (skb)
+				skb_queue_tail(q, skb);
+			kfree(rule);
+		}
+	}
 	skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
 	if (skb)
 		skb_queue_tail(q, skb);
@@ -547,8 +1170,8 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
 	struct audit_entry *e;
 	int i;
 
-	/* The *_rcu iterators not needed here because we are
-	   always called with audit_netlink_mutex held. */
+	/* This is a blocking read, so use audit_filter_mutex instead of rcu
+	 * iterator to sync with list writers. */
 	for (i=0; i<AUDIT_NR_FILTERS; i++) {
 		list_for_each_entry(e, &audit_filter_list[i], list) {
 			struct audit_rule_data *data;
@@ -557,7 +1180,21 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
 			if (unlikely(!data))
 				break;
 			skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
-					 data, sizeof(*data));
+					 data, sizeof(*data) + data->buflen);
+			if (skb)
+				skb_queue_tail(q, skb);
+			kfree(data);
+		}
+	}
+	for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
+		list_for_each_entry(e, &audit_inode_hash[i], list) {
+			struct audit_rule_data *data;
+
+			data = audit_krule_to_data(&e->rule);
+			if (unlikely(!data))
+				break;
+			skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
+					 data, sizeof(*data) + data->buflen);
 			if (skb)
 				skb_queue_tail(q, skb);
 			kfree(data);
@@ -602,10 +1239,12 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 		dest->pid = pid;
 		skb_queue_head_init(&dest->q);
 
+		mutex_lock(&audit_filter_mutex);
 		if (type == AUDIT_LIST)
 			audit_list(pid, seq, &dest->q);
 		else
 			audit_list_rules(pid, seq, &dest->q);
+		mutex_unlock(&audit_filter_mutex);
 
 		tsk = kthread_run(audit_send_list, dest, "audit_send_list");
 		if (IS_ERR(tsk)) {
@@ -625,6 +1264,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 
 		err = audit_add_rule(entry,
 				     &audit_filter_list[entry->rule.listnr]);
+
 		if (sid) {
 			char *ctx = NULL;
 			u32 len;
@@ -705,7 +1345,39 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
 	return 0;
 }
 
+/* Compare given dentry name with last component in given path,
+ * return of 0 indicates a match. */
+int audit_compare_dname_path(const char *dname, const char *path)
+{
+	int dlen, plen;
+	const char *p;
+
+	if (!dname || !path)
+		return 1;
+
+	dlen = strlen(dname);
+	plen = strlen(path);
+	if (plen < dlen)
+		return 1;
+
+	/* disregard trailing slashes */
+	p = path + plen - 1;
+	while ((*p == '/') && (p > path))
+		p--;
+
+	/* find last path component */
+	p = p - dlen + 1;
+	if (p < path)
+		return 1;
+	else if (p > path) {
+		if (*--p != '/')
+			return 1;
+		else
+			p++;
+	}
 
+	return strncmp(p, dname, dlen);
+}
 
 static int audit_filter_user_rules(struct netlink_skb_parms *cb,
 				   struct audit_krule *rule,
@@ -818,32 +1490,65 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule)
 int selinux_audit_rule_update(void)
 {
 	struct audit_entry *entry, *n, *nentry;
+	struct audit_watch *watch;
 	int i, err = 0;
 
-	/* audit_netlink_mutex synchronizes the writers */
-	mutex_lock(&audit_netlink_mutex);
+	/* audit_filter_mutex synchronizes the writers */
+	mutex_lock(&audit_filter_mutex);
 
 	for (i = 0; i < AUDIT_NR_FILTERS; i++) {
 		list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
 			if (!audit_rule_has_selinux(&entry->rule))
 				continue;
 
-			nentry = audit_dupe_rule(&entry->rule);
+			watch = entry->rule.watch;
+			nentry = audit_dupe_rule(&entry->rule, watch);
 			if (unlikely(IS_ERR(nentry))) {
 				/* save the first error encountered for the
 				 * return value */
 				if (!err)
 					err = PTR_ERR(nentry);
 				audit_panic("error updating selinux filters");
+				if (watch)
+					list_del(&entry->rule.rlist);
 				list_del_rcu(&entry->list);
 			} else {
+				if (watch) {
+					list_add(&nentry->rule.rlist,
+						 &watch->rules);
+					list_del(&entry->rule.rlist);
+				}
 				list_replace_rcu(&entry->list, &nentry->list);
 			}
 			call_rcu(&entry->rcu, audit_free_rule_rcu);
 		}
 	}
 
-	mutex_unlock(&audit_netlink_mutex);
+	mutex_unlock(&audit_filter_mutex);
 
 	return err;
 }
+
+/* Update watch data in audit rules based on inotify events. */
+void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
+			 u32 cookie, const char *dname, struct inode *inode)
+{
+	struct audit_parent *parent;
+
+	parent = container_of(i_watch, struct audit_parent, wdata);
+
+	if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
+		audit_update_watch(parent, dname, inode->i_sb->s_dev,
+				   inode->i_ino, 0);
+	else if (mask & (IN_DELETE|IN_MOVED_FROM))
+		audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+	/* inotify automatically removes the watch and sends IN_IGNORED */
+	else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
+		audit_remove_parent_watches(parent);
+	/* inotify does not remove the watch, so remove it manually */
+	else if(mask & IN_MOVE_SELF) {
+		audit_remove_parent_watches(parent);
+		inotify_remove_watch_locked(audit_ih, i_watch);
+	} else if (mask & IN_IGNORED)
+		put_inotify_watch(i_watch);
+}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 14e295a4121b..174a3f624892 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -200,12 +200,13 @@ struct audit_context {
 #endif
 };
 
-
+/* Determine if any context name data matches a rule's watch data */
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
  * otherwise. */
 static int audit_filter_rules(struct task_struct *tsk,
 			      struct audit_krule *rule,
 			      struct audit_context *ctx,
+			      struct audit_names *name,
 			      enum audit_state *state)
 {
 	int i, j, need_sid = 1;
@@ -268,7 +269,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 			}
 			break;
 		case AUDIT_DEVMAJOR:
-			if (ctx) {
+			if (name)
+				result = audit_comparator(MAJOR(name->dev),
+							  f->op, f->val);
+			else if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
 					if (audit_comparator(MAJOR(ctx->names[j].dev),	f->op, f->val)) {
 						++result;
@@ -278,7 +282,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 			}
 			break;
 		case AUDIT_DEVMINOR:
-			if (ctx) {
+			if (name)
+				result = audit_comparator(MINOR(name->dev),
+							  f->op, f->val);
+			else if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
 					if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
 						++result;
@@ -288,7 +295,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 			}
 			break;
 		case AUDIT_INODE:
-			if (ctx) {
+			if (name)
+				result = (name->ino == f->val ||
+					  name->pino == f->val);
+			else if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
 					if (audit_comparator(ctx->names[j].ino, f->op, f->val) ||
 					    audit_comparator(ctx->names[j].pino, f->op, f->val)) {
@@ -298,6 +308,12 @@ static int audit_filter_rules(struct task_struct *tsk,
 				}
 			}
 			break;
+		case AUDIT_WATCH:
+			if (name && rule->watch->ino != (unsigned long)-1)
+				result = (name->dev == rule->watch->dev &&
+					  (name->ino == rule->watch->ino ||
+					   name->pino == rule->watch->ino));
+			break;
 		case AUDIT_LOGINUID:
 			result = 0;
 			if (ctx)
@@ -354,7 +370,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
-		if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+		if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
 			rcu_read_unlock();
 			return state;
 		}
@@ -384,8 +400,9 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 		int bit  = AUDIT_BIT(ctx->major);
 
 		list_for_each_entry_rcu(e, list, list) {
-			if ((e->rule.mask[word] & bit) == bit
-					&& audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+			if ((e->rule.mask[word] & bit) == bit &&
+			    audit_filter_rules(tsk, &e->rule, ctx, NULL,
+					       &state)) {
 				rcu_read_unlock();
 				return state;
 			}
@@ -395,6 +412,49 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 	return AUDIT_BUILD_CONTEXT;
 }
 
+/* At syscall exit time, this filter is called if any audit_names[] have been
+ * collected during syscall processing.  We only check rules in sublists at hash
+ * buckets applicable to the inode numbers in audit_names[].
+ * Regarding audit_state, same rules apply as for audit_filter_syscall().
+ */
+enum audit_state audit_filter_inodes(struct task_struct *tsk,
+				     struct audit_context *ctx)
+{
+	int i;
+	struct audit_entry *e;
+	enum audit_state state;
+
+	if (audit_pid && tsk->tgid == audit_pid)
+		return AUDIT_DISABLED;
+
+	rcu_read_lock();
+	for (i = 0; i < ctx->name_count; i++) {
+		int word = AUDIT_WORD(ctx->major);
+		int bit  = AUDIT_BIT(ctx->major);
+		struct audit_names *n = &ctx->names[i];
+		int h = audit_hash_ino((u32)n->ino);
+		struct list_head *list = &audit_inode_hash[h];
+
+		if (list_empty(list))
+			continue;
+
+		list_for_each_entry_rcu(e, list, list) {
+			if ((e->rule.mask[word] & bit) == bit &&
+			    audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
+				rcu_read_unlock();
+				return state;
+			}
+		}
+	}
+	rcu_read_unlock();
+	return AUDIT_BUILD_CONTEXT;
+}
+
+void audit_set_auditable(struct audit_context *ctx)
+{
+	ctx->auditable = 1;
+}
+
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 						      int return_valid,
 						      int return_code)
@@ -408,11 +468,20 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 
 	if (context->in_syscall && !context->auditable) {
 		enum audit_state state;
+
 		state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
+		if (state == AUDIT_RECORD_CONTEXT) {
+			context->auditable = 1;
+			goto get_context;
+		}
+
+		state = audit_filter_inodes(tsk, context);
 		if (state == AUDIT_RECORD_CONTEXT)
 			context->auditable = 1;
+
 	}
 
+get_context:
 	context->pid = tsk->pid;
 	context->ppid = sys_getppid();	/* sic.  tsk == current in all cases */
 	context->uid = tsk->uid;
@@ -1142,37 +1211,20 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
 		return;
 
 	/* determine matching parent */
-	if (dname)
-		for (idx = 0; idx < context->name_count; idx++)
-			if (context->names[idx].pino == pino) {
-				const char *n;
-				const char *name = context->names[idx].name;
-				int dlen = strlen(dname);
-				int nlen = name ? strlen(name) : 0;
-
-				if (nlen < dlen)
-					continue;
-				
-				/* disregard trailing slashes */
-				n = name + nlen - 1;
-				while ((*n == '/') && (n > name))
-					n--;
-
-				/* find last path component */
-				n = n - dlen + 1;
-				if (n < name)
-					continue;
-				else if (n > name) {
-					if (*--n != '/')
-						continue;
-					else
-						n++;
-				}
+	if (!dname)
+		goto no_match;
+	for (idx = 0; idx < context->name_count; idx++)
+		if (context->names[idx].pino == pino) {
+			const char *name = context->names[idx].name;
 
-				if (strncmp(n, dname, dlen) == 0)
-					goto update_context;
-			}
+			if (!name)
+				continue;
+
+			if (audit_compare_dname_path(dname, name) == 0)
+				goto update_context;
+		}
 
+no_match:
 	/* catch-all in case match not found */
 	idx = context->name_count++;
 	context->names[idx].name  = NULL;
-- 
cgit v1.2.3


From 9c937dcc71021f2dbf78f904f03d962dd9bcc130 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 8 Jun 2006 23:19:31 -0400
Subject: [PATCH] log more info for directory entry change events

When an audit event involves changes to a directory entry, include
a PATH record for the directory itself.  A few other notable changes:

    - fixed audit_inode_child() hooks in fsnotify_move()
    - removed unused flags arg from audit_inode()
    - added audit log routines for logging a portion of a string

Here's some sample output.

before patch:
type=SYSCALL msg=audit(1149821605.320:26): arch=40000003 syscall=39 success=yes exit=0 a0=bf8d3c7c a1=1ff a2=804e1b8 a3=bf8d3c7c items=1 ppid=739 pid=800 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=ttyS0 comm="mkdir" exe="/bin/mkdir" subj=root:system_r:unconfined_t:s0-s0:c0.c255
type=CWD msg=audit(1149821605.320:26):  cwd="/root"
type=PATH msg=audit(1149821605.320:26): item=0 name="foo" parent=164068 inode=164010 dev=03:00 mode=040755 ouid=0 ogid=0 rdev=00:00 obj=root:object_r:user_home_t:s0

after patch:
type=SYSCALL msg=audit(1149822032.332:24): arch=40000003 syscall=39 success=yes exit=0 a0=bfdd9c7c a1=1ff a2=804e1b8 a3=bfdd9c7c items=2 ppid=714 pid=777 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=ttyS0 comm="mkdir" exe="/bin/mkdir" subj=root:system_r:unconfined_t:s0-s0:c0.c255
type=CWD msg=audit(1149822032.332:24):  cwd="/root"
type=PATH msg=audit(1149822032.332:24): item=0 name="/root" inode=164068 dev=03:00 mode=040750 ouid=0 ogid=0 rdev=00:00 obj=root:object_r:user_home_dir_t:s0
type=PATH msg=audit(1149822032.332:24): item=1 name="foo" inode=164010 dev=03:00 mode=040755 ouid=0 ogid=0 rdev=00:00 obj=root:object_r:user_home_t:s0

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c               |   2 +-
 fs/open.c                |   4 +-
 fs/xattr.c               |   4 +-
 include/linux/audit.h    |  15 +++---
 include/linux/fsnotify.h |   3 +-
 kernel/audit.c           |  54 +++++++++++++++++++--
 kernel/audit.h           |   3 +-
 kernel/auditfilter.c     |   8 ++-
 kernel/auditsc.c         | 123 ++++++++++++++++++++++++++---------------------
 9 files changed, 142 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index d6e2ee251736..184fe4acf824 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1127,7 +1127,7 @@ out:
 	if (likely(retval == 0)) {
 		if (unlikely(current->audit_context && nd && nd->dentry &&
 				nd->dentry->d_inode))
-		audit_inode(name, nd->dentry->d_inode, flags);
+		audit_inode(name, nd->dentry->d_inode);
 	}
 out_fail:
 	return retval;
diff --git a/fs/open.c b/fs/open.c
index 317b7c7f38a7..4f178acd4c09 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -633,7 +633,7 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
 	dentry = file->f_dentry;
 	inode = dentry->d_inode;
 
-	audit_inode(NULL, inode, 0);
+	audit_inode(NULL, inode);
 
 	err = -EROFS;
 	if (IS_RDONLY(inode))
@@ -786,7 +786,7 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
 	if (file) {
 		struct dentry * dentry;
 		dentry = file->f_dentry;
-		audit_inode(NULL, dentry->d_inode, 0);
+		audit_inode(NULL, dentry->d_inode);
 		error = chown_common(dentry, user, group);
 		fput(file);
 	}
diff --git a/fs/xattr.c b/fs/xattr.c
index e416190f5e9c..c32f15b5f60f 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -242,7 +242,7 @@ sys_fsetxattr(int fd, char __user *name, void __user *value,
 	if (!f)
 		return error;
 	dentry = f->f_dentry;
-	audit_inode(NULL, dentry->d_inode, 0);
+	audit_inode(NULL, dentry->d_inode);
 	error = setxattr(dentry, name, value, size, flags);
 	fput(f);
 	return error;
@@ -469,7 +469,7 @@ sys_fremovexattr(int fd, char __user *name)
 	if (!f)
 		return error;
 	dentry = f->f_dentry;
-	audit_inode(NULL, dentry->d_inode, 0);
+	audit_inode(NULL, dentry->d_inode);
 	error = removexattr(dentry, name);
 	fput(f);
 	return error;
diff --git a/include/linux/audit.h b/include/linux/audit.h
index c78327507f4e..e1c1dbdf9efb 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -310,7 +310,7 @@ extern void audit_syscall_entry(int arch,
 extern void audit_syscall_exit(int failed, long return_code);
 extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
-extern void __audit_inode(const char *name, const struct inode *inode, unsigned flags);
+extern void __audit_inode(const char *name, const struct inode *inode);
 extern void __audit_inode_child(const char *dname, const struct inode *inode,
 				unsigned long pino);
 static inline void audit_getname(const char *name)
@@ -318,10 +318,9 @@ static inline void audit_getname(const char *name)
 	if (unlikely(current->audit_context))
 		__audit_getname(name);
 }
-static inline void audit_inode(const char *name, const struct inode *inode,
-			       unsigned flags) {
+static inline void audit_inode(const char *name, const struct inode *inode) {
 	if (unlikely(current->audit_context))
-		__audit_inode(name, inode, flags);
+		__audit_inode(name, inode);
 }
 static inline void audit_inode_child(const char *dname, 
 				     const struct inode *inode, 
@@ -398,9 +397,9 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 #define audit_syscall_exit(f,r) do { ; } while (0)
 #define audit_getname(n) do { ; } while (0)
 #define audit_putname(n) do { ; } while (0)
-#define __audit_inode(n,i,f) do { ; } while (0)
+#define __audit_inode(n,i) do { ; } while (0)
 #define __audit_inode_child(d,i,p) do { ; } while (0)
-#define audit_inode(n,i,f) do { ; } while (0)
+#define audit_inode(n,i) do { ; } while (0)
 #define audit_inode_child(d,i,p) do { ; } while (0)
 #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0)
 #define audit_get_loginuid(c) ({ -1; })
@@ -435,6 +434,9 @@ extern void		    audit_log_hex(struct audit_buffer *ab,
 					  size_t len);
 extern const char *	    audit_log_untrustedstring(struct audit_buffer *ab,
 						      const char *string);
+extern const char *	    audit_log_n_untrustedstring(struct audit_buffer *ab,
+							size_t n,
+							const char *string);
 extern void		    audit_log_d_path(struct audit_buffer *ab,
 					     const char *prefix,
 					     struct dentry *dentry,
@@ -452,6 +454,7 @@ extern int  audit_receive_filter(int type, int pid, int uid, int seq,
 #define audit_log_end(b) do { ; } while (0)
 #define audit_log_hex(a,b,l) do { ; } while (0)
 #define audit_log_untrustedstring(a,s) do { ; } while (0)
+#define audit_log_n_untrustedstring(a,n,s) do { ; } while (0)
 #define audit_log_d_path(b,p,d,v) do { ; } while (0)
 #endif
 #endif
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index a9d30442448f..cc5dec70c32c 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -67,8 +67,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
 	}
-	audit_inode_child(old_name, source, old_dir->i_ino);
-	audit_inode_child(new_name, target, new_dir->i_ino);
+	audit_inode_child(new_name, source, new_dir->i_ino);
 }
 
 /*
diff --git a/kernel/audit.c b/kernel/audit.c
index 0fbf1c116363..7dfac7031bd7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1051,20 +1051,53 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
 	skb_put(skb, len << 1); /* new string is twice the old string */
 }
 
+/*
+ * Format a string of no more than slen characters into the audit buffer,
+ * enclosed in quote marks.
+ */
+static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
+			       const char *string)
+{
+	int avail, new_len;
+	unsigned char *ptr;
+	struct sk_buff *skb;
+
+	BUG_ON(!ab->skb);
+	skb = ab->skb;
+	avail = skb_tailroom(skb);
+	new_len = slen + 3;	/* enclosing quotes + null terminator */
+	if (new_len > avail) {
+		avail = audit_expand(ab, new_len);
+		if (!avail)
+			return;
+	}
+	ptr = skb->tail;
+	*ptr++ = '"';
+	memcpy(ptr, string, slen);
+	ptr += slen;
+	*ptr++ = '"';
+	*ptr = 0;
+	skb_put(skb, slen + 2);	/* don't include null terminator */
+}
+
 /**
- * audit_log_unstrustedstring - log a string that may contain random characters
+ * audit_log_n_unstrustedstring - log a string that may contain random characters
  * @ab: audit_buffer
+ * @len: lenth of string (not including trailing null)
  * @string: string to be logged
  *
  * This code will escape a string that is passed to it if the string
  * contains a control character, unprintable character, double quote mark,
  * or a space. Unescaped strings will start and end with a double quote mark.
  * Strings that are escaped are printed in hex (2 digits per char).
+ *
+ * The caller specifies the number of characters in the string to log, which may
+ * or may not be the entire string.
  */
-const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
+const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
+					const char *string)
 {
 	const unsigned char *p = string;
-	size_t len = strlen(string);
 
 	while (*p) {
 		if (*p == '"' || *p < 0x21 || *p > 0x7f) {
@@ -1073,10 +1106,23 @@ const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *strin
 		}
 		p++;
 	}
-	audit_log_format(ab, "\"%s\"", string);
+	audit_log_n_string(ab, len, string);
 	return p + 1;
 }
 
+/**
+ * audit_log_unstrustedstring - log a string that may contain random characters
+ * @ab: audit_buffer
+ * @string: string to be logged
+ *
+ * Same as audit_log_n_unstrustedstring(), except that strlen is used to
+ * determine string length.
+ */
+const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
+{
+	return audit_log_n_untrustedstring(ab, strlen(string), string);
+}
+
 /* This is a helper-function to print the escaped d_path */
 void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 		      struct dentry *dentry, struct vfsmount *vfsmnt)
diff --git a/kernel/audit.h b/kernel/audit.h
index 58fa44cb8d01..8323e4132a33 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -104,7 +104,8 @@ static inline int audit_hash_ino(u32 ino)
 }
 
 extern int audit_comparator(const u32 left, const u32 op, const u32 right);
-extern int audit_compare_dname_path(const char *dname, const char *path);
+extern int audit_compare_dname_path(const char *dname, const char *path,
+				    int *dirlen);
 extern struct sk_buff *	    audit_make_reply(int pid, int seq, int type,
 					     int done, int multi,
 					     void *payload, int size);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a536f7148bcd..4c99d2c586ed 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -787,7 +787,7 @@ static void audit_update_watch(struct audit_parent *parent,
 
 	mutex_lock(&audit_filter_mutex);
 	list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
-		if (audit_compare_dname_path(dname, owatch->path))
+		if (audit_compare_dname_path(dname, owatch->path, NULL))
 			continue;
 
 		/* If the update involves invalidating rules, do the inode-based
@@ -1387,7 +1387,8 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
 
 /* Compare given dentry name with last component in given path,
  * return of 0 indicates a match. */
-int audit_compare_dname_path(const char *dname, const char *path)
+int audit_compare_dname_path(const char *dname, const char *path,
+			     int *dirlen)
 {
 	int dlen, plen;
 	const char *p;
@@ -1416,6 +1417,9 @@ int audit_compare_dname_path(const char *dname, const char *path)
 			p++;
 	}
 
+	/* return length of path's directory component */
+	if (dirlen)
+		*dirlen = p - path;
 	return strncmp(p, dname, dlen);
 }
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 174a3f624892..851ae0217e4b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -82,6 +82,9 @@ extern int audit_enabled;
  * path_lookup. */
 #define AUDIT_NAMES_RESERVED 7
 
+/* Indicates that audit should log the full pathname. */
+#define AUDIT_NAME_FULL -1
+
 /* When fs/namei.c:getname() is called, we store the pointer in name and
  * we don't let putname() free it (instead we free all of the saved
  * pointers at syscall exit time).
@@ -89,8 +92,9 @@ extern int audit_enabled;
  * Further, in fs/namei.c:path_lookup() we store the inode and device. */
 struct audit_names {
 	const char	*name;
+	int		name_len;	/* number of name's characters to log */
+	unsigned	name_put;	/* call __putname() for this name */
 	unsigned long	ino;
-	unsigned long	pino;
 	dev_t		dev;
 	umode_t		mode;
 	uid_t		uid;
@@ -296,12 +300,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 			break;
 		case AUDIT_INODE:
 			if (name)
-				result = (name->ino == f->val ||
-					  name->pino == f->val);
+				result = (name->ino == f->val);
 			else if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(ctx->names[j].ino, f->op, f->val) ||
-					    audit_comparator(ctx->names[j].pino, f->op, f->val)) {
+					if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -311,8 +313,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_WATCH:
 			if (name && rule->watch->ino != (unsigned long)-1)
 				result = (name->dev == rule->watch->dev &&
-					  (name->ino == rule->watch->ino ||
-					   name->pino == rule->watch->ino));
+					  name->ino == rule->watch->ino);
 			break;
 		case AUDIT_LOGINUID:
 			result = 0;
@@ -526,7 +527,7 @@ static inline void audit_free_names(struct audit_context *context)
 #endif
 
 	for (i = 0; i < context->name_count; i++) {
-		if (context->names[i].name)
+		if (context->names[i].name && context->names[i].name_put)
 			__putname(context->names[i].name);
 	}
 	context->name_count = 0;
@@ -850,8 +851,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		}
 	}
 	for (i = 0; i < context->name_count; i++) {
-		unsigned long ino  = context->names[i].ino;
-		unsigned long pino = context->names[i].pino;
+		struct audit_names *n = &context->names[i];
 
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
 		if (!ab)
@@ -859,33 +859,47 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 
 		audit_log_format(ab, "item=%d", i);
 
-		audit_log_format(ab, " name=");
-		if (context->names[i].name)
-			audit_log_untrustedstring(ab, context->names[i].name);
-		else
-			audit_log_format(ab, "(null)");
-
-		if (pino != (unsigned long)-1)
-			audit_log_format(ab, " parent=%lu",  pino);
-		if (ino != (unsigned long)-1)
-			audit_log_format(ab, " inode=%lu",  ino);
-		if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1))
-			audit_log_format(ab, " dev=%02x:%02x mode=%#o" 
-					 " ouid=%u ogid=%u rdev=%02x:%02x", 
-					 MAJOR(context->names[i].dev), 
-					 MINOR(context->names[i].dev), 
-					 context->names[i].mode, 
-					 context->names[i].uid, 
-					 context->names[i].gid, 
-					 MAJOR(context->names[i].rdev), 
-					 MINOR(context->names[i].rdev));
-		if (context->names[i].osid != 0) {
+		if (n->name) {
+			switch(n->name_len) {
+			case AUDIT_NAME_FULL:
+				/* log the full path */
+				audit_log_format(ab, " name=");
+				audit_log_untrustedstring(ab, n->name);
+				break;
+			case 0:
+				/* name was specified as a relative path and the
+				 * directory component is the cwd */
+				audit_log_d_path(ab, " name=", context->pwd,
+						 context->pwdmnt);
+				break;
+			default:
+				/* log the name's directory component */
+				audit_log_format(ab, " name=");
+				audit_log_n_untrustedstring(ab, n->name_len,
+							    n->name);
+			}
+		} else
+			audit_log_format(ab, " name=(null)");
+
+		if (n->ino != (unsigned long)-1) {
+			audit_log_format(ab, " inode=%lu"
+					 " dev=%02x:%02x mode=%#o"
+					 " ouid=%u ogid=%u rdev=%02x:%02x",
+					 n->ino,
+					 MAJOR(n->dev),
+					 MINOR(n->dev),
+					 n->mode,
+					 n->uid,
+					 n->gid,
+					 MAJOR(n->rdev),
+					 MINOR(n->rdev));
+		}
+		if (n->osid != 0) {
 			char *ctx = NULL;
 			u32 len;
 			if (selinux_ctxid_to_string(
-				context->names[i].osid, &ctx, &len)) {
-				audit_log_format(ab, " osid=%u",
-						context->names[i].osid);
+				n->osid, &ctx, &len)) {
+				audit_log_format(ab, " osid=%u", n->osid);
 				call_panic = 2;
 			} else
 				audit_log_format(ab, " obj=%s", ctx);
@@ -1075,6 +1089,8 @@ void __audit_getname(const char *name)
 	}
 	BUG_ON(context->name_count >= AUDIT_NAMES);
 	context->names[context->name_count].name = name;
+	context->names[context->name_count].name_len = AUDIT_NAME_FULL;
+	context->names[context->name_count].name_put = 1;
 	context->names[context->name_count].ino  = (unsigned long)-1;
 	++context->name_count;
 	if (!context->pwd) {
@@ -1141,11 +1157,10 @@ static void audit_inode_context(int idx, const struct inode *inode)
  * audit_inode - store the inode and device from a lookup
  * @name: name being audited
  * @inode: inode being audited
- * @flags: lookup flags (as used in path_lookup())
  *
  * Called from fs/namei.c:path_lookup().
  */
-void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
+void __audit_inode(const char *name, const struct inode *inode)
 {
 	int idx;
 	struct audit_context *context = current->audit_context;
@@ -1171,20 +1186,13 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
 		++context->ino_count;
 #endif
 	}
+	context->names[idx].ino   = inode->i_ino;
 	context->names[idx].dev	  = inode->i_sb->s_dev;
 	context->names[idx].mode  = inode->i_mode;
 	context->names[idx].uid   = inode->i_uid;
 	context->names[idx].gid   = inode->i_gid;
 	context->names[idx].rdev  = inode->i_rdev;
 	audit_inode_context(idx, inode);
-	if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && 
-	    (strcmp(name, ".") != 0)) {
-		context->names[idx].ino   = (unsigned long)-1;
-		context->names[idx].pino  = inode->i_ino;
-	} else {
-		context->names[idx].ino   = inode->i_ino;
-		context->names[idx].pino  = (unsigned long)-1;
-	}
 }
 
 /**
@@ -1206,34 +1214,40 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
 {
 	int idx;
 	struct audit_context *context = current->audit_context;
+	const char *found_name = NULL;
+	int dirlen = 0;
 
 	if (!context->in_syscall)
 		return;
 
 	/* determine matching parent */
 	if (!dname)
-		goto no_match;
+		goto update_context;
 	for (idx = 0; idx < context->name_count; idx++)
-		if (context->names[idx].pino == pino) {
+		if (context->names[idx].ino == pino) {
 			const char *name = context->names[idx].name;
 
 			if (!name)
 				continue;
 
-			if (audit_compare_dname_path(dname, name) == 0)
-				goto update_context;
+			if (audit_compare_dname_path(dname, name, &dirlen) == 0) {
+				context->names[idx].name_len = dirlen;
+				found_name = name;
+				break;
+			}
 		}
 
-no_match:
-	/* catch-all in case match not found */
+update_context:
 	idx = context->name_count++;
-	context->names[idx].name  = NULL;
-	context->names[idx].pino  = pino;
 #if AUDIT_DEBUG
 	context->ino_count++;
 #endif
+	/* Re-use the name belonging to the slot for a matching parent directory.
+	 * All names for this context are relinquished in audit_free_names() */
+	context->names[idx].name = found_name;
+	context->names[idx].name_len = AUDIT_NAME_FULL;
+	context->names[idx].name_put = 0;	/* don't call __putname() */
 
-update_context:
 	if (inode) {
 		context->names[idx].ino   = inode->i_ino;
 		context->names[idx].dev	  = inode->i_sb->s_dev;
@@ -1242,7 +1256,8 @@ update_context:
 		context->names[idx].gid   = inode->i_gid;
 		context->names[idx].rdev  = inode->i_rdev;
 		audit_inode_context(idx, inode);
-	}
+	} else
+		context->names[idx].ino   = (unsigned long)-1;
 }
 
 /**
-- 
cgit v1.2.3


From e05b59fe7927bc648ac3af3d59dc64a7ee6b22e2 Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Wed, 19 Apr 2006 22:40:53 +0200
Subject: [WATCHDOG] Pre-Timeout flags

Some watchdog timers support the concept of a "pretimeout" which
occurs some time before the real timeout.  The pretimeout can
be delivered via an interrupt or NMI and can be used to panic
the system when it occurs (so you get useful information instead
of a blind reboot).

Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 Documentation/watchdog/watchdog-api.txt | 30 +++++++++++++++++++++++++++++-
 include/linux/watchdog.h                |  3 +++
 2 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/watchdog/watchdog-api.txt b/Documentation/watchdog/watchdog-api.txt
index 21ed51173662..7dc2c1c6f779 100644
--- a/Documentation/watchdog/watchdog-api.txt
+++ b/Documentation/watchdog/watchdog-api.txt
@@ -110,7 +110,31 @@ current timeout using the GETTIMEOUT ioctl.
     ioctl(fd, WDIOC_GETTIMEOUT, &timeout);
     printf("The timeout was is %d seconds\n", timeout);
 
-Envinronmental monitoring:
+Pretimeouts:
+
+Some watchdog timers can be set to have a trigger go off before the
+actual time they will reset the system.  This can be done with an NMI,
+interrupt, or other mechanism.  This allows Linux to record useful
+information (like panic information and kernel coredumps) before it
+resets.
+
+    pretimeout = 10;
+    ioctl(fd, WDIOC_SETPRETIMEOUT, &pretimeout);
+
+Note that the pretimeout is the number of seconds before the time
+when the timeout will go off.  It is not the number of seconds until
+the pretimeout.  So, for instance, if you set the timeout to 60 seconds
+and the pretimeout to 10 seconds, the pretimout will go of in 50
+seconds.  Setting a pretimeout to zero disables it.
+
+There is also a get function for getting the pretimeout:
+
+    ioctl(fd, WDIOC_GETPRETIMEOUT, &timeout);
+    printf("The pretimeout was is %d seconds\n", timeout);
+
+Not all watchdog drivers will support a pretimeout.
+
+Environmental monitoring:
 
 All watchdog drivers are required return more information about the system,
 some do temperature, fan and power level monitoring, some can tell you
@@ -169,6 +193,10 @@ The watchdog saw a keepalive ping since it was last queried.
 
 	WDIOF_SETTIMEOUT	Can set/get the timeout
 
+The watchdog can do pretimeouts.
+
+	WDIOF_PRETIMEOUT	Pretimeout (in seconds), get/set
+
 
 For those drivers that return any bits set in the option field, the
 GETSTATUS and GETBOOTSTATUS ioctls can be used to ask for the current
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 1192ed8f4fe8..a99c937f665e 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -28,6 +28,8 @@ struct watchdog_info {
 #define	WDIOC_KEEPALIVE		_IOR(WATCHDOG_IOCTL_BASE, 5, int)
 #define	WDIOC_SETTIMEOUT        _IOWR(WATCHDOG_IOCTL_BASE, 6, int)
 #define	WDIOC_GETTIMEOUT        _IOR(WATCHDOG_IOCTL_BASE, 7, int)
+#define	WDIOC_SETPRETIMEOUT	_IOWR(WATCHDOG_IOCTL_BASE, 8, int)
+#define	WDIOC_GETPRETIMEOUT	_IOR(WATCHDOG_IOCTL_BASE, 9, int)
 
 #define	WDIOF_UNKNOWN		-1	/* Unknown flag error */
 #define	WDIOS_UNKNOWN		-1	/* Unknown status error */
@@ -41,6 +43,7 @@ struct watchdog_info {
 #define WDIOF_POWEROVER		0x0040	/* Power over voltage */
 #define WDIOF_SETTIMEOUT	0x0080  /* Set timeout (in seconds) */
 #define WDIOF_MAGICCLOSE	0x0100	/* Supports magic close char */
+#define	WDIOF_PRETIMEOUT	0x0200  /* Pretimeout (in seconds), get/set */
 #define	WDIOF_KEEPALIVEPING	0x8000	/* Keep alive ping reply */
 
 #define	WDIOS_DISABLECARD	0x0001	/* Turn off the watchdog timer */
-- 
cgit v1.2.3


From 58b519f3e5e491d5a3e320dc525f58ac439bdde4 Mon Sep 17 00:00:00 2001
From: Wim Van Sebroeck <wim@iguana.be>
Date: Sun, 21 May 2006 12:48:44 +0200
Subject: [WATCHDOG] add WDIOC_GETTIMELEFT ioctl

Some watchdog drivers have the ability to report the remaining time
before the system will reboot. With the WDIOC_GETTIMELEFT ioctl
you can now read the time left before the watchdog would reboot
your system.

The following drivers support this new IOCTL:
i8xx_tco.c, pcwd_pci.c and pcwd_usb.c .

Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 Documentation/watchdog/watchdog-api.txt |  9 +++++++++
 drivers/char/watchdog/i8xx_tco.c        | 28 +++++++++++++++++++++++++++-
 drivers/char/watchdog/pcwd_pci.c        | 30 +++++++++++++++++++++++++++++-
 drivers/char/watchdog/pcwd_usb.c        | 23 +++++++++++++++++++++++
 include/linux/watchdog.h                |  7 ++++---
 5 files changed, 92 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/watchdog/watchdog-api.txt b/Documentation/watchdog/watchdog-api.txt
index 7dc2c1c6f779..d738ec25eaa4 100644
--- a/Documentation/watchdog/watchdog-api.txt
+++ b/Documentation/watchdog/watchdog-api.txt
@@ -134,6 +134,15 @@ There is also a get function for getting the pretimeout:
 
 Not all watchdog drivers will support a pretimeout.
 
+Get the number of seconds before reboot:
+
+Some watchdog drivers have the ability to report the remaining time
+before the system will reboot. The WDIOC_GETTIMELEFT is the ioctl
+that returns the number of seconds before reboot.
+
+    ioctl(fd, WDIOC_GETTIMELEFT, &timeleft);
+    printf("The timeout was is %d seconds\n", timeleft);
+
 Environmental monitoring:
 
 All watchdog drivers are required return more information about the system,
diff --git a/drivers/char/watchdog/i8xx_tco.c b/drivers/char/watchdog/i8xx_tco.c
index fa2ba9ebe42a..bfbdbbf3c2f2 100644
--- a/drivers/char/watchdog/i8xx_tco.c
+++ b/drivers/char/watchdog/i8xx_tco.c
@@ -205,6 +205,23 @@ static int tco_timer_set_heartbeat (int t)
 	return 0;
 }
 
+static int tco_timer_get_timeleft (int *time_left)
+{
+	unsigned char val;
+
+	spin_lock(&tco_lock);
+
+	/* read the TCO Timer */
+	val = inb (TCO1_RLD);
+	val &= 0x3f;
+
+	spin_unlock(&tco_lock);
+
+	*time_left = (int)((val * 6) / 10);
+
+	return 0;
+}
+
 /*
  *	/dev/watchdog handling
  */
@@ -272,6 +289,7 @@ static int i8xx_tco_ioctl (struct inode *inode, struct file *file,
 {
 	int new_options, retval = -EINVAL;
 	int new_heartbeat;
+	int time_left;
 	void __user *argp = (void __user *)arg;
 	int __user *p = argp;
 	static struct watchdog_info ident = {
@@ -320,7 +338,7 @@ static int i8xx_tco_ioctl (struct inode *inode, struct file *file,
 				return -EFAULT;
 
 			if (tco_timer_set_heartbeat(new_heartbeat))
-			    return -EINVAL;
+				return -EINVAL;
 
 			tco_timer_keepalive ();
 			/* Fall */
@@ -329,6 +347,14 @@ static int i8xx_tco_ioctl (struct inode *inode, struct file *file,
 		case WDIOC_GETTIMEOUT:
 			return put_user(heartbeat, p);
 
+		case WDIOC_GETTIMELEFT:
+		{
+			if (tco_timer_get_timeleft(&time_left))
+				return -EINVAL;
+
+			return put_user(time_left, p);
+		}
+
 		default:
 			return -ENOIOCTLCMD;
 	}
diff --git a/drivers/char/watchdog/pcwd_pci.c b/drivers/char/watchdog/pcwd_pci.c
index 2451edbefece..1f40ecefbf72 100644
--- a/drivers/char/watchdog/pcwd_pci.c
+++ b/drivers/char/watchdog/pcwd_pci.c
@@ -21,7 +21,7 @@
  */
 
 /*
- *	A bells and whistles driver is available from: 
+ *	A bells and whistles driver is available from:
  *	http://www.kernel.org/pub/linux/kernel/people/wim/pcwd/pcwd_pci/
  *
  *	More info available at http://www.berkprod.com/ or http://www.pcwatchdog.com/
@@ -390,6 +390,24 @@ static int pcipcwd_get_temperature(int *temperature)
 	return 0;
 }
 
+static int pcipcwd_get_timeleft(int *time_left)
+{
+	int msb;
+	int lsb;
+
+	/* Read the time that's left before rebooting */
+	/* Note: if the board is not yet armed then we will read 0xFFFF */
+	send_command(CMD_READ_WATCHDOG_TIMEOUT, &msb, &lsb);
+
+	*time_left = (msb << 8) + lsb;
+
+	if (debug >= VERBOSE)
+		printk(KERN_DEBUG PFX "Time left before next reboot: %d\n",
+		       *time_left);
+
+	return 0;
+}
+
 /*
  *	/dev/watchdog handling
  */
@@ -512,6 +530,16 @@ static int pcipcwd_ioctl(struct inode *inode, struct file *file,
 		case WDIOC_GETTIMEOUT:
 			return put_user(heartbeat, p);
 
+		case WDIOC_GETTIMELEFT:
+		{
+			int time_left;
+
+			if (pcipcwd_get_timeleft(&time_left))
+				return -EFAULT;
+
+			return put_user(time_left, p);
+		}
+
 		default:
 			return -ENOIOCTLCMD;
 	}
diff --git a/drivers/char/watchdog/pcwd_usb.c b/drivers/char/watchdog/pcwd_usb.c
index 3fdfda9324fa..0d072bed501d 100644
--- a/drivers/char/watchdog/pcwd_usb.c
+++ b/drivers/char/watchdog/pcwd_usb.c
@@ -317,6 +317,19 @@ static int usb_pcwd_get_temperature(struct usb_pcwd_private *usb_pcwd, int *temp
 	return 0;
 }
 
+static int usb_pcwd_get_timeleft(struct usb_pcwd_private *usb_pcwd, int *time_left)
+{
+	unsigned char msb, lsb;
+
+	/* Read the time that's left before rebooting */
+	/* Note: if the board is not yet armed then we will read 0xFFFF */
+	usb_pcwd_send_command(usb_pcwd, CMD_READ_WATCHDOG_TIMEOUT, &msb, &lsb);
+
+	*time_left = (msb << 8) + lsb;
+
+	return 0;
+}
+
 /*
  *	/dev/watchdog handling
  */
@@ -422,6 +435,16 @@ static int usb_pcwd_ioctl(struct inode *inode, struct file *file,
 		case WDIOC_GETTIMEOUT:
 			return put_user(heartbeat, p);
 
+		case WDIOC_GETTIMELEFT:
+		{
+			int time_left;
+
+			if (usb_pcwd_get_timeleft(usb_pcwd_device, &time_left))
+				return -EFAULT;
+
+			return put_user(time_left, p);
+		}
+
 		default:
 			return -ENOIOCTLCMD;
 	}
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index a99c937f665e..011bcfeb9f09 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -30,6 +30,7 @@ struct watchdog_info {
 #define	WDIOC_GETTIMEOUT        _IOR(WATCHDOG_IOCTL_BASE, 7, int)
 #define	WDIOC_SETPRETIMEOUT	_IOWR(WATCHDOG_IOCTL_BASE, 8, int)
 #define	WDIOC_GETPRETIMEOUT	_IOR(WATCHDOG_IOCTL_BASE, 9, int)
+#define	WDIOC_GETTIMELEFT	_IOR(WATCHDOG_IOCTL_BASE, 10, int)
 
 #define	WDIOF_UNKNOWN		-1	/* Unknown flag error */
 #define	WDIOS_UNKNOWN		-1	/* Unknown status error */
@@ -40,9 +41,9 @@ struct watchdog_info {
 #define	WDIOF_EXTERN2		0x0008	/* External relay 2 */
 #define	WDIOF_POWERUNDER	0x0010	/* Power bad/power fault */
 #define	WDIOF_CARDRESET		0x0020	/* Card previously reset the CPU */
-#define WDIOF_POWEROVER		0x0040	/* Power over voltage */
-#define WDIOF_SETTIMEOUT	0x0080  /* Set timeout (in seconds) */
-#define WDIOF_MAGICCLOSE	0x0100	/* Supports magic close char */
+#define	WDIOF_POWEROVER		0x0040	/* Power over voltage */
+#define	WDIOF_SETTIMEOUT	0x0080  /* Set timeout (in seconds) */
+#define	WDIOF_MAGICCLOSE	0x0100	/* Supports magic close char */
 #define	WDIOF_PRETIMEOUT	0x0200  /* Pretimeout (in seconds), get/set */
 #define	WDIOF_KEEPALIVEPING	0x8000	/* Keep alive ping reply */
 
-- 
cgit v1.2.3


From 7bc3312bef4d6f220812500c0de7868fb7625a41 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 20 Jun 2006 20:05:05 +0200
Subject: [MTD] NAND: Fix breakage all over the place

Following problems are addressed:

- wrong status caused early break out of nand_wait()
- removed the bogus status check in nand_wait() which
  is a relict of the abandoned support for interrupted
  erase.
- status check moved to the correct place in read_oob
- oob support for syndrom based ecc with strange layouts
- use given offset in the AUTOOOB based oob operations

Partially based on a patch from Vitaly Vool <vwool@ru.mvista.com>
Thanks to Savin Zlobec <savin@epico.si> for tracking down the
status problem.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/mtdchar.c         |   8 +-
 drivers/mtd/nand/diskonchip.c |   2 +-
 drivers/mtd/nand/nand_base.c  | 297 ++++++++++++++++++++++++++++++------------
 include/linux/mtd/nand.h      |  12 +-
 4 files changed, 231 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 5dd0b8d72c8b..aa18d45b264b 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -504,12 +504,12 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 			return ret;
 
 		ops.len = buf.length;
-		ops.ooblen = mtd->oobsize;
+		ops.ooblen = buf.length;
 		ops.ooboffs = buf.start & (mtd->oobsize - 1);
 		ops.datbuf = NULL;
 		ops.mode = MTD_OOB_PLACE;
 
-		if (ops.ooboffs && ops.len > (ops.ooblen - ops.ooboffs))
+		if (ops.ooboffs && ops.len > (mtd->oobsize - ops.ooboffs))
 			return -EINVAL;
 
 		ops.oobbuf = kmalloc(buf.length, GFP_KERNEL);
@@ -553,12 +553,12 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 			return ret;
 
 		ops.len = buf.length;
-		ops.ooblen = mtd->oobsize;
+		ops.ooblen = buf.length;
 		ops.ooboffs = buf.start & (mtd->oobsize - 1);
 		ops.datbuf = NULL;
 		ops.mode = MTD_OOB_PLACE;
 
-		if (ops.ooboffs && ops.len > (ops.ooblen - ops.ooboffs))
+		if (ops.ooboffs && ops.len > (mtd->oobsize - ops.ooboffs))
 			return -EINVAL;
 
 		ops.oobbuf = kmalloc(buf.length, GFP_KERNEL);
diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index 463e12ced1b3..6107f532855b 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -464,7 +464,7 @@ static void __init doc2000_count_chips(struct mtd_info *mtd)
 	printk(KERN_DEBUG "Detected %d chips per floor.\n", i);
 }
 
-static int doc200x_wait(struct mtd_info *mtd, struct nand_chip *this, int state)
+static int doc200x_wait(struct mtd_info *mtd, struct nand_chip *this)
 {
 	struct doc_priv *doc = this->priv;
 
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index e74678e928cf..27083ed0a017 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -501,7 +501,6 @@ static void nand_command(struct mtd_info *mtd, unsigned int command,
 	case NAND_CMD_ERASE2:
 	case NAND_CMD_SEQIN:
 	case NAND_CMD_STATUS:
-		chip->cmd_ctrl(mtd, NAND_CMD_NONE, NAND_NCE);
 		return;
 
 	case NAND_CMD_RESET:
@@ -595,6 +594,7 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned int command,
 	case NAND_CMD_ERASE1:
 	case NAND_CMD_ERASE2:
 	case NAND_CMD_SEQIN:
+	case NAND_CMD_RNDIN:
 	case NAND_CMD_STATUS:
 	case NAND_CMD_DEPLETE1:
 		return;
@@ -621,6 +621,14 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned int command,
 		while (!(chip->read_byte(mtd) & NAND_STATUS_READY)) ;
 		return;
 
+	case NAND_CMD_RNDOUT:
+		/* No ready / busy check necessary */
+		chip->cmd_ctrl(mtd, NAND_CMD_RNDOUTSTART,
+			       NAND_NCE | NAND_CLE | NAND_CTRL_CHANGE);
+		chip->cmd_ctrl(mtd, NAND_CMD_NONE,
+			       NAND_NCE | NAND_CTRL_CHANGE);
+		return;
+
 	case NAND_CMD_READ0:
 		chip->cmd_ctrl(mtd, NAND_CMD_READSTART,
 			       NAND_NCE | NAND_CLE | NAND_CTRL_CHANGE);
@@ -689,18 +697,17 @@ nand_get_device(struct nand_chip *chip, struct mtd_info *mtd, int new_state)
  * nand_wait - [DEFAULT]  wait until the command is done
  * @mtd:	MTD device structure
  * @this:	NAND chip structure
- * @state:	state to select the max. timeout value
  *
  * Wait for command done. This applies to erase and program only
  * Erase can take up to 400ms and program up to 20ms according to
  * general NAND and SmartMedia specs
  *
 */
-static int nand_wait(struct mtd_info *mtd, struct nand_chip *chip, int state)
+static int nand_wait(struct mtd_info *mtd, struct nand_chip *chip)
 {
 
 	unsigned long timeo = jiffies;
-	int status;
+	int status, state = chip->state;
 
 	if (state == FL_ERASING)
 		timeo += (HZ * 400) / 1000;
@@ -719,10 +726,6 @@ static int nand_wait(struct mtd_info *mtd, struct nand_chip *chip, int state)
 		chip->cmdfunc(mtd, NAND_CMD_STATUS, -1, -1);
 
 	while (time_before(jiffies, timeo)) {
-		/* Check, if we were interrupted */
-		if (chip->state != state)
-			return 0;
-
 		if (chip->dev_ready) {
 			if (chip->dev_ready(mtd))
 				break;
@@ -909,12 +912,25 @@ static uint8_t *nand_transfer_oob(struct nand_chip *chip, uint8_t *oob,
 
 	case MTD_OOB_AUTO: {
 		struct nand_oobfree *free = chip->ecc.layout->oobfree;
-		size_t bytes;
+		uint32_t boffs = 0, roffs = ops->ooboffs;
+		size_t bytes = 0;
 
 		for(; free->length && len; free++, len -= bytes) {
-			bytes = min_t(size_t, len, free->length);
-
-			memcpy(oob, chip->oob_poi + free->offset, bytes);
+			/* Read request not from offset 0 ? */
+			if (unlikely(roffs)) {
+				if (roffs >= free->length) {
+					roffs -= free->length;
+					continue;
+				}
+				boffs = free->offset + roffs;
+				bytes = min_t(size_t, len,
+					      (free->length - roffs));
+				roffs = 0;
+			} else {
+				bytes = min_t(size_t, len, free->length);
+				boffs = free->offset;
+			}
+			memcpy(oob, chip->oob_poi + boffs, bytes);
 			oob += bytes;
 		}
 		return oob;
@@ -1083,6 +1099,145 @@ static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
 	return ret;
 }
 
+/**
+ * nand_read_oob_std - [REPLACABLE] the most common OOB data read function
+ * @mtd:	mtd info structure
+ * @chip:	nand chip info structure
+ * @page:	page number to read
+ * @sndcmd:	flag whether to issue read command or not
+ */
+static int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
+			     int page, int sndcmd)
+{
+	if (sndcmd) {
+		chip->cmdfunc(mtd, NAND_CMD_READOOB, 0, page);
+		sndcmd = 0;
+	}
+	chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
+	return sndcmd;
+}
+
+/**
+ * nand_read_oob_syndrome - [REPLACABLE] OOB data read function for HW ECC
+ *			    with syndromes
+ * @mtd:	mtd info structure
+ * @chip:	nand chip info structure
+ * @page:	page number to read
+ * @sndcmd:	flag whether to issue read command or not
+ */
+static int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+				  int page, int sndcmd)
+{
+	uint8_t *buf = chip->oob_poi;
+	int length = mtd->oobsize;
+	int chunk = chip->ecc.bytes + chip->ecc.prepad + chip->ecc.postpad;
+	int eccsize = chip->ecc.size;
+	uint8_t *bufpoi = buf;
+	int i, toread, sndrnd = 0, pos;
+
+	chip->cmdfunc(mtd, NAND_CMD_READ0, chip->ecc.size, page);
+	for (i = 0; i < chip->ecc.steps; i++) {
+		if (sndrnd) {
+			pos = eccsize + i * (eccsize + chunk);
+			if (mtd->writesize > 512)
+				chip->cmdfunc(mtd, NAND_CMD_RNDOUT, pos, -1);
+			else
+				chip->cmdfunc(mtd, NAND_CMD_READ0, pos, page);
+		} else
+			sndrnd = 1;
+		toread = min_t(int, length, chunk);
+		chip->read_buf(mtd, bufpoi, toread);
+		bufpoi += toread;
+		length -= toread;
+	}
+	if (length > 0)
+		chip->read_buf(mtd, bufpoi, length);
+
+	return 1;
+}
+
+/**
+ * nand_write_oob_std - [REPLACABLE] the most common OOB data write function
+ * @mtd:	mtd info structure
+ * @chip:	nand chip info structure
+ * @page:	page number to write
+ */
+static int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
+			      int page)
+{
+	int status = 0;
+	const uint8_t *buf = chip->oob_poi;
+	int length = mtd->oobsize;
+
+	chip->cmdfunc(mtd, NAND_CMD_SEQIN, mtd->writesize, page);
+	chip->write_buf(mtd, buf, length);
+	/* Send command to program the OOB data */
+	chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
+
+	status = chip->waitfunc(mtd, chip);
+
+	return status;
+}
+
+/**
+ * nand_write_oob_syndrome - [REPLACABLE] OOB data write function for HW ECC
+ *			     with syndrome - only for large page flash !
+ * @mtd:	mtd info structure
+ * @chip:	nand chip info structure
+ * @page:	page number to write
+ */
+static int nand_write_oob_syndrome(struct mtd_info *mtd,
+				   struct nand_chip *chip, int page)
+{
+	int chunk = chip->ecc.bytes + chip->ecc.prepad + chip->ecc.postpad;
+	int eccsize = chip->ecc.size, length = mtd->oobsize;
+	int i, len, pos, status = 0, sndcmd = 0, steps = chip->ecc.steps;
+	const uint8_t *bufpoi = chip->oob_poi;
+
+	/*
+	 * data-ecc-data-ecc ... ecc-oob
+	 * or
+	 * data-pad-ecc-pad-data-pad .... ecc-pad-oob
+	 */
+	if (!chip->ecc.prepad && !chip->ecc.postpad) {
+		pos = steps * (eccsize + chunk);
+		steps = 0;
+	} else
+		pos = eccsize + chunk;
+
+	chip->cmdfunc(mtd, NAND_CMD_SEQIN, pos, page);
+	for (i = 0; i < steps; i++) {
+		if (sndcmd) {
+			if (mtd->writesize <= 512) {
+				uint32_t fill = 0xFFFFFFFF;
+
+				len = eccsize;
+				while (len > 0) {
+					int num = min_t(int, len, 4);
+					chip->write_buf(mtd, (uint8_t *)&fill,
+							num);
+					len -= num;
+				}
+			} else {
+				pos = eccsize + i * (eccsize + chunk);
+				chip->cmdfunc(mtd, NAND_CMD_RNDIN, pos, -1);
+			}
+		} else
+			sndcmd = 1;
+		len = min_t(int, length, chunk);
+		chip->write_buf(mtd, bufpoi, len);
+		bufpoi += len;
+		length -= len;
+	}
+	if (length > 0)
+		chip->write_buf(mtd, bufpoi, length);
+
+	chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
+	status = chip->waitfunc(mtd, chip);
+
+	return status & NAND_STATUS_FAIL ? -EIO : 0;
+}
+
 /**
  * nand_do_read_oob - [Intern] NAND read out-of-band
  * @mtd:	MTD device structure
@@ -1094,11 +1249,11 @@ static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
 static int nand_do_read_oob(struct mtd_info *mtd, loff_t from,
 			    struct mtd_oob_ops *ops)
 {
-	int col, page, realpage, chipnr, sndcmd = 1;
+	int page, realpage, chipnr, sndcmd = 1;
 	struct nand_chip *chip = mtd->priv;
 	int blkcheck = (1 << (chip->phys_erase_shift - chip->page_shift)) - 1;
-	int direct, bytes, readlen = ops->len;
-	uint8_t *bufpoi, *buf = ops->oobbuf;
+	int readlen = ops->len;
+	uint8_t *buf = ops->oobbuf;
 
 	DEBUG(MTD_DEBUG_LEVEL3, "nand_read_oob: from = 0x%08Lx, len = %i\n",
 	      (unsigned long long)from, readlen);
@@ -1110,29 +1265,11 @@ static int nand_do_read_oob(struct mtd_info *mtd, loff_t from,
 	realpage = (int)(from >> chip->page_shift);
 	page = realpage & chip->pagemask;
 
-	if (ops->mode != MTD_OOB_AUTO) {
-		col = ops->ooboffs;
-		direct = 1;
-	} else {
-		col = 0;
-		direct = 0;
-	}
+	chip->oob_poi = chip->buffers.oobrbuf;
 
 	while(1) {
-		bytes = direct ? ops->ooblen : mtd->oobsize;
-		bufpoi = direct ? buf : chip->buffers.oobrbuf;
-
-		if (likely(sndcmd)) {
-			chip->cmdfunc(mtd, NAND_CMD_READOOB, col, page);
-			sndcmd = 0;
-		}
-
-		chip->read_buf(mtd, bufpoi, bytes);
-
-		if (unlikely(!direct))
-			buf = nand_transfer_oob(chip, buf, ops);
-		else
-			buf += ops->ooblen;
+		sndcmd = chip->ecc.read_oob(mtd, chip, page, sndcmd);
+		buf = nand_transfer_oob(chip, buf, ops);
 
 		readlen -= ops->ooblen;
 		if (!readlen)
@@ -1365,7 +1502,7 @@ static int nand_write_page(struct mtd_info *mtd, struct nand_chip *chip,
 	if (!cached || !(chip->options & NAND_CACHEPRG)) {
 
 		chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
-		status = chip->waitfunc(mtd, chip, FL_WRITING);
+		status = chip->waitfunc(mtd, chip);
 		/*
 		 * See if operation failed and additional status checks are
 		 * available
@@ -1378,7 +1515,7 @@ static int nand_write_page(struct mtd_info *mtd, struct nand_chip *chip,
 			return -EIO;
 	} else {
 		chip->cmdfunc(mtd, NAND_CMD_CACHEDPROG, -1, -1);
-		status = chip->waitfunc(mtd, chip, FL_WRITING);
+		status = chip->waitfunc(mtd, chip);
 	}
 
 #ifdef CONFIG_MTD_NAND_VERIFY_WRITE
@@ -1411,11 +1548,25 @@ static uint8_t *nand_fill_oob(struct nand_chip *chip, uint8_t *oob,
 
 	case MTD_OOB_AUTO: {
 		struct nand_oobfree *free = chip->ecc.layout->oobfree;
-		size_t bytes;
+		uint32_t boffs = 0, woffs = ops->ooboffs;
+		size_t bytes = 0;
 
 		for(; free->length && len; free++, len -= bytes) {
-			bytes = min_t(size_t, len, free->length);
-			memcpy(chip->oob_poi + free->offset, oob, bytes);
+			/* Write request not from offset 0 ? */
+			if (unlikely(woffs)) {
+				if (woffs >= free->length) {
+					woffs -= free->length;
+					continue;
+				}
+				boffs = free->offset + woffs;
+				bytes = min_t(size_t, len,
+					      (free->length - woffs));
+				woffs = 0;
+			} else {
+				bytes = min_t(size_t, len, free->length);
+				boffs = free->offset;
+			}
+			memcpy(chip->oob_poi + woffs, oob, bytes);
 			oob += bytes;
 		}
 		return oob;
@@ -1532,7 +1683,7 @@ static int nand_write(struct mtd_info *mtd, loff_t to, size_t len,
 	if (!len)
 		return 0;
 
-	nand_get_device(chip, mtd, FL_READING);
+	nand_get_device(chip, mtd, FL_WRITING);
 
 	chip->ops.len = len;
 	chip->ops.datbuf = (uint8_t *)buf;
@@ -1592,48 +1743,18 @@ static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
 	if (page == chip->pagebuf)
 		chip->pagebuf = -1;
 
-	if (ops->mode == MTD_OOB_AUTO || NAND_MUST_PAD(chip)) {
-		chip->oob_poi = chip->buffers.oobwbuf;
-		memset(chip->oob_poi, 0xff, mtd->oobsize);
-		nand_fill_oob(chip, ops->oobbuf, ops);
-		chip->cmdfunc(mtd, NAND_CMD_SEQIN, mtd->writesize,
-			      page & chip->pagemask);
-		chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
-		memset(chip->oob_poi, 0xff, mtd->oobsize);
-	} else {
-		chip->cmdfunc(mtd, NAND_CMD_SEQIN,
-			      mtd->writesize + ops->ooboffs,
-			      page & chip->pagemask);
-		chip->write_buf(mtd, ops->oobbuf, ops->len);
-	}
-
-	/* Send command to program the OOB data */
-	chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
+	chip->oob_poi = chip->buffers.oobwbuf;
+	memset(chip->oob_poi, 0xff, mtd->oobsize);
+	nand_fill_oob(chip, ops->oobbuf, ops);
+	status = chip->ecc.write_oob(mtd, chip, page & chip->pagemask);
+	memset(chip->oob_poi, 0xff, mtd->oobsize);
 
-	status = chip->waitfunc(mtd, chip, FL_WRITING);
+	if (status)
+		return status;
 
-	/* See if device thinks it succeeded */
-	if (status & NAND_STATUS_FAIL) {
-		DEBUG(MTD_DEBUG_LEVEL0, "nand_write_oob: "
-		      "Failed write, page 0x%08x\n", page);
-		return -EIO;
-	}
 	ops->retlen = ops->len;
 
-#ifdef CONFIG_MTD_NAND_VERIFY_WRITE
-	if (ops->mode != MTD_OOB_AUTO) {
-		/* Send command to read back the data */
-		chip->cmdfunc(mtd, NAND_CMD_READOOB, ops->ooboffs,
-			      page & chip->pagemask);
-
-		if (chip->verify_buf(mtd, ops->oobbuf, ops->len)) {
-			DEBUG(MTD_DEBUG_LEVEL0, "nand_write_oob: "
-			      "Failed write verify, page 0x%08x\n", page);
-			return -EIO;
-		}
-	}
-#endif
-		return 0;
+	return 0;
 }
 
 /**
@@ -1659,7 +1780,7 @@ static int nand_write_oob(struct mtd_info *mtd, loff_t to,
 		return -EINVAL;
 	}
 
-	nand_get_device(chip, mtd, FL_READING);
+	nand_get_device(chip, mtd, FL_WRITING);
 
 	switch(ops->mode) {
 	case MTD_OOB_PLACE:
@@ -1833,7 +1954,7 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 
 		chip->erase_cmd(mtd, page & chip->pagemask);
 
-		status = chip->waitfunc(mtd, chip, FL_ERASING);
+		status = chip->waitfunc(mtd, chip);
 
 		/*
 		 * See if operation failed and additional status checks are
@@ -2265,6 +2386,10 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 			chip->ecc.read_page = nand_read_page_hwecc;
 		if (!chip->ecc.write_page)
 			chip->ecc.write_page = nand_write_page_hwecc;
+		if (!chip->ecc.read_oob)
+			chip->ecc.read_oob = nand_read_oob_std;
+		if (!chip->ecc.write_oob)
+			chip->ecc.write_oob = nand_write_oob_std;
 
 	case NAND_ECC_HW_SYNDROME:
 		if (!chip->ecc.calculate || !chip->ecc.correct ||
@@ -2278,6 +2403,10 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 			chip->ecc.read_page = nand_read_page_syndrome;
 		if (!chip->ecc.write_page)
 			chip->ecc.write_page = nand_write_page_syndrome;
+		if (!chip->ecc.read_oob)
+			chip->ecc.read_oob = nand_read_oob_syndrome;
+		if (!chip->ecc.write_oob)
+			chip->ecc.write_oob = nand_write_oob_syndrome;
 
 		if (mtd->writesize >= chip->ecc.size)
 			break;
@@ -2291,6 +2420,8 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 		chip->ecc.correct = nand_correct_data;
 		chip->ecc.read_page = nand_read_page_swecc;
 		chip->ecc.write_page = nand_write_page_swecc;
+		chip->ecc.read_oob = nand_read_oob_std;
+		chip->ecc.write_oob = nand_write_oob_std;
 		chip->ecc.size = 256;
 		chip->ecc.bytes = 3;
 		break;
@@ -2300,6 +2431,8 @@ int nand_scan(struct mtd_info *mtd, int maxchips)
 		       "This is not recommended !!\n");
 		chip->ecc.read_page = nand_read_page_raw;
 		chip->ecc.write_page = nand_write_page_raw;
+		chip->ecc.read_oob = nand_read_oob_std;
+		chip->ecc.write_oob = nand_write_oob_std;
 		chip->ecc.size = mtd->writesize;
 		chip->ecc.bytes = 0;
 		break;
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index bf2ce68901f5..a30969eb9afe 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -63,18 +63,21 @@ extern void nand_release (struct mtd_info *mtd);
  */
 #define NAND_CMD_READ0		0
 #define NAND_CMD_READ1		1
+#define NAND_CMD_RNDOUT		5
 #define NAND_CMD_PAGEPROG	0x10
 #define NAND_CMD_READOOB	0x50
 #define NAND_CMD_ERASE1		0x60
 #define NAND_CMD_STATUS		0x70
 #define NAND_CMD_STATUS_MULTI	0x71
 #define NAND_CMD_SEQIN		0x80
+#define NAND_CMD_RNDIN		0x85
 #define NAND_CMD_READID		0x90
 #define NAND_CMD_ERASE2		0xd0
 #define NAND_CMD_RESET		0xff
 
 /* Extended commands for large page devices */
 #define NAND_CMD_READSTART	0x30
+#define NAND_CMD_RNDOUTSTART	0xE0
 #define NAND_CMD_CACHEDPROG	0x15
 
 /* Extended commands for AG-AND device */
@@ -250,6 +253,13 @@ struct nand_ecc_ctrl {
 	void			(*write_page)(struct mtd_info *mtd,
 					      struct nand_chip *chip,
 					      const uint8_t *buf);
+	int			(*read_oob)(struct mtd_info *mtd,
+					    struct nand_chip *chip,
+					    int page,
+					    int sndcmd);
+	int			(*write_oob)(struct mtd_info *mtd,
+					     struct nand_chip *chip,
+					     int page);
 };
 
 /**
@@ -339,7 +349,7 @@ struct nand_chip {
 				    unsigned int ctrl);
 	int		(*dev_ready)(struct mtd_info *mtd);
 	void		(*cmdfunc)(struct mtd_info *mtd, unsigned command, int column, int page_addr);
-	int		(*waitfunc)(struct mtd_info *mtd, struct nand_chip *this, int state);
+	int		(*waitfunc)(struct mtd_info *mtd, struct nand_chip *this);
 	void		(*erase_cmd)(struct mtd_info *mtd, int page);
 	int		(*scan_bbt)(struct mtd_info *mtd);
 	int		(*errstat)(struct mtd_info *mtd, struct nand_chip *this, int state, int status, int page);
-- 
cgit v1.2.3


From 22ae813b85df7c0b0fc7c8d6f336d6a9f566ff97 Mon Sep 17 00:00:00 2001
From: Brice Goglin <brice@myri.com>
Date: Tue, 20 Jun 2006 20:03:02 -0700
Subject: [PATCH] add __iowrite64_copy

Introduce __iowrite64_copy.  It will be used by the Myri-10G Ethernet
driver to post requests to the NIC.  This driver will be submitted soon.

__iowrite64_copy copies to I/O memory in units of 64 bits when possible (on
64 bit architectures).  It reverts to __iowrite32_copy on 32 bit
architectures.

Signed-off-by: Brice Goglin <brice@myri.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/io.h |  1 +
 lib/iomap_copy.c   | 28 ++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io.h b/include/linux/io.h
index 85533ec5aaa1..420e2fdf26f6 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -21,5 +21,6 @@
 #include <asm/io.h>
 
 void __iowrite32_copy(void __iomem *to, const void *from, size_t count);
+void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
 
 #endif /* _LINUX_IO_H */
diff --git a/lib/iomap_copy.c b/lib/iomap_copy.c
index 351045f4f63c..864fc5ea398c 100644
--- a/lib/iomap_copy.c
+++ b/lib/iomap_copy.c
@@ -40,3 +40,31 @@ void __attribute__((weak)) __iowrite32_copy(void __iomem *to,
 		__raw_writel(*src++, dst++);
 }
 EXPORT_SYMBOL_GPL(__iowrite32_copy);
+
+/**
+ * __iowrite64_copy - copy data to MMIO space, in 64-bit or 32-bit units
+ * @to: destination, in MMIO space (must be 64-bit aligned)
+ * @from: source (must be 64-bit aligned)
+ * @count: number of 64-bit quantities to copy
+ *
+ * Copy data from kernel space to MMIO space, in units of 32 or 64 bits at a
+ * time.  Order of access is not guaranteed, nor is a memory barrier
+ * performed afterwards.
+ */
+void __attribute__((weak)) __iowrite64_copy(void __iomem *to,
+					    const void *from,
+					    size_t count)
+{
+#ifdef CONFIG_64BIT
+	u64 __iomem *dst = to;
+	const u64 *src = from;
+	const u64 *end = src + count;
+
+	while (src < end)
+		__raw_writeq(*src++, dst++);
+#else
+	__iowrite32_copy(to, from, count * 2);
+#endif
+}
+
+EXPORT_SYMBOL_GPL(__iowrite64_copy);
-- 
cgit v1.2.3


From 1e92a550e80fef01ebcc0bcd0896109cdb986c72 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 15 Jun 2006 14:11:22 +1000
Subject: [POWERPC] Fix mdelay badness on shared processor partitions

On partitioned PPC64 systems where a partition is given 1/10 of a
processor, we have seen mdelay() delaying for 10 times longer than it
should.  The reason is that the generic mdelay(n) does n delays of 1
millisecond each.  However, with 1/10 of a processor, we only get a
one-millisecond timeslice every 10ms.  Thus each 1 millisecond delay
loop ends up taking 10ms elapsed time.

The solution is just to use the PPC64 udelay function, which uses the
timebase to ensure that the delay is based on elapsed time rather than
how much processing time the partition has been given.  (Yes, the
generic mdelay uses the PPC64 udelay, but the problem is that the
start time gets reset every millisecond, and each time it gets reset
we lose another 9ms.)

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Andrew Morton <akpm@osdl.org>
---
 include/asm-powerpc/delay.h | 13 +++++++++++++
 include/linux/delay.h       |  5 +----
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-powerpc/delay.h b/include/asm-powerpc/delay.h
index 057a60955474..f9200a65c632 100644
--- a/include/asm-powerpc/delay.h
+++ b/include/asm-powerpc/delay.h
@@ -17,5 +17,18 @@
 extern void __delay(unsigned long loops);
 extern void udelay(unsigned long usecs);
 
+/*
+ * On shared processor machines the generic implementation of mdelay can
+ * result in large errors. While each iteration of the loop inside mdelay
+ * is supposed to take 1ms, the hypervisor could sleep our partition for
+ * longer (eg 10ms). With the right timing these errors can add up.
+ *
+ * Since there is no 32bit overflow issue on 64bit kernels, just call
+ * udelay directly.
+ */
+#ifdef CONFIG_PPC64
+#define mdelay(n)	udelay((n) * 1000)
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_DELAY_H */
diff --git a/include/linux/delay.h b/include/linux/delay.h
index acb74865b973..17ddb55430ae 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -25,10 +25,7 @@ extern unsigned long loops_per_jiffy;
 #define MAX_UDELAY_MS	5
 #endif
 
-#ifdef notdef
-#define mdelay(n) (\
-	{unsigned long __ms=(n); while (__ms--) udelay(1000);})
-#else
+#ifndef mdelay
 #define mdelay(n) (\
 	(__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \
 	({unsigned long __ms=(n); while (__ms--) udelay(1000);}))
-- 
cgit v1.2.3


From c34b4c734482dda750deb6089521f7c891b48736 Mon Sep 17 00:00:00 2001
From: Brice Goglin <brice@myri.com>
Date: Tue, 9 May 2006 10:52:09 -0700
Subject: [PATCH] PCI: Add PCI_CAP_ID_VNDR

Add the vendor-specific extended capability PCI_CAP_ID_VNDR.  It will be
used by the Myri-10G Ethernet driver (will be submitted soon).

Signed-off-by: Brice Goglin <brice@myri.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pci_regs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index d27a78b71297..6bce4a240364 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -197,6 +197,7 @@
 #define  PCI_CAP_ID_CHSWP	0x06	/* CompactPCI HotSwap */
 #define  PCI_CAP_ID_PCIX	0x07	/* PCI-X */
 #define  PCI_CAP_ID_HT_IRQCONF	0x08	/* HyperTransport IRQ Configuration */
+#define  PCI_CAP_ID_VNDR	0x09	/* Vendor specific capability */
 #define  PCI_CAP_ID_SHPC 	0x0C	/* PCI Standard Hot-Plug Controller */
 #define  PCI_CAP_ID_EXP 	0x10	/* PCI Express */
 #define  PCI_CAP_ID_MSIX	0x11	/* MSI-X */
-- 
cgit v1.2.3


From 75acfecaa031c0e1bc412cee4fe58ba49ff3406c Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Mon, 1 May 2006 10:43:46 -0500
Subject: [PATCH] PCI: Add pci_assign_resource_fixed -- allow fixed address
 assignments

PCI: Add pci_assign_resource_fixed -- allow fixed address assignments

On some embedded systems the PCI address for hotplug devices are not only
known a priori but are required to be at a given PCI address for other
master in the system to be able to access.

An example of such a system would be an FPGA which is setup from user space
after the system has booted.  The FPGA may be access by DSPs in the system
and those DSPs expect the FPGA at a fixed PCI address.

Added pci_assign_resource_fixed() as a way to allow assignment of the PCI
devices's BARs at fixed PCI addresses.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/setup-res.c | 40 ++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h     |  1 +
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index ea9277b7f899..577f4b55c46d 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -155,6 +155,46 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
 	return ret;
 }
 
+#ifdef CONFIG_EMBEDDED
+int pci_assign_resource_fixed(struct pci_dev *dev, int resno)
+{
+	struct pci_bus *bus = dev->bus;
+	struct resource *res = dev->resource + resno;
+	unsigned int type_mask;
+	int i, ret = -EBUSY;
+
+	type_mask = IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+
+	for (i = 0; i < PCI_BUS_NUM_RESOURCES; i++) {
+		struct resource *r = bus->resource[i];
+		if (!r)
+			continue;
+
+		/* type_mask must match */
+		if ((res->flags ^ r->flags) & type_mask)
+			continue;
+
+		ret = request_resource(r, res);
+
+		if (ret == 0)
+			break;
+	}
+
+	if (ret) {
+		printk(KERN_ERR "PCI: Failed to allocate %s resource "
+				"#%d:%llx@%llx for %s\n",
+			res->flags & IORESOURCE_IO ? "I/O" : "mem",
+			resno, (unsigned long long)(res->end - res->start + 1),
+			(unsigned long long)res->start, pci_name(dev));
+	} else if (resno < PCI_BRIDGE_RESOURCES) {
+		pci_update_resource(dev, res, resno);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_assign_resource_fixed);
+#endif
+
 /* Sort resources by alignment */
 void __devinit
 pdev_sort_resources(struct pci_dev *dev, struct resource_list *head)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 6c4bc773f7b7..b9eb9b021d6a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -496,6 +496,7 @@ int pci_set_dma_mask(struct pci_dev *dev, u64 mask);
 int pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask);
 void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
 int pci_assign_resource(struct pci_dev *dev, int i);
+int pci_assign_resource_fixed(struct pci_dev *dev, int i);
 void pci_restore_bars(struct pci_dev *dev);
 
 /* ROM control related routines */
-- 
cgit v1.2.3


From bd8481e1646d7649fa101ee57a5139b9da3c2436 Mon Sep 17 00:00:00 2001
From: Doug Thompson <norsk5@yahoo.com>
Date: Mon, 8 May 2006 17:06:09 -0700
Subject: [PATCH] PCI Bus Parity Status-broken hardware attribute, EDAC
 foundation

Currently, the EDAC (error detection and correction) modules that are in
the kernel contain some features that need to be moved. After some good
feedback on the PCI Parity detection code and interface
(http://www.ussg.iu.edu/hypermail/linux/kernel/0603.1/0897.html) this
patch ADDs an new attribute to the pci_dev structure: Namely the
'broken_parity_status' bit.

When set this indicates that the respective hardware generates false
positives of Parity errors.

The EDAC "blacklist" solution was inferior and will be removed in a
future patch.

Also in this patch is a PCI quirk.c entry for an Infiniband PCI-X card
which generates false positive parity errors.

I am requesting comments on this AND on the possibility of a exposing
this 'broken_parity_status' bit to userland via the PCI device sysfs
directory for devices. This access would allow for enabling of this
feature on new devices and for old devices that have their drivers
updated. (SLES 9 SP3 did this on an ATI motherboard video device). There
is a need to update such a PCI attribute between kernel releases.

This patch just adds a storage place for the attribute and a quirk entry
for a known bad PCI device. PCI Parity reaper/harvestor operations are
in EDAC itself and will be refactored to use this PCI attribute instead
of its own mechanisms (which are currently disabled) in the future.

Signed-off-by: Doug Thompson <norsk5@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/quirks.c    | 11 +++++++++++
 include/linux/pci.h     |  1 +
 include/linux/pci_ids.h |  1 +
 3 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index d378478612fb..ea48e969a12e 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -24,6 +24,17 @@
 #include <linux/acpi.h>
 #include "pci.h"
 
+/* The Mellanox Tavor device gives false positive parity errors
+ * Mark this device with a broken_parity_status, to allow
+ * PCI scanning code to "skip" this now blacklisted device.
+ */
+static void __devinit quirk_mellanox_tavor(struct pci_dev *dev)
+{
+	dev->broken_parity_status = 1;	/* This device gives false positives */
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX,PCI_DEVICE_ID_MELLANOX_TAVOR,quirk_mellanox_tavor);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX,PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE,quirk_mellanox_tavor);
+
 /* Deal with broken BIOS'es that neglect to enable passive release,
    which can cause problems in combination with the 82441FX/PPro MTRRs */
 static void __devinit quirk_passive_release(struct pci_dev *dev)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b9eb9b021d6a..91c37750cd34 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -162,6 +162,7 @@ struct pci_dev {
 	unsigned int	is_busmaster:1; /* device is busmaster */
 	unsigned int	no_msi:1;	/* device may not use msi */
 	unsigned int	block_ucfg_access:1;	/* userspace config space access is blocked */
+	unsigned int	broken_parity_status:1;	/* Device generates false positive parity */
 
 	u32		saved_config_space[16]; /* config space saved at suspend time */
 	struct hlist_head saved_cap_space;
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index bcfe9d4f56ae..3d197cdcfa3a 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1946,6 +1946,7 @@
 
 #define PCI_VENDOR_ID_MELLANOX		0x15b3
 #define PCI_DEVICE_ID_MELLANOX_TAVOR	0x5a44
+#define PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE	0x5a46
 #define PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT 0x6278
 #define PCI_DEVICE_ID_MELLANOX_ARBEL	0x6282
 #define PCI_DEVICE_ID_MELLANOX_SINAI_OLD 0x5e8c
-- 
cgit v1.2.3


From 74d0a988d3aa359b6b8a8536c8cb92cce02ca5d5 Mon Sep 17 00:00:00 2001
From: Brent Casavant <bcasavan@sgi.com>
Date: Wed, 10 May 2006 01:49:14 -0700
Subject: [PATCH] PCI: Move various PCI IDs to header file

Move various QLogic, Vitesse, and Intel storage controller PCI IDs to the
main header file.

Signed-off-by: Brent Casavant <bcasavan@sgi.com>
Acked-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/scsi/qla1280.c  | 24 ------------------------
 drivers/scsi/sata_vsc.c | 11 ++++++-----
 include/linux/pci_ids.h |  9 +++++++++
 3 files changed, 15 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index 77bb2351500c..680f6063954b 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -397,30 +397,6 @@
 #include "ql1280_fw.h"
 #include "ql1040_fw.h"
 
-
-/*
- * Missing PCI ID's
- */
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP1080
-#define PCI_DEVICE_ID_QLOGIC_ISP1080	0x1080
-#endif
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP1240
-#define PCI_DEVICE_ID_QLOGIC_ISP1240	0x1240
-#endif
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP1280
-#define PCI_DEVICE_ID_QLOGIC_ISP1280	0x1280
-#endif
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP10160
-#define PCI_DEVICE_ID_QLOGIC_ISP10160	0x1016
-#endif
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP12160
-#define PCI_DEVICE_ID_QLOGIC_ISP12160	0x1216
-#endif
-
-#ifndef PCI_VENDOR_ID_AMI
-#define PCI_VENDOR_ID_AMI               0x101e
-#endif
-
 #ifndef BITS_PER_LONG
 #error "BITS_PER_LONG not defined!"
 #endif
diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c
index 8a29ce340b47..27d658704cf9 100644
--- a/drivers/scsi/sata_vsc.c
+++ b/drivers/scsi/sata_vsc.c
@@ -433,13 +433,14 @@ err_out:
 
 
 /*
- * 0x1725/0x7174 is the Vitesse VSC-7174
- * 0x8086/0x3200 is the Intel 31244, which is supposed to be identical
- * compatibility is untested as of yet
+ * Intel 31244 is supposed to be identical.
+ * Compatibility is untested as of yet.
  */
 static const struct pci_device_id vsc_sata_pci_tbl[] = {
-	{ 0x1725, 0x7174, PCI_ANY_ID, PCI_ANY_ID, 0x10600, 0xFFFFFF, 0 },
-	{ 0x8086, 0x3200, PCI_ANY_ID, PCI_ANY_ID, 0x10600, 0xFFFFFF, 0 },
+	{ PCI_VENDOR_ID_VITESSE, PCI_DEVICE_ID_VITESSE_VSC7174,
+	  PCI_ANY_ID, PCI_ANY_ID, 0x10600, 0xFFFFFF, 0 },
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_GD31244,
+	  PCI_ANY_ID, PCI_ANY_ID, 0x10600, 0xFFFFFF, 0 },
 	{ }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 3d197cdcfa3a..e526e7b5ea47 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -848,7 +848,12 @@
 
 
 #define PCI_VENDOR_ID_QLOGIC		0x1077
+#define PCI_DEVICE_ID_QLOGIC_ISP10160	0x1016
 #define PCI_DEVICE_ID_QLOGIC_ISP1020	0x1020
+#define PCI_DEVICE_ID_QLOGIC_ISP1080	0x1080
+#define PCI_DEVICE_ID_QLOGIC_ISP12160	0x1216
+#define PCI_DEVICE_ID_QLOGIC_ISP1240	0x1240
+#define PCI_DEVICE_ID_QLOGIC_ISP1280	0x1280
 #define PCI_DEVICE_ID_QLOGIC_ISP2100	0x2100
 #define PCI_DEVICE_ID_QLOGIC_ISP2200	0x2200
 #define PCI_DEVICE_ID_QLOGIC_ISP2300	0x2300
@@ -1970,6 +1975,9 @@
 #define PCI_VENDOR_ID_NETCELL		0x169c
 #define PCI_DEVICE_ID_REVOLUTION	0x0044
 
+#define PCI_VENDOR_ID_VITESSE		0x1725
+#define PCI_DEVICE_ID_VITESSE_VSC7174	0x7174
+
 #define PCI_VENDOR_ID_LINKSYS		0x1737
 #define PCI_DEVICE_ID_LINKSYS_EG1064	0x1064
 
@@ -2149,6 +2157,7 @@
 #define PCI_DEVICE_ID_INTEL_ICH8_4	0x2815
 #define PCI_DEVICE_ID_INTEL_ICH8_5	0x283e
 #define PCI_DEVICE_ID_INTEL_ICH8_6	0x2850
+#define PCI_DEVICE_ID_INTEL_GD31244	0x3200
 #define PCI_DEVICE_ID_INTEL_82855PM_HB	0x3340
 #define PCI_DEVICE_ID_INTEL_82830_HB	0x3575
 #define PCI_DEVICE_ID_INTEL_82830_CGC	0x3577
-- 
cgit v1.2.3


From 99dc804d9bcc2c53f4c20c291bf4e185312a1a0c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 26 May 2006 10:58:27 +0800
Subject: [PATCH] PCI: disable msi mode in pci_disable_device

Brice said the pci_save_msi_state breaks his driver in his special usage
(not in suspend/resume), as pci_save_msi_state will disable msi mode. In
his usage, pci_save_state will be called at runtime, and later (after
the device operates for some time and has an error) pci_restore_state
will be called.
In another hand, suspend/resume needs disable msi mode, as device should
stop working completely. This patch try to workaround this issue.
Drivers are expected call pci_disable_device in suspend time after
pci_save_state.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/msi.c   | 6 ++++--
 drivers/pci/pci.c   | 9 ++++++++-
 include/linux/pci.h | 2 ++
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 9c69b6966e79..3ec558dc6523 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -453,9 +453,11 @@ static void enable_msi_mode(struct pci_dev *dev, int pos, int type)
 		/* Set enabled bits to single MSI & enable MSI_enable bit */
 		msi_enable(control, 1);
 		pci_write_config_word(dev, msi_control_reg(pos), control);
+		dev->msi_enabled = 1;
 	} else {
 		msix_enable(control);
 		pci_write_config_word(dev, msi_control_reg(pos), control);
+		dev->msix_enabled = 1;
 	}
     	if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
 		/* PCI Express Endpoint device detected */
@@ -472,9 +474,11 @@ void disable_msi_mode(struct pci_dev *dev, int pos, int type)
 		/* Set enabled bits to single MSI & enable MSI_enable bit */
 		msi_disable(control);
 		pci_write_config_word(dev, msi_control_reg(pos), control);
+		dev->msi_enabled = 0;
 	} else {
 		msix_disable(control);
 		pci_write_config_word(dev, msi_control_reg(pos), control);
+		dev->msix_enabled = 0;
 	}
     	if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
 		/* PCI Express Endpoint device detected */
@@ -549,7 +553,6 @@ int pci_save_msi_state(struct pci_dev *dev)
 		pci_read_config_dword(dev, pos + PCI_MSI_DATA_32, &cap[i++]);
 	if (control & PCI_MSI_FLAGS_MASKBIT)
 		pci_read_config_dword(dev, pos + PCI_MSI_MASK_BIT, &cap[i++]);
-	disable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
 	save_state->cap_nr = PCI_CAP_ID_MSI;
 	pci_add_saved_cap(dev, save_state);
 	return 0;
@@ -639,7 +642,6 @@ int pci_save_msix_state(struct pci_dev *dev)
 	}
 	dev->irq = temp;
 
-	disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
 	save_state->cap_nr = PCI_CAP_ID_MSIX;
 	pci_add_saved_cap(dev, save_state);
 	return 0;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index aa480370ef10..d408a3c30426 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -551,7 +551,14 @@ void
 pci_disable_device(struct pci_dev *dev)
 {
 	u16 pci_command;
-	
+
+	if (dev->msi_enabled)
+		disable_msi_mode(dev, pci_find_capability(dev, PCI_CAP_ID_MSI),
+			PCI_CAP_ID_MSI);
+	if (dev->msix_enabled)
+		disable_msi_mode(dev, pci_find_capability(dev, PCI_CAP_ID_MSI),
+			PCI_CAP_ID_MSIX);
+
 	pci_read_config_word(dev, PCI_COMMAND, &pci_command);
 	if (pci_command & PCI_COMMAND_MASTER) {
 		pci_command &= ~PCI_COMMAND_MASTER;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 91c37750cd34..62a8c22f5f60 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -163,6 +163,8 @@ struct pci_dev {
 	unsigned int	no_msi:1;	/* device may not use msi */
 	unsigned int	block_ucfg_access:1;	/* userspace config space access is blocked */
 	unsigned int	broken_parity_status:1;	/* Device generates false positive parity */
+	unsigned int 	msi_enabled:1;
+	unsigned int	msix_enabled:1;
 
 	u32		saved_config_space[16]; /* config space saved at suspend time */
 	struct hlist_head saved_cap_space;
-- 
cgit v1.2.3


From cf34a8e07f02c76f3f1232eecb681301a3d7b10b Mon Sep 17 00:00:00 2001
From: Brice Goglin <brice@myri.com>
Date: Tue, 13 Jun 2006 14:35:42 -0400
Subject: [PATCH] PCI: nVidia quirk to make AER PCI-E extended capability
 visible

The nVidia CK804 PCI-E chipset supports the AER extended capability
but sometimes fails to link it (with some BIOS or after a warm reboot).
It makes the AER cap invisible to pci_find_ext_capability().

The patch adds a quirk to set the missing bit that controls the
linking of the capability.
By the way, it removes the corresponding code in the myri10ge driver.

Signed-off-by: Brice Goglin <brice@myri.com>
Signed-off-by: Loic Prylli <loic@myri.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/quirks.c    | 19 +++++++++++++++++++
 include/linux/pci_ids.h |  1 +
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index aea4d49bcce3..4364d793f73b 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1499,6 +1499,25 @@ static void __devinit quirk_p64h2_1k_io(struct pci_dev *dev)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	0x1460,		quirk_p64h2_1k_io);
 
+/* Under some circumstances, AER is not linked with extended capabilities.
+ * Force it to be linked by setting the corresponding control bit in the
+ * config space.
+ */
+static void __devinit quirk_nvidia_ck804_pcie_aer_ext_cap(struct pci_dev *dev)
+{
+	uint8_t b;
+	if (pci_read_config_byte(dev, 0xf41, &b) == 0) {
+		if (!(b & 0x20)) {
+			pci_write_config_byte(dev, 0xf41, b | 0x20);
+			printk(KERN_INFO
+			       "PCI: Linking AER extended capability on %s\n",
+			       pci_name(dev));
+		}
+	}
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA,  PCI_DEVICE_ID_NVIDIA_CK804_PCIE,
+			quirk_nvidia_ck804_pcie_aer_ext_cap);
+
 EXPORT_SYMBOL(pcie_mch_quirk);
 #ifdef CONFIG_HOTPLUG
 EXPORT_SYMBOL(pci_fixup_device);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index e526e7b5ea47..fd54a9d4c3d4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1023,6 +1023,7 @@
 #define PCI_DEVICE_ID_NVIDIA_NVENET_8		0x0056
 #define PCI_DEVICE_ID_NVIDIA_NVENET_9		0x0057
 #define PCI_DEVICE_ID_NVIDIA_CK804_AUDIO	0x0059
+#define PCI_DEVICE_ID_NVIDIA_CK804_PCIE		0x005d
 #define PCI_DEVICE_ID_NVIDIA_NFORCE2_SMBUS	0x0064
 #define PCI_DEVICE_ID_NVIDIA_NFORCE2_IDE	0x0065
 #define PCI_DEVICE_ID_NVIDIA_NVENET_2		0x0066
-- 
cgit v1.2.3


From 1cdcb6b43fda7424b7435dac8f80b2b5d8a48899 Mon Sep 17 00:00:00 2001
From: Hansjoerg Lipp <hjlipp@web.de>
Date: Sat, 22 Apr 2006 18:36:53 +0200
Subject: [PATCH] TTY: return class device pointer from tty_register_device()

Let tty_register_device() return a pointer to the class device it creates.
This allows registrants to add their own sysfs files under the class
device node.

Signed-off-by: Hansjoerg Lipp <hjlipp@web.de>
Signed-off-by: Tilman Schmidt <tilman@imap.cc>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/tty_io.c | 11 +++++++----
 include/linux/tty.h   |  4 +++-
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index a88b94a82b14..8b2a59969868 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2961,12 +2961,14 @@ static struct class *tty_class;
  *	This field is optional, if there is no known struct device for this
  *	tty device it can be set to NULL safely.
  *
+ * Returns a pointer to the class device (or ERR_PTR(-EFOO) on error).
+ *
  * This call is required to be made to register an individual tty device if
  * the tty driver's flags have the TTY_DRIVER_NO_DEVFS bit set.  If that
  * bit is not set, this function should not be called.
  */
-void tty_register_device(struct tty_driver *driver, unsigned index,
-			 struct device *device)
+struct class_device *tty_register_device(struct tty_driver *driver,
+					 unsigned index, struct device *device)
 {
 	char name[64];
 	dev_t dev = MKDEV(driver->major, driver->minor_start) + index;
@@ -2974,7 +2976,7 @@ void tty_register_device(struct tty_driver *driver, unsigned index,
 	if (index >= driver->num) {
 		printk(KERN_ERR "Attempt to register invalid tty line number "
 		       " (%d).\n", index);
-		return;
+		return ERR_PTR(-EINVAL);
 	}
 
 	devfs_mk_cdev(dev, S_IFCHR | S_IRUSR | S_IWUSR,
@@ -2984,7 +2986,8 @@ void tty_register_device(struct tty_driver *driver, unsigned index,
 		pty_line_name(driver, index, name);
 	else
 		tty_line_name(driver, index, name);
-	class_device_create(tty_class, NULL, dev, device, "%s", name);
+
+	return class_device_create(tty_class, NULL, dev, device, "%s", name);
 }
 
 /**
diff --git a/include/linux/tty.h b/include/linux/tty.h
index e898eeb94166..cb35ca50a0a6 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -290,7 +290,9 @@ extern int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc);
 extern int tty_unregister_ldisc(int disc);
 extern int tty_register_driver(struct tty_driver *driver);
 extern int tty_unregister_driver(struct tty_driver *driver);
-extern void tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev);
+extern struct class_device *tty_register_device(struct tty_driver *driver,
+						unsigned index,
+						struct device *dev);
 extern void tty_unregister_device(struct tty_driver *driver, unsigned index);
 extern int tty_read_raw_data(struct tty_struct *tty, unsigned char *bufp,
 			     int buflen);
-- 
cgit v1.2.3


From 1740757e8f94c6899705eb6f5434de9404992778 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 2 May 2006 16:59:59 +0200
Subject: [PATCH] Driver Core: remove unused exports

Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/attribute_container.c | 8 --------
 drivers/base/base.h                | 2 ++
 drivers/base/bus.c                 | 6 ------
 drivers/base/class.c               | 6 ++----
 include/linux/device.h             | 8 --------
 5 files changed, 4 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/attribute_container.c b/drivers/base/attribute_container.c
index 2a7d7ae83e1e..22220733f76f 100644
--- a/drivers/base/attribute_container.c
+++ b/drivers/base/attribute_container.c
@@ -236,7 +236,6 @@ attribute_container_remove_device(struct device *dev,
 	}
 	up(&attribute_container_mutex);
 }
-EXPORT_SYMBOL_GPL(attribute_container_remove_device);
 
 /**
  * attribute_container_device_trigger - execute a trigger for each matching classdev
@@ -276,7 +275,6 @@ attribute_container_device_trigger(struct device *dev,
 	}
 	up(&attribute_container_mutex);
 }
-EXPORT_SYMBOL_GPL(attribute_container_device_trigger);
 
 /**
  * attribute_container_trigger - trigger a function for each matching container
@@ -304,7 +302,6 @@ attribute_container_trigger(struct device *dev,
 	}
 	up(&attribute_container_mutex);
 }
-EXPORT_SYMBOL_GPL(attribute_container_trigger);
 
 /**
  * attribute_container_add_attrs - add attributes
@@ -333,7 +330,6 @@ attribute_container_add_attrs(struct class_device *classdev)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(attribute_container_add_attrs);
 
 /**
  * attribute_container_add_class_device - same function as class_device_add
@@ -352,7 +348,6 @@ attribute_container_add_class_device(struct class_device *classdev)
 		return error;
 	return attribute_container_add_attrs(classdev);
 }
-EXPORT_SYMBOL_GPL(attribute_container_add_class_device);
 
 /**
  * attribute_container_add_class_device_adapter - simple adapter for triggers
@@ -367,7 +362,6 @@ attribute_container_add_class_device_adapter(struct attribute_container *cont,
 {
 	return attribute_container_add_class_device(classdev);
 }
-EXPORT_SYMBOL_GPL(attribute_container_add_class_device_adapter);
 
 /**
  * attribute_container_remove_attrs - remove any attribute files
@@ -389,7 +383,6 @@ attribute_container_remove_attrs(struct class_device *classdev)
 	for (i = 0; attrs[i]; i++)
 		class_device_remove_file(classdev, attrs[i]);
 }
-EXPORT_SYMBOL_GPL(attribute_container_remove_attrs);
 
 /**
  * attribute_container_class_device_del - equivalent of class_device_del
@@ -405,7 +398,6 @@ attribute_container_class_device_del(struct class_device *classdev)
 	attribute_container_remove_attrs(classdev);
 	class_device_del(classdev);
 }
-EXPORT_SYMBOL_GPL(attribute_container_class_device_del);
 
 /**
  * attribute_container_find_class_device - find the corresponding class_device
diff --git a/drivers/base/base.h b/drivers/base/base.h
index bbbc2acd921c..122498aef50b 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -13,6 +13,8 @@ extern int attribute_container_init(void);
 extern int bus_add_device(struct device * dev);
 extern void bus_attach_device(struct device * dev);
 extern void bus_remove_device(struct device * dev);
+extern struct bus_type *get_bus(struct bus_type * bus);
+extern void put_bus(struct bus_type * bus);
 
 extern int bus_add_driver(struct device_driver *);
 extern void bus_remove_driver(struct device_driver *);
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index b27a6067e5a4..64ba9011d1a8 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -745,15 +745,9 @@ EXPORT_SYMBOL_GPL(bus_for_each_dev);
 EXPORT_SYMBOL_GPL(bus_find_device);
 EXPORT_SYMBOL_GPL(bus_for_each_drv);
 
-EXPORT_SYMBOL_GPL(bus_add_device);
-EXPORT_SYMBOL_GPL(bus_attach_device);
-EXPORT_SYMBOL_GPL(bus_remove_device);
 EXPORT_SYMBOL_GPL(bus_register);
 EXPORT_SYMBOL_GPL(bus_unregister);
 EXPORT_SYMBOL_GPL(bus_rescan_devices);
-EXPORT_SYMBOL_GPL(get_bus);
-EXPORT_SYMBOL_GPL(put_bus);
-EXPORT_SYMBOL_GPL(find_bus);
 
 EXPORT_SYMBOL_GPL(bus_create_file);
 EXPORT_SYMBOL_GPL(bus_remove_file);
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 48ad5df72812..4b598be0e4b6 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -91,14 +91,14 @@ void class_remove_file(struct class * cls, const struct class_attribute * attr)
 		sysfs_remove_file(&cls->subsys.kset.kobj, &attr->attr);
 }
 
-struct class * class_get(struct class * cls)
+static struct class *class_get(struct class *cls)
 {
 	if (cls)
 		return container_of(subsys_get(&cls->subsys), struct class, subsys);
 	return NULL;
 }
 
-void class_put(struct class * cls)
+static void class_put(struct class * cls)
 {
 	if (cls)
 		subsys_put(&cls->subsys);
@@ -894,8 +894,6 @@ EXPORT_SYMBOL_GPL(class_create_file);
 EXPORT_SYMBOL_GPL(class_remove_file);
 EXPORT_SYMBOL_GPL(class_register);
 EXPORT_SYMBOL_GPL(class_unregister);
-EXPORT_SYMBOL_GPL(class_get);
-EXPORT_SYMBOL_GPL(class_put);
 EXPORT_SYMBOL_GPL(class_create);
 EXPORT_SYMBOL_GPL(class_destroy);
 
diff --git a/include/linux/device.h b/include/linux/device.h
index b2e5da2b637b..ade10dd6b779 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -60,11 +60,6 @@ extern void bus_unregister(struct bus_type * bus);
 
 extern void bus_rescan_devices(struct bus_type * bus);
 
-extern struct bus_type * get_bus(struct bus_type * bus);
-extern void put_bus(struct bus_type * bus);
-
-extern struct bus_type * find_bus(char * name);
-
 /* iterator helpers for buses */
 
 int bus_for_each_dev(struct bus_type * bus, struct device * start, void * data,
@@ -163,9 +158,6 @@ struct class {
 extern int class_register(struct class *);
 extern void class_unregister(struct class *);
 
-extern struct class * class_get(struct class *);
-extern void class_put(struct class *);
-
 
 struct class_attribute {
 	struct attribute	attr;
-- 
cgit v1.2.3


From 670dd90d81f60ef429cbba54ad235e9207f4d444 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 8 May 2006 13:45:57 +0800
Subject: [PATCH] Driver Core: Allow sysdev_class have attributes

allow sysdev_class adding attribute. Next patch will use the new API to
add an attribute under /sys/device/system/cpu/.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/sys.c     | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/sysdev.h | 18 +++++++++++++++++-
 2 files changed, 67 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/sys.c b/drivers/base/sys.c
index 6fc23ab127bd..6858178b3aff 100644
--- a/drivers/base/sys.c
+++ b/drivers/base/sys.c
@@ -80,10 +80,59 @@ void sysdev_remove_file(struct sys_device * s, struct sysdev_attribute * a)
 EXPORT_SYMBOL_GPL(sysdev_create_file);
 EXPORT_SYMBOL_GPL(sysdev_remove_file);
 
+#define to_sysdev_class(k) container_of(k, struct sysdev_class, kset.kobj)
+#define to_sysdev_class_attr(a) container_of(a, \
+	struct sysdev_class_attribute, attr)
+
+static ssize_t sysdev_class_show(struct kobject *kobj, struct attribute *attr,
+				 char *buffer)
+{
+	struct sysdev_class * class = to_sysdev_class(kobj);
+	struct sysdev_class_attribute *class_attr = to_sysdev_class_attr(attr);
+
+	if (class_attr->show)
+		return class_attr->show(class, buffer);
+	return -EIO;
+}
+
+static ssize_t sysdev_class_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	struct sysdev_class * class = to_sysdev_class(kobj);
+	struct sysdev_class_attribute * class_attr = to_sysdev_class_attr(attr);
+
+	if (class_attr->store)
+		return class_attr->store(class, buffer, count);
+	return -EIO;
+}
+
+static struct sysfs_ops sysfs_class_ops = {
+	.show	= sysdev_class_show,
+	.store	= sysdev_class_store,
+};
+
+static struct kobj_type ktype_sysdev_class = {
+	.sysfs_ops	= &sysfs_class_ops,
+};
+
+int sysdev_class_create_file(struct sysdev_class *c,
+			     struct sysdev_class_attribute *a)
+{
+	return sysfs_create_file(&c->kset.kobj, &a->attr);
+}
+EXPORT_SYMBOL_GPL(sysdev_class_create_file);
+
+void sysdev_class_remove_file(struct sysdev_class *c,
+			      struct sysdev_class_attribute *a)
+{
+	sysfs_remove_file(&c->kset.kobj, &a->attr);
+}
+EXPORT_SYMBOL_GPL(sysdev_class_remove_file);
+
 /*
  * declare system_subsys
  */
-static decl_subsys(system, &ktype_sysdev, NULL);
+static decl_subsys(system, &ktype_sysdev_class, NULL);
 
 int sysdev_class_register(struct sysdev_class * cls)
 {
diff --git a/include/linux/sysdev.h b/include/linux/sysdev.h
index 2a4b432e1176..166a2e58c287 100644
--- a/include/linux/sysdev.h
+++ b/include/linux/sysdev.h
@@ -37,11 +37,27 @@ struct sysdev_class {
 	struct kset		kset;
 };
 
+struct sysdev_class_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct sysdev_class *, char *);
+	ssize_t (*store)(struct sysdev_class *, const char *, size_t);
+};
+
+#define SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) 		\
+struct sysdev_class_attribute attr_##_name = { 			\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,					\
+	.store	= _store,					\
+};
+
 
 extern int sysdev_class_register(struct sysdev_class *);
 extern void sysdev_class_unregister(struct sysdev_class *);
 
-
+extern int sysdev_class_create_file(struct sysdev_class *,
+	struct sysdev_class_attribute *);
+extern void sysdev_class_remove_file(struct sysdev_class *,
+	struct sysdev_class_attribute *);
 /**
  * Auxillary system device drivers.
  */
-- 
cgit v1.2.3


From 4039483fd3065920f035eed39ec59085421c0a4f Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@de.ibm.com>
Date: Tue, 9 May 2006 12:53:49 +0200
Subject: [PATCH] Driver Core: Add /sys/hypervisor when needed

To have a home for all hypervisors, this patch creates /sys/hypervisor.
A new config option SYS_HYPERVISOR is introduced, which should to be set
by architecture dependent hypervisors (e.g. s390 or Xen).

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Michael Holzheu <holzheu@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/Kconfig      |  4 ++++
 drivers/base/Makefile     |  1 +
 drivers/base/base.h       |  5 +++++
 drivers/base/hypervisor.c | 19 +++++++++++++++++++
 drivers/base/init.c       |  1 +
 include/linux/kobject.h   |  2 ++
 6 files changed, 32 insertions(+)
 create mode 100644 drivers/base/hypervisor.c

(limited to 'include/linux')

diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index f0eff3dac58d..80502dc6ed66 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -38,3 +38,7 @@ config DEBUG_DRIVER
 	  If you are unsure about this, say N here.
 
 endmenu
+
+config SYS_HYPERVISOR
+	bool
+	default n
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index e99471d3232b..659cde6c2fb9 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
 obj-$(CONFIG_SMP)	+= topology.o
+obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor.o
 
 ifeq ($(CONFIG_DEBUG_DRIVER),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/drivers/base/base.h b/drivers/base/base.h
index 122498aef50b..79115eff6e94 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -5,6 +5,11 @@ extern int devices_init(void);
 extern int buses_init(void);
 extern int classes_init(void);
 extern int firmware_init(void);
+#ifdef CONFIG_SYS_HYPERVISOR
+extern int hypervisor_init(void);
+#else
+static inline int hypervisor_init(void) { return 0; }
+#endif
 extern int platform_bus_init(void);
 extern int system_bus_init(void);
 extern int cpu_dev_init(void);
diff --git a/drivers/base/hypervisor.c b/drivers/base/hypervisor.c
new file mode 100644
index 000000000000..0c85e9d6a448
--- /dev/null
+++ b/drivers/base/hypervisor.c
@@ -0,0 +1,19 @@
+/*
+ * hypervisor.c - /sys/hypervisor subsystem.
+ *
+ * This file is released under the GPLv2
+ *
+ */
+
+#include <linux/kobject.h>
+#include <linux/device.h>
+
+#include "base.h"
+
+decl_subsys(hypervisor, NULL, NULL);
+EXPORT_SYMBOL_GPL(hypervisor_subsys);
+
+int __init hypervisor_init(void)
+{
+	return subsystem_register(&hypervisor_subsys);
+}
diff --git a/drivers/base/init.c b/drivers/base/init.c
index c648914b9cde..37138154f9e8 100644
--- a/drivers/base/init.c
+++ b/drivers/base/init.c
@@ -27,6 +27,7 @@ void __init driver_init(void)
 	buses_init();
 	classes_init();
 	firmware_init();
+	hypervisor_init();
 
 	/* These are also core pieces, but must come after the
 	 * core core pieces.
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index c187c53cecd0..2d229327959e 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -190,6 +190,8 @@ struct subsystem _varname##_subsys = { \
 
 /* The global /sys/kernel/ subsystem for people to chain off of */
 extern struct subsystem kernel_subsys;
+/* The global /sys/hypervisor/ subsystem  */
+extern struct subsystem hypervisor_subsys;
 
 /**
  * Helpers for setting the kset of registered objects.
-- 
cgit v1.2.3


From 23681e479129854305da1da32f7f1eaf635ef22c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 14 Jun 2006 12:14:34 -0700
Subject: [PATCH] Driver core: allow struct device to have a dev_t

This is the first step in moving class_device to being replaced by
struct device.  It allows struct device to export a dev_t and makes it
easy to dynamically create and destroy struct device as long as they are
associated with a specific class.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/class.c   |   1 +
 drivers/base/core.c    | 162 ++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/device.h |  14 +++++
 3 files changed, 176 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 41a8e0934e3a..50e841a33af0 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -142,6 +142,7 @@ int class_register(struct class * cls)
 	pr_debug("device class '%s': registering\n", cls->name);
 
 	INIT_LIST_HEAD(&cls->children);
+	INIT_LIST_HEAD(&cls->devices);
 	INIT_LIST_HEAD(&cls->interfaces);
 	init_MUTEX(&cls->sem);
 	error = kobject_set_name(&cls->subsys.kset.kobj, "%s", cls->name);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index d5e15a03584e..252cf403f891 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/kdev_t.h>
 
 #include <asm/semaphore.h>
 
@@ -98,6 +99,8 @@ static int dev_uevent_filter(struct kset *kset, struct kobject *kobj)
 		struct device *dev = to_dev(kobj);
 		if (dev->bus)
 			return 1;
+		if (dev->class)
+			return 1;
 	}
 	return 0;
 }
@@ -106,7 +109,11 @@ static const char *dev_uevent_name(struct kset *kset, struct kobject *kobj)
 {
 	struct device *dev = to_dev(kobj);
 
-	return dev->bus->name;
+	if (dev->bus)
+		return dev->bus->name;
+	if (dev->class)
+		return dev->class->name;
+	return NULL;
 }
 
 static int dev_uevent(struct kset *kset, struct kobject *kobj, char **envp,
@@ -117,6 +124,16 @@ static int dev_uevent(struct kset *kset, struct kobject *kobj, char **envp,
 	int length = 0;
 	int retval = 0;
 
+	/* add the major/minor if present */
+	if (MAJOR(dev->devt)) {
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "MAJOR=%u", MAJOR(dev->devt));
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "MINOR=%u", MINOR(dev->devt));
+	}
+
 	/* add bus name of physical device */
 	if (dev->bus)
 		add_uevent_var(envp, num_envp, &i,
@@ -161,6 +178,12 @@ static ssize_t store_uevent(struct device *dev, struct device_attribute *attr,
 	return count;
 }
 
+static ssize_t show_dev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	return print_dev_t(buf, dev->devt);
+}
+
 /*
  *	devices_subsys - structure to be registered with kobject core.
  */
@@ -231,6 +254,7 @@ void device_initialize(struct device *dev)
 	klist_init(&dev->klist_children, klist_children_get,
 		   klist_children_put);
 	INIT_LIST_HEAD(&dev->dma_pools);
+	INIT_LIST_HEAD(&dev->node);
 	init_MUTEX(&dev->sem);
 	device_init_wakeup(dev, 0);
 }
@@ -274,6 +298,31 @@ int device_add(struct device *dev)
 	dev->uevent_attr.store = store_uevent;
 	device_create_file(dev, &dev->uevent_attr);
 
+	if (MAJOR(dev->devt)) {
+		struct device_attribute *attr;
+		attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+		if (!attr) {
+			error = -ENOMEM;
+			goto PMError;
+		}
+		attr->attr.name = "dev";
+		attr->attr.mode = S_IRUGO;
+		if (dev->driver)
+			attr->attr.owner = dev->driver->owner;
+		attr->show = show_dev;
+		error = device_create_file(dev, attr);
+		if (error) {
+			kfree(attr);
+			goto attrError;
+		}
+
+		dev->devt_attr = attr;
+	}
+
+	if (dev->class)
+		sysfs_create_link(&dev->class->subsys.kset.kobj, &dev->kobj,
+				  dev->bus_id);
+
 	if ((error = device_pm_add(dev)))
 		goto PMError;
 	if ((error = bus_add_device(dev)))
@@ -292,6 +341,11 @@ int device_add(struct device *dev)
  BusError:
 	device_pm_remove(dev);
  PMError:
+	if (dev->devt_attr) {
+		device_remove_file(dev, dev->devt_attr);
+		kfree(dev->devt_attr);
+	}
+ attrError:
 	kobject_uevent(&dev->kobj, KOBJ_REMOVE);
 	kobject_del(&dev->kobj);
  Error:
@@ -366,6 +420,10 @@ void device_del(struct device * dev)
 
 	if (parent)
 		klist_del(&dev->knode_parent);
+	if (dev->devt_attr)
+		device_remove_file(dev, dev->devt_attr);
+	if (dev->class)
+		sysfs_remove_link(&dev->class->subsys.kset.kobj, dev->bus_id);
 	device_remove_file(dev, &dev->uevent_attr);
 
 	/* Notify the platform of the removal, in case they
@@ -450,3 +508,105 @@ EXPORT_SYMBOL_GPL(put_device);
 
 EXPORT_SYMBOL_GPL(device_create_file);
 EXPORT_SYMBOL_GPL(device_remove_file);
+
+
+static void device_create_release(struct device *dev)
+{
+	pr_debug("%s called for %s\n", __FUNCTION__, dev->bus_id);
+	kfree(dev);
+}
+
+/**
+ * device_create - creates a device and registers it with sysfs
+ * @cs: pointer to the struct class that this device should be registered to.
+ * @parent: pointer to the parent struct device of this new device, if any.
+ * @dev: the dev_t for the char device to be added.
+ * @fmt: string for the class device's name
+ *
+ * This function can be used by char device classes.  A struct
+ * device will be created in sysfs, registered to the specified
+ * class.
+ * A "dev" file will be created, showing the dev_t for the device, if
+ * the dev_t is not 0,0.
+ * If a pointer to a parent struct device is passed in, the newly
+ * created struct device will be a child of that device in sysfs.  The
+ * pointer to the struct device will be returned from the call.  Any
+ * further sysfs files that might be required can be created using this
+ * pointer.
+ *
+ * Note: the struct class passed to this function must have previously
+ * been created with a call to class_create().
+ */
+struct device *device_create(struct class *class, struct device *parent,
+			     dev_t devt, char *fmt, ...)
+{
+	va_list args;
+	struct device *dev = NULL;
+	int retval = -ENODEV;
+
+	if (class == NULL || IS_ERR(class))
+		goto error;
+	if (parent == NULL) {
+		printk(KERN_WARNING "%s does not work yet for NULL parents\n", __FUNCTION__);
+		goto error;
+	}
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev) {
+		retval = -ENOMEM;
+		goto error;
+	}
+
+	dev->devt = devt;
+	dev->class = class;
+	dev->parent = parent;
+	dev->release = device_create_release;
+
+	va_start(args, fmt);
+	vsnprintf(dev->bus_id, BUS_ID_SIZE, fmt, args);
+	va_end(args);
+	retval = device_register(dev);
+	if (retval)
+		goto error;
+
+	/* tie the class to the device */
+	down(&class->sem);
+	list_add_tail(&dev->node, &class->devices);
+	up(&class->sem);
+
+	return dev;
+
+error:
+	kfree(dev);
+	return ERR_PTR(retval);
+}
+EXPORT_SYMBOL_GPL(device_create);
+
+/**
+ * device_destroy - removes a device that was created with device_create()
+ * @class: the pointer to the struct class that this device was registered * with.
+ * @dev: the dev_t of the device that was previously registered.
+ *
+ * This call unregisters and cleans up a class device that was created with a
+ * call to class_device_create()
+ */
+void device_destroy(struct class *class, dev_t devt)
+{
+	struct device *dev = NULL;
+	struct device *dev_tmp;
+
+	down(&class->sem);
+	list_for_each_entry(dev_tmp, &class->devices, node) {
+		if (dev_tmp->devt == devt) {
+			dev = dev_tmp;
+			break;
+		}
+	}
+	up(&class->sem);
+
+	if (dev) {
+		list_del_init(&dev->node);
+		device_unregister(dev);
+	}
+}
+EXPORT_SYMBOL_GPL(device_destroy);
diff --git a/include/linux/device.h b/include/linux/device.h
index ade10dd6b779..b473f4278910 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -142,6 +142,7 @@ struct class {
 
 	struct subsystem	subsys;
 	struct list_head	children;
+	struct list_head	devices;
 	struct list_head	interfaces;
 	struct semaphore	sem;	/* locks both the children and interfaces lists */
 
@@ -305,6 +306,7 @@ struct device {
 	struct kobject kobj;
 	char	bus_id[BUS_ID_SIZE];	/* position on parent bus */
 	struct device_attribute uevent_attr;
+	struct device_attribute *devt_attr;
 
 	struct semaphore	sem;	/* semaphore to synchronize calls to
 					 * its driver.
@@ -332,6 +334,11 @@ struct device {
 	struct dma_coherent_mem	*dma_mem; /* internal for coherent mem
 					     override */
 
+	/* class_device migration path */
+	struct list_head	node;
+	struct class		*class;		/* optional*/
+	dev_t			devt;		/* dev_t, creates the sysfs "dev" */
+
 	void	(*release)(struct device * dev);
 };
 
@@ -373,6 +380,13 @@ extern int  device_attach(struct device * dev);
 extern void driver_attach(struct device_driver * drv);
 extern void device_reprobe(struct device *dev);
 
+/*
+ * Easy functions for dynamically creating devices on the fly
+ */
+extern struct device *device_create(struct class *cls, struct device *parent,
+				    dev_t devt, char *fmt, ...)
+				    __attribute__((format(printf,4,5)));
+extern void device_destroy(struct class *cls, dev_t devt);
 
 /*
  * Platform "fixup" functions - allow the platform to have their say
-- 
cgit v1.2.3


From 3e95637a48820ff8bedb33e6439def96ccff1de5 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Fri, 16 Jun 2006 17:10:48 -0400
Subject: [PATCH] Driver Core: Make dev_info and friends print the bus name if
 there is no driver

This patch (as721) makes dev_info and related macros print the device's
bus name if the device doesn't have a driver, instead of printing just a
blank.  If the device isn't on a bus either... well, then it does leave
a blank space.  But it will be easier for someone else to change if they
want.

Cc: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/core.c    | 16 ++++++++++++++++
 include/linux/device.h |  3 ++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index a979bc3f49a9..d0f84ff78776 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -29,6 +29,22 @@ int (*platform_notify_remove)(struct device * dev) = NULL;
  * sysfs bindings for devices.
  */
 
+/**
+ * dev_driver_string - Return a device's driver name, if at all possible
+ * @dev: struct device to get the name of
+ *
+ * Will return the device's driver's name if it is bound to a device.  If
+ * the device is not bound to a device, it will return the name of the bus
+ * it is attached to.  If it is not attached to a bus either, an empty
+ * string will be returned.
+ */
+const char *dev_driver_string(struct device *dev)
+{
+	return dev->driver ? dev->driver->name :
+			(dev->bus ? dev->bus->name : "");
+}
+EXPORT_SYMBOL_GPL(dev_driver_string);
+
 #define to_dev(obj) container_of(obj, struct device, kobj)
 #define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr)
 
diff --git a/include/linux/device.h b/include/linux/device.h
index b473f4278910..1e5f30da98bc 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -416,8 +416,9 @@ extern int firmware_register(struct subsystem *);
 extern void firmware_unregister(struct subsystem *);
 
 /* debugging and troubleshooting/diagnostic helpers. */
+extern const char *dev_driver_string(struct device *dev);
 #define dev_printk(level, dev, format, arg...)	\
-	printk(level "%s %s: " format , (dev)->driver ? (dev)->driver->name : "" , (dev)->bus_id , ## arg)
+	printk(level "%s %s: " format , dev_driver_string(dev) , (dev)->bus_id , ## arg)
 
 #ifdef DEBUG
 #define dev_dbg(dev, format, arg...)		\
-- 
cgit v1.2.3


From a5117ba7da37deb09df5eb802dace229b3fb1e9f Mon Sep 17 00:00:00 2001
From: Rene Herman <rene.herman@keyaccess.nl>
Date: Tue, 6 Jun 2006 23:54:02 +0200
Subject: [PATCH] Driver model: add ISA bus

During the recent "isa drivers using platform devices" discussion it was
pointed out that (ALSA) ISA drivers ran into the problem of not having
the option to fail driver load (device registration rather) upon not
finding their hardware due to a probe() error not being passed up
through the driver model. In the course of that, I suggested a seperate
ISA bus might be best; Russell King agreed and suggested this bus could
use the .match() method for the actual device discovery.

The attached does this. For this old non (generically) discoverable ISA
hardware only the driver itself can do discovery so as a difference with
the platform_bus, this isa_bus also distributes match() up to the driver.

As another difference: these devices only exist in the driver model due
to the driver creating them because it might want to drive them, meaning
that all device creation has been made internal as well.

The usage model this provides is nice, and has been acked from the ALSA
side by Takashi Iwai and Jaroslav Kysela. The ALSA driver module_init's
now (for oldisa-only drivers) become:

static int __init alsa_card_foo_init(void)
{
	return isa_register_driver(&snd_foo_isa_driver, SNDRV_CARDS);
}

static void __exit alsa_card_foo_exit(void)
{
	isa_unregister_driver(&snd_foo_isa_driver);
}

Quite like the other bus models therefore. This removes a lot of
duplicated init code from the ALSA ISA drivers.

The passed in isa_driver struct is the regular driver struct embedding a
struct device_driver, the normal probe/remove/shutdown/suspend/resume
callbacks, and as indicated that .match callback.

The "SNDRV_CARDS" you see being passed in is a "unsigned int ndev"
parameter, indicating how many devices to create and call our methods with.

The platform_driver callbacks are called with a platform_device param;
the isa_driver callbacks are being called with a "struct device *dev,
unsigned int id" pair directly -- with the device creation completely
internal to the bus it's much cleaner to not leak isa_dev's by passing
them in at all. The id is the only thing we ever want other then the
struct device * anyways, and it makes for nicer code in the callbacks as
well.

With this additional .match() callback ISA drivers have all options. If
ALSA would want to keep the old non-load behaviour, it could stick all
of the old .probe in .match, which would only keep them registered after
everything was found to be present and accounted for. If it wanted the
behaviour of always loading as it inadvertently did for a bit after the
changeover to platform devices, it could just not provide a .match() and
do everything in .probe() as before.

If it, as Takashi Iwai already suggested earlier as a way of following
the model from saner buses more closely, wants to load when a later bind
could conceivably succeed, it could use .match() for the prerequisites
(such as checking the user wants the card enabled and that port/irq/dma
values have been passed in) and .probe() for everything else. This is
the nicest model.

To the code...

This exports only two functions; isa_{,un}register_driver().

isa_register_driver() register's the struct device_driver, and then
loops over the passed in ndev creating devices and registering them.
This causes the bus match method to be called for them, which is:

int isa_bus_match(struct device *dev, struct device_driver *driver)
{
          struct isa_driver *isa_driver = to_isa_driver(driver);

          if (dev->platform_data == isa_driver) {
                  if (!isa_driver->match ||
                          isa_driver->match(dev, to_isa_dev(dev)->id))
                          return 1;
                  dev->platform_data = NULL;
          }
          return 0;
}

The first thing this does is check if this device is in fact one of this
driver's devices by seeing if the device's platform_data pointer is set
to this driver. Platform devices compare strings, but we don't need to
do that with everything being internal, so isa_register_driver() abuses
dev->platform_data as a isa_driver pointer which we can then check here.
I believe platform_data is available for this, but if rather not, moving
the isa_driver pointer to the private struct isa_dev is ofcourse fine as
well.

Then, if the the driver did not provide a .match, it matches. If it did,
the driver match() method is called to determine a match.

If it did _not_ match, dev->platform_data is reset to indicate this to
isa_register_driver which can then unregister the device again.

If during all this, there's any error, or no devices matched at all
everything is backed out again and the error, or -ENODEV, is returned.

isa_unregister_driver() just unregisters the matched devices and the
driver itself.

More global points/questions...

- I'm introducing include/linux/isa.h. It was available but is ofcourse
a somewhat generic name. Moving more isa stuff over to it in time is
ofcourse fine, so can I have it please? :)

- I'm using device_initcall() and added the isa.o (dependent on
CONFIG_ISA) after the base driver model things in the Makefile. Will
this do, or I really need to stick it in drivers/base/init.c, inside
#ifdef CONFIG_ISA? It's working fine.

Lastly -- I also looked, a bit, into integrating with PnP. "Old ISA"
could be another pnp_protocol, but this does not seem to be a good
match, largely due to the same reason platform_devices weren't -- the
devices do not have a life of their own outside the driver, meaning the
pnp_protocol {get,set}_resources callbacks would need to callback into
driver -- which again means you first need to _have_ that driver. Even
if there's clean way around that, you only end up inventing fake but
valid-form PnP IDs and generally catering to the PnP layer without any
practical advantages over this very simple isa_bus. The thing I also
suggested earlier about the user echoing values into /sys to set up the
hardware from userspace first is... well, cute, but a horrible idea from
a user standpoint.

Comments ofcourse appreciated. Hope it's okay. As said, the usage model
is nice at least.

Signed-off-by: Rene Herman <rene.herman@keyaccess.nl>
---
 drivers/base/Makefile |   1 +
 drivers/base/isa.c    | 180 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/isa.h   |  28 ++++++++
 3 files changed, 209 insertions(+)
 create mode 100644 drivers/base/isa.c
 create mode 100644 include/linux/isa.h

(limited to 'include/linux')

diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 659cde6c2fb9..b539e5e75b56 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -5,6 +5,7 @@ obj-y			:= core.o sys.o bus.o dd.o \
 			   cpu.o firmware.o init.o map.o dmapool.o \
 			   attribute_container.o transport_class.o
 obj-y			+= power/
+obj-$(CONFIG_ISA)	+= isa.o
 obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
diff --git a/drivers/base/isa.c b/drivers/base/isa.c
new file mode 100644
index 000000000000..d2222397a401
--- /dev/null
+++ b/drivers/base/isa.c
@@ -0,0 +1,180 @@
+/*
+ * ISA bus.
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/isa.h>
+
+static struct device isa_bus = {
+	.bus_id		= "isa"
+};
+
+struct isa_dev {
+	struct device dev;
+	struct device *next;
+	unsigned int id;
+};
+
+#define to_isa_dev(x) container_of((x), struct isa_dev, dev)
+
+static int isa_bus_match(struct device *dev, struct device_driver *driver)
+{
+	struct isa_driver *isa_driver = to_isa_driver(driver);
+
+	if (dev->platform_data == isa_driver) {
+		if (!isa_driver->match ||
+			isa_driver->match(dev, to_isa_dev(dev)->id))
+			return 1;
+		dev->platform_data = NULL;
+	}
+	return 0;
+}
+
+static int isa_bus_probe(struct device *dev)
+{
+	struct isa_driver *isa_driver = dev->platform_data;
+
+	if (isa_driver->probe)
+		return isa_driver->probe(dev, to_isa_dev(dev)->id);
+
+	return 0;
+}
+
+static int isa_bus_remove(struct device *dev)
+{
+	struct isa_driver *isa_driver = dev->platform_data;
+
+	if (isa_driver->remove)
+		return isa_driver->remove(dev, to_isa_dev(dev)->id);
+
+	return 0;
+}
+
+static void isa_bus_shutdown(struct device *dev)
+{
+	struct isa_driver *isa_driver = dev->platform_data;
+
+	if (isa_driver->shutdown)
+		isa_driver->shutdown(dev, to_isa_dev(dev)->id);
+}
+
+static int isa_bus_suspend(struct device *dev, pm_message_t state)
+{
+	struct isa_driver *isa_driver = dev->platform_data;
+
+	if (isa_driver->suspend)
+		return isa_driver->suspend(dev, to_isa_dev(dev)->id, state);
+
+	return 0;
+}
+
+static int isa_bus_resume(struct device *dev)
+{
+	struct isa_driver *isa_driver = dev->platform_data;
+
+	if (isa_driver->resume)
+		return isa_driver->resume(dev, to_isa_dev(dev)->id);
+
+	return 0;
+}
+
+static struct bus_type isa_bus_type = {
+	.name		= "isa",
+	.match		= isa_bus_match,
+	.probe		= isa_bus_probe,
+	.remove		= isa_bus_remove,
+	.shutdown	= isa_bus_shutdown,
+	.suspend	= isa_bus_suspend,
+	.resume		= isa_bus_resume
+};
+
+static void isa_dev_release(struct device *dev)
+{
+	kfree(to_isa_dev(dev));
+}
+
+void isa_unregister_driver(struct isa_driver *isa_driver)
+{
+	struct device *dev = isa_driver->devices;
+
+	while (dev) {
+		struct device *tmp = to_isa_dev(dev)->next;
+		device_unregister(dev);
+		dev = tmp;
+	}
+	driver_unregister(&isa_driver->driver);
+}
+EXPORT_SYMBOL_GPL(isa_unregister_driver);
+
+int isa_register_driver(struct isa_driver *isa_driver, unsigned int ndev)
+{
+	int error;
+	unsigned int id;
+
+	isa_driver->driver.bus	= &isa_bus_type;
+	isa_driver->devices	= NULL;
+
+	error = driver_register(&isa_driver->driver);
+	if (error)
+		return error;
+
+	for (id = 0; id < ndev; id++) {
+		struct isa_dev *isa_dev;
+
+		isa_dev = kzalloc(sizeof *isa_dev, GFP_KERNEL);
+		if (!isa_dev) {
+			error = -ENOMEM;
+			break;
+		}
+
+		isa_dev->dev.parent	= &isa_bus;
+		isa_dev->dev.bus	= &isa_bus_type;
+
+		snprintf(isa_dev->dev.bus_id, BUS_ID_SIZE, "%s.%u",
+				isa_driver->driver.name, id);
+
+		isa_dev->dev.platform_data	= isa_driver;
+		isa_dev->dev.release		= isa_dev_release;
+		isa_dev->id			= id;
+
+		error = device_register(&isa_dev->dev);
+		if (error) {
+			put_device(&isa_dev->dev);
+			break;
+		}
+
+		if (isa_dev->dev.platform_data) {
+			isa_dev->next = isa_driver->devices;
+			isa_driver->devices = &isa_dev->dev;
+		} else
+			device_unregister(&isa_dev->dev);
+	}
+
+	if (!error && !isa_driver->devices)
+		error = -ENODEV;
+
+	if (error)
+		isa_unregister_driver(isa_driver);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(isa_register_driver);
+
+static int __init isa_bus_init(void)
+{
+	int error;
+
+	error = bus_register(&isa_bus_type);
+	if (!error) {
+		error = device_register(&isa_bus);
+		if (error)
+			bus_unregister(&isa_bus_type);
+	}
+	return error;
+}
+
+device_initcall(isa_bus_init);
diff --git a/include/linux/isa.h b/include/linux/isa.h
new file mode 100644
index 000000000000..1b855335cb11
--- /dev/null
+++ b/include/linux/isa.h
@@ -0,0 +1,28 @@
+/*
+ * ISA bus.
+ */
+
+#ifndef __LINUX_ISA_H
+#define __LINUX_ISA_H
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+
+struct isa_driver {
+	int (*match)(struct device *, unsigned int);
+	int (*probe)(struct device *, unsigned int);
+	int (*remove)(struct device *, unsigned int);
+	void (*shutdown)(struct device *, unsigned int);
+	int (*suspend)(struct device *, unsigned int, pm_message_t);
+	int (*resume)(struct device *, unsigned int);
+
+	struct device_driver driver;
+	struct device *devices;
+};
+
+#define to_isa_driver(x) container_of((x), struct isa_driver, driver)
+
+int isa_register_driver(struct isa_driver *, unsigned int);
+void isa_unregister_driver(struct isa_driver *);
+
+#endif /* __LINUX_ISA_H */
-- 
cgit v1.2.3


From 782a7a632e4b0581ade665e3d89ee97c8db0f441 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 19 May 2006 13:20:20 -0700
Subject: [PATCH] USB: add usb_interrupt_msg() function for api completeness.

Really just a wrapper around usb_bulk_msg() but now it's documented
much better.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/message.c | 31 +++++++++++++++++++++++++++++++
 include/linux/usb.h        |  2 ++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 08fb20f06f3e..b2f608b0538d 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -157,6 +157,37 @@ int usb_control_msg(struct usb_device *dev, unsigned int pipe, __u8 request, __u
 }
 
 
+/**
+ * usb_interrupt_msg - Builds an interrupt urb, sends it off and waits for completion
+ * @usb_dev: pointer to the usb device to send the message to
+ * @pipe: endpoint "pipe" to send the message to
+ * @data: pointer to the data to send
+ * @len: length in bytes of the data to send
+ * @actual_length: pointer to a location to put the actual length transferred in bytes
+ * @timeout: time in msecs to wait for the message to complete before
+ *	timing out (if 0 the wait is forever)
+ * Context: !in_interrupt ()
+ *
+ * This function sends a simple interrupt message to a specified endpoint and
+ * waits for the message to complete, or timeout.
+ *
+ * If successful, it returns 0, otherwise a negative error number.  The number
+ * of actual bytes transferred will be stored in the actual_length paramater.
+ *
+ * Don't use this function from within an interrupt context, like a bottom half
+ * handler.  If you need an asynchronous message, or need to send a message
+ * from within interrupt context, use usb_submit_urb() If a thread in your
+ * driver uses this call, make sure your disconnect() method can wait for it to
+ * complete.  Since you don't have a handle on the URB used, you can't cancel
+ * the request.
+ */
+int usb_interrupt_msg(struct usb_device *usb_dev, unsigned int pipe,
+		      void *data, int len, int *actual_length, int timeout)
+{
+	return usb_bulk_msg(usb_dev, pipe, data, len, actual_length, timeout);
+}
+EXPORT_SYMBOL_GPL(usb_interrupt_msg);
+
 /**
  *	usb_bulk_msg - Builds a bulk urb, sends it off and waits for completion
  *	@usb_dev: pointer to the usb device to send the message to
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 1f492c0c7047..317ec9f28bce 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1008,6 +1008,8 @@ void usb_buffer_unmap_sg (struct usb_device *dev, unsigned pipe,
 extern int usb_control_msg(struct usb_device *dev, unsigned int pipe,
 	__u8 request, __u8 requesttype, __u16 value, __u16 index,
 	void *data, __u16 size, int timeout);
+extern int usb_interrupt_msg(struct usb_device *usb_dev, unsigned int pipe,
+	void *data, int len, int *actual_length, int timeout);
 extern int usb_bulk_msg(struct usb_device *usb_dev, unsigned int pipe,
 	void *data, int len, int *actual_length,
 	int timeout);
-- 
cgit v1.2.3


From 79efa097e75018a2918155f343f0e08e61ee8a8c Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Thu, 1 Jun 2006 13:33:42 -0400
Subject: [PATCH] usbcore: port reset for composite devices

This patch (as699) adds usb_reset_composite_device(), a routine for
sending a USB port reset to a device with multiple interfaces owned by
different drivers.  Drivers are notified about impending and completed
resets through two new methods in the usb_driver structure.

The patch modifieds the usbfs ioctl code to make it use the new routine
instead of usb_reset_device().  Follow-up patches will modify the hub,
usb-storage, and usbhid drivers so they can utilize this new API.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/devio.c |  3 +-
 drivers/usb/core/hub.c   | 84 ++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/usb/core/usb.c   |  1 +
 include/linux/usb.h      |  9 ++++++
 4 files changed, 92 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index b04ede772f2c..df3fb57d71e6 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -823,8 +823,7 @@ static int proc_connectinfo(struct dev_state *ps, void __user *arg)
 
 static int proc_resetdevice(struct dev_state *ps)
 {
-	return usb_reset_device(ps->dev);
-
+	return usb_reset_composite_device(ps->dev, NULL);
 }
 
 static int proc_setintf(struct dev_state *ps, void __user *arg)
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index f41c08946a52..37c67d7e8b84 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -3007,9 +3007,9 @@ static int config_descriptors_changed(struct usb_device *udev)
  * usb_reset_device - perform a USB port reset to reinitialize a device
  * @udev: device to reset (not in SUSPENDED or NOTATTACHED state)
  *
- * WARNING - don't reset any device unless drivers for all of its
- * interfaces are expecting that reset!  Maybe some driver->reset()
- * method should eventually help ensure sufficient cooperation.
+ * WARNING - don't use this routine to reset a composite device
+ * (one with multiple interfaces owned by separate drivers)!
+ * Use usb_reset_composite_device() instead.
  *
  * Do a port reset, reassign the device's address, and establish its
  * former operating configuration.  If the reset fails, or the device's
@@ -3125,3 +3125,81 @@ re_enumerate:
 	hub_port_logical_disconnect(parent_hub, port1);
 	return -ENODEV;
 }
+
+/**
+ * usb_reset_composite_device - warn interface drivers and perform a USB port reset
+ * @udev: device to reset (not in SUSPENDED or NOTATTACHED state)
+ * @iface: interface bound to the driver making the request (optional)
+ *
+ * Warns all drivers bound to registered interfaces (using their pre_reset
+ * method), performs the port reset, and then lets the drivers know that
+ * the reset is over (using their post_reset method).
+ *
+ * Return value is the same as for usb_reset_device().
+ *
+ * The caller must own the device lock.  For example, it's safe to use
+ * this from a driver probe() routine after downloading new firmware.
+ * For calls that might not occur during probe(), drivers should lock
+ * the device using usb_lock_device_for_reset().
+ *
+ * The interface locks are acquired during the pre_reset stage and released
+ * during the post_reset stage.  However if iface is not NULL and is
+ * currently being probed, we assume that the caller already owns its
+ * lock.
+ */
+int usb_reset_composite_device(struct usb_device *udev,
+		struct usb_interface *iface)
+{
+	int ret;
+	struct usb_host_config *config = udev->actconfig;
+
+	if (udev->state == USB_STATE_NOTATTACHED ||
+			udev->state == USB_STATE_SUSPENDED) {
+		dev_dbg(&udev->dev, "device reset not allowed in state %d\n",
+				udev->state);
+		return -EINVAL;
+	}
+
+	if (iface && iface->condition != USB_INTERFACE_BINDING)
+		iface = NULL;
+
+	if (config) {
+		int i;
+		struct usb_interface *cintf;
+		struct usb_driver *drv;
+
+		for (i = 0; i < config->desc.bNumInterfaces; ++i) {
+			cintf = config->interface[i];
+			if (cintf != iface)
+				down(&cintf->dev.sem);
+			if (device_is_registered(&cintf->dev) &&
+					cintf->dev.driver) {
+				drv = to_usb_driver(cintf->dev.driver);
+				if (drv->pre_reset)
+					(drv->pre_reset)(cintf);
+			}
+		}
+	}
+
+	ret = usb_reset_device(udev);
+
+	if (config) {
+		int i;
+		struct usb_interface *cintf;
+		struct usb_driver *drv;
+
+		for (i = config->desc.bNumInterfaces - 1; i >= 0; --i) {
+			cintf = config->interface[i];
+			if (device_is_registered(&cintf->dev) &&
+					cintf->dev.driver) {
+				drv = to_usb_driver(cintf->dev.driver);
+				if (drv->post_reset)
+					(drv->post_reset)(cintf);
+			}
+			if (cintf != iface)
+				up(&cintf->dev.sem);
+		}
+	}
+
+	return ret;
+}
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index b7fdc1cd134a..515310751303 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -1207,6 +1207,7 @@ EXPORT_SYMBOL(usb_ifnum_to_if);
 EXPORT_SYMBOL(usb_altnum_to_altsetting);
 
 EXPORT_SYMBOL(usb_reset_device);
+EXPORT_SYMBOL(usb_reset_composite_device);
 
 EXPORT_SYMBOL(__usb_get_extra_descriptor);
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 317ec9f28bce..5ad30cefe7b2 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -386,6 +386,8 @@ extern int usb_lock_device_for_reset(struct usb_device *udev,
 
 /* USB port reset for device reinitialization */
 extern int usb_reset_device(struct usb_device *dev);
+extern int usb_reset_composite_device(struct usb_device *dev,
+		struct usb_interface *iface);
 
 extern struct usb_device *usb_find_device(u16 vendor_id, u16 product_id);
 
@@ -554,6 +556,10 @@ struct usb_dynids {
  *	do (or don't) show up otherwise in the filesystem.
  * @suspend: Called when the device is going to be suspended by the system.
  * @resume: Called when the device is being resumed by the system.
+ * @pre_reset: Called by usb_reset_composite_device() when the device
+ *	is about to be reset.
+ * @post_reset: Called by usb_reset_composite_device() after the device
+ *	has been reset.
  * @id_table: USB drivers use ID table to support hotplugging.
  *	Export this with MODULE_DEVICE_TABLE(usb,...).  This must be set
  *	or your driver's probe function will never get called.
@@ -592,6 +598,9 @@ struct usb_driver {
 	int (*suspend) (struct usb_interface *intf, pm_message_t message);
 	int (*resume) (struct usb_interface *intf);
 
+	void (*pre_reset) (struct usb_interface *intf);
+	void (*post_reset) (struct usb_interface *intf);
+
 	const struct usb_device_id *id_table;
 
 	struct usb_dynids dynids;
-- 
cgit v1.2.3


From a8c28f2389942bab376e39351d27525499630248 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Tue, 13 Jun 2006 09:57:47 -0700
Subject: [PATCH] USB: move <linux/usb_cdc.h> to <linux/usb/cdc.h>

This moves <linux/usb_cdc.h> to <linux/usb/cdc.h> to reduce some of the
clutter of usb header files.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/class/cdc-acm.c  |   2 +-
 drivers/usb/gadget/ether.c   |   2 +-
 drivers/usb/gadget/serial.c  |   2 +-
 drivers/usb/net/cdc_ether.c  |   2 +-
 drivers/usb/net/rndis_host.c |   2 +-
 drivers/usb/net/zaurus.c     |   2 +-
 include/linux/usb/cdc.h      | 205 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb_cdc.h      | 205 -------------------------------------------
 8 files changed, 211 insertions(+), 211 deletions(-)
 create mode 100644 include/linux/usb/cdc.h
 delete mode 100644 include/linux/usb_cdc.h

(limited to 'include/linux')

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 506aff60dac5..d41dc67ba4cc 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -63,7 +63,7 @@
 #include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <linux/usb.h>
-#include <linux/usb_cdc.h>
+#include <linux/usb/cdc.h>
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 #include <linux/list.h>
diff --git a/drivers/usb/gadget/ether.c b/drivers/usb/gadget/ether.c
index fc4684096fcc..078daa026718 100644
--- a/drivers/usb/gadget/ether.c
+++ b/drivers/usb/gadget/ether.c
@@ -49,7 +49,7 @@
 #include <asm/unaligned.h>
 
 #include <linux/usb_ch9.h>
-#include <linux/usb_cdc.h>
+#include <linux/usb/cdc.h>
 #include <linux/usb_gadget.h>
 
 #include <linux/random.h>
diff --git a/drivers/usb/gadget/serial.c b/drivers/usb/gadget/serial.c
index e477edd681d3..9d6e1d295528 100644
--- a/drivers/usb/gadget/serial.c
+++ b/drivers/usb/gadget/serial.c
@@ -45,7 +45,7 @@
 #include <asm/uaccess.h>
 
 #include <linux/usb_ch9.h>
-#include <linux/usb_cdc.h>
+#include <linux/usb/cdc.h>
 #include <linux/usb_gadget.h>
 
 #include "gadget_chips.h"
diff --git a/drivers/usb/net/cdc_ether.c b/drivers/usb/net/cdc_ether.c
index 9c6c5b0b01ad..efd195b5912c 100644
--- a/drivers/usb/net/cdc_ether.c
+++ b/drivers/usb/net/cdc_ether.c
@@ -31,7 +31,7 @@
 #include <linux/workqueue.h>
 #include <linux/mii.h>
 #include <linux/usb.h>
-#include <linux/usb_cdc.h>
+#include <linux/usb/cdc.h>
 
 #include "usbnet.h"
 
diff --git a/drivers/usb/net/rndis_host.c b/drivers/usb/net/rndis_host.c
index 94ddfe16fdda..f551546d7521 100644
--- a/drivers/usb/net/rndis_host.c
+++ b/drivers/usb/net/rndis_host.c
@@ -30,7 +30,7 @@
 #include <linux/workqueue.h>
 #include <linux/mii.h>
 #include <linux/usb.h>
-#include <linux/usb_cdc.h>
+#include <linux/usb/cdc.h>
 
 #include "usbnet.h"
 
diff --git a/drivers/usb/net/zaurus.c b/drivers/usb/net/zaurus.c
index bf2035d329f4..813e470d0600 100644
--- a/drivers/usb/net/zaurus.c
+++ b/drivers/usb/net/zaurus.c
@@ -30,7 +30,7 @@
 #include <linux/mii.h>
 #include <linux/crc32.h>
 #include <linux/usb.h>
-#include <linux/usb_cdc.h>
+#include <linux/usb/cdc.h>
 
 #include "usbnet.h"
 
diff --git a/include/linux/usb/cdc.h b/include/linux/usb/cdc.h
new file mode 100644
index 000000000000..ba617c372455
--- /dev/null
+++ b/include/linux/usb/cdc.h
@@ -0,0 +1,205 @@
+/*
+ * USB Communications Device Class (CDC) definitions
+ *
+ * CDC says how to talk to lots of different types of network adapters,
+ * notably ethernet adapters and various modems.  It's used mostly with
+ * firmware based USB peripherals.
+ */
+
+#define USB_CDC_SUBCLASS_ACM			0x02
+#define USB_CDC_SUBCLASS_ETHERNET		0x06
+#define USB_CDC_SUBCLASS_WHCM			0x08
+#define USB_CDC_SUBCLASS_DMM			0x09
+#define USB_CDC_SUBCLASS_MDLM			0x0a
+#define USB_CDC_SUBCLASS_OBEX			0x0b
+
+#define USB_CDC_PROTO_NONE			0
+
+#define USB_CDC_ACM_PROTO_AT_V25TER		1
+#define USB_CDC_ACM_PROTO_AT_PCCA101		2
+#define USB_CDC_ACM_PROTO_AT_PCCA101_WAKE	3
+#define USB_CDC_ACM_PROTO_AT_GSM		4
+#define USB_CDC_ACM_PROTO_AT_3G			5
+#define USB_CDC_ACM_PROTO_AT_CDMA		6
+#define USB_CDC_ACM_PROTO_VENDOR		0xff
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * Class-Specific descriptors ... there are a couple dozen of them
+ */
+
+#define USB_CDC_HEADER_TYPE		0x00		/* header_desc */
+#define USB_CDC_CALL_MANAGEMENT_TYPE	0x01		/* call_mgmt_descriptor */
+#define USB_CDC_ACM_TYPE		0x02		/* acm_descriptor */
+#define USB_CDC_UNION_TYPE		0x06		/* union_desc */
+#define USB_CDC_COUNTRY_TYPE		0x07
+#define USB_CDC_NETWORK_TERMINAL_TYPE	0x0a		/* network_terminal_desc */
+#define USB_CDC_ETHERNET_TYPE		0x0f		/* ether_desc */
+#define USB_CDC_WHCM_TYPE		0x11
+#define USB_CDC_MDLM_TYPE		0x12		/* mdlm_desc */
+#define USB_CDC_MDLM_DETAIL_TYPE	0x13		/* mdlm_detail_desc */
+#define USB_CDC_DMM_TYPE		0x14
+#define USB_CDC_OBEX_TYPE		0x15
+
+/* "Header Functional Descriptor" from CDC spec  5.2.3.1 */
+struct usb_cdc_header_desc {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__le16	bcdCDC;
+} __attribute__ ((packed));
+
+/* "Call Management Descriptor" from CDC spec  5.2.3.2 */
+struct usb_cdc_call_mgmt_descriptor {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__u8	bmCapabilities;
+#define USB_CDC_CALL_MGMT_CAP_CALL_MGMT		0x01
+#define USB_CDC_CALL_MGMT_CAP_DATA_INTF		0x02
+
+	__u8	bDataInterface;
+} __attribute__ ((packed));
+
+/* "Abstract Control Management Descriptor" from CDC spec  5.2.3.3 */
+struct usb_cdc_acm_descriptor {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__u8	bmCapabilities;
+} __attribute__ ((packed));
+
+/* "Union Functional Descriptor" from CDC spec 5.2.3.8 */
+struct usb_cdc_union_desc {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__u8	bMasterInterface0;
+	__u8	bSlaveInterface0;
+	/* ... and there could be other slave interfaces */
+} __attribute__ ((packed));
+
+/* "Network Channel Terminal Functional Descriptor" from CDC spec 5.2.3.11 */
+struct usb_cdc_network_terminal_desc {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__u8	bEntityId;
+	__u8	iName;
+	__u8	bChannelIndex;
+	__u8	bPhysicalInterface;
+} __attribute__ ((packed));
+
+/* "Ethernet Networking Functional Descriptor" from CDC spec 5.2.3.16 */
+struct usb_cdc_ether_desc {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__u8	iMACAddress;
+	__le32	bmEthernetStatistics;
+	__le16	wMaxSegmentSize;
+	__le16	wNumberMCFilters;
+	__u8	bNumberPowerFilters;
+} __attribute__ ((packed));
+
+/* "MDLM Functional Descriptor" from CDC WMC spec 6.7.2.3 */
+struct usb_cdc_mdlm_desc {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__le16	bcdVersion;
+	__u8	bGUID[16];
+} __attribute__ ((packed));
+
+/* "MDLM Detail Functional Descriptor" from CDC WMC spec 6.7.2.4 */
+struct usb_cdc_mdlm_detail_desc {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	/* type is associated with mdlm_desc.bGUID */
+	__u8	bGuidDescriptorType;
+	__u8	bDetailData[0];
+} __attribute__ ((packed));
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * Class-Specific Control Requests (6.2)
+ *
+ * section 3.6.2.1 table 4 has the ACM profile, for modems.
+ * section 3.8.2 table 10 has the ethernet profile.
+ *
+ * Microsoft's RNDIS stack for Ethernet is a vendor-specific CDC ACM variant,
+ * heavily dependent on the encapsulated (proprietary) command mechanism.
+ */
+
+#define USB_CDC_SEND_ENCAPSULATED_COMMAND	0x00
+#define USB_CDC_GET_ENCAPSULATED_RESPONSE	0x01
+#define USB_CDC_REQ_SET_LINE_CODING		0x20
+#define USB_CDC_REQ_GET_LINE_CODING		0x21
+#define USB_CDC_REQ_SET_CONTROL_LINE_STATE	0x22
+#define USB_CDC_REQ_SEND_BREAK			0x23
+#define USB_CDC_SET_ETHERNET_MULTICAST_FILTERS	0x40
+#define USB_CDC_SET_ETHERNET_PM_PATTERN_FILTER	0x41
+#define USB_CDC_GET_ETHERNET_PM_PATTERN_FILTER	0x42
+#define USB_CDC_SET_ETHERNET_PACKET_FILTER	0x43
+#define USB_CDC_GET_ETHERNET_STATISTIC		0x44
+
+/* Line Coding Structure from CDC spec 6.2.13 */
+struct usb_cdc_line_coding {
+	__le32	dwDTERate;
+	__u8	bCharFormat;
+#define USB_CDC_1_STOP_BITS			0
+#define USB_CDC_1_5_STOP_BITS			1
+#define USB_CDC_2_STOP_BITS			2
+
+	__u8	bParityType;
+#define USB_CDC_NO_PARITY			0
+#define USB_CDC_ODD_PARITY			1
+#define USB_CDC_EVEN_PARITY			2
+#define USB_CDC_MARK_PARITY			3
+#define USB_CDC_SPACE_PARITY			4
+
+	__u8	bDataBits;
+} __attribute__ ((packed));
+
+/* table 62; bits in multicast filter */
+#define	USB_CDC_PACKET_TYPE_PROMISCUOUS		(1 << 0)
+#define	USB_CDC_PACKET_TYPE_ALL_MULTICAST	(1 << 1) /* no filter */
+#define	USB_CDC_PACKET_TYPE_DIRECTED		(1 << 2)
+#define	USB_CDC_PACKET_TYPE_BROADCAST		(1 << 3)
+#define	USB_CDC_PACKET_TYPE_MULTICAST		(1 << 4) /* filtered */
+
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * Class-Specific Notifications (6.3) sent by interrupt transfers
+ *
+ * section 3.8.2 table 11 of the CDC spec lists Ethernet notifications
+ * section 3.6.2.1 table 5 specifies ACM notifications, accepted by RNDIS
+ * RNDIS also defines its own bit-incompatible notifications
+ */
+
+#define USB_CDC_NOTIFY_NETWORK_CONNECTION	0x00
+#define USB_CDC_NOTIFY_RESPONSE_AVAILABLE	0x01
+#define USB_CDC_NOTIFY_SERIAL_STATE		0x20
+#define USB_CDC_NOTIFY_SPEED_CHANGE		0x2a
+
+struct usb_cdc_notification {
+	__u8	bmRequestType;
+	__u8	bNotificationType;
+	__le16	wValue;
+	__le16	wIndex;
+	__le16	wLength;
+} __attribute__ ((packed));
+
diff --git a/include/linux/usb_cdc.h b/include/linux/usb_cdc.h
deleted file mode 100644
index ba617c372455..000000000000
--- a/include/linux/usb_cdc.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * USB Communications Device Class (CDC) definitions
- *
- * CDC says how to talk to lots of different types of network adapters,
- * notably ethernet adapters and various modems.  It's used mostly with
- * firmware based USB peripherals.
- */
-
-#define USB_CDC_SUBCLASS_ACM			0x02
-#define USB_CDC_SUBCLASS_ETHERNET		0x06
-#define USB_CDC_SUBCLASS_WHCM			0x08
-#define USB_CDC_SUBCLASS_DMM			0x09
-#define USB_CDC_SUBCLASS_MDLM			0x0a
-#define USB_CDC_SUBCLASS_OBEX			0x0b
-
-#define USB_CDC_PROTO_NONE			0
-
-#define USB_CDC_ACM_PROTO_AT_V25TER		1
-#define USB_CDC_ACM_PROTO_AT_PCCA101		2
-#define USB_CDC_ACM_PROTO_AT_PCCA101_WAKE	3
-#define USB_CDC_ACM_PROTO_AT_GSM		4
-#define USB_CDC_ACM_PROTO_AT_3G			5
-#define USB_CDC_ACM_PROTO_AT_CDMA		6
-#define USB_CDC_ACM_PROTO_VENDOR		0xff
-
-/*-------------------------------------------------------------------------*/
-
-/*
- * Class-Specific descriptors ... there are a couple dozen of them
- */
-
-#define USB_CDC_HEADER_TYPE		0x00		/* header_desc */
-#define USB_CDC_CALL_MANAGEMENT_TYPE	0x01		/* call_mgmt_descriptor */
-#define USB_CDC_ACM_TYPE		0x02		/* acm_descriptor */
-#define USB_CDC_UNION_TYPE		0x06		/* union_desc */
-#define USB_CDC_COUNTRY_TYPE		0x07
-#define USB_CDC_NETWORK_TERMINAL_TYPE	0x0a		/* network_terminal_desc */
-#define USB_CDC_ETHERNET_TYPE		0x0f		/* ether_desc */
-#define USB_CDC_WHCM_TYPE		0x11
-#define USB_CDC_MDLM_TYPE		0x12		/* mdlm_desc */
-#define USB_CDC_MDLM_DETAIL_TYPE	0x13		/* mdlm_detail_desc */
-#define USB_CDC_DMM_TYPE		0x14
-#define USB_CDC_OBEX_TYPE		0x15
-
-/* "Header Functional Descriptor" from CDC spec  5.2.3.1 */
-struct usb_cdc_header_desc {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	__le16	bcdCDC;
-} __attribute__ ((packed));
-
-/* "Call Management Descriptor" from CDC spec  5.2.3.2 */
-struct usb_cdc_call_mgmt_descriptor {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	__u8	bmCapabilities;
-#define USB_CDC_CALL_MGMT_CAP_CALL_MGMT		0x01
-#define USB_CDC_CALL_MGMT_CAP_DATA_INTF		0x02
-
-	__u8	bDataInterface;
-} __attribute__ ((packed));
-
-/* "Abstract Control Management Descriptor" from CDC spec  5.2.3.3 */
-struct usb_cdc_acm_descriptor {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	__u8	bmCapabilities;
-} __attribute__ ((packed));
-
-/* "Union Functional Descriptor" from CDC spec 5.2.3.8 */
-struct usb_cdc_union_desc {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	__u8	bMasterInterface0;
-	__u8	bSlaveInterface0;
-	/* ... and there could be other slave interfaces */
-} __attribute__ ((packed));
-
-/* "Network Channel Terminal Functional Descriptor" from CDC spec 5.2.3.11 */
-struct usb_cdc_network_terminal_desc {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	__u8	bEntityId;
-	__u8	iName;
-	__u8	bChannelIndex;
-	__u8	bPhysicalInterface;
-} __attribute__ ((packed));
-
-/* "Ethernet Networking Functional Descriptor" from CDC spec 5.2.3.16 */
-struct usb_cdc_ether_desc {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	__u8	iMACAddress;
-	__le32	bmEthernetStatistics;
-	__le16	wMaxSegmentSize;
-	__le16	wNumberMCFilters;
-	__u8	bNumberPowerFilters;
-} __attribute__ ((packed));
-
-/* "MDLM Functional Descriptor" from CDC WMC spec 6.7.2.3 */
-struct usb_cdc_mdlm_desc {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	__le16	bcdVersion;
-	__u8	bGUID[16];
-} __attribute__ ((packed));
-
-/* "MDLM Detail Functional Descriptor" from CDC WMC spec 6.7.2.4 */
-struct usb_cdc_mdlm_detail_desc {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	/* type is associated with mdlm_desc.bGUID */
-	__u8	bGuidDescriptorType;
-	__u8	bDetailData[0];
-} __attribute__ ((packed));
-
-/*-------------------------------------------------------------------------*/
-
-/*
- * Class-Specific Control Requests (6.2)
- *
- * section 3.6.2.1 table 4 has the ACM profile, for modems.
- * section 3.8.2 table 10 has the ethernet profile.
- *
- * Microsoft's RNDIS stack for Ethernet is a vendor-specific CDC ACM variant,
- * heavily dependent on the encapsulated (proprietary) command mechanism.
- */
-
-#define USB_CDC_SEND_ENCAPSULATED_COMMAND	0x00
-#define USB_CDC_GET_ENCAPSULATED_RESPONSE	0x01
-#define USB_CDC_REQ_SET_LINE_CODING		0x20
-#define USB_CDC_REQ_GET_LINE_CODING		0x21
-#define USB_CDC_REQ_SET_CONTROL_LINE_STATE	0x22
-#define USB_CDC_REQ_SEND_BREAK			0x23
-#define USB_CDC_SET_ETHERNET_MULTICAST_FILTERS	0x40
-#define USB_CDC_SET_ETHERNET_PM_PATTERN_FILTER	0x41
-#define USB_CDC_GET_ETHERNET_PM_PATTERN_FILTER	0x42
-#define USB_CDC_SET_ETHERNET_PACKET_FILTER	0x43
-#define USB_CDC_GET_ETHERNET_STATISTIC		0x44
-
-/* Line Coding Structure from CDC spec 6.2.13 */
-struct usb_cdc_line_coding {
-	__le32	dwDTERate;
-	__u8	bCharFormat;
-#define USB_CDC_1_STOP_BITS			0
-#define USB_CDC_1_5_STOP_BITS			1
-#define USB_CDC_2_STOP_BITS			2
-
-	__u8	bParityType;
-#define USB_CDC_NO_PARITY			0
-#define USB_CDC_ODD_PARITY			1
-#define USB_CDC_EVEN_PARITY			2
-#define USB_CDC_MARK_PARITY			3
-#define USB_CDC_SPACE_PARITY			4
-
-	__u8	bDataBits;
-} __attribute__ ((packed));
-
-/* table 62; bits in multicast filter */
-#define	USB_CDC_PACKET_TYPE_PROMISCUOUS		(1 << 0)
-#define	USB_CDC_PACKET_TYPE_ALL_MULTICAST	(1 << 1) /* no filter */
-#define	USB_CDC_PACKET_TYPE_DIRECTED		(1 << 2)
-#define	USB_CDC_PACKET_TYPE_BROADCAST		(1 << 3)
-#define	USB_CDC_PACKET_TYPE_MULTICAST		(1 << 4) /* filtered */
-
-
-/*-------------------------------------------------------------------------*/
-
-/*
- * Class-Specific Notifications (6.3) sent by interrupt transfers
- *
- * section 3.8.2 table 11 of the CDC spec lists Ethernet notifications
- * section 3.6.2.1 table 5 specifies ACM notifications, accepted by RNDIS
- * RNDIS also defines its own bit-incompatible notifications
- */
-
-#define USB_CDC_NOTIFY_NETWORK_CONNECTION	0x00
-#define USB_CDC_NOTIFY_RESPONSE_AVAILABLE	0x01
-#define USB_CDC_NOTIFY_SERIAL_STATE		0x20
-#define USB_CDC_NOTIFY_SPEED_CHANGE		0x2a
-
-struct usb_cdc_notification {
-	__u8	bmRequestType;
-	__u8	bNotificationType;
-	__le16	wValue;
-	__le16	wIndex;
-	__le16	wLength;
-} __attribute__ ((packed));
-
-- 
cgit v1.2.3


From 325a4af60dc945bf2da9cbcdbabb276e312b297c Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Tue, 13 Jun 2006 09:59:32 -0700
Subject: [PATCH] USB: move hardware-specific <linux/usb_*.h> to
 <linux/usb/*.h>

This moves header files for controller-specific platform data
from <linux/usb_XXX.h> to <linux/usb/XXX.h> to start reducing
some clutter.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/host/isp116x-hcd.c |  2 +-
 drivers/usb/host/sl811-hcd.c   |  2 +-
 drivers/usb/host/sl811_cs.c    |  2 +-
 include/linux/usb/isp116x.h    | 29 +++++++++++++++++++++++++++++
 include/linux/usb/sl811.h      | 26 ++++++++++++++++++++++++++
 include/linux/usb_isp116x.h    | 29 -----------------------------
 include/linux/usb_sl811.h      | 26 --------------------------
 7 files changed, 58 insertions(+), 58 deletions(-)
 create mode 100644 include/linux/usb/isp116x.h
 create mode 100644 include/linux/usb/sl811.h
 delete mode 100644 include/linux/usb_isp116x.h
 delete mode 100644 include/linux/usb_sl811.h

(limited to 'include/linux')

diff --git a/drivers/usb/host/isp116x-hcd.c b/drivers/usb/host/isp116x-hcd.c
index c5e224048efa..14386254c870 100644
--- a/drivers/usb/host/isp116x-hcd.c
+++ b/drivers/usb/host/isp116x-hcd.c
@@ -63,7 +63,7 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/usb.h>
-#include <linux/usb_isp116x.h>
+#include <linux/usb/isp116x.h>
 #include <linux/platform_device.h>
 
 #include <asm/io.h>
diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c
index a92343052751..6b4bc3f2bd86 100644
--- a/drivers/usb/host/sl811-hcd.c
+++ b/drivers/usb/host/sl811-hcd.c
@@ -46,7 +46,7 @@
 #include <linux/list.h>
 #include <linux/interrupt.h>
 #include <linux/usb.h>
-#include <linux/usb_sl811.h>
+#include <linux/usb/sl811.h>
 #include <linux/platform_device.h>
 
 #include <asm/io.h>
diff --git a/drivers/usb/host/sl811_cs.c b/drivers/usb/host/sl811_cs.c
index 302aa1ec312f..54f554e0f0ad 100644
--- a/drivers/usb/host/sl811_cs.c
+++ b/drivers/usb/host/sl811_cs.c
@@ -27,7 +27,7 @@
 #include <pcmcia/cisreg.h>
 #include <pcmcia/ds.h>
 
-#include <linux/usb_sl811.h>
+#include <linux/usb/sl811.h>
 
 MODULE_AUTHOR("Botond Botyanszki");
 MODULE_DESCRIPTION("REX-CFU1U PCMCIA driver for 2.6");
diff --git a/include/linux/usb/isp116x.h b/include/linux/usb/isp116x.h
new file mode 100644
index 000000000000..436dd8a2b64a
--- /dev/null
+++ b/include/linux/usb/isp116x.h
@@ -0,0 +1,29 @@
+
+/*
+ * Board initialization code should put one of these into dev->platform_data
+ * and place the isp116x onto platform_bus.
+ */
+
+struct isp116x_platform_data {
+	/* Enable internal resistors on downstream ports */
+	unsigned sel15Kres:1;
+	/* On-chip overcurrent detection */
+	unsigned oc_enable:1;
+	/* INT output polarity */
+	unsigned int_act_high:1;
+	/* INT edge or level triggered */
+	unsigned int_edge_triggered:1;
+	/* Enable wakeup by devices on usb bus (e.g. wakeup
+	   by attachment/detachment or by device activity
+	   such as moving a mouse). When chosen, this option
+	   prevents stopping internal clock, increasing
+	   thereby power consumption in suspended state. */
+	unsigned remote_wakeup_enable:1;
+	/* Inter-io delay (ns). The chip is picky about access timings; it
+	   expects at least:
+	   150ns delay between consecutive accesses to DATA_REG,
+	   300ns delay between access to ADDR_REG and DATA_REG
+	   OE, WE MUST NOT be changed during these intervals
+	 */
+	void (*delay) (struct device * dev, int delay);
+};
diff --git a/include/linux/usb/sl811.h b/include/linux/usb/sl811.h
new file mode 100644
index 000000000000..397ee3b3d7f3
--- /dev/null
+++ b/include/linux/usb/sl811.h
@@ -0,0 +1,26 @@
+
+/*
+ * board initialization should put one of these into dev->platform_data
+ * and place the sl811hs onto platform_bus named "sl811-hcd".
+ */
+
+struct sl811_platform_data {
+	unsigned	can_wakeup:1;
+
+	/* given port_power, msec/2 after power on till power good */
+	u8		potpg;
+
+	/* mA/2 power supplied on this port (max = default = 250) */
+	u8		power;
+
+	/* sl811 relies on an external source of VBUS current */
+	void		(*port_power)(struct device *dev, int is_on);
+
+	/* pulse sl811 nRST (probably with a GPIO) */
+	void		(*reset)(struct device *dev);
+
+	// some boards need something like these:
+	// int		(*check_overcurrent)(struct device *dev);
+	// void		(*clock_enable)(struct device *dev, int is_on);
+};
+
diff --git a/include/linux/usb_isp116x.h b/include/linux/usb_isp116x.h
deleted file mode 100644
index 436dd8a2b64a..000000000000
--- a/include/linux/usb_isp116x.h
+++ /dev/null
@@ -1,29 +0,0 @@
-
-/*
- * Board initialization code should put one of these into dev->platform_data
- * and place the isp116x onto platform_bus.
- */
-
-struct isp116x_platform_data {
-	/* Enable internal resistors on downstream ports */
-	unsigned sel15Kres:1;
-	/* On-chip overcurrent detection */
-	unsigned oc_enable:1;
-	/* INT output polarity */
-	unsigned int_act_high:1;
-	/* INT edge or level triggered */
-	unsigned int_edge_triggered:1;
-	/* Enable wakeup by devices on usb bus (e.g. wakeup
-	   by attachment/detachment or by device activity
-	   such as moving a mouse). When chosen, this option
-	   prevents stopping internal clock, increasing
-	   thereby power consumption in suspended state. */
-	unsigned remote_wakeup_enable:1;
-	/* Inter-io delay (ns). The chip is picky about access timings; it
-	   expects at least:
-	   150ns delay between consecutive accesses to DATA_REG,
-	   300ns delay between access to ADDR_REG and DATA_REG
-	   OE, WE MUST NOT be changed during these intervals
-	 */
-	void (*delay) (struct device * dev, int delay);
-};
diff --git a/include/linux/usb_sl811.h b/include/linux/usb_sl811.h
deleted file mode 100644
index 4f2d012d7309..000000000000
--- a/include/linux/usb_sl811.h
+++ /dev/null
@@ -1,26 +0,0 @@
-
-/*
- * board initialization should put one of these into dev->platform_data
- * and place the sl811hs onto platform_bus named "sl811-hcd".
- */
-
-struct sl811_platform_data {
-	unsigned	can_wakeup:1;
-
-	/* given port_power, msec/2 after power on till power good */
-	u8		potpg;
-
-	/* mA/2 power supplied on this port (max = default = 250) */
-	u8		power;
-
-	/* sl811 relies on an external source of VBUS current */
-	void 		(*port_power)(struct device *dev, int is_on);
-
-	/* pulse sl811 nRST (probably with a GPIO) */
-	void 		(*reset)(struct device *dev);
-
-	// some boards need something like these:
- 	// int 		(*check_overcurrent)(struct device *dev);
- 	// void 	(*clock_enable)(struct device *dev, int is_on);
-};
-
-- 
cgit v1.2.3


From ae0dadcf0f912cbab2ac84caa437454620bf71b2 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Tue, 13 Jun 2006 10:04:34 -0700
Subject: [PATCH] USB: move <linux/usb_input.h> to <linux/usb/input.h>

Move <linux/usb_input.h> to <linux/usb/input.h> and remove some
redundant includes.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/media/video/usbvideo/konicawc.c |  3 +--
 drivers/usb/input/acecad.c              |  4 +---
 drivers/usb/input/aiptek.c              |  4 +---
 drivers/usb/input/appletouch.c          |  4 +---
 drivers/usb/input/ati_remote.c          |  4 +---
 drivers/usb/input/ati_remote2.c         |  2 +-
 drivers/usb/input/hid-input.c           |  4 +---
 drivers/usb/input/itmtouch.c            |  4 +---
 drivers/usb/input/kbtab.c               |  5 +----
 drivers/usb/input/keyspan_remote.c      |  4 +---
 drivers/usb/input/mtouchusb.c           |  4 +---
 drivers/usb/input/powermate.c           |  4 +---
 drivers/usb/input/touchkitusb.c         |  4 +---
 drivers/usb/input/usbkbd.c              |  4 +---
 drivers/usb/input/usbmouse.c            |  4 +---
 drivers/usb/input/usbtouchscreen.c      |  2 +-
 drivers/usb/input/wacom.c               |  5 +----
 drivers/usb/input/xpad.c                |  4 +---
 drivers/usb/input/yealink.c             |  4 +---
 drivers/usb/storage/onetouch.c          |  3 +--
 include/linux/usb/input.h               | 25 +++++++++++++++++++++++++
 include/linux/usb_input.h               | 25 -------------------------
 22 files changed, 45 insertions(+), 81 deletions(-)
 create mode 100644 include/linux/usb/input.h
 delete mode 100644 include/linux/usb_input.h

(limited to 'include/linux')

diff --git a/drivers/media/video/usbvideo/konicawc.c b/drivers/media/video/usbvideo/konicawc.c
index c11f5d46b114..6f31ecc88843 100644
--- a/drivers/media/video/usbvideo/konicawc.c
+++ b/drivers/media/video/usbvideo/konicawc.c
@@ -15,8 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/input.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #include "usbvideo.h"
 
diff --git a/drivers/usb/input/acecad.c b/drivers/usb/input/acecad.c
index df29b8078b54..18c10e150ef3 100644
--- a/drivers/usb/input/acecad.c
+++ b/drivers/usb/input/acecad.c
@@ -27,11 +27,9 @@
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 /*
  * Version Information
diff --git a/drivers/usb/input/aiptek.c b/drivers/usb/input/aiptek.c
index a6693b0d1c4c..b138dae2b055 100644
--- a/drivers/usb/input/aiptek.c
+++ b/drivers/usb/input/aiptek.c
@@ -73,11 +73,9 @@
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 #include <linux/sched.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
diff --git a/drivers/usb/input/appletouch.c b/drivers/usb/input/appletouch.c
index 4eff8d7a79d4..36855062eacc 100644
--- a/drivers/usb/input/appletouch.c
+++ b/drivers/usb/input/appletouch.c
@@ -33,9 +33,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/usb.h>
-#include <linux/input.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 /* Apple has powerbooks which have the keyboard with different Product IDs */
 #define APPLE_VENDOR_ID		0x05AC
diff --git a/drivers/usb/input/ati_remote.c b/drivers/usb/input/ati_remote.c
index 99f986cb6e95..07c8c0e665dd 100644
--- a/drivers/usb/input/ati_remote.c
+++ b/drivers/usb/input/ati_remote.c
@@ -92,9 +92,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/input.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 #include <linux/wait.h>
 #include <linux/jiffies.h>
 
diff --git a/drivers/usb/input/ati_remote2.c b/drivers/usb/input/ati_remote2.c
index ab1a1ae24be9..ea71de81ca6b 100644
--- a/drivers/usb/input/ati_remote2.c
+++ b/drivers/usb/input/ati_remote2.c
@@ -8,7 +8,7 @@
  * as published by the Free Software Foundation.
  */
 
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #define DRIVER_DESC    "ATI/Philips USB RF remote driver"
 #define DRIVER_VERSION "0.1"
diff --git a/drivers/usb/input/hid-input.c b/drivers/usb/input/hid-input.c
index 2f665195f4ac..028e1ad89f5d 100644
--- a/drivers/usb/input/hid-input.c
+++ b/drivers/usb/input/hid-input.c
@@ -29,9 +29,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
-#include <linux/input.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #undef DEBUG
 
diff --git a/drivers/usb/input/itmtouch.c b/drivers/usb/input/itmtouch.c
index 7618ae5c104f..5c570cc703f3 100644
--- a/drivers/usb/input/itmtouch.c
+++ b/drivers/usb/input/itmtouch.c
@@ -42,11 +42,9 @@
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 /* only an 8 byte buffer necessary for a single packet */
 #define ITM_BUFSIZE			8
diff --git a/drivers/usb/input/kbtab.c b/drivers/usb/input/kbtab.c
index f6d5cead542b..604ade356ead 100644
--- a/drivers/usb/input/kbtab.c
+++ b/drivers/usb/input/kbtab.c
@@ -1,12 +1,9 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 #include <asm/unaligned.h>
-#include <asm/byteorder.h>
 
 /*
  * Version Information
diff --git a/drivers/usb/input/keyspan_remote.c b/drivers/usb/input/keyspan_remote.c
index 3d911976f378..70af985b5db9 100644
--- a/drivers/usb/input/keyspan_remote.c
+++ b/drivers/usb/input/keyspan_remote.c
@@ -18,9 +18,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/input.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #define DRIVER_VERSION	"v0.1"
 #define DRIVER_AUTHOR	"Michael Downey <downey@zymeta.com>"
diff --git a/drivers/usb/input/mtouchusb.c b/drivers/usb/input/mtouchusb.c
index f018953a5485..4fdee4db0729 100644
--- a/drivers/usb/input/mtouchusb.c
+++ b/drivers/usb/input/mtouchusb.c
@@ -42,11 +42,9 @@
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #define MTOUCHUSB_MIN_XC                0x0
 #define MTOUCHUSB_MAX_RAW_XC            0x4000
diff --git a/drivers/usb/input/powermate.c b/drivers/usb/input/powermate.c
index fdf0f788062c..b3c0d0c3eae9 100644
--- a/drivers/usb/input/powermate.c
+++ b/drivers/usb/input/powermate.c
@@ -30,12 +30,10 @@
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #define POWERMATE_VENDOR	0x077d	/* Griffin Technology, Inc. */
 #define POWERMATE_PRODUCT_NEW	0x0410	/* Griffin PowerMate */
diff --git a/drivers/usb/input/touchkitusb.c b/drivers/usb/input/touchkitusb.c
index 697c5e573a11..da7b0bf51aff 100644
--- a/drivers/usb/input/touchkitusb.c
+++ b/drivers/usb/input/touchkitusb.c
@@ -27,11 +27,9 @@
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #define TOUCHKIT_MIN_XC			0x0
 #define TOUCHKIT_MAX_XC			0x07ff
diff --git a/drivers/usb/input/usbkbd.c b/drivers/usb/input/usbkbd.c
index 2f3edc26cb50..5067a6ae650f 100644
--- a/drivers/usb/input/usbkbd.c
+++ b/drivers/usb/input/usbkbd.c
@@ -29,10 +29,8 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/input.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 /*
  * Version Information
diff --git a/drivers/usb/input/usbmouse.c b/drivers/usb/input/usbmouse.c
index af526135d210..446935b671d9 100644
--- a/drivers/usb/input/usbmouse.c
+++ b/drivers/usb/input/usbmouse.c
@@ -28,11 +28,9 @@
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 /*
  * Version Information
diff --git a/drivers/usb/input/usbtouchscreen.c b/drivers/usb/input/usbtouchscreen.c
index e9a07c1e905b..3b175aa482cd 100644
--- a/drivers/usb/input/usbtouchscreen.c
+++ b/drivers/usb/input/usbtouchscreen.c
@@ -39,7 +39,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 
 #define DRIVER_VERSION		"v0.3"
diff --git a/drivers/usb/input/wacom.c b/drivers/usb/input/wacom.c
index cf84c6096f29..369461a70b72 100644
--- a/drivers/usb/input/wacom.c
+++ b/drivers/usb/input/wacom.c
@@ -69,13 +69,10 @@
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 #include <asm/unaligned.h>
-#include <asm/byteorder.h>
 
 /*
  * Version Information
diff --git a/drivers/usb/input/xpad.c b/drivers/usb/input/xpad.c
index e278489a80c6..cfd4a4e04334 100644
--- a/drivers/usb/input/xpad.c
+++ b/drivers/usb/input/xpad.c
@@ -56,13 +56,11 @@
 
 #include <linux/config.h>
 #include <linux/kernel.h>
-#include <linux/input.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/smp_lock.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #define DRIVER_VERSION "v0.0.5"
 #define DRIVER_AUTHOR "Marko Friedemann <mfr@bmx-chemnitz.de>"
diff --git a/drivers/usb/input/yealink.c b/drivers/usb/input/yealink.c
index 37d2f0ba0319..24aedbb20f03 100644
--- a/drivers/usb/input/yealink.c
+++ b/drivers/usb/input/yealink.c
@@ -48,13 +48,11 @@
 
 #include <linux/config.h>
 #include <linux/kernel.h>
-#include <linux/input.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/rwsem.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #include "map_to_7segment.h"
 #include "yealink.h"
diff --git a/drivers/usb/storage/onetouch.c b/drivers/usb/storage/onetouch.c
index 55ee2d36d585..026a587eb8dd 100644
--- a/drivers/usb/storage/onetouch.c
+++ b/drivers/usb/storage/onetouch.c
@@ -34,9 +34,8 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/usb.h>
 #include <linux/usb_ch9.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 #include "usb.h"
 #include "onetouch.h"
 #include "debug.h"
diff --git a/include/linux/usb/input.h b/include/linux/usb/input.h
new file mode 100644
index 000000000000..716e0cc16043
--- /dev/null
+++ b/include/linux/usb/input.h
@@ -0,0 +1,25 @@
+#ifndef __USB_INPUT_H
+#define __USB_INPUT_H
+
+/*
+ * Copyright (C) 2005 Dmitry Torokhov
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/usb.h>
+#include <linux/input.h>
+#include <asm/byteorder.h>
+
+static inline void
+usb_to_input_id(const struct usb_device *dev, struct input_id *id)
+{
+	id->bustype = BUS_USB;
+	id->vendor = le16_to_cpu(dev->descriptor.idVendor);
+	id->product = le16_to_cpu(dev->descriptor.idProduct);
+	id->version = le16_to_cpu(dev->descriptor.bcdDevice);
+}
+
+#endif
diff --git a/include/linux/usb_input.h b/include/linux/usb_input.h
deleted file mode 100644
index 716e0cc16043..000000000000
--- a/include/linux/usb_input.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __USB_INPUT_H
-#define __USB_INPUT_H
-
-/*
- * Copyright (C) 2005 Dmitry Torokhov
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- */
-
-#include <linux/usb.h>
-#include <linux/input.h>
-#include <asm/byteorder.h>
-
-static inline void
-usb_to_input_id(const struct usb_device *dev, struct input_id *id)
-{
-	id->bustype = BUS_USB;
-	id->vendor = le16_to_cpu(dev->descriptor.idVendor);
-	id->product = le16_to_cpu(dev->descriptor.idProduct);
-	id->version = le16_to_cpu(dev->descriptor.bcdDevice);
-}
-
-#endif
-- 
cgit v1.2.3


From 9bde7497e0b54178c317fac47a18be7f948dd471 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 14 Jun 2006 12:14:34 -0700
Subject: [PATCH] USB: make endpoints real struct devices

This will allow for us to give endpoints a major/minor to create a
"usbfs2-like" way to access endpoints directly from userspace in an
easier manner than the current usbfs provides us.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/endpoint.c | 238 ++++++++++++++++++++++++++++----------------
 include/linux/usb.h         |   4 +-
 2 files changed, 153 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/endpoint.c b/drivers/usb/core/endpoint.c
index 4c2fe8f723e5..247b5a4913a8 100644
--- a/drivers/usb/core/endpoint.c
+++ b/drivers/usb/core/endpoint.c
@@ -14,13 +14,14 @@
 #include "usb.h"
 
 /* endpoint stuff */
-struct ep_object {
+
+struct ep_device {
 	struct usb_endpoint_descriptor *desc;
 	struct usb_device *udev;
-	struct kobject kobj;
+	struct device dev;
 };
-#define to_ep_object(_kobj) \
-	container_of(_kobj, struct ep_object, kobj)
+#define to_ep_device(_dev) \
+	container_of(_dev, struct ep_device, dev)
 
 struct ep_attribute {
 	struct attribute attr;
@@ -30,40 +31,37 @@ struct ep_attribute {
 #define to_ep_attribute(_attr) \
 	container_of(_attr, struct ep_attribute, attr)
 
-#define EP_ATTR(_name)						\
-struct ep_attribute ep_##_name = {				\
-	.attr = {.name = #_name, .owner = THIS_MODULE,		\
-			.mode = S_IRUGO},			\
-	.show = show_ep_##_name}
-
 #define usb_ep_attr(field, format_string)			\
-static ssize_t show_ep_##field(struct usb_device *udev,		\
-		struct usb_endpoint_descriptor *desc, 		\
-		char *buf)					\
+static ssize_t show_ep_##field(struct device *dev,		\
+			       struct device_attribute *attr,	\
+			       char *buf)			\
 {								\
-	return sprintf(buf, format_string, desc->field);	\
+	struct ep_device *ep = to_ep_device(dev);		\
+	return sprintf(buf, format_string, ep->desc->field);	\
 }								\
-static EP_ATTR(field);
+static DEVICE_ATTR(field, S_IRUGO, show_ep_##field, NULL);
 
 usb_ep_attr(bLength, "%02x\n")
 usb_ep_attr(bEndpointAddress, "%02x\n")
 usb_ep_attr(bmAttributes, "%02x\n")
 usb_ep_attr(bInterval, "%02x\n")
 
-static ssize_t show_ep_wMaxPacketSize(struct usb_device *udev,
-		struct usb_endpoint_descriptor *desc, char *buf)
+static ssize_t show_ep_wMaxPacketSize(struct device *dev,
+				      struct device_attribute *attr, char *buf)
 {
+	struct ep_device *ep = to_ep_device(dev);
 	return sprintf(buf, "%04x\n",
-			le16_to_cpu(desc->wMaxPacketSize) & 0x07ff);
+			le16_to_cpu(ep->desc->wMaxPacketSize) & 0x07ff);
 }
-static EP_ATTR(wMaxPacketSize);
+static DEVICE_ATTR(wMaxPacketSize, S_IRUGO, show_ep_wMaxPacketSize, NULL);
 
-static ssize_t show_ep_type(struct usb_device *udev,
-		struct usb_endpoint_descriptor *desc, char *buf)
+static ssize_t show_ep_type(struct device *dev, struct device_attribute *attr,
+			    char *buf)
 {
+	struct ep_device *ep = to_ep_device(dev);
 	char *type = "unknown";
 
-	switch (desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) {
+	switch (ep->desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) {
 	case USB_ENDPOINT_XFER_CONTROL:
 		type = "Control";
 		break;
@@ -79,37 +77,38 @@ static ssize_t show_ep_type(struct usb_device *udev,
 	}
 	return sprintf(buf, "%s\n", type);
 }
-static EP_ATTR(type);
+static DEVICE_ATTR(type, S_IRUGO, show_ep_type, NULL);
 
-static ssize_t show_ep_interval(struct usb_device *udev,
-		struct usb_endpoint_descriptor *desc, char *buf)
+static ssize_t show_ep_interval(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
+	struct ep_device *ep = to_ep_device(dev);
 	char unit;
 	unsigned interval = 0;
 	unsigned in;
 
-	in = (desc->bEndpointAddress & USB_DIR_IN);
+	in = (ep->desc->bEndpointAddress & USB_DIR_IN);
 
-	switch (desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) {
+	switch (ep->desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) {
 	case USB_ENDPOINT_XFER_CONTROL:
-		if (udev->speed == USB_SPEED_HIGH) 	/* uframes per NAK */
-			interval = desc->bInterval;
+		if (ep->udev->speed == USB_SPEED_HIGH) 	/* uframes per NAK */
+			interval = ep->desc->bInterval;
 		break;
 	case USB_ENDPOINT_XFER_ISOC:
-		interval = 1 << (desc->bInterval - 1);
+		interval = 1 << (ep->desc->bInterval - 1);
 		break;
 	case USB_ENDPOINT_XFER_BULK:
-		if (udev->speed == USB_SPEED_HIGH && !in) /* uframes per NAK */
-			interval = desc->bInterval;
+		if (ep->udev->speed == USB_SPEED_HIGH && !in) /* uframes per NAK */
+			interval = ep->desc->bInterval;
 		break;
 	case USB_ENDPOINT_XFER_INT:
-		if (udev->speed == USB_SPEED_HIGH)
-			interval = 1 << (desc->bInterval - 1);
+		if (ep->udev->speed == USB_SPEED_HIGH)
+			interval = 1 << (ep->desc->bInterval - 1);
 		else
-			interval = desc->bInterval;
+			interval = ep->desc->bInterval;
 		break;
 	}
-	interval *= (udev->speed == USB_SPEED_HIGH) ? 125 : 1000;
+	interval *= (ep->udev->speed == USB_SPEED_HIGH) ? 125 : 1000;
 	if (interval % 1000)
 		unit = 'u';
 	else {
@@ -119,95 +118,158 @@ static ssize_t show_ep_interval(struct usb_device *udev,
 
 	return sprintf(buf, "%d%cs\n", interval, unit);
 }
-static EP_ATTR(interval);
+static DEVICE_ATTR(interval, S_IRUGO, show_ep_interval, NULL);
 
-static ssize_t show_ep_direction(struct usb_device *udev,
-		struct usb_endpoint_descriptor *desc, char *buf)
+static ssize_t show_ep_direction(struct device *dev,
+				 struct device_attribute *attr, char *buf)
 {
+	struct ep_device *ep = to_ep_device(dev);
 	char *direction;
 
-	if ((desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
+	if ((ep->desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
 			USB_ENDPOINT_XFER_CONTROL)
 		direction = "both";
-	else if (desc->bEndpointAddress & USB_DIR_IN)
+	else if (ep->desc->bEndpointAddress & USB_DIR_IN)
 		direction = "in";
 	else
 		direction = "out";
 	return sprintf(buf, "%s\n", direction);
 }
-static EP_ATTR(direction);
-
-static struct attribute *ep_attrs[] = {
-	&ep_bLength.attr,
-	&ep_bEndpointAddress.attr,
-	&ep_bmAttributes.attr,
-	&ep_bInterval.attr,
-	&ep_wMaxPacketSize.attr,
-	&ep_type.attr,
-	&ep_interval.attr,
-	&ep_direction.attr,
+static DEVICE_ATTR(direction, S_IRUGO, show_ep_direction, NULL);
+
+static struct attribute *ep_dev_attrs[] = {
+	&dev_attr_bLength.attr,
+	&dev_attr_bEndpointAddress.attr,
+	&dev_attr_bmAttributes.attr,
+	&dev_attr_bInterval.attr,
+	&dev_attr_wMaxPacketSize.attr,
+	&dev_attr_interval.attr,
+	&dev_attr_type.attr,
+	&dev_attr_direction.attr,
 	NULL,
 };
+static struct attribute_group ep_dev_attr_grp = {
+	.attrs = ep_dev_attrs,
+};
 
-static void ep_object_release(struct kobject *kobj)
+static struct endpoint_class {
+	struct kref kref;
+	struct class *class;
+} *ep_class;
+
+static int init_endpoint_class(void)
 {
-	kfree(to_ep_object(kobj));
+	int result = 0;
+
+	if (ep_class != NULL) {
+		kref_get(&ep_class->kref);
+		goto exit;
+	}
+
+	ep_class = kmalloc(sizeof(*ep_class), GFP_KERNEL);
+	if (!ep_class) {
+		result = -ENOMEM;
+		goto exit;
+	}
+
+	kref_init(&ep_class->kref);
+	ep_class->class = class_create(THIS_MODULE, "usb_endpoint");
+	if (IS_ERR(ep_class->class)) {
+		result = IS_ERR(ep_class->class);
+		kfree(ep_class);
+		ep_class = NULL;
+		goto exit;
+	}
+
+exit:
+	return result;
 }
 
-static ssize_t ep_object_show(struct kobject *kobj, struct attribute *attr,
-		char *buf)
+static void release_endpoint_class(struct kref *kref)
 {
-	struct ep_object *ep_obj = to_ep_object(kobj);
-	struct ep_attribute *ep_attr = to_ep_attribute(attr);
+	/* Ok, we cheat as we know we only have one ep_class */
+	class_destroy(ep_class->class);
+	kfree(ep_class);
+	ep_class = NULL;
+}
 
-	return (ep_attr->show)(ep_obj->udev, ep_obj->desc, buf);
+static void destroy_endpoint_class(void)
+{
+	if (ep_class)
+		kref_put(&ep_class->kref, release_endpoint_class);
 }
 
-static struct sysfs_ops ep_object_sysfs_ops = {
-	.show =			ep_object_show,
-};
+static void ep_device_release(struct device *dev)
+{
+	struct ep_device *ep_dev = to_ep_device(dev);
 
-static struct kobj_type ep_object_ktype = {
-	.release =		ep_object_release,
-	.sysfs_ops =		&ep_object_sysfs_ops,
-	.default_attrs =	ep_attrs,
-};
+	dev_dbg(dev, "%s called for %s\n", __FUNCTION__, dev->bus_id);
+	kfree(ep_dev);
+}
 
 void usb_create_ep_files(struct device *parent,
 			 struct usb_host_endpoint *endpoint,
 			 struct usb_device *udev)
 {
-	struct ep_object *ep_obj;
-	struct kobject *kobj;
+	char name[8];
+	struct ep_device *ep_dev;
+	int minor;
+	int retval;
+
+	retval = init_endpoint_class();
+	if (retval)
+		goto exit;
 
-	ep_obj = kzalloc(sizeof(struct ep_object), GFP_KERNEL);
-	if (!ep_obj)
-		return;
+	ep_dev = kzalloc(sizeof(*ep_dev), GFP_KERNEL);
+	if (!ep_dev) {
+		retval = -ENOMEM;
+		goto exit;
+	}
 
-	ep_obj->desc = &endpoint->desc;
-	ep_obj->udev = udev;
+	/* fun calculation to determine the minor of this endpoint */
+	minor = (((udev->bus->busnum - 1) * 128) * 16) + (udev->devnum - 1);
 
-	kobj = &ep_obj->kobj;
-	kobject_set_name(kobj, "ep_%02x", endpoint->desc.bEndpointAddress);
-	kobj->parent = &parent->kobj;
-	kobj->ktype = &ep_object_ktype;
+	ep_dev->desc = &endpoint->desc;
+	ep_dev->udev = udev;
+	ep_dev->dev.devt = MKDEV(442, minor);	// FIXME fake number...
+	ep_dev->dev.class = ep_class->class;
+	ep_dev->dev.parent = parent;
+	ep_dev->dev.release = ep_device_release;
+	snprintf(ep_dev->dev.bus_id, BUS_ID_SIZE, "usbdev%d.%d_ep%02x",
+		 udev->bus->busnum, udev->devnum,
+		 endpoint->desc.bEndpointAddress);
 
-	/* Don't use kobject_register, because it generates a hotplug event */
-	kobject_init(kobj);
-	if (kobject_add(kobj) == 0)
-		endpoint->kobj = kobj;
-	else
-		kobject_put(kobj);
+	retval = device_register(&ep_dev->dev);
+	if (retval)
+		goto error;
+	sysfs_create_group(&ep_dev->dev.kobj, &ep_dev_attr_grp);
+
+	endpoint->ep_dev = ep_dev;
+
+	/* create the symlink to the old-style "ep_XX" directory */
+	sprintf(name, "ep_%02x", endpoint->desc.bEndpointAddress);
+	sysfs_create_link(&parent->kobj, &endpoint->ep_dev->dev.kobj, name);
+
+exit:
+	return;
+error:
+	kfree(ep_dev);
+	return;
 }
 
 void usb_remove_ep_files(struct usb_host_endpoint *endpoint)
 {
 
-	if (endpoint->kobj) {
-		kobject_del(endpoint->kobj);
-		kobject_put(endpoint->kobj);
-		endpoint->kobj = NULL;
+	if (endpoint->ep_dev) {
+		char name[8];
+
+		sprintf(name, "ep_%02x", endpoint->desc.bEndpointAddress);
+		sysfs_remove_link(&endpoint->ep_dev->dev.parent->kobj, name);
+		sysfs_remove_group(&endpoint->ep_dev->dev.kobj, &ep_dev_attr_grp);
+		device_unregister(&endpoint->ep_dev->dev);
+		endpoint->ep_dev = NULL;
 	}
+	destroy_endpoint_class();
 }
 
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 5ad30cefe7b2..46956a72de5d 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -40,6 +40,8 @@ struct usb_driver;
  * Devices may also have class-specific or vendor-specific descriptors.
  */
 
+struct ep_device;
+
 /**
  * struct usb_host_endpoint - host-side endpoint descriptor and queue
  * @desc: descriptor for this endpoint, wMaxPacketSize in native byteorder
@@ -57,7 +59,7 @@ struct usb_host_endpoint {
 	struct usb_endpoint_descriptor	desc;
 	struct list_head		urb_list;
 	void				*hcpriv;
-	struct kobject			*kobj;	/* For sysfs info */
+	struct ep_device 		*ep_dev;	/* For sysfs info */
 
 	unsigned char *extra;   /* Extra descriptors */
 	int extralen;
-- 
cgit v1.2.3


From c182274ffe1277f4e7c564719a696a37cacf74ea Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 19 Jun 2006 23:59:31 -0700
Subject: [PATCH] USB: move usb_device_class class devices to be real devices

This moves the usb class devices that control the usbfs nodes to show up
in the proper place in the larger device tree.

No userspace changes is needed, this is compatible due to the symlinks
generated by the driver core.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/devio.c | 20 ++++++++++----------
 include/linux/usb.h      |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 2eda52fc1ebc..3f8e06279c92 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -515,19 +515,19 @@ static int check_ctrlrecip(struct dev_state *ps, unsigned int requesttype, unsig
 
 static struct usb_device *usbdev_lookup_minor(int minor)
 {
-	struct class_device *class_dev;
-	struct usb_device *dev = NULL;
+	struct device *device;
+	struct usb_device *udev = NULL;
 
 	down(&usb_device_class->sem);
-	list_for_each_entry(class_dev, &usb_device_class->children, node) {
-		if (class_dev->devt == MKDEV(USB_DEVICE_MAJOR, minor)) {
-			dev = class_dev->class_data;
+	list_for_each_entry(device, &usb_device_class->devices, node) {
+		if (device->devt == MKDEV(USB_DEVICE_MAJOR, minor)) {
+			udev = device->platform_data;
 			break;
 		}
 	}
 	up(&usb_device_class->sem);
 
-	return dev;
+	return udev;
 };
 
 /*
@@ -1580,16 +1580,16 @@ static void usbdev_add(struct usb_device *dev)
 {
 	int minor = ((dev->bus->busnum-1) * 128) + (dev->devnum-1);
 
-	dev->class_dev = class_device_create(usb_device_class, NULL,
-				MKDEV(USB_DEVICE_MAJOR, minor), &dev->dev,
+	dev->usbfs_dev = device_create(usb_device_class, &dev->dev,
+				MKDEV(USB_DEVICE_MAJOR, minor),
 				"usbdev%d.%d", dev->bus->busnum, dev->devnum);
 
-	dev->class_dev->class_data = dev;
+	dev->usbfs_dev->platform_data = dev;
 }
 
 static void usbdev_remove(struct usb_device *dev)
 {
-	class_device_unregister(dev->class_dev);
+	device_unregister(dev->usbfs_dev);
 }
 
 static int usbdev_notify(struct notifier_block *self, unsigned long action,
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 46956a72de5d..b69b6cfb0bd7 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -360,7 +360,7 @@ struct usb_device {
 	char *serial;			/* iSerialNumber string, if present */
 
 	struct list_head filelist;
-	struct class_device *class_dev;
+	struct device *usbfs_dev;
 	struct dentry *usbfs_dentry;	/* usbfs dentry entry for the device */
 
 	/*
-- 
cgit v1.2.3


From bd00949647ddcea47ce4ea8bb2cfcfc98ebf9f2a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 20 Jun 2006 13:09:50 -0700
Subject: [PATCH] USB: convert usb class devices to real devices

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/file.c | 13 ++++++-------
 include/linux/usb.h     |  5 +++--
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/file.c b/drivers/usb/core/file.c
index b263a54a13c0..70898716dd9a 100644
--- a/drivers/usb/core/file.c
+++ b/drivers/usb/core/file.c
@@ -158,14 +158,13 @@ int usb_register_dev(struct usb_interface *intf,
 		++temp;
 	else
 		temp = name;
-	intf->class_dev = class_device_create(usb_class, NULL,
-					      MKDEV(USB_MAJOR, minor),
-					      &intf->dev, "%s", temp);
-	if (IS_ERR(intf->class_dev)) {
+	intf->usb_dev = device_create(usb_class, &intf->dev,
+				      MKDEV(USB_MAJOR, minor), "%s", temp);
+	if (IS_ERR(intf->usb_dev)) {
 		spin_lock (&minor_lock);
 		usb_minors[intf->minor] = NULL;
 		spin_unlock (&minor_lock);
-		retval = PTR_ERR(intf->class_dev);
+		retval = PTR_ERR(intf->usb_dev);
 	}
 exit:
 	return retval;
@@ -206,8 +205,8 @@ void usb_deregister_dev(struct usb_interface *intf,
 	spin_unlock (&minor_lock);
 
 	snprintf(name, BUS_ID_SIZE, class_driver->name, intf->minor - minor_base);
-	class_device_destroy(usb_class, MKDEV(USB_MAJOR, intf->minor));
-	intf->class_dev = NULL;
+	device_destroy(usb_class, MKDEV(USB_MAJOR, intf->minor));
+	intf->usb_dev = NULL;
 	intf->minor = -1;
 }
 EXPORT_SYMBOL(usb_deregister_dev);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index b69b6cfb0bd7..8dead32e7ebf 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -103,7 +103,8 @@ enum usb_interface_condition {
  * @condition: binding state of the interface: not bound, binding
  *	(in probe()), bound to a driver, or unbinding (in disconnect())
  * @dev: driver model's view of this device
- * @class_dev: driver model's class view of this device.
+ * @usb_dev: if an interface is bound to the USB major, this will point
+ *	to the sysfs representation for that device.
  *
  * USB device drivers attach to interfaces on a physical device.  Each
  * interface encapsulates a single high level function, such as feeding
@@ -143,7 +144,7 @@ struct usb_interface {
 					 * bound to */
 	enum usb_interface_condition condition;		/* state of binding */
 	struct device dev;		/* interface specific device info */
-	struct class_device *class_dev;
+	struct device *usb_dev;		/* pointer to the usb class's device, if any */
 };
 #define	to_usb_interface(d) container_of(d, struct usb_interface, dev)
 #define	interface_to_usbdev(intf) \
-- 
cgit v1.2.3


From 02e0c5d5c2e00374b6808a42f8eea4ea9baaa216 Mon Sep 17 00:00:00 2001
From: Rudolf Marek <r.marek@sh.cvut.cz>
Date: Thu, 23 Mar 2006 16:48:09 +0100
Subject: [PATCH] i2c-piix4: Add ATI IXP200/300/400 support

This patch adds the ATI IXP southbridges support to i2c-piix4,
as it turned out those chips are compatible with it.

Signed-off-by: Rudolf Marek <r.marek@sh.cvut.cz>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/i2c/busses/i2c-piix4 | 2 ++
 drivers/i2c/busses/Kconfig         | 5 ++++-
 drivers/i2c/busses/i2c-piix4.c     | 6 ++++++
 include/linux/pci_ids.h            | 3 +++
 4 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-piix4 b/Documentation/i2c/busses/i2c-piix4
index a1c8f581afed..6e6c905143a1 100644
--- a/Documentation/i2c/busses/i2c-piix4
+++ b/Documentation/i2c/busses/i2c-piix4
@@ -6,6 +6,8 @@ Supported adapters:
     Datasheet: Publicly available at the Intel website
   * ServerWorks OSB4, CSB5, CSB6 and HT-1000 southbridges
     Datasheet: Only available via NDA from ServerWorks
+  * ATI IXP southbridges IXP200, IXP300, IXP400
+    Datasheet: Not publicly available
   * Standard Microsystems (SMSC) SLC90E66 (Victory66) southbridge
     Datasheet: Publicly available at the SMSC website http://www.smsc.com
 
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index d6d44946a283..e3450d16d8a4 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -163,7 +163,7 @@ config I2C_PXA_SLAVE
 	  I2C bus.
 
 config I2C_PIIX4
-	tristate "Intel PIIX4"
+	tristate "Intel PIIX4 and compatible (ATI/Serverworks/Broadcom/SMSC)"
 	depends on I2C && PCI
 	help
 	  If you say yes to this option, support will be included for the Intel
@@ -172,6 +172,9 @@ config I2C_PIIX4
 	  of Broadcom):
 	    Intel PIIX4
 	    Intel 440MX
+	    ATI IXP200
+	    ATI IXP300
+	    ATI IXP400
 	    Serverworks OSB4
 	    Serverworks CSB5
 	    Serverworks CSB6
diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
index d9c7c00e71f9..5f06e81a2087 100644
--- a/drivers/i2c/busses/i2c-piix4.c
+++ b/drivers/i2c/busses/i2c-piix4.c
@@ -413,6 +413,12 @@ static struct i2c_adapter piix4_adapter = {
 static struct pci_device_id piix4_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3),
 	  .driver_data = 3 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP200_SMBUS),
+	  .driver_data = 0 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP300_SMBUS),
+	  .driver_data = 0 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS),
+	  .driver_data = 0 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_OSB4),
 	  .driver_data = 0 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_CSB5),
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index bcfe9d4f56ae..489af9d3ce1f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -352,8 +352,11 @@
 #define PCI_DEVICE_ID_ATI_RS480         0x5950
 /* ATI IXP Chipset */
 #define PCI_DEVICE_ID_ATI_IXP200_IDE	0x4349
+#define PCI_DEVICE_ID_ATI_IXP200_SMBUS	0x4353
+#define PCI_DEVICE_ID_ATI_IXP300_SMBUS	0x4363
 #define PCI_DEVICE_ID_ATI_IXP300_IDE	0x4369
 #define PCI_DEVICE_ID_ATI_IXP300_SATA   0x436e
+#define PCI_DEVICE_ID_ATI_IXP400_SMBUS	0x4372
 #define PCI_DEVICE_ID_ATI_IXP400_IDE	0x4376
 #define PCI_DEVICE_ID_ATI_IXP400_SATA   0x4379
 #define PCI_DEVICE_ID_ATI_IXP400_SATA2	0x437a
-- 
cgit v1.2.3


From 5e9f4f2e5a02bb6908278a819952aa31fffefaa2 Mon Sep 17 00:00:00 2001
From: "Mark A. Greer" <mgreer@mvista.com>
Date: Tue, 25 Apr 2006 13:04:54 +0200
Subject: [PATCH] I2C: m41t00: Add support for the ST M41T81 and M41T85

This patch adds support for the ST m41t81 and m41t85 i2c rtc chips
to the existing m41t00 driver.

Since there is no way to reliably determine what type of rtc chip
is in use, the chip type is passed in via platform_data.  The i2c
address and square wave frequency are passed in via platform_data
as well.  To accommodate the use of platform_data, a new header
file include/linux/m41t00.h has been added.

The m41t81 and m41t85 chips halt the updating of their time registers
while they are being accessed.  They resume when a stop condition
exists on the i2c bus or when non-time related regs are accessed.
To make the best use of that facility and to make more efficient
use of the i2c bus, this patch replaces multiple i2c_smbus_xxx calls
with a single i2c_transfer call.

Signed-off-by: Mark A. Greer <mgreer@mvista.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/i2c/chips/m41t00.c | 313 ++++++++++++++++++++++++++++++++++-----------
 include/linux/m41t00.h     |  50 ++++++++
 2 files changed, 291 insertions(+), 72 deletions(-)
 create mode 100644 include/linux/m41t00.h

(limited to 'include/linux')

diff --git a/drivers/i2c/chips/m41t00.c b/drivers/i2c/chips/m41t00.c
index eb2be3e4416c..2dd0a34d9472 100644
--- a/drivers/i2c/chips/m41t00.c
+++ b/drivers/i2c/chips/m41t00.c
@@ -1,9 +1,9 @@
 /*
- * I2C client/driver for the ST M41T00 Real-Time Clock chip.
+ * I2C client/driver for the ST M41T00 family of i2c rtc chips.
  *
  * Author: Mark A. Greer <mgreer@mvista.com>
  *
- * 2005 (c) MontaVista Software, Inc. This file is licensed under
+ * 2005, 2006 (c) MontaVista Software, Inc. This file is licensed under
  * the terms of the GNU General Public License version 2. This program
  * is licensed "as is" without any warranty of any kind, whether express
  * or implied.
@@ -19,21 +19,17 @@
 #include <linux/i2c.h>
 #include <linux/rtc.h>
 #include <linux/bcd.h>
-#include <linux/mutex.h>
 #include <linux/workqueue.h>
-
+#include <linux/platform_device.h>
+#include <linux/m41t00.h>
 #include <asm/time.h>
 #include <asm/rtc.h>
 
-#define	M41T00_DRV_NAME		"m41t00"
-
-static DEFINE_MUTEX(m41t00_mutex);
-
 static struct i2c_driver m41t00_driver;
 static struct i2c_client *save_client;
 
 static unsigned short ignore[] = { I2C_CLIENT_END };
-static unsigned short normal_addr[] = { 0x68, I2C_CLIENT_END };
+static unsigned short normal_addr[] = { I2C_CLIENT_END, I2C_CLIENT_END };
 
 static struct i2c_client_address_data addr_data = {
 	.normal_i2c	= normal_addr,
@@ -41,34 +37,92 @@ static struct i2c_client_address_data addr_data = {
 	.ignore		= ignore,
 };
 
+struct m41t00_chip_info {
+	u8	type;
+	char	*name;
+	u8	read_limit;
+	u8	sec;		/* Offsets for chip regs */
+	u8	min;
+	u8	hour;
+	u8	day;
+	u8	mon;
+	u8	year;
+	u8	alarm_mon;
+	u8	alarm_hour;
+	u8	sqw;
+	u8	sqw_freq;
+};
+
+static struct m41t00_chip_info m41t00_chip_info_tbl[] = {
+	{
+		.type		= M41T00_TYPE_M41T00,
+		.name		= "m41t00",
+		.read_limit	= 5,
+		.sec		= 0,
+		.min		= 1,
+		.hour		= 2,
+		.day		= 4,
+		.mon		= 5,
+		.year		= 6,
+	},
+	{
+		.type		= M41T00_TYPE_M41T81,
+		.name		= "m41t81",
+		.read_limit	= 1,
+		.sec		= 1,
+		.min		= 2,
+		.hour		= 3,
+		.day		= 5,
+		.mon		= 6,
+		.year		= 7,
+		.alarm_mon	= 0xa,
+		.alarm_hour	= 0xc,
+		.sqw		= 0x13,
+	},
+	{
+		.type		= M41T00_TYPE_M41T85,
+		.name		= "m41t85",
+		.read_limit	= 1,
+		.sec		= 1,
+		.min		= 2,
+		.hour		= 3,
+		.day		= 5,
+		.mon		= 6,
+		.year		= 7,
+		.alarm_mon	= 0xa,
+		.alarm_hour	= 0xc,
+		.sqw		= 0x13,
+	},
+};
+static struct m41t00_chip_info *m41t00_chip;
+
 ulong
 m41t00_get_rtc_time(void)
 {
 	s32 sec, min, hour, day, mon, year;
 	s32 sec1, min1, hour1, day1, mon1, year1;
-	ulong limit = 10;
+	u8 reads = 0;
+	u8 buf[8], msgbuf[1] = { 0 }; /* offset into rtc's regs */
+	struct i2c_msg msgs[] = {
+		{
+			.addr	= save_client->addr,
+			.flags	= 0,
+			.len	= 1,
+			.buf	= msgbuf,
+		},
+		{
+			.addr	= save_client->addr,
+			.flags	= I2C_M_RD,
+			.len	= 8,
+			.buf	= buf,
+		},
+	};
 
 	sec = min = hour = day = mon = year = 0;
-	sec1 = min1 = hour1 = day1 = mon1 = year1 = 0;
 
-	mutex_lock(&m41t00_mutex);
 	do {
-		if (((sec = i2c_smbus_read_byte_data(save_client, 0)) >= 0)
-			&& ((min = i2c_smbus_read_byte_data(save_client, 1))
-				>= 0)
-			&& ((hour = i2c_smbus_read_byte_data(save_client, 2))
-				>= 0)
-			&& ((day = i2c_smbus_read_byte_data(save_client, 4))
-				>= 0)
-			&& ((mon = i2c_smbus_read_byte_data(save_client, 5))
-				>= 0)
-			&& ((year = i2c_smbus_read_byte_data(save_client, 6))
-				>= 0)
-			&& ((sec == sec1) && (min == min1) && (hour == hour1)
-				&& (day == day1) && (mon == mon1)
-				&& (year == year1)))
-
-				break;
+		if (i2c_transfer(save_client->adapter, msgs, 2) < 0)
+			goto read_err;
 
 		sec1 = sec;
 		min1 = min;
@@ -76,21 +130,21 @@ m41t00_get_rtc_time(void)
 		day1 = day;
 		mon1 = mon;
 		year1 = year;
-	} while (--limit > 0);
-	mutex_unlock(&m41t00_mutex);
-
-	if (limit == 0) {
-		dev_warn(&save_client->dev,
-			"m41t00: can't read rtc chip\n");
-		sec = min = hour = day = mon = year = 0;
-	}
 
-	sec &= 0x7f;
-	min &= 0x7f;
-	hour &= 0x3f;
-	day &= 0x3f;
-	mon &= 0x1f;
-	year &= 0xff;
+		sec = buf[m41t00_chip->sec] & 0x7f;
+		min = buf[m41t00_chip->min] & 0x7f;
+		hour = buf[m41t00_chip->hour] & 0x3f;
+		day = buf[m41t00_chip->day] & 0x3f;
+		mon = buf[m41t00_chip->mon] & 0x1f;
+		year = buf[m41t00_chip->year];
+	} while ((++reads < m41t00_chip->read_limit) && ((sec != sec1)
+			|| (min != min1) || (hour != hour1) || (day != day1)
+			|| (mon != mon1) || (year != year1)));
+
+	if ((m41t00_chip->read_limit > 1) && ((sec != sec1) || (min != min1)
+			|| (hour != hour1) || (day != day1) || (mon != mon1)
+			|| (year != year1)))
+		goto read_err;
 
 	sec = BCD2BIN(sec);
 	min = BCD2BIN(min);
@@ -104,40 +158,60 @@ m41t00_get_rtc_time(void)
 		year += 100;
 
 	return mktime(year, mon, day, hour, min, sec);
+
+read_err:
+	dev_err(&save_client->dev, "m41t00_get_rtc_time: Read error\n");
+	return 0;
 }
+EXPORT_SYMBOL_GPL(m41t00_get_rtc_time);
 
 static void
 m41t00_set(void *arg)
 {
 	struct rtc_time	tm;
-	ulong nowtime = *(ulong *)arg;
+	int nowtime = *(int *)arg;
+	s32 sec, min, hour, day, mon, year;
+	u8 wbuf[9], *buf = &wbuf[1], msgbuf[1] = { 0 };
+	struct i2c_msg msgs[] = {
+		{
+			.addr	= save_client->addr,
+			.flags	= 0,
+			.len	= 1,
+			.buf	= msgbuf,
+		},
+		{
+			.addr	= save_client->addr,
+			.flags	= I2C_M_RD,
+			.len	= 8,
+			.buf	= buf,
+		},
+	};
 
 	to_tm(nowtime, &tm);
 	tm.tm_year = (tm.tm_year - 1900) % 100;
 
-	tm.tm_sec = BIN2BCD(tm.tm_sec);
-	tm.tm_min = BIN2BCD(tm.tm_min);
-	tm.tm_hour = BIN2BCD(tm.tm_hour);
-	tm.tm_mon = BIN2BCD(tm.tm_mon);
-	tm.tm_mday = BIN2BCD(tm.tm_mday);
-	tm.tm_year = BIN2BCD(tm.tm_year);
-
-	mutex_lock(&m41t00_mutex);
-	if ((i2c_smbus_write_byte_data(save_client, 0, tm.tm_sec & 0x7f) < 0)
-		|| (i2c_smbus_write_byte_data(save_client, 1, tm.tm_min & 0x7f)
-			< 0)
-		|| (i2c_smbus_write_byte_data(save_client, 2, tm.tm_hour & 0x3f)
-			< 0)
-		|| (i2c_smbus_write_byte_data(save_client, 4, tm.tm_mday & 0x3f)
-			< 0)
-		|| (i2c_smbus_write_byte_data(save_client, 5, tm.tm_mon & 0x1f)
-			< 0)
-		|| (i2c_smbus_write_byte_data(save_client, 6, tm.tm_year & 0xff)
-			< 0))
-
-		dev_warn(&save_client->dev,"m41t00: can't write to rtc chip\n");
-
-	mutex_unlock(&m41t00_mutex);
+	sec = BIN2BCD(tm.tm_sec);
+	min = BIN2BCD(tm.tm_min);
+	hour = BIN2BCD(tm.tm_hour);
+	day = BIN2BCD(tm.tm_mday);
+	mon = BIN2BCD(tm.tm_mon);
+	year = BIN2BCD(tm.tm_year);
+
+	/* Read reg values into buf[0..7]/wbuf[1..8] */
+	if (i2c_transfer(save_client->adapter, msgs, 2) < 0) {
+		dev_err(&save_client->dev, "m41t00_set: Read error\n");
+		return;
+	}
+
+	wbuf[0] = 0; /* offset into rtc's regs */
+	buf[m41t00_chip->sec] = (buf[m41t00_chip->sec] & ~0x7f) | (sec & 0x7f);
+	buf[m41t00_chip->min] = (buf[m41t00_chip->min] & ~0x7f) | (min & 0x7f);
+	buf[m41t00_chip->hour] = (buf[m41t00_chip->hour] & ~0x3f) | (hour& 0x3f);
+	buf[m41t00_chip->day] = (buf[m41t00_chip->day] & ~0x3f) | (day & 0x3f);
+	buf[m41t00_chip->mon] = (buf[m41t00_chip->mon] & ~0x1f) | (mon & 0x1f);
+
+	if (i2c_master_send(save_client, wbuf, 9) < 0)
+		dev_err(&save_client->dev, "m41t00_set: Write error\n");
 }
 
 static ulong new_time;
@@ -156,6 +230,48 @@ m41t00_set_rtc_time(ulong nowtime)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(m41t00_set_rtc_time);
+
+/*
+ *****************************************************************************
+ *
+ *	platform_data Driver Interface
+ *
+ *****************************************************************************
+ */
+static int __init
+m41t00_platform_probe(struct platform_device *pdev)
+{
+	struct m41t00_platform_data *pdata;
+	int i;
+
+	if (pdev && (pdata = pdev->dev.platform_data)) {
+		normal_addr[0] = pdata->i2c_addr;
+
+		for (i=0; i<ARRAY_SIZE(m41t00_chip_info_tbl); i++)
+			if (m41t00_chip_info_tbl[i].type == pdata->type) {
+				m41t00_chip = &m41t00_chip_info_tbl[i];
+				m41t00_chip->sqw_freq = pdata->sqw_freq;
+				return 0;
+			}
+	}
+	return -ENODEV;
+}
+
+static int __exit
+m41t00_platform_remove(struct platform_device *pdev)
+{
+	return 0;
+}
+
+static struct platform_driver m41t00_platform_driver = {
+	.probe  = m41t00_platform_probe,
+	.remove = m41t00_platform_remove,
+	.driver = {
+		.owner = THIS_MODULE,
+		.name  = M41T00_DRV_NAME,
+	},
+};
 
 /*
  *****************************************************************************
@@ -170,23 +286,71 @@ m41t00_probe(struct i2c_adapter *adap, int addr, int kind)
 	struct i2c_client *client;
 	int rc;
 
+	if (!i2c_check_functionality(adap, I2C_FUNC_I2C
+			| I2C_FUNC_SMBUS_BYTE_DATA))
+		return 0;
+
 	client = kzalloc(sizeof(struct i2c_client), GFP_KERNEL);
 	if (!client)
 		return -ENOMEM;
 
-	strlcpy(client->name, M41T00_DRV_NAME, I2C_NAME_SIZE);
+	strlcpy(client->name, m41t00_chip->name, I2C_NAME_SIZE);
 	client->addr = addr;
 	client->adapter = adap;
 	client->driver = &m41t00_driver;
 
-	if ((rc = i2c_attach_client(client)) != 0) {
-		kfree(client);
-		return rc;
+	if ((rc = i2c_attach_client(client)))
+		goto attach_err;
+
+	if (m41t00_chip->type != M41T00_TYPE_M41T00) {
+		/* If asked, disable SQW, set SQW frequency & re-enable */
+		if (m41t00_chip->sqw_freq)
+			if (((rc = i2c_smbus_read_byte_data(client,
+					m41t00_chip->alarm_mon)) < 0)
+			 || ((rc = i2c_smbus_write_byte_data(client,
+					m41t00_chip->alarm_mon, rc & ~0x40)) <0)
+			 || ((rc = i2c_smbus_write_byte_data(client,
+					m41t00_chip->sqw,
+					m41t00_chip->sqw_freq)) < 0)
+			 || ((rc = i2c_smbus_write_byte_data(client,
+					m41t00_chip->alarm_mon, rc | 0x40)) <0))
+				goto sqw_err;
+
+		/* Make sure HT (Halt Update) bit is cleared */
+		if ((rc = i2c_smbus_read_byte_data(client,
+				m41t00_chip->alarm_hour)) < 0)
+			goto ht_err;
+
+		if (rc & 0x40)
+			if ((rc = i2c_smbus_write_byte_data(client,
+					m41t00_chip->alarm_hour, rc & ~0x40))<0)
+				goto ht_err;
 	}
 
-	m41t00_wq = create_singlethread_workqueue("m41t00");
+	/* Make sure ST (stop) bit is cleared */
+	if ((rc = i2c_smbus_read_byte_data(client, m41t00_chip->sec)) < 0)
+		goto st_err;
+
+	if (rc & 0x80)
+		if ((rc = i2c_smbus_write_byte_data(client, m41t00_chip->sec,
+				rc & ~0x80)) < 0)
+			goto st_err;
+
+	m41t00_wq = create_singlethread_workqueue(m41t00_chip->name);
 	save_client = client;
 	return 0;
+
+st_err:
+	dev_err(&client->dev, "m41t00_probe: Can't clear ST bit\n");
+	goto attach_err;
+ht_err:
+	dev_err(&client->dev, "m41t00_probe: Can't clear HT bit\n");
+	goto attach_err;
+sqw_err:
+	dev_err(&client->dev, "m41t00_probe: Can't set SQW Frequency\n");
+attach_err:
+	kfree(client);
+	return rc;
 }
 
 static int
@@ -219,13 +383,18 @@ static struct i2c_driver m41t00_driver = {
 static int __init
 m41t00_init(void)
 {
-	return i2c_add_driver(&m41t00_driver);
+	int rc;
+
+	if (!(rc = platform_driver_register(&m41t00_platform_driver)))
+		rc = i2c_add_driver(&m41t00_driver);
+	return rc;
 }
 
 static void __exit
 m41t00_exit(void)
 {
 	i2c_del_driver(&m41t00_driver);
+	platform_driver_unregister(&m41t00_platform_driver);
 }
 
 module_init(m41t00_init);
diff --git a/include/linux/m41t00.h b/include/linux/m41t00.h
new file mode 100644
index 000000000000..b423360ca38e
--- /dev/null
+++ b/include/linux/m41t00.h
@@ -0,0 +1,50 @@
+/*
+ * Definitions for the ST M41T00 family of i2c rtc chips.
+ *
+ * Author: Mark A. Greer <mgreer@mvista.com>
+ *
+ * 2005, 2006 (c) MontaVista Software, Inc. This file is licensed under
+ * the terms of the GNU General Public License version 2. This program
+ * is licensed "as is" without any warranty of any kind, whether express
+ * or implied.
+ */
+
+#ifndef _M41T00_H
+#define _M41T00_H
+
+#define	M41T00_DRV_NAME		"m41t00"
+#define	M41T00_I2C_ADDR		0x68
+
+#define	M41T00_TYPE_M41T00	0
+#define	M41T00_TYPE_M41T81	81
+#define	M41T00_TYPE_M41T85	85
+
+struct m41t00_platform_data {
+	u8	type;
+	u8	i2c_addr;
+	u8	sqw_freq;
+};
+
+/* SQW output disabled, this is default value by power on */
+#define M41T00_SQW_DISABLE	(0)
+
+#define M41T00_SQW_32KHZ	(1<<4)		/* 32.768 KHz */
+#define M41T00_SQW_8KHZ		(2<<4)		/* 8.192 KHz */
+#define M41T00_SQW_4KHZ		(3<<4)		/* 4.096 KHz */
+#define M41T00_SQW_2KHZ		(4<<4)		/* 2.048 KHz */
+#define M41T00_SQW_1KHZ		(5<<4)		/* 1.024 KHz */
+#define M41T00_SQW_512HZ	(6<<4)		/* 512 Hz */
+#define M41T00_SQW_256HZ	(7<<4)		/* 256 Hz */
+#define M41T00_SQW_128HZ	(8<<4)		/* 128 Hz */
+#define M41T00_SQW_64HZ		(9<<4)		/* 64 Hz */
+#define M41T00_SQW_32HZ		(10<<4)		/* 32 Hz */
+#define M41T00_SQW_16HZ		(11<<4)		/* 16 Hz */
+#define M41T00_SQW_8HZ		(12<<4)		/* 8 Hz */
+#define M41T00_SQW_4HZ		(13<<4)		/* 4 Hz */
+#define M41T00_SQW_2HZ		(14<<4)		/* 2 Hz */
+#define M41T00_SQW_1HZ		(15<<4)		/* 1 Hz */
+
+extern ulong m41t00_get_rtc_time(void);
+extern int m41t00_set_rtc_time(ulong nowtime);
+
+#endif /* _M41T00_H */
-- 
cgit v1.2.3


From 5c7ae65899a4c5b05b6277f856018d1eeeb98907 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Tue, 25 Apr 2006 14:18:16 +0200
Subject: [PATCH] I2C: i2c-nforce2: Add support for the nForce4 MCP51 and MCP55

Add support for the new nForce4 MCP51 (also known as nForce 410 or
430) and nForce4 MCP55 to the i2c-nforce2 driver. Some code changes
were required because the base I/O address registers have changed in
these versions. Standard BARs are now being used, while the original
nForce2 chips used non-standard ones.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/i2c/busses/i2c-nforce2 |  2 ++
 drivers/i2c/busses/i2c-nforce2.c     | 38 +++++++++++++++++++++++++-----------
 include/linux/pci_ids.h              |  2 ++
 3 files changed, 31 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-nforce2 b/Documentation/i2c/busses/i2c-nforce2
index d751282d9b2a..cd49c428a3ab 100644
--- a/Documentation/i2c/busses/i2c-nforce2
+++ b/Documentation/i2c/busses/i2c-nforce2
@@ -7,6 +7,8 @@ Supported adapters:
   * nForce3 250Gb MCP          10de:00E4 
   * nForce4 MCP                10de:0052
   * nForce4 MCP-04             10de:0034
+  * nForce4 MCP51              10de:0264
+  * nForce4 MCP55              10de:0368
 
 Datasheet: not publically available, but seems to be similar to the
            AMD-8111 SMBus 2.0 adapter.
diff --git a/drivers/i2c/busses/i2c-nforce2.c b/drivers/i2c/busses/i2c-nforce2.c
index 2d80eb26f688..604b49e22df1 100644
--- a/drivers/i2c/busses/i2c-nforce2.c
+++ b/drivers/i2c/busses/i2c-nforce2.c
@@ -31,6 +31,8 @@
     nForce3 250Gb MCP		00E4
     nForce4 MCP			0052
     nForce4 MCP-04		0034
+    nForce4 MCP51		0264
+    nForce4 MCP55		0368
 
     This driver supports the 2 SMBuses that are included in the MCP of the
     nForce2/3/4 chipsets.
@@ -64,6 +66,7 @@ struct nforce2_smbus {
 
 /*
  * nVidia nForce2 SMBus control register definitions
+ * (Newer incarnations use standard BARs 4 and 5 instead)
  */
 #define NFORCE_PCI_SMB1	0x50
 #define NFORCE_PCI_SMB2	0x54
@@ -259,6 +262,8 @@ static struct pci_device_id nforce2_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE3S_SMBUS) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE4_SMBUS) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_SMBUS) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_SMBUS) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SMBUS) },
 	{ 0 }
 };
 
@@ -266,19 +271,29 @@ static struct pci_device_id nforce2_ids[] = {
 MODULE_DEVICE_TABLE (pci, nforce2_ids);
 
 
-static int __devinit nforce2_probe_smb (struct pci_dev *dev, int reg,
-	struct nforce2_smbus *smbus, char *name)
+static int __devinit nforce2_probe_smb (struct pci_dev *dev, int bar,
+	int alt_reg, struct nforce2_smbus *smbus, const char *name)
 {
-	u16 iobase;
 	int error;
 
-	if (pci_read_config_word(dev, reg, &iobase) != PCIBIOS_SUCCESSFUL) {
-		dev_err(&smbus->adapter.dev, "Error reading PCI config for %s\n", name);
-		return -1;
+	smbus->base = pci_resource_start(dev, bar);
+	if (smbus->base) {
+		smbus->size = pci_resource_len(dev, bar);
+	} else {
+		/* Older incarnations of the device used non-standard BARs */
+		u16 iobase;
+
+		if (pci_read_config_word(dev, alt_reg, &iobase)
+		    != PCIBIOS_SUCCESSFUL) {
+			dev_err(&dev->dev, "Error reading PCI config for %s\n",
+				name);
+			return -1;
+		}
+
+		smbus->base = iobase & PCI_BASE_ADDRESS_IO_MASK;
+		smbus->size = 8;
 	}
-	smbus->dev  = dev;
-	smbus->base = iobase & 0xfffc;
-	smbus->size = 8;
+	smbus->dev = dev;
 
 	if (!request_region(smbus->base, smbus->size, nforce2_driver.name)) {
 		dev_err(&smbus->adapter.dev, "Error requesting region %02x .. %02X for %s\n",
@@ -313,12 +328,13 @@ static int __devinit nforce2_probe(struct pci_dev *dev, const struct pci_device_
 	pci_set_drvdata(dev, smbuses);
 
 	/* SMBus adapter 1 */
-	res1 = nforce2_probe_smb (dev, NFORCE_PCI_SMB1, &smbuses[0], "SMB1");
+	res1 = nforce2_probe_smb(dev, 4, NFORCE_PCI_SMB1, &smbuses[0], "SMB1");
 	if (res1 < 0) {
 		dev_err(&dev->dev, "Error probing SMB1.\n");
 		smbuses[0].base = 0;	/* to have a check value */
 	}
-	res2 = nforce2_probe_smb (dev, NFORCE_PCI_SMB2, &smbuses[1], "SMB2");
+	/* SMBus adapter 2 */
+	res2 = nforce2_probe_smb(dev, 5, NFORCE_PCI_SMB2, &smbuses[1], "SMB2");
 	if (res2 < 0) {
 		dev_err(&dev->dev, "Error probing SMB2.\n");
 		smbuses[1].base = 0;	/* to have a check value */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 489af9d3ce1f..d33436097e1d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1130,9 +1130,11 @@
 #define PCI_DEVICE_ID_NVIDIA_QUADRO4_900XGL	0x0258
 #define PCI_DEVICE_ID_NVIDIA_QUADRO4_750XGL	0x0259
 #define PCI_DEVICE_ID_NVIDIA_QUADRO4_700XGL	0x025B
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_SMBUS	0x0264
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE	0x0265
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_SATA	0x0266
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_SATA2	0x0267
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SMBUS	0x0368
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE	0x036E
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SATA	0x037E
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SATA2	0x037F
-- 
cgit v1.2.3


From 18f98b1e3147afdb51e545cc6ff2b016c7d088a7 Mon Sep 17 00:00:00 2001
From: Peter Korsgaard <jacmet@sunsite.dk>
Date: Sun, 4 Jun 2006 20:01:08 +0200
Subject: [PATCH] i2c: New bus driver for the OpenCores I2C controller

The following patch adds support for the OpenCores I2C controller IP
core (See http://www.opencores.org/projects.cgi/web/i2c/overview).

Signed-off-by: Peter Korsgaard <jacmet@sunsite.dk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/i2c/busses/i2c-ocores |  51 ++++++
 drivers/i2c/busses/Kconfig          |  11 ++
 drivers/i2c/busses/Makefile         |   1 +
 drivers/i2c/busses/i2c-ocores.c     | 343 ++++++++++++++++++++++++++++++++++++
 include/linux/i2c-ocores.h          |  19 ++
 5 files changed, 425 insertions(+)
 create mode 100644 Documentation/i2c/busses/i2c-ocores
 create mode 100644 drivers/i2c/busses/i2c-ocores.c
 create mode 100644 include/linux/i2c-ocores.h

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-ocores b/Documentation/i2c/busses/i2c-ocores
new file mode 100644
index 000000000000..cfcebb10d14e
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-ocores
@@ -0,0 +1,51 @@
+Kernel driver i2c-ocores
+
+Supported adapters:
+  * OpenCores.org I2C controller by Richard Herveille (see datasheet link)
+    Datasheet: http://www.opencores.org/projects.cgi/web/i2c/overview
+
+Author: Peter Korsgaard <jacmet@sunsite.dk>
+
+Description
+-----------
+
+i2c-ocores is an i2c bus driver for the OpenCores.org I2C controller
+IP core by Richard Herveille.
+
+Usage
+-----
+
+i2c-ocores uses the platform bus, so you need to provide a struct
+platform_device with the base address and interrupt number. The
+dev.platform_data of the device should also point to a struct
+ocores_i2c_platform_data (see linux/i2c-ocores.h) describing the
+distance between registers and the input clock speed.
+
+E.G. something like:
+
+static struct resource ocores_resources[] = {
+	[0] = {
+		.start	= MYI2C_BASEADDR,
+		.end	= MYI2C_BASEADDR + 8,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= MYI2C_IRQ,
+		.end	= MYI2C_IRQ,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct ocores_i2c_platform_data myi2c_data = {
+	.regstep	= 2,		/* two bytes between registers */
+	.clock_khz	= 50000,	/* input clock of 50MHz */
+};
+
+static struct platform_device myi2c = {
+	.name			= "ocores-i2c",
+	.dev = {
+		.platform_data	= &myi2c_data,
+	},
+	.num_resources		= ARRAY_SIZE(ocores_resources),
+	.resource		= ocores_resources,
+};
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index d25a8cbbec0a..f7af7e9bb7d9 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -276,6 +276,17 @@ config I2C_NFORCE2
 	  This driver can also be built as a module.  If so, the module
 	  will be called i2c-nforce2.
 
+config I2C_OCORES
+	tristate "OpenCores I2C Controller"
+	depends on I2C && EXPERIMENTAL
+	help
+	  If you say yes to this option, support will be included for the
+	  OpenCores I2C controller. For details see
+	  http://www.opencores.org/projects.cgi/web/i2c/overview
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called i2c-ocores.
+
 config I2C_PARPORT
 	tristate "Parallel port adapter"
 	depends on I2C && PARPORT
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index b44831dff683..ac56df53155b 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_I2C_POWERMAC)	+= i2c-powermac.o
 obj-$(CONFIG_I2C_MPC)		+= i2c-mpc.o
 obj-$(CONFIG_I2C_MV64XXX)	+= i2c-mv64xxx.o
 obj-$(CONFIG_I2C_NFORCE2)	+= i2c-nforce2.o
+obj-$(CONFIG_I2C_OCORES)	+= i2c-ocores.o
 obj-$(CONFIG_I2C_PARPORT)	+= i2c-parport.o
 obj-$(CONFIG_I2C_PARPORT_LIGHT)	+= i2c-parport-light.o
 obj-$(CONFIG_I2C_PCA_ISA)	+= i2c-pca-isa.o
diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
new file mode 100644
index 000000000000..d5c0610bfe6d
--- /dev/null
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -0,0 +1,343 @@
+/*
+ * i2c-ocores.c: I2C bus driver for OpenCores I2C controller
+ * (http://www.opencores.org/projects.cgi/web/i2c/overview).
+ *
+ * Peter Korsgaard <jacmet@sunsite.dk>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2.  This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/platform_device.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/wait.h>
+#include <linux/i2c-ocores.h>
+#include <asm/io.h>
+
+struct ocores_i2c {
+	void __iomem *base;
+	int regstep;
+	wait_queue_head_t wait;
+	struct i2c_adapter adap;
+	struct i2c_msg *msg;
+	int pos;
+	int nmsgs;
+	int state; /* see STATE_ */
+};
+
+/* registers */
+#define OCI2C_PRELOW		0
+#define OCI2C_PREHIGH		1
+#define OCI2C_CONTROL		2
+#define OCI2C_DATA		3
+#define OCI2C_CMD		4
+#define OCI2C_STATUS		4
+
+#define OCI2C_CTRL_IEN		0x40
+#define OCI2C_CTRL_EN		0x80
+
+#define OCI2C_CMD_START		0x91
+#define OCI2C_CMD_STOP		0x41
+#define OCI2C_CMD_READ		0x21
+#define OCI2C_CMD_WRITE		0x11
+#define OCI2C_CMD_READ_ACK	0x21
+#define OCI2C_CMD_READ_NACK	0x29
+#define OCI2C_CMD_IACK		0x01
+
+#define OCI2C_STAT_IF		0x01
+#define OCI2C_STAT_TIP		0x02
+#define OCI2C_STAT_ARBLOST	0x20
+#define OCI2C_STAT_BUSY		0x40
+#define OCI2C_STAT_NACK		0x80
+
+#define STATE_DONE		0
+#define STATE_START		1
+#define STATE_WRITE		2
+#define STATE_READ		3
+#define STATE_ERROR		4
+
+static inline void oc_setreg(struct ocores_i2c *i2c, int reg, u8 value)
+{
+	iowrite8(value, i2c->base + reg * i2c->regstep);
+}
+
+static inline u8 oc_getreg(struct ocores_i2c *i2c, int reg)
+{
+	return ioread8(i2c->base + reg * i2c->regstep);
+}
+
+static void ocores_process(struct ocores_i2c *i2c)
+{
+	struct i2c_msg *msg = i2c->msg;
+	u8 stat = oc_getreg(i2c, OCI2C_STATUS);
+
+	if ((i2c->state == STATE_DONE) || (i2c->state == STATE_ERROR)) {
+		/* stop has been sent */
+		oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_IACK);
+		wake_up(&i2c->wait);
+		return;
+	}
+
+	/* error? */
+	if (stat & OCI2C_STAT_ARBLOST) {
+		i2c->state = STATE_ERROR;
+		oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_STOP);
+		return;
+	}
+
+	if ((i2c->state == STATE_START) || (i2c->state == STATE_WRITE)) {
+		i2c->state =
+			(msg->flags & I2C_M_RD) ? STATE_READ : STATE_WRITE;
+
+		if (stat & OCI2C_STAT_NACK) {
+			i2c->state = STATE_ERROR;
+			oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_STOP);
+			return;
+		}
+	} else
+		msg->buf[i2c->pos++] = oc_getreg(i2c, OCI2C_DATA);
+
+	/* end of msg? */
+	if (i2c->pos == msg->len) {
+		i2c->nmsgs--;
+		i2c->msg++;
+		i2c->pos = 0;
+		msg = i2c->msg;
+
+		if (i2c->nmsgs) {	/* end? */
+			/* send start? */
+			if (!(msg->flags & I2C_M_NOSTART)) {
+				u8 addr = (msg->addr << 1);
+
+				if (msg->flags & I2C_M_RD)
+					addr |= 1;
+
+				i2c->state = STATE_START;
+
+				oc_setreg(i2c, OCI2C_DATA, addr);
+				oc_setreg(i2c, OCI2C_CMD,  OCI2C_CMD_START);
+				return;
+			} else
+				i2c->state = (msg->flags & I2C_M_RD)
+					? STATE_READ : STATE_WRITE;
+		} else {
+			i2c->state = STATE_DONE;
+			oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_STOP);
+			return;
+		}
+	}
+
+	if (i2c->state == STATE_READ) {
+		oc_setreg(i2c, OCI2C_CMD, i2c->pos == (msg->len-1) ?
+			  OCI2C_CMD_READ_NACK : OCI2C_CMD_READ_ACK);
+	} else {
+		oc_setreg(i2c, OCI2C_DATA, msg->buf[i2c->pos++]);
+		oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_WRITE);
+	}
+}
+
+static irqreturn_t ocores_isr(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct ocores_i2c *i2c = dev_id;
+
+	ocores_process(i2c);
+
+	return IRQ_HANDLED;
+}
+
+static int ocores_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num)
+{
+	struct ocores_i2c *i2c = i2c_get_adapdata(adap);
+
+	i2c->msg = msgs;
+	i2c->pos = 0;
+	i2c->nmsgs = num;
+	i2c->state = STATE_START;
+
+	oc_setreg(i2c, OCI2C_DATA,
+			(i2c->msg->addr << 1) |
+			((i2c->msg->flags & I2C_M_RD) ? 1:0));
+
+	oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_START);
+
+	if (wait_event_timeout(i2c->wait, (i2c->state == STATE_ERROR) ||
+			       (i2c->state == STATE_DONE), HZ))
+		return (i2c->state == STATE_DONE) ? num : -EIO;
+	else
+		return -ETIMEDOUT;
+}
+
+static void ocores_init(struct ocores_i2c *i2c,
+			struct ocores_i2c_platform_data *pdata)
+{
+	int prescale;
+	u8 ctrl = oc_getreg(i2c, OCI2C_CONTROL);
+
+	/* make sure the device is disabled */
+	oc_setreg(i2c, OCI2C_CONTROL, ctrl & ~(OCI2C_CTRL_EN|OCI2C_CTRL_IEN));
+
+	prescale = (pdata->clock_khz / (5*100)) - 1;
+	oc_setreg(i2c, OCI2C_PRELOW, prescale & 0xff);
+	oc_setreg(i2c, OCI2C_PREHIGH, prescale >> 8);
+
+	/* Init the device */
+	oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_IACK);
+	oc_setreg(i2c, OCI2C_CONTROL, ctrl | OCI2C_CTRL_IEN | OCI2C_CTRL_EN);
+}
+
+
+static u32 ocores_func(struct i2c_adapter *adap)
+{
+	return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL;
+}
+
+static struct i2c_algorithm ocores_algorithm = {
+	.master_xfer	= ocores_xfer,
+	.functionality	= ocores_func,
+};
+
+static struct i2c_adapter ocores_adapter = {
+	.owner		= THIS_MODULE,
+	.name		= "i2c-ocores",
+	.class		= I2C_CLASS_HWMON,
+	.algo		= &ocores_algorithm,
+	.timeout	= 2,
+	.retries	= 1,
+};
+
+
+static int __devinit ocores_i2c_probe(struct platform_device *pdev)
+{
+	struct ocores_i2c *i2c;
+	struct ocores_i2c_platform_data *pdata;
+	struct resource *res, *res2;
+	int ret;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENODEV;
+
+	res2 = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!res2)
+		return -ENODEV;
+
+	pdata = (struct ocores_i2c_platform_data*) pdev->dev.platform_data;
+	if (!pdata)
+		return -ENODEV;
+
+	i2c = kzalloc(sizeof(*i2c), GFP_KERNEL);
+	if (!i2c)
+		return -ENOMEM;
+
+	if (!request_mem_region(res->start, res->end - res->start + 1,
+				pdev->name)) {
+		dev_err(&pdev->dev, "Memory region busy\n");
+		ret = -EBUSY;
+		goto request_mem_failed;
+	}
+
+	i2c->base = ioremap(res->start, res->end - res->start + 1);
+	if (!i2c->base) {
+		dev_err(&pdev->dev, "Unable to map registers\n");
+		ret = -EIO;
+		goto map_failed;
+	}
+
+	i2c->regstep = pdata->regstep;
+	ocores_init(i2c, pdata);
+
+	init_waitqueue_head(&i2c->wait);
+	ret = request_irq(res2->start, ocores_isr, 0, pdev->name, i2c);
+	if (ret) {
+		dev_err(&pdev->dev, "Cannot claim IRQ\n");
+		goto request_irq_failed;
+	}
+
+	/* hook up driver to tree */
+	platform_set_drvdata(pdev, i2c);
+	i2c->adap = ocores_adapter;
+	i2c_set_adapdata(&i2c->adap, i2c);
+	i2c->adap.dev.parent = &pdev->dev;
+
+	/* add i2c adapter to i2c tree */
+	ret = i2c_add_adapter(&i2c->adap);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to add adapter\n");
+		goto add_adapter_failed;
+	}
+
+	return 0;
+
+add_adapter_failed:
+	free_irq(res2->start, i2c);
+request_irq_failed:
+	iounmap(i2c->base);
+map_failed:
+	release_mem_region(res->start, res->end - res->start + 1);
+request_mem_failed:
+	kfree(i2c);
+
+	return ret;
+}
+
+static int __devexit ocores_i2c_remove(struct platform_device* pdev)
+{
+	struct ocores_i2c *i2c = platform_get_drvdata(pdev);
+	struct resource *res;
+
+	/* disable i2c logic */
+	oc_setreg(i2c, OCI2C_CONTROL, oc_getreg(i2c, OCI2C_CONTROL)
+		  & ~(OCI2C_CTRL_EN|OCI2C_CTRL_IEN));
+
+	/* remove adapter & data */
+	i2c_del_adapter(&i2c->adap);
+	platform_set_drvdata(pdev, NULL);
+
+	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (res)
+		free_irq(res->start, i2c);
+
+	iounmap(i2c->base);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (res)
+		release_mem_region(res->start, res->end - res->start + 1);
+
+	kfree(i2c);
+
+	return 0;
+}
+
+static struct platform_driver ocores_i2c_driver = {
+	.probe  = ocores_i2c_probe,
+	.remove = __devexit_p(ocores_i2c_remove),
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "ocores-i2c",
+	},
+};
+
+static int __init ocores_i2c_init(void)
+{
+	return platform_driver_register(&ocores_i2c_driver);
+}
+
+static void __exit ocores_i2c_exit(void)
+{
+	platform_driver_unregister(&ocores_i2c_driver);
+}
+
+module_init(ocores_i2c_init);
+module_exit(ocores_i2c_exit);
+
+MODULE_AUTHOR("Peter Korsgaard <jacmet@sunsite.dk>");
+MODULE_DESCRIPTION("OpenCores I2C bus driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/i2c-ocores.h b/include/linux/i2c-ocores.h
new file mode 100644
index 000000000000..8ed591b0887e
--- /dev/null
+++ b/include/linux/i2c-ocores.h
@@ -0,0 +1,19 @@
+/*
+ * i2c-ocores.h - definitions for the i2c-ocores interface
+ *
+ * Peter Korsgaard <jacmet@sunsite.dk>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2.  This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#ifndef _LINUX_I2C_OCORES_H
+#define _LINUX_I2C_OCORES_H
+
+struct ocores_i2c_platform_data {
+	u32 regstep;   /* distance between registers */
+	u32 clock_khz; /* input clock in kHz */
+};
+
+#endif /* _LINUX_I2C_OCORES_H */
-- 
cgit v1.2.3


From 46f5ed753fac512f73069bd07455555b41a8a06e Mon Sep 17 00:00:00 2001
From: Krzysztof Halasa <khc@pm.waw.pl>
Date: Mon, 12 Jun 2006 21:42:20 +0200
Subject: [PATCH] i2c: Mark block write buffers as const

The attached patch marks i2c_smbus_write_block_data() and
i2c_smbus_write_i2c_block_data() buffers as const.

Signed-off-by: Krzysztof Halasa <khc@pm.waw.pl>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/i2c/i2c-core.c | 4 ++--
 include/linux/i2c.h    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 45e2cdf54736..a45155f799d4 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -916,7 +916,7 @@ s32 i2c_smbus_write_word_data(struct i2c_client *client, u8 command, u16 value)
 }
 
 s32 i2c_smbus_write_block_data(struct i2c_client *client, u8 command,
-			       u8 length, u8 *values)
+			       u8 length, const u8 *values)
 {
 	union i2c_smbus_data data;
 
@@ -944,7 +944,7 @@ s32 i2c_smbus_read_i2c_block_data(struct i2c_client *client, u8 command, u8 *val
 }
 
 s32 i2c_smbus_write_i2c_block_data(struct i2c_client *client, u8 command,
-				   u8 length, u8 *values)
+				   u8 length, const u8 *values)
 {
 	union i2c_smbus_data data;
 
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 0510430e00db..526ddc8eecfb 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -97,13 +97,13 @@ extern s32 i2c_smbus_write_word_data(struct i2c_client * client,
                                      u8 command, u16 value);
 extern s32 i2c_smbus_write_block_data(struct i2c_client * client,
 				      u8 command, u8 length,
-				      u8 *values);
+				      const u8 *values);
 /* Returns the number of read bytes */
 extern s32 i2c_smbus_read_i2c_block_data(struct i2c_client * client,
 					 u8 command, u8 *values);
 extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client * client,
 					  u8 command, u8 length,
-					  u8 *values);
+					  const u8 *values);
 
 /*
  * A driver is capable of handling one or more physical devices present on
-- 
cgit v1.2.3


From b6043fcab4b2b06b9fcde4c783ab253cdc2c1129 Mon Sep 17 00:00:00 2001
From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Thu, 23 Mar 2006 19:11:58 +0300
Subject: [PATCH] w1: Move w1-connector definitions into
 linux/include/connector.h

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/w1/w1_netlink.h   | 3 ---
 include/linux/connector.h | 5 ++++-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/w1/w1_netlink.h b/drivers/w1/w1_netlink.h
index 5644221f9a44..56122b9e9294 100644
--- a/drivers/w1/w1_netlink.h
+++ b/drivers/w1/w1_netlink.h
@@ -27,9 +27,6 @@
 
 #include "w1.h"
 
-#define CN_W1_IDX	3
-#define CN_W1_VAL	1
-
 enum w1_netlink_message_types {
 	W1_SLAVE_ADD = 0,
 	W1_SLAVE_REMOVE,
diff --git a/include/linux/connector.h b/include/linux/connector.h
index ad1a22c1c42e..4c02119c6ab9 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -34,8 +34,11 @@
 #define CN_VAL_PROC			0x1
 #define CN_IDX_CIFS			0x2
 #define CN_VAL_CIFS                     0x1
+#define CN_W1_IDX			0x3	/* w1 communication */
+#define CN_W1_VAL			0x1
 
-#define CN_NETLINK_USERS		1
+
+#define CN_NETLINK_USERS		4
 
 /*
  * Maximum connector's message size.
-- 
cgit v1.2.3


From bb5427b5466782ba0bbf56a4ed752e08b65a5d08 Mon Sep 17 00:00:00 2001
From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Thu, 23 Mar 2006 19:11:58 +0300
Subject: [PATCH] w1: netlink: Mark netlink group 1 as unused.

netlink_w1 was moved to connector.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/netlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 87b8a5703ebc..855b44668caa 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -5,7 +5,7 @@
 #include <linux/types.h>
 
 #define NETLINK_ROUTE		0	/* Routing/device hook				*/
-#define NETLINK_W1		1	/* 1-wire subsystem				*/
+#define NETLINK_UNUSED		1	/* Unused number				*/
 #define NETLINK_USERSOCK	2	/* Reserved for user mode socket protocols 	*/
 #define NETLINK_FIREWALL	3	/* Firewalling hook				*/
 #define NETLINK_INET_DIAG	4	/* INET socket monitoring			*/
-- 
cgit v1.2.3


From d720024e94de4e8b7f10ee83c532926f3ad5d708 Mon Sep 17 00:00:00 2001
From: Michael LeMay <mdlemay@epoch.ncsc.mil>
Date: Thu, 22 Jun 2006 14:47:17 -0700
Subject: [PATCH] selinux: add hooks for key subsystem

Introduce SELinux hooks to support the access key retention subsystem
within the kernel.  Incorporate new flask headers from a modified version
of the SELinux reference policy, with support for the new security class
representing retained keys.  Extend the "key_alloc" security hook with a
task parameter representing the intended ownership context for the key
being allocated.  Attach security information to root's default keyrings
within the SELinux initialization routine.

Has passed David's testsuite.

Signed-off-by: Michael LeMay <mdlemay@epoch.ncsc.mil>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys.txt                       | 29 +++++++++++++
 include/linux/key.h                          | 18 +++++---
 include/linux/security.h                     | 10 +++--
 kernel/user.c                                |  2 +-
 security/dummy.c                             |  2 +-
 security/keys/key.c                          |  8 ++--
 security/keys/keyring.c                      |  5 ++-
 security/keys/process_keys.c                 | 15 ++++---
 security/keys/request_key.c                  |  6 ++-
 security/keys/request_key_auth.c             |  2 +-
 security/selinux/hooks.c                     | 64 ++++++++++++++++++++++++++++
 security/selinux/include/av_perm_to_string.h |  6 +++
 security/selinux/include/av_permissions.h    |  8 ++++
 security/selinux/include/class_to_string.h   |  1 +
 security/selinux/include/flask.h             |  1 +
 security/selinux/include/objsec.h            |  5 +++
 16 files changed, 155 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index aaa01b0e3ee9..703020012708 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -19,6 +19,7 @@ This document has the following sections:
 	- Key overview
 	- Key service overview
 	- Key access permissions
+	- SELinux support
 	- New procfs files
 	- Userspace system call interface
 	- Kernel services
@@ -232,6 +233,34 @@ For changing the ownership, group ID or permissions mask, being the owner of
 the key or having the sysadmin capability is sufficient.
 
 
+===============
+SELINUX SUPPORT
+===============
+
+The security class "key" has been added to SELinux so that mandatory access
+controls can be applied to keys created within various contexts.  This support
+is preliminary, and is likely to change quite significantly in the near future.
+Currently, all of the basic permissions explained above are provided in SELinux
+as well; SE Linux is simply invoked after all basic permission checks have been
+performed.
+
+Each key is labeled with the same context as the task to which it belongs.
+Typically, this is the same task that was running when the key was created.
+The default keyrings are handled differently, but in a way that is very
+intuitive:
+
+ (*) The user and user session keyrings that are created when the user logs in
+     are currently labeled with the context of the login manager.
+
+ (*) The keyrings associated with new threads are each labeled with the context
+     of their associated thread, and both session and process keyrings are
+     handled similarly.
+
+Note, however, that the default keyrings associated with the root user are
+labeled with the default kernel context, since they are created early in the
+boot process, before root has a chance to log in.
+
+
 ================
 NEW PROCFS FILES
 ================
diff --git a/include/linux/key.h b/include/linux/key.h
index cbf464ad9589..8c275d12ef63 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -241,8 +241,9 @@ extern void unregister_key_type(struct key_type *ktype);
 
 extern struct key *key_alloc(struct key_type *type,
 			     const char *desc,
-			     uid_t uid, gid_t gid, key_perm_t perm,
-			     int not_in_quota);
+			     uid_t uid, gid_t gid,
+			     struct task_struct *ctx,
+			     key_perm_t perm, int not_in_quota);
 extern int key_payload_reserve(struct key *key, size_t datalen);
 extern int key_instantiate_and_link(struct key *key,
 				    const void *data,
@@ -292,7 +293,9 @@ extern int key_unlink(struct key *keyring,
 		      struct key *key);
 
 extern struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
-				 int not_in_quota, struct key *dest);
+				 struct task_struct *ctx,
+				 int not_in_quota,
+				 struct key *dest);
 
 extern int keyring_clear(struct key *keyring);
 
@@ -313,7 +316,8 @@ extern void keyring_replace_payload(struct key *key, void *replacement);
  * the userspace interface
  */
 extern struct key root_user_keyring, root_session_keyring;
-extern int alloc_uid_keyring(struct user_struct *user);
+extern int alloc_uid_keyring(struct user_struct *user,
+			     struct task_struct *ctx);
 extern void switch_uid_keyring(struct user_struct *new_user);
 extern int copy_keys(unsigned long clone_flags, struct task_struct *tsk);
 extern int copy_thread_group_keys(struct task_struct *tsk);
@@ -342,7 +346,7 @@ extern void key_init(void);
 #define make_key_ref(k)			({ NULL; })
 #define key_ref_to_ptr(k)		({ NULL; })
 #define is_key_possessed(k)		0
-#define alloc_uid_keyring(u)		0
+#define alloc_uid_keyring(u,c)		0
 #define switch_uid_keyring(u)		do { } while(0)
 #define __install_session_keyring(t, k)	({ NULL; })
 #define copy_keys(f,t)			0
@@ -355,6 +359,10 @@ extern void key_init(void);
 #define key_fsgid_changed(t)		do { } while(0)
 #define key_init()			do { } while(0)
 
+/* Initial keyrings */
+extern struct key root_user_keyring;
+extern struct key root_session_keyring;
+
 #endif /* CONFIG_KEYS */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_KEY_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index 4dfb1b84a9b3..47722d355532 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1313,7 +1313,7 @@ struct security_operations {
 
 	/* key management security hooks */
 #ifdef CONFIG_KEYS
-	int (*key_alloc)(struct key *key);
+	int (*key_alloc)(struct key *key, struct task_struct *tsk);
 	void (*key_free)(struct key *key);
 	int (*key_permission)(key_ref_t key_ref,
 			      struct task_struct *context,
@@ -3008,9 +3008,10 @@ static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid
 
 #ifdef CONFIG_KEYS
 #ifdef CONFIG_SECURITY
-static inline int security_key_alloc(struct key *key)
+static inline int security_key_alloc(struct key *key,
+				     struct task_struct *tsk)
 {
-	return security_ops->key_alloc(key);
+	return security_ops->key_alloc(key, tsk);
 }
 
 static inline void security_key_free(struct key *key)
@@ -3027,7 +3028,8 @@ static inline int security_key_permission(key_ref_t key_ref,
 
 #else
 
-static inline int security_key_alloc(struct key *key)
+static inline int security_key_alloc(struct key *key,
+				     struct task_struct *tsk)
 {
 	return 0;
 }
diff --git a/kernel/user.c b/kernel/user.c
index 4b1eb745afa1..6408c0424291 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid)
 		new->mq_bytes = 0;
 		new->locked_shm = 0;
 
-		if (alloc_uid_keyring(new) < 0) {
+		if (alloc_uid_keyring(new, current) < 0) {
 			kmem_cache_free(uid_cachep, new);
 			return NULL;
 		}
diff --git a/security/dummy.c b/security/dummy.c
index 64f6da0f422e..6de4a4a5eb13 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -860,7 +860,7 @@ static int dummy_setprocattr(struct task_struct *p, char *name, void *value, siz
 }
 
 #ifdef CONFIG_KEYS
-static inline int dummy_key_alloc(struct key *key)
+static inline int dummy_key_alloc(struct key *key, struct task_struct *ctx)
 {
 	return 0;
 }
diff --git a/security/keys/key.c b/security/keys/key.c
index 3fdc49c6a02c..14a15abb7735 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -247,8 +247,8 @@ static inline void key_alloc_serial(struct key *key)
  *   instantiate the key or discard it before returning
  */
 struct key *key_alloc(struct key_type *type, const char *desc,
-		      uid_t uid, gid_t gid, key_perm_t perm,
-		      int not_in_quota)
+		      uid_t uid, gid_t gid, struct task_struct *ctx,
+		      key_perm_t perm, int not_in_quota)
 {
 	struct key_user *user = NULL;
 	struct key *key;
@@ -318,7 +318,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 #endif
 
 	/* let the security module know about the key */
-	ret = security_key_alloc(key);
+	ret = security_key_alloc(key, ctx);
 	if (ret < 0)
 		goto security_error;
 
@@ -822,7 +822,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 
 	/* allocate a new key */
 	key = key_alloc(ktype, description, current->fsuid, current->fsgid,
-			perm, not_in_quota);
+			current, perm, not_in_quota);
 	if (IS_ERR(key)) {
 		key_ref = ERR_PTR(PTR_ERR(key));
 		goto error_3;
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index bffa924c1f88..1357207fc9df 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -240,13 +240,14 @@ static long keyring_read(const struct key *keyring,
  * allocate a keyring and link into the destination keyring
  */
 struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
-			  int not_in_quota, struct key *dest)
+			  struct task_struct *ctx, int not_in_quota,
+			  struct key *dest)
 {
 	struct key *keyring;
 	int ret;
 
 	keyring = key_alloc(&key_type_keyring, description,
-			    uid, gid,
+			    uid, gid, ctx,
 			    (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL,
 			    not_in_quota);
 
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 217a0bef3c82..a50a91332fe1 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -67,7 +67,8 @@ struct key root_session_keyring = {
 /*
  * allocate the keyrings to be associated with a UID
  */
-int alloc_uid_keyring(struct user_struct *user)
+int alloc_uid_keyring(struct user_struct *user,
+		      struct task_struct *ctx)
 {
 	struct key *uid_keyring, *session_keyring;
 	char buf[20];
@@ -76,7 +77,7 @@ int alloc_uid_keyring(struct user_struct *user)
 	/* concoct a default session keyring */
 	sprintf(buf, "_uid_ses.%u", user->uid);
 
-	session_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, 0, NULL);
+	session_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx, 0, NULL);
 	if (IS_ERR(session_keyring)) {
 		ret = PTR_ERR(session_keyring);
 		goto error;
@@ -86,7 +87,7 @@ int alloc_uid_keyring(struct user_struct *user)
 	 * keyring */
 	sprintf(buf, "_uid.%u", user->uid);
 
-	uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, 0,
+	uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx, 0,
 				    session_keyring);
 	if (IS_ERR(uid_keyring)) {
 		key_put(session_keyring);
@@ -143,7 +144,7 @@ int install_thread_keyring(struct task_struct *tsk)
 
 	sprintf(buf, "_tid.%u", tsk->pid);
 
-	keyring = keyring_alloc(buf, tsk->uid, tsk->gid, 1, NULL);
+	keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error;
@@ -177,7 +178,7 @@ int install_process_keyring(struct task_struct *tsk)
 	if (!tsk->signal->process_keyring) {
 		sprintf(buf, "_pid.%u", tsk->tgid);
 
-		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, 1, NULL);
+		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error;
@@ -217,7 +218,7 @@ static int install_session_keyring(struct task_struct *tsk,
 	if (!keyring) {
 		sprintf(buf, "_ses.%u", tsk->tgid);
 
-		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, 1, NULL);
+		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
 	}
@@ -717,7 +718,7 @@ long join_session_keyring(const char *name)
 	keyring = find_keyring_by_name(name, 0);
 	if (PTR_ERR(keyring) == -ENOKEY) {
 		/* not found - try and create a new one */
-		keyring = keyring_alloc(name, tsk->uid, tsk->gid, 0, NULL);
+		keyring = keyring_alloc(name, tsk->uid, tsk->gid, tsk, 0, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error2;
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index f030a0ccbb93..eab66a06ca53 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -48,7 +48,8 @@ static int call_sbin_request_key(struct key *key,
 	/* allocate a new session keyring */
 	sprintf(desc, "_req.%u", key->serial);
 
-	keyring = keyring_alloc(desc, current->fsuid, current->fsgid, 1, NULL);
+	keyring = keyring_alloc(desc, current->fsuid, current->fsgid,
+				current, 1, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error_alloc;
@@ -137,7 +138,8 @@ static struct key *__request_key_construction(struct key_type *type,
 
 	/* create a key and add it to the queue */
 	key = key_alloc(type, description,
-			current->fsuid, current->fsgid, KEY_POS_ALL, 0);
+			current->fsuid, current->fsgid,
+			current, KEY_POS_ALL, 0);
 	if (IS_ERR(key))
 		goto alloc_failed;
 
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index cce6ba6b0323..0ecc2e8d2bd0 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -148,7 +148,7 @@ struct key *request_key_auth_new(struct key *target, const char *callout_info)
 	sprintf(desc, "%x", target->serial);
 
 	authkey = key_alloc(&key_type_request_key_auth, desc,
-			    current->fsuid, current->fsgid,
+			    current->fsuid, current->fsgid, current,
 			    KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
 			    KEY_USR_VIEW, 1);
 	if (IS_ERR(authkey)) {
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 54adc9d31e92..524915dfda64 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4252,6 +4252,57 @@ static int selinux_setprocattr(struct task_struct *p,
 	return size;
 }
 
+#ifdef CONFIG_KEYS
+
+static int selinux_key_alloc(struct key *k, struct task_struct *tsk)
+{
+	struct task_security_struct *tsec = tsk->security;
+	struct key_security_struct *ksec;
+
+	ksec = kzalloc(sizeof(struct key_security_struct), GFP_KERNEL);
+	if (!ksec)
+		return -ENOMEM;
+
+	ksec->obj = k;
+	ksec->sid = tsec->sid;
+	k->security = ksec;
+
+	return 0;
+}
+
+static void selinux_key_free(struct key *k)
+{
+	struct key_security_struct *ksec = k->security;
+
+	k->security = NULL;
+	kfree(ksec);
+}
+
+static int selinux_key_permission(key_ref_t key_ref,
+			    struct task_struct *ctx,
+			    key_perm_t perm)
+{
+	struct key *key;
+	struct task_security_struct *tsec;
+	struct key_security_struct *ksec;
+
+	key = key_ref_to_ptr(key_ref);
+
+	tsec = ctx->security;
+	ksec = key->security;
+
+	/* if no specific permissions are requested, we skip the
+	   permission check. No serious, additional covert channels
+	   appear to be created. */
+	if (perm == 0)
+		return 0;
+
+	return avc_has_perm(tsec->sid, ksec->sid,
+			    SECCLASS_KEY, perm, NULL);
+}
+
+#endif
+
 static struct security_operations selinux_ops = {
 	.ptrace =			selinux_ptrace,
 	.capget =			selinux_capget,
@@ -4406,6 +4457,12 @@ static struct security_operations selinux_ops = {
 	.xfrm_state_delete_security =	selinux_xfrm_state_delete,
 	.xfrm_policy_lookup = 		selinux_xfrm_policy_lookup,
 #endif
+
+#ifdef CONFIG_KEYS
+	.key_alloc =                    selinux_key_alloc,
+	.key_free =                     selinux_key_free,
+	.key_permission =               selinux_key_permission,
+#endif
 };
 
 static __init int selinux_init(void)
@@ -4441,6 +4498,13 @@ static __init int selinux_init(void)
 	} else {
 		printk(KERN_INFO "SELinux:  Starting in permissive mode\n");
 	}
+
+#ifdef CONFIG_KEYS
+	/* Add security information to initial keyrings */
+	security_key_alloc(&root_user_keyring, current);
+	security_key_alloc(&root_session_keyring, current);
+#endif
+
 	return 0;
 }
 
diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h
index 70ee65a58817..bc020bde6c86 100644
--- a/security/selinux/include/av_perm_to_string.h
+++ b/security/selinux/include/av_perm_to_string.h
@@ -242,3 +242,9 @@
    S_(SECCLASS_PACKET, PACKET__SEND, "send")
    S_(SECCLASS_PACKET, PACKET__RECV, "recv")
    S_(SECCLASS_PACKET, PACKET__RELABELTO, "relabelto")
+   S_(SECCLASS_KEY, KEY__VIEW, "view")
+   S_(SECCLASS_KEY, KEY__READ, "read")
+   S_(SECCLASS_KEY, KEY__WRITE, "write")
+   S_(SECCLASS_KEY, KEY__SEARCH, "search")
+   S_(SECCLASS_KEY, KEY__LINK, "link")
+   S_(SECCLASS_KEY, KEY__SETATTR, "setattr")
diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h
index 1d9cf3d306bc..1205227a3a33 100644
--- a/security/selinux/include/av_permissions.h
+++ b/security/selinux/include/av_permissions.h
@@ -959,3 +959,11 @@
 #define PACKET__SEND                              0x00000001UL
 #define PACKET__RECV                              0x00000002UL
 #define PACKET__RELABELTO                         0x00000004UL
+
+#define KEY__VIEW                                 0x00000001UL
+#define KEY__READ                                 0x00000002UL
+#define KEY__WRITE                                0x00000004UL
+#define KEY__SEARCH                               0x00000008UL
+#define KEY__LINK                                 0x00000010UL
+#define KEY__SETATTR                              0x00000020UL
+
diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h
index 3aec75fee4f7..24303b61309f 100644
--- a/security/selinux/include/class_to_string.h
+++ b/security/selinux/include/class_to_string.h
@@ -60,3 +60,4 @@
     S_("netlink_kobject_uevent_socket")
     S_("appletalk_socket")
     S_("packet")
+    S_("key")
diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h
index a0eb9e281d18..95887aed2a68 100644
--- a/security/selinux/include/flask.h
+++ b/security/selinux/include/flask.h
@@ -62,6 +62,7 @@
 #define SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET           55
 #define SECCLASS_APPLETALK_SOCKET                        56
 #define SECCLASS_PACKET                                  57
+#define SECCLASS_KEY                                     58
 
 /*
  * Security identifier indices for initial entities
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 54c030778882..8f5547ad1856 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -99,6 +99,11 @@ struct sk_security_struct {
 	u32 peer_sid;			/* SID of peer */
 };
 
+struct key_security_struct {
+	struct key *obj; /* back pointer */
+	u32 sid;         /* SID of key */
+};
+
 extern unsigned int selinux_checkreqprot;
 
 #endif /* _SELINUX_OBJSEC_H_ */
-- 
cgit v1.2.3


From 04c567d9313e4927b9835361d8ac0318ce65af6b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Jun 2006 14:47:18 -0700
Subject: [PATCH] Keys: Fix race between two instantiators of a key

Add a revocation notification method to the key type and calls it whilst
the key's semaphore is still write-locked after setting the revocation
flag.

The patch then uses this to maintain a reference on the task_struct of the
process that calls request_key() for as long as the authorisation key
remains unrevoked.

This fixes a potential race between two processes both of which have
assumed the authority to instantiate a key (one may have forked the other
for example).  The problem is that there's no locking around the check for
revocation of the auth key and the use of the task_struct it points to, nor
does the auth key keep a reference on the task_struct.

Access to the "context" pointer in the auth key must thenceforth be done
with the auth key semaphore held.  The revocation method is called with the
target key semaphore held write-locked and the search of the context
process's keyrings is done with the auth key semaphore read-locked.

The check for the revocation state of the auth key just prior to searching
it is done after the auth key is read-locked for the search.  This ensures
that the auth key can't be revoked between the check and the search.

The revocation notification method is added so that the context task_struct
can be released as soon as instantiation happens rather than waiting for
the auth key to be destroyed, thus avoiding the unnecessary pinning of the
requesting process.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys.txt           | 10 +++++++++
 include/linux/key.h              |  5 +++++
 security/keys/key.c              |  4 ++++
 security/keys/process_keys.c     | 42 +++++++++++++++++++++++--------------
 security/keys/request_key_auth.c | 45 +++++++++++++++++++++++++++++++++++++++-
 5 files changed, 89 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 703020012708..3bbe157b45e4 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -964,6 +964,16 @@ The structure has a number of fields, some of which are mandatory:
      It is not safe to sleep in this method; the caller may hold spinlocks.
 
 
+ (*) void (*revoke)(struct key *key);
+
+     This method is optional.  It is called to discard part of the payload
+     data upon a key being revoked.  The caller will have the key semaphore
+     write-locked.
+
+     It is safe to sleep in this method, though care should be taken to avoid
+     a deadlock against the key semaphore.
+
+
  (*) void (*destroy)(struct key *key);
 
      This method is optional. It is called to discard the payload data on a key
diff --git a/include/linux/key.h b/include/linux/key.h
index 8c275d12ef63..e81ebf910d0b 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -205,6 +205,11 @@ struct key_type {
 	/* match a key against a description */
 	int (*match)(const struct key *key, const void *desc);
 
+	/* clear some of the data from a key on revokation (optional)
+	 * - the key's semaphore will be write-locked by the caller
+	 */
+	void (*revoke)(struct key *key);
+
 	/* clear the data from a key (optional) */
 	void (*destroy)(struct key *key);
 
diff --git a/security/keys/key.c b/security/keys/key.c
index 14a15abb7735..51f851557389 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -907,6 +907,10 @@ void key_revoke(struct key *key)
 	 * it */
 	down_write(&key->sem);
 	set_bit(KEY_FLAG_REVOKED, &key->flags);
+
+	if (key->type->revoke)
+		key->type->revoke(key);
+
 	up_write(&key->sem);
 
 } /* end key_revoke() */
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index a50a91332fe1..4d9825f9962c 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -391,6 +391,8 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	struct request_key_auth *rka;
 	key_ref_t key_ref, ret, err;
 
+	might_sleep();
+
 	/* we want to return -EAGAIN or -ENOKEY if any of the keyrings were
 	 * searchable, but we failed to find a key or we found a negative key;
 	 * otherwise we want to return a sample error (probably -EACCES) if
@@ -496,27 +498,35 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	 */
 	if (context->request_key_auth &&
 	    context == current &&
-	    type != &key_type_request_key_auth &&
-	    key_validate(context->request_key_auth) == 0
+	    type != &key_type_request_key_auth
 	    ) {
-		rka = context->request_key_auth->payload.data;
+		/* defend against the auth key being revoked */
+		down_read(&context->request_key_auth->sem);
 
-		key_ref = search_process_keyrings(type, description, match,
-						  rka->context);
+		if (key_validate(context->request_key_auth) == 0) {
+			rka = context->request_key_auth->payload.data;
 
-		if (!IS_ERR(key_ref))
-			goto found;
+			key_ref = search_process_keyrings(type, description,
+							  match, rka->context);
 
-		switch (PTR_ERR(key_ref)) {
-		case -EAGAIN: /* no key */
-			if (ret)
+			up_read(&context->request_key_auth->sem);
+
+			if (!IS_ERR(key_ref))
+				goto found;
+
+			switch (PTR_ERR(key_ref)) {
+			case -EAGAIN: /* no key */
+				if (ret)
+					break;
+			case -ENOKEY: /* negative key */
+				ret = key_ref;
 				break;
-		case -ENOKEY: /* negative key */
-			ret = key_ref;
-			break;
-		default:
-			err = key_ref;
-			break;
+			default:
+				err = key_ref;
+				break;
+			}
+		} else {
+			up_read(&context->request_key_auth->sem);
 		}
 	}
 
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 0ecc2e8d2bd0..cb9817ced3fd 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -20,6 +20,7 @@
 
 static int request_key_auth_instantiate(struct key *, const void *, size_t);
 static void request_key_auth_describe(const struct key *, struct seq_file *);
+static void request_key_auth_revoke(struct key *);
 static void request_key_auth_destroy(struct key *);
 static long request_key_auth_read(const struct key *, char __user *, size_t);
 
@@ -31,6 +32,7 @@ struct key_type key_type_request_key_auth = {
 	.def_datalen	= sizeof(struct request_key_auth),
 	.instantiate	= request_key_auth_instantiate,
 	.describe	= request_key_auth_describe,
+	.revoke		= request_key_auth_revoke,
 	.destroy	= request_key_auth_destroy,
 	.read		= request_key_auth_read,
 };
@@ -91,6 +93,24 @@ static long request_key_auth_read(const struct key *key,
 
 } /* end request_key_auth_read() */
 
+/*****************************************************************************/
+/*
+ * handle revocation of an authorisation token key
+ * - called with the key sem write-locked
+ */
+static void request_key_auth_revoke(struct key *key)
+{
+	struct request_key_auth *rka = key->payload.data;
+
+	kenter("{%d}", key->serial);
+
+	if (rka->context) {
+		put_task_struct(rka->context);
+		rka->context = NULL;
+	}
+
+} /* end request_key_auth_revoke() */
+
 /*****************************************************************************/
 /*
  * destroy an instantiation authorisation token key
@@ -101,6 +121,11 @@ static void request_key_auth_destroy(struct key *key)
 
 	kenter("{%d}", key->serial);
 
+	if (rka->context) {
+		put_task_struct(rka->context);
+		rka->context = NULL;
+	}
+
 	key_put(rka->target_key);
 	kfree(rka);
 
@@ -131,14 +156,26 @@ struct key *request_key_auth_new(struct key *target, const char *callout_info)
 	 * another process */
 	if (current->request_key_auth) {
 		/* it is - use that instantiation context here too */
+		down_read(&current->request_key_auth->sem);
+
+		/* if the auth key has been revoked, then the key we're
+		 * servicing is already instantiated */
+		if (test_bit(KEY_FLAG_REVOKED,
+			     &current->request_key_auth->flags))
+			goto auth_key_revoked;
+
 		irka = current->request_key_auth->payload.data;
 		rka->context = irka->context;
 		rka->pid = irka->pid;
+		get_task_struct(rka->context);
+
+		up_read(&current->request_key_auth->sem);
 	}
 	else {
 		/* it isn't - use this process as the context */
 		rka->context = current;
 		rka->pid = current->pid;
+		get_task_struct(rka->context);
 	}
 
 	rka->target_key = key_get(target);
@@ -161,9 +198,15 @@ struct key *request_key_auth_new(struct key *target, const char *callout_info)
 	if (ret < 0)
 		goto error_inst;
 
-	kleave(" = {%d})", authkey->serial);
+	kleave(" = {%d}", authkey->serial);
 	return authkey;
 
+auth_key_revoked:
+	up_read(&current->request_key_auth->sem);
+	kfree(rka);
+	kleave("= -EKEYREVOKED");
+	return ERR_PTR(-EKEYREVOKED);
+
 error_inst:
 	key_revoke(authkey);
 	key_put(authkey);
-- 
cgit v1.2.3


From 0e5b3781591cc954037c08ef78edf7f1192d38c5 Mon Sep 17 00:00:00 2001
From: Brice Goglin <brice@myri.com>
Date: Thu, 22 Jun 2006 14:47:20 -0700
Subject: [PATCH] PCI: Add PCI_CAP_ID_VNDR

Add the vendor-specific extended capability PCI_CAP_ID_VNDR.  It is required
by the Myri-10G Ethernet driver.

Signed-off-by: Brice Goglin <brice@myri.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pci_regs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index d27a78b71297..6bce4a240364 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -197,6 +197,7 @@
 #define  PCI_CAP_ID_CHSWP	0x06	/* CompactPCI HotSwap */
 #define  PCI_CAP_ID_PCIX	0x07	/* PCI-X */
 #define  PCI_CAP_ID_HT_IRQCONF	0x08	/* HyperTransport IRQ Configuration */
+#define  PCI_CAP_ID_VNDR	0x09	/* Vendor specific capability */
 #define  PCI_CAP_ID_SHPC 	0x0C	/* PCI Standard Hot-Plug Controller */
 #define  PCI_CAP_ID_EXP 	0x10	/* PCI Express */
 #define  PCI_CAP_ID_MSIX	0x11	/* MSI-X */
-- 
cgit v1.2.3


From c89681ed7d0e4a61d35bdc12c06c6733b718b2cb Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 22 Jun 2006 14:47:22 -0700
Subject: [PATCH] remove steal_locks()

This patch removes the steal_locks() function.

steal_locks() doesn't work correctly with any filesystem that does it's own
lock management, including NFS, CIFS, etc.

In addition it has weird semantics on local filesystems in case tasks
sharing file-descriptor tables are doing POSIX locking operations in
parallel to execve().

The steal_locks() function has an effect on applications doing:

clone(CLONE_FILES)
  /* in child */
  lock
  execve
  lock

POSIX locks acquired before execve (by "child", "parent" or any further
task sharing files_struct) will after the execve be owned exclusively by
"child".

According to Chris Wright some LSB/LTP kind of suite triggers without the
stealing behavior, but there's no known real-world application that would
also fail.

Apps using NPTL are not affected, since all other threads are killed before
execve.

Apps using LinuxThreads are only affected if they

  - have multiple threads during exec (LinuxThreads doesn't kill other
    threads, the app may do it with pthread_kill_other_threads_np())
  - rely on POSIX locks being inherited across exec

Both conditions are documented, but not their interaction.

Apps using clone() natively are affected if they

  - use clone(CLONE_FILES)
  - rely on POSIX locks being inherited across exec

The above scenarios are unlikely, but possible.

If the patch is vetoed, there's a plan B, that involves mostly keeping the
weird stealing semantics, but changing the way lock ownership is handled so
that network and local filesystems work consistently.

That would add more complexity though, so this solution seems to be
preferred by most people.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Matthew Wilcox <willy@debian.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Steven French <sfrench@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf.c    |  1 -
 fs/binfmt_misc.c   |  1 -
 fs/exec.c          |  1 -
 fs/locks.c         | 57 ------------------------------------------------------
 include/linux/fs.h |  1 -
 5 files changed, 61 deletions(-)

(limited to 'include/linux')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 537893a16014..8a04216e8b4d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -759,7 +759,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	/* Discard our unneeded old files struct */
 	if (files) {
-		steal_locks(files);
 		put_files_struct(files);
 		files = NULL;
 	}
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index d73d75591a39..599f36fd0f67 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -203,7 +203,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		goto _error;
 
 	if (files) {
-		steal_locks(files);
 		put_files_struct(files);
 		files = NULL;
 	}
diff --git a/fs/exec.c b/fs/exec.c
index d07858c0b7c4..0b88bf646143 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -866,7 +866,6 @@ int flush_old_exec(struct linux_binprm * bprm)
 	bprm->mm = NULL;		/* We're using it now */
 
 	/* This is the point of no return */
-	steal_locks(files);
 	put_files_struct(files);
 
 	current->sas_ss_sp = current->sas_ss_size = 0;
diff --git a/fs/locks.c b/fs/locks.c
index ab61a8b54829..69435c68c1ed 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2206,63 +2206,6 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 
 EXPORT_SYMBOL(lock_may_write);
 
-static inline void __steal_locks(struct file *file, fl_owner_t from)
-{
-	struct inode *inode = file->f_dentry->d_inode;
-	struct file_lock *fl = inode->i_flock;
-
-	while (fl) {
-		if (fl->fl_file == file && fl->fl_owner == from)
-			fl->fl_owner = current->files;
-		fl = fl->fl_next;
-	}
-}
-
-/* When getting ready for executing a binary, we make sure that current
- * has a files_struct on its own. Before dropping the old files_struct,
- * we take over ownership of all locks for all file descriptors we own.
- * Note that we may accidentally steal a lock for a file that a sibling
- * has created since the unshare_files() call.
- */
-void steal_locks(fl_owner_t from)
-{
-	struct files_struct *files = current->files;
-	int i, j;
-	struct fdtable *fdt;
-
-	if (from == files)
-		return;
-
-	lock_kernel();
-	j = 0;
-
-	/*
-	 * We are not taking a ref to the file structures, so
-	 * we need to acquire ->file_lock.
-	 */
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	for (;;) {
-		unsigned long set;
-		i = j * __NFDBITS;
-		if (i >= fdt->max_fdset || i >= fdt->max_fds)
-			break;
-		set = fdt->open_fds->fds_bits[j++];
-		while (set) {
-			if (set & 1) {
-				struct file *file = fdt->fd[i];
-				if (file)
-					__steal_locks(file, from);
-			}
-			i++;
-			set >>= 1;
-		}
-	}
-	spin_unlock(&files->file_lock);
-	unlock_kernel();
-}
-EXPORT_SYMBOL(steal_locks);
-
 static int __init filelock_init(void)
 {
 	filelock_cache = kmem_cache_create("file_lock_cache",
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ecc8c2c3d8ca..73c7d6f04b31 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -782,7 +782,6 @@ extern int setlease(struct file *, long, struct file_lock **);
 extern int lease_modify(struct file_lock **, int);
 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
 extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
-extern void steal_locks(fl_owner_t from);
 
 struct fasync_struct {
 	int	magic;
-- 
cgit v1.2.3


From 0feae5c47aabdde59cbbec32d150e17102de37f0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 22 Jun 2006 14:47:28 -0700
Subject: [PATCH] Fix dcache race during umount

The race is that the shrink_dcache_memory shrinker could get called while a
filesystem is being unmounted, and could try to prune a dentry belonging to
that filesystem.

If it does, then it will call in to iput on the inode while the dentry is
no longer able to be found by the umounting process.  If iput takes a
while, generic_shutdown_super could get all the way though
shrink_dcache_parent and shrink_dcache_anon and invalidate_inodes without
ever waiting on this particular inode.

Eventually the superblock gets freed anyway and if the iput tried to touch
it (which some filesystems certainly do), it will lose.  The promised
"Self-destruct in 5 seconds" doesn't lead to a nice day.

The race is closed by holding s_umount while calling prune_one_dentry on
someone else's dentry.  As a down_read_trylock is used,
shrink_dcache_memory will no longer try to prune the dentry of a filesystem
that is being unmounted, and unmount will not be able to start until any
such active prune_one_dentry completes.

This requires that prune_dcache *knows* which filesystem (if any) it is
doing the prune on behalf of so that it can be careful of other
filesystems.  shrink_dcache_memory isn't called it on behalf of any
filesystem, and so is careful of everything.

shrink_dcache_anon is now passed a super_block rather than the s_anon list
out of the superblock, so it can get the s_anon list itself, and can pass
the superblock down to prune_dcache.

If prune_dcache finds a dentry that it cannot free, it leaves it where it
is (at the tail of the list) and exits, on the assumption that some other
thread will be removing that dentry soon.  To try to make sure that some
work gets done, a limited number of dnetries which are untouchable are
skipped over while choosing the dentry to work on.

I believe this race was first found by Kirill Korotaev.

Cc: Jan Blunck <jblunck@suse.de>
Acked-by: Kirill Korotaev <dev@openvz.org>
Cc: Olaf Hering <olh@suse.de>
Acked-by: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c            | 66 +++++++++++++++++++++++++++++++++++++++++++++-----
 fs/super.c             |  2 +-
 include/linux/dcache.h |  2 +-
 3 files changed, 62 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 940d188e5d14..385f5dbc4b0c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -382,6 +382,8 @@ static inline void prune_one_dentry(struct dentry * dentry)
 /**
  * prune_dcache - shrink the dcache
  * @count: number of entries to try and free
+ * @sb: if given, ignore dentries for other superblocks
+ *         which are being unmounted.
  *
  * Shrink the dcache. This is done when we need
  * more memory, or simply when we need to unmount
@@ -392,16 +394,29 @@ static inline void prune_one_dentry(struct dentry * dentry)
  * all the dentries are in use.
  */
  
-static void prune_dcache(int count)
+static void prune_dcache(int count, struct super_block *sb)
 {
 	spin_lock(&dcache_lock);
 	for (; count ; count--) {
 		struct dentry *dentry;
 		struct list_head *tmp;
+		struct rw_semaphore *s_umount;
 
 		cond_resched_lock(&dcache_lock);
 
 		tmp = dentry_unused.prev;
+		if (unlikely(sb)) {
+			/* Try to find a dentry for this sb, but don't try
+			 * too hard, if they aren't near the tail they will
+			 * be moved down again soon
+			 */
+			int skip = count;
+			while (skip && tmp != &dentry_unused &&
+			    list_entry(tmp, struct dentry, d_lru)->d_sb != sb) {
+				skip--;
+				tmp = tmp->prev;
+			}
+		}
 		if (tmp == &dentry_unused)
 			break;
 		list_del_init(tmp);
@@ -427,7 +442,45 @@ static void prune_dcache(int count)
  			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-		prune_one_dentry(dentry);
+		/*
+		 * If the dentry is not DCACHED_REFERENCED, it is time
+		 * to remove it from the dcache, provided the super block is
+		 * NULL (which means we are trying to reclaim memory)
+		 * or this dentry belongs to the same super block that
+		 * we want to shrink.
+		 */
+		/*
+		 * If this dentry is for "my" filesystem, then I can prune it
+		 * without taking the s_umount lock (I already hold it).
+		 */
+		if (sb && dentry->d_sb == sb) {
+			prune_one_dentry(dentry);
+			continue;
+		}
+		/*
+		 * ...otherwise we need to be sure this filesystem isn't being
+		 * unmounted, otherwise we could race with
+		 * generic_shutdown_super(), and end up holding a reference to
+		 * an inode while the filesystem is unmounted.
+		 * So we try to get s_umount, and make sure s_root isn't NULL.
+		 * (Take a local copy of s_umount to avoid a use-after-free of
+		 * `dentry').
+		 */
+		s_umount = &dentry->d_sb->s_umount;
+		if (down_read_trylock(s_umount)) {
+			if (dentry->d_sb->s_root != NULL) {
+				prune_one_dentry(dentry);
+				up_read(s_umount);
+				continue;
+			}
+			up_read(s_umount);
+		}
+		spin_unlock(&dentry->d_lock);
+		/* Cannot remove the first dentry, and it isn't appropriate
+		 * to move it to the head of the list, so give up, and try
+		 * later
+		 */
+		break;
 	}
 	spin_unlock(&dcache_lock);
 }
@@ -630,7 +683,7 @@ void shrink_dcache_parent(struct dentry * parent)
 	int found;
 
 	while ((found = select_parent(parent)) != 0)
-		prune_dcache(found);
+		prune_dcache(found, parent->d_sb);
 }
 
 /**
@@ -643,9 +696,10 @@ void shrink_dcache_parent(struct dentry * parent)
  * done under dcache_lock.
  *
  */
-void shrink_dcache_anon(struct hlist_head *head)
+void shrink_dcache_anon(struct super_block *sb)
 {
 	struct hlist_node *lp;
+	struct hlist_head *head = &sb->s_anon;
 	int found;
 	do {
 		found = 0;
@@ -668,7 +722,7 @@ void shrink_dcache_anon(struct hlist_head *head)
 			}
 		}
 		spin_unlock(&dcache_lock);
-		prune_dcache(found);
+		prune_dcache(found, sb);
 	} while(found);
 }
 
@@ -689,7 +743,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
-		prune_dcache(nr);
+		prune_dcache(nr, NULL);
 	}
 	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
diff --git a/fs/super.c b/fs/super.c
index a66f66bb8049..9d5c2add7228 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -231,7 +231,7 @@ void generic_shutdown_super(struct super_block *sb)
 	if (root) {
 		sb->s_root = NULL;
 		shrink_dcache_parent(root);
-		shrink_dcache_anon(&sb->s_anon);
+		shrink_dcache_anon(sb);
 		dput(root);
 		fsync_super(sb);
 		lock_super(sb);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 836325ee0931..46d0e079735d 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -217,7 +217,7 @@ extern struct dentry * d_alloc_anon(struct inode *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
-extern void shrink_dcache_anon(struct hlist_head *);
+extern void shrink_dcache_anon(struct super_block *);
 extern int d_invalidate(struct dentry *);
 
 /* only used at mount-time */
-- 
cgit v1.2.3


From 4f3865fb57a04db7cca068fed1c15badc064a302 Mon Sep 17 00:00:00 2001
From: Richard Purdie <rpurdie@rpsys.net>
Date: Thu, 22 Jun 2006 14:47:34 -0700
Subject: [PATCH] zlib_inflate: Upgrade library code to a recent version

Upgrade the zlib_inflate implementation in the kernel from a patched
version 1.1.3/4 to a patched 1.2.3.

The code in the kernel is about seven years old and I noticed that the
external zlib library's inflate performance was significantly faster (~50%)
than the code in the kernel on ARM (and faster again on x86_32).

For comparison the newer deflate code is 20% slower on ARM and 50% slower
on x86_32 but gives an approx 1% compression ratio improvement.  I don't
consider this to be an improvement for kernel use so have no plans to
change the zlib_deflate code.

Various changes have been made to the zlib code in the kernel, the most
significant being the extra functions/flush option used by ppp_deflate.
This update reimplements the features PPP needs to ensure it continues to
work.

This code has been tested on ARM under both JFFS2 (with zlib compression
enabled) and ppp_deflate and on x86_32.  JFFS2 sees an approx.  10% real
world file read speed improvement.

This patch also removes ZLIB_VERSION as it no longer has a correct value.
We don't need version checks anyway as the kernel's module handling will
take care of that for us.  This removal is also more in keeping with the
zlib author's wishes (http://www.zlib.net/zlib_faq.html#faq24) and I've
added something to the zlib.h header to note its a modified version.

Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
Acked-by: Joern Engel <joern@wh.fh-wedel.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/boot/Makefile      |    4 +-
 arch/ppc/boot/lib/Makefile      |    2 +-
 arch/xtensa/boot/lib/Makefile   |    2 +-
 include/linux/zconf.h           |   12 +
 include/linux/zlib.h            |  209 +++++---
 include/linux/zutil.h           |   12 -
 lib/zlib_deflate/deflate.c      |   25 +-
 lib/zlib_deflate/deflate_syms.c |    3 +-
 lib/zlib_inflate/Makefile       |    4 +-
 lib/zlib_inflate/infblock.c     |  365 -------------
 lib/zlib_inflate/infblock.h     |   48 --
 lib/zlib_inflate/infcodes.c     |  202 --------
 lib/zlib_inflate/infcodes.h     |   33 --
 lib/zlib_inflate/inffast.c      |  462 +++++++++++------
 lib/zlib_inflate/inffast.h      |   12 +-
 lib/zlib_inflate/inffixed.h     |   94 ++++
 lib/zlib_inflate/inflate.c      | 1086 +++++++++++++++++++++++++++++++--------
 lib/zlib_inflate/inflate.h      |  107 ++++
 lib/zlib_inflate/inflate_syms.c |    3 +-
 lib/zlib_inflate/inflate_sync.c |  152 ------
 lib/zlib_inflate/inftrees.c     |  683 +++++++++++-------------
 lib/zlib_inflate/inftrees.h     |   99 ++--
 lib/zlib_inflate/infutil.c      |   88 ----
 lib/zlib_inflate/infutil.h      |  176 +------
 24 files changed, 1877 insertions(+), 2006 deletions(-)
 delete mode 100644 lib/zlib_inflate/infblock.c
 delete mode 100644 lib/zlib_inflate/infblock.h
 delete mode 100644 lib/zlib_inflate/infcodes.c
 delete mode 100644 lib/zlib_inflate/infcodes.h
 create mode 100644 lib/zlib_inflate/inffixed.h
 create mode 100644 lib/zlib_inflate/inflate.h
 delete mode 100644 lib/zlib_inflate/inflate_sync.c
 delete mode 100644 lib/zlib_inflate/infutil.c

(limited to 'include/linux')

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 840ae595a617..d961bfeed05f 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -29,8 +29,8 @@ OBJCOPYFLAGS    := contents,alloc,load,readonly,data
 OBJCOPY_COFF_ARGS := -O aixcoff-rs6000 --set-start 0x500000
 OBJCOPY_MIB_ARGS  := -O aixcoff-rs6000 -R .stab -R .stabstr -R .comment
 
-zlib       := infblock.c infcodes.c inffast.c inflate.c inftrees.c infutil.c
-zlibheader := infblock.h infcodes.h inffast.h inftrees.h infutil.h
+zlib       := inffast.c inflate.c inftrees.c
+zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h
 zliblinuxheader := zlib.h zconf.h zutil.h
 
 $(addprefix $(obj)/,$(zlib) main.o): $(addprefix $(obj)/,$(zliblinuxheader)) $(addprefix $(obj)/,$(zlibheader))
diff --git a/arch/ppc/boot/lib/Makefile b/arch/ppc/boot/lib/Makefile
index 80c84d562fa4..2f995f712ec5 100644
--- a/arch/ppc/boot/lib/Makefile
+++ b/arch/ppc/boot/lib/Makefile
@@ -5,7 +5,7 @@
 CFLAGS_kbd.o	:= -Idrivers/char
 CFLAGS_vreset.o := -Iarch/ppc/boot/include
 
-zlib  := infblock.c infcodes.c inffast.c inflate.c inftrees.c infutil.c
+zlib  := inffast.c inflate.c inftrees.c
 	 
 lib-y += $(zlib:.c=.o) div64.o
 lib-$(CONFIG_VGA_CONSOLE) += vreset.o kbd.o
diff --git a/arch/xtensa/boot/lib/Makefile b/arch/xtensa/boot/lib/Makefile
index 9e73bb8aeb7a..d3d2aa2d883a 100644
--- a/arch/xtensa/boot/lib/Makefile
+++ b/arch/xtensa/boot/lib/Makefile
@@ -2,7 +2,7 @@
 # Makefile for some libs needed by zImage.
 #
 
-zlib	:= infblock.c infcodes.c inffast.c inflate.c inftrees.c infutil.c
+zlib	:= inffast.c inflate.c inftrees.c
 
 lib-y	+= $(zlib:.c=.o) zmem.o
 
diff --git a/include/linux/zconf.h b/include/linux/zconf.h
index f1cfd66b9554..0beb75e38caa 100644
--- a/include/linux/zconf.h
+++ b/include/linux/zconf.h
@@ -33,6 +33,18 @@
  */
 #ifndef MAX_WBITS
 #  define MAX_WBITS   15 /* 32K LZ77 window */
+#endif
+
+/* default windowBits for decompression. MAX_WBITS is for compression only */
+#ifndef DEF_WBITS
+#  define DEF_WBITS MAX_WBITS
+#endif
+
+/* default memLevel */
+#if MAX_MEM_LEVEL >= 8
+#  define DEF_MEM_LEVEL 8
+#else
+#  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
 #endif
 
                         /* Type declarations */
diff --git a/include/linux/zlib.h b/include/linux/zlib.h
index 4fa32f0d4df8..9e3192a7dc6f 100644
--- a/include/linux/zlib.h
+++ b/include/linux/zlib.h
@@ -1,7 +1,6 @@
 /* zlib.h -- interface of the 'zlib' general purpose compression library
-  version 1.1.3, July 9th, 1998
 
-  Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
+  Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler
 
   This software is provided 'as-is', without any express or implied
   warranty.  In no event will the authors be held liable for any damages
@@ -24,7 +23,7 @@
 
 
   The data format used by the zlib library is described by RFCs (Request for
-  Comments) 1950 to 1952 in the files ftp://ds.internic.net/rfc/rfc1950.txt
+  Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt
   (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format).
 */
 
@@ -33,7 +32,22 @@
 
 #include <linux/zconf.h>
 
-#define ZLIB_VERSION "1.1.3"
+/* zlib deflate based on ZLIB_VERSION "1.1.3" */
+/* zlib inflate based on ZLIB_VERSION "1.2.3" */
+
+/*
+  This is a modified version of zlib for use inside the Linux kernel.
+  The main changes are to perform all memory allocation in advance.
+
+  Inflation Changes:
+    * Z_PACKET_FLUSH is added and used by ppp_deflate. Before returning
+      this checks there is no more input data available and the next data
+      is a STORED block. It also resets the mode to be read for the next
+      data, all as per PPP requirements.
+    * Addition of zlib_inflateIncomp which copies incompressible data into
+      the history window and adjusts the accoutning without calling
+      zlib_inflate itself to inflate the data.
+*/
 
 /* 
      The 'zlib' compression library provides in-memory compression and
@@ -48,9 +62,18 @@
   application must provide more input and/or consume the output
   (providing more output space) before each call.
 
+     The compressed data format used by default by the in-memory functions is
+  the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
+  around a deflate stream, which is itself documented in RFC 1951.
+
      The library also supports reading and writing files in gzip (.gz) format
   with an interface similar to that of stdio.
 
+     The zlib format was designed to be compact and fast for use in memory
+  and on communications channels.  The gzip format was designed for single-
+  file compression on file systems, has a larger header than zlib to maintain
+  directory information, and uses a different, slower check method than zlib.
+
      The library does not install any signal handler. The decoder checks
   the consistency of the compressed data, so the library should never
   crash even in case of corrupted input.
@@ -119,7 +142,8 @@ typedef z_stream *z_streamp;
 #define Z_SYNC_FLUSH    3
 #define Z_FULL_FLUSH    4
 #define Z_FINISH        5
-/* Allowed flush values; see deflate() below for details */
+#define Z_BLOCK         6 /* Only for inflate at present */
+/* Allowed flush values; see deflate() and inflate() below for details */
 
 #define Z_OK            0
 #define Z_STREAM_END    1
@@ -155,13 +179,6 @@ typedef z_stream *z_streamp;
 
                         /* basic functions */
 
-extern const char * zlib_zlibVersion (void);
-/* The application can compare zlibVersion and ZLIB_VERSION for consistency.
-   If the first character differs, the library code actually used is
-   not compatible with the zlib.h header file used by the application.
-   This check is automatically made by deflateInit and inflateInit.
- */
-
 extern int zlib_deflate_workspacesize (void);
 /*
    Returns the number of bytes that needs to be allocated for a per-
@@ -315,9 +332,9 @@ extern int zlib_inflateInit (z_streamp strm);
 extern int zlib_inflate (z_streamp strm, int flush);
 /*
     inflate decompresses as much data as possible, and stops when the input
-  buffer becomes empty or the output buffer becomes full. It may some
-  introduce some output latency (reading input without producing any output)
-  except when forced to flush.
+  buffer becomes empty or the output buffer becomes full. It may introduce
+  some output latency (reading input without producing any output) except when
+  forced to flush.
 
   The detailed semantics are as follows. inflate performs one or both of the
   following actions:
@@ -341,11 +358,26 @@ extern int zlib_inflate (z_streamp strm, int flush);
   must be called again after making room in the output buffer because there
   might be more output pending.
 
-    If the parameter flush is set to Z_SYNC_FLUSH, inflate flushes as much
-  output as possible to the output buffer. The flushing behavior of inflate is
-  not specified for values of the flush parameter other than Z_SYNC_FLUSH
-  and Z_FINISH, but the current implementation actually flushes as much output
-  as possible anyway.
+    The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH,
+  Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much
+  output as possible to the output buffer. Z_BLOCK requests that inflate() stop
+  if and when it gets to the next deflate block boundary. When decoding the
+  zlib or gzip format, this will cause inflate() to return immediately after
+  the header and before the first block. When doing a raw inflate, inflate()
+  will go ahead and process the first block, and will return when it gets to
+  the end of that block, or when it runs out of data.
+
+    The Z_BLOCK option assists in appending to or combining deflate streams.
+  Also to assist in this, on return inflate() will set strm->data_type to the
+  number of unused bits in the last byte taken from strm->next_in, plus 64
+  if inflate() is currently decoding the last block in the deflate stream,
+  plus 128 if inflate() returned immediately after decoding an end-of-block
+  code or decoding the complete header up to just before the first byte of the
+  deflate stream. The end-of-block will not be indicated until all of the
+  uncompressed data from that block has been written to strm->next_out.  The
+  number of unused bits may in general be greater than seven, except when
+  bit 7 of data_type is set, in which case the number of unused bits will be
+  less than eight.
 
     inflate() should normally be called until it returns Z_STREAM_END or an
   error. However if all decompression is to be performed in a single step
@@ -355,29 +387,44 @@ extern int zlib_inflate (z_streamp strm, int flush);
   uncompressed data. (The size of the uncompressed data may have been saved
   by the compressor for this purpose.) The next operation on this stream must
   be inflateEnd to deallocate the decompression state. The use of Z_FINISH
-  is never required, but can be used to inform inflate that a faster routine
+  is never required, but can be used to inform inflate that a faster approach
   may be used for the single inflate() call.
 
-     If a preset dictionary is needed at this point (see inflateSetDictionary
-  below), inflate sets strm-adler to the adler32 checksum of the
-  dictionary chosen by the compressor and returns Z_NEED_DICT; otherwise 
-  it sets strm->adler to the adler32 checksum of all output produced
-  so far (that is, total_out bytes) and returns Z_OK, Z_STREAM_END or
-  an error code as described below. At the end of the stream, inflate()
-  checks that its computed adler32 checksum is equal to that saved by the
-  compressor and returns Z_STREAM_END only if the checksum is correct.
+     In this implementation, inflate() always flushes as much output as
+  possible to the output buffer, and always uses the faster approach on the
+  first call. So the only effect of the flush parameter in this implementation
+  is on the return value of inflate(), as noted below, or when it returns early
+  because Z_BLOCK is used.
+
+     If a preset dictionary is needed after this call (see inflateSetDictionary
+  below), inflate sets strm->adler to the adler32 checksum of the dictionary
+  chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
+  strm->adler to the adler32 checksum of all output produced so far (that is,
+  total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
+  below. At the end of the stream, inflate() checks that its computed adler32
+  checksum is equal to that saved by the compressor and returns Z_STREAM_END
+  only if the checksum is correct.
+
+    inflate() will decompress and check either zlib-wrapped or gzip-wrapped
+  deflate data.  The header type is detected automatically.  Any information
+  contained in the gzip header is not retained, so applications that need that
+  information should instead use raw inflate, see inflateInit2() below, or
+  inflateBack() and perform their own processing of the gzip header and
+  trailer.
 
     inflate() returns Z_OK if some progress has been made (more input processed
   or more output produced), Z_STREAM_END if the end of the compressed data has
   been reached and all uncompressed output has been produced, Z_NEED_DICT if a
   preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
-  corrupted (input stream not conforming to the zlib format or incorrect
-  adler32 checksum), Z_STREAM_ERROR if the stream structure was inconsistent
-  (for example if next_in or next_out was NULL), Z_MEM_ERROR if there was not
-  enough memory, Z_BUF_ERROR if no progress is possible or if there was not
-  enough room in the output buffer when Z_FINISH is used. In the Z_DATA_ERROR
-  case, the application may then call inflateSync to look for a good
-  compression block.
+  corrupted (input stream not conforming to the zlib format or incorrect check
+  value), Z_STREAM_ERROR if the stream structure was inconsistent (for example
+  if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory,
+  Z_BUF_ERROR if no progress is possible or if there was not enough room in the
+  output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and
+  inflate() can be called again with more input and more output space to
+  continue decompressing. If Z_DATA_ERROR is returned, the application may then
+  call inflateSync() to look for a good compression block if a partial recovery
+  of the data is desired.
 */
 
 
@@ -547,16 +594,36 @@ extern int inflateInit2 (z_streamp strm, int  windowBits);
      The windowBits parameter is the base two logarithm of the maximum window
    size (the size of the history buffer).  It should be in the range 8..15 for
    this version of the library. The default value is 15 if inflateInit is used
-   instead. If a compressed stream with a larger window size is given as
-   input, inflate() will return with the error code Z_DATA_ERROR instead of
-   trying to allocate a larger window.
-
-      inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
-   memory, Z_STREAM_ERROR if a parameter is invalid (such as a negative
-   memLevel). msg is set to null if there is no error message.  inflateInit2
-   does not perform any decompression apart from reading the zlib header if
-   present: this will be done by inflate(). (So next_in and avail_in may be
-   modified, but next_out and avail_out are unchanged.)
+   instead. windowBits must be greater than or equal to the windowBits value
+   provided to deflateInit2() while compressing, or it must be equal to 15 if
+   deflateInit2() was not used. If a compressed stream with a larger window
+   size is given as input, inflate() will return with the error code
+   Z_DATA_ERROR instead of trying to allocate a larger window.
+
+     windowBits can also be -8..-15 for raw inflate. In this case, -windowBits
+   determines the window size. inflate() will then process raw deflate data,
+   not looking for a zlib or gzip header, not generating a check value, and not
+   looking for any check values for comparison at the end of the stream. This
+   is for use with other formats that use the deflate compressed data format
+   such as zip.  Those formats provide their own check values. If a custom
+   format is developed using the raw deflate format for compressed data, it is
+   recommended that a check value such as an adler32 or a crc32 be applied to
+   the uncompressed data as is done in the zlib, gzip, and zip formats.  For
+   most applications, the zlib format should be used as is. Note that comments
+   above on the use in deflateInit2() applies to the magnitude of windowBits.
+
+     windowBits can also be greater than 15 for optional gzip decoding. Add
+   32 to windowBits to enable zlib and gzip decoding with automatic header
+   detection, or add 16 to decode only the gzip format (the zlib format will
+   return a Z_DATA_ERROR).  If a gzip stream is being decoded, strm->adler is
+   a crc32 instead of an adler32.
+
+     inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg
+   is set to null if there is no error message.  inflateInit2 does not perform
+   any decompression apart from reading the zlib header if present: this will
+   be done by inflate(). (So next_in and avail_in may be modified, but next_out
+   and avail_out are unchanged.)
 */
 
 extern int zlib_inflateSetDictionary (z_streamp strm,
@@ -564,16 +631,19 @@ extern int zlib_inflateSetDictionary (z_streamp strm,
 						     uInt  dictLength);
 /*
      Initializes the decompression dictionary from the given uncompressed byte
-   sequence. This function must be called immediately after a call of inflate
-   if this call returned Z_NEED_DICT. The dictionary chosen by the compressor
-   can be determined from the Adler32 value returned by this call of
-   inflate. The compressor and decompressor must use exactly the same
-   dictionary (see deflateSetDictionary).
+   sequence. This function must be called immediately after a call of inflate,
+   if that call returned Z_NEED_DICT. The dictionary chosen by the compressor
+   can be determined from the adler32 value returned by that call of inflate.
+   The compressor and decompressor must use exactly the same dictionary (see
+   deflateSetDictionary).  For raw inflate, this function can be called
+   immediately after inflateInit2() or inflateReset() and before any call of
+   inflate() to set the dictionary.  The application must insure that the
+   dictionary that was used for compression is provided.
 
      inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
    parameter is invalid (such as NULL dictionary) or the stream state is
    inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
-   expected one (incorrect Adler32 value). inflateSetDictionary does not
+   expected one (incorrect adler32 value). inflateSetDictionary does not
    perform any decompression: this will be done by subsequent calls of
    inflate().
 */
@@ -614,40 +684,19 @@ extern int zlib_inflateIncomp (z_stream *strm);
    containing the data at next_in (except that the data is not output).
 */
 
-                        /* various hacks, don't look :) */
-
-/* deflateInit and inflateInit are macros to allow checking the zlib version
- * and the compiler's view of z_stream:
- */
-extern int zlib_deflateInit_ (z_streamp strm, int level,
-                                     const char *version, int stream_size);
-extern int zlib_inflateInit_ (z_streamp strm,
-                                     const char *version, int stream_size);
-extern int zlib_deflateInit2_ (z_streamp strm, int  level, int  method,
-                                      int windowBits, int memLevel,
-                                      int strategy, const char *version,
-                                      int stream_size);
-extern int zlib_inflateInit2_ (z_streamp strm, int  windowBits,
-                                      const char *version, int stream_size);
 #define zlib_deflateInit(strm, level) \
-        zlib_deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream))
+	zlib_deflateInit2((strm), (level), Z_DEFLATED, MAX_WBITS, \
+			      DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY)
 #define zlib_inflateInit(strm) \
-        zlib_inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream))
-#define zlib_deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
-        zlib_deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
-                      (strategy), ZLIB_VERSION, sizeof(z_stream))
-#define zlib_inflateInit2(strm, windowBits) \
-        zlib_inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream))
+	zlib_inflateInit2((strm), DEF_WBITS)
 
+extern int zlib_deflateInit2(z_streamp strm, int  level, int  method,
+                                      int windowBits, int memLevel,
+                                      int strategy);
+extern int zlib_inflateInit2(z_streamp strm, int  windowBits);
 
 #if !defined(_Z_UTIL_H) && !defined(NO_DUMMY_DECL)
     struct internal_state {int dummy;}; /* hack for buggy compilers */
 #endif
 
-extern const char  * zlib_zError           (int err);
-#if 0
-extern int           zlib_inflateSyncPoint (z_streamp z);
-#endif
-extern const uLong * zlib_get_crc_table    (void);
-
 #endif /* _ZLIB_H */
diff --git a/include/linux/zutil.h b/include/linux/zutil.h
index ee0c59cf2136..6adfa9a6ffe9 100644
--- a/include/linux/zutil.h
+++ b/include/linux/zutil.h
@@ -23,18 +23,6 @@ typedef unsigned long  ulg;
 
         /* common constants */
 
-#ifndef DEF_WBITS
-#  define DEF_WBITS MAX_WBITS
-#endif
-/* default windowBits for decompression. MAX_WBITS is for compression only */
-
-#if MAX_MEM_LEVEL >= 8
-#  define DEF_MEM_LEVEL 8
-#else
-#  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
-#endif
-/* default memLevel */
-
 #define STORED_BLOCK 0
 #define STATIC_TREES 1
 #define DYN_TREES    2
diff --git a/lib/zlib_deflate/deflate.c b/lib/zlib_deflate/deflate.c
index 1653dd9bb01a..c3e4a2baf835 100644
--- a/lib/zlib_deflate/deflate.c
+++ b/lib/zlib_deflate/deflate.c
@@ -164,34 +164,17 @@ static const config configuration_table[10] = {
     memset((char *)s->head, 0, (unsigned)(s->hash_size-1)*sizeof(*s->head));
 
 /* ========================================================================= */
-int zlib_deflateInit_(
-	z_streamp strm,
-	int level,
-	const char *version,
-	int stream_size
-)
-{
-    return zlib_deflateInit2_(strm, level, Z_DEFLATED, MAX_WBITS,
-			      DEF_MEM_LEVEL,
-			      Z_DEFAULT_STRATEGY, version, stream_size);
-    /* To do: ignore strm->next_in if we use it as window */
-}
-
-/* ========================================================================= */
-int zlib_deflateInit2_(
+int zlib_deflateInit2(
 	z_streamp strm,
 	int  level,
 	int  method,
 	int  windowBits,
 	int  memLevel,
-	int  strategy,
-	const char *version,
-	int stream_size
+	int  strategy
 )
 {
     deflate_state *s;
     int noheader = 0;
-    static char* my_version = ZLIB_VERSION;
     deflate_workspace *mem;
 
     ush *overlay;
@@ -199,10 +182,6 @@ int zlib_deflateInit2_(
      * output size for (length,distance) codes is <= 24 bits.
      */
 
-    if (version == NULL || version[0] != my_version[0] ||
-        stream_size != sizeof(z_stream)) {
-	return Z_VERSION_ERROR;
-    }
     if (strm == NULL) return Z_STREAM_ERROR;
 
     strm->msg = NULL;
diff --git a/lib/zlib_deflate/deflate_syms.c b/lib/zlib_deflate/deflate_syms.c
index 767b573d1ef6..ccfe25f3920f 100644
--- a/lib/zlib_deflate/deflate_syms.c
+++ b/lib/zlib_deflate/deflate_syms.c
@@ -12,8 +12,7 @@
 
 EXPORT_SYMBOL(zlib_deflate_workspacesize);
 EXPORT_SYMBOL(zlib_deflate);
-EXPORT_SYMBOL(zlib_deflateInit_);
-EXPORT_SYMBOL(zlib_deflateInit2_);
+EXPORT_SYMBOL(zlib_deflateInit2);
 EXPORT_SYMBOL(zlib_deflateEnd);
 EXPORT_SYMBOL(zlib_deflateReset);
 MODULE_LICENSE("GPL");
diff --git a/lib/zlib_inflate/Makefile b/lib/zlib_inflate/Makefile
index 221c139e0df1..bf065482fa67 100644
--- a/lib/zlib_inflate/Makefile
+++ b/lib/zlib_inflate/Makefile
@@ -15,5 +15,5 @@
 
 obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate.o
 
-zlib_inflate-objs := infblock.o infcodes.o inffast.o inflate.o \
-		     inflate_sync.o inftrees.o infutil.o inflate_syms.o
+zlib_inflate-objs := inffast.o inflate.o \
+		     inftrees.o inflate_syms.o
diff --git a/lib/zlib_inflate/infblock.c b/lib/zlib_inflate/infblock.c
deleted file mode 100644
index c16cdeff51aa..000000000000
--- a/lib/zlib_inflate/infblock.c
+++ /dev/null
@@ -1,365 +0,0 @@
-/* infblock.c -- interpret and process block types to last block
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
- */
-
-#include <linux/zutil.h>
-#include "infblock.h"
-#include "inftrees.h"
-#include "infcodes.h"
-#include "infutil.h"
-
-struct inflate_codes_state;
-
-/* simplify the use of the inflate_huft type with some defines */
-#define exop word.what.Exop
-#define bits word.what.Bits
-
-/* Table for deflate from PKZIP's appnote.txt. */
-static const uInt border[] = { /* Order of the bit length code lengths */
-        16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
-
-/*
-   Notes beyond the 1.93a appnote.txt:
-
-   1. Distance pointers never point before the beginning of the output
-      stream.
-   2. Distance pointers can point back across blocks, up to 32k away.
-   3. There is an implied maximum of 7 bits for the bit length table and
-      15 bits for the actual data.
-   4. If only one code exists, then it is encoded using one bit.  (Zero
-      would be more efficient, but perhaps a little confusing.)  If two
-      codes exist, they are coded using one bit each (0 and 1).
-   5. There is no way of sending zero distance codes--a dummy must be
-      sent if there are none.  (History: a pre 2.0 version of PKZIP would
-      store blocks with no distance codes, but this was discovered to be
-      too harsh a criterion.)  Valid only for 1.93a.  2.04c does allow
-      zero distance codes, which is sent as one code of zero bits in
-      length.
-   6. There are up to 286 literal/length codes.  Code 256 represents the
-      end-of-block.  Note however that the static length tree defines
-      288 codes just to fill out the Huffman codes.  Codes 286 and 287
-      cannot be used though, since there is no length base or extra bits
-      defined for them.  Similarily, there are up to 30 distance codes.
-      However, static trees define 32 codes (all 5 bits) to fill out the
-      Huffman codes, but the last two had better not show up in the data.
-   7. Unzip can check dynamic Huffman blocks for complete code sets.
-      The exception is that a single code would not be complete (see #4).
-   8. The five bits following the block type is really the number of
-      literal codes sent minus 257.
-   9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits
-      (1+6+6).  Therefore, to output three times the length, you output
-      three codes (1+1+1), whereas to output four times the same length,
-      you only need two codes (1+3).  Hmm.
-  10. In the tree reconstruction algorithm, Code = Code + Increment
-      only if BitLength(i) is not zero.  (Pretty obvious.)
-  11. Correction: 4 Bits: # of Bit Length codes - 4     (4 - 19)
-  12. Note: length code 284 can represent 227-258, but length code 285
-      really is 258.  The last length deserves its own, short code
-      since it gets used a lot in very redundant files.  The length
-      258 is special since 258 - 3 (the min match length) is 255.
-  13. The literal/length and distance code bit lengths are read as a
-      single stream of lengths.  It is possible (and advantageous) for
-      a repeat code (16, 17, or 18) to go across the boundary between
-      the two sets of lengths.
- */
-
-
-void zlib_inflate_blocks_reset(
-	inflate_blocks_statef *s,
-	z_streamp z,
-	uLong *c
-)
-{
-  if (c != NULL)
-    *c = s->check;
-  if (s->mode == CODES)
-    zlib_inflate_codes_free(s->sub.decode.codes, z);
-  s->mode = TYPE;
-  s->bitk = 0;
-  s->bitb = 0;
-  s->read = s->write = s->window;
-  if (s->checkfn != NULL)
-    z->adler = s->check = (*s->checkfn)(0L, NULL, 0);
-}
-
-inflate_blocks_statef *zlib_inflate_blocks_new(
-	z_streamp z,
-	check_func c,
-	uInt w
-)
-{
-  inflate_blocks_statef *s;
-
-  s = &WS(z)->working_blocks_state;
-  s->hufts = WS(z)->working_hufts;
-  s->window = WS(z)->working_window;
-  s->end = s->window + w;
-  s->checkfn = c;
-  s->mode = TYPE;
-  zlib_inflate_blocks_reset(s, z, NULL);
-  return s;
-}
-
-
-int zlib_inflate_blocks(
-	inflate_blocks_statef *s,
-	z_streamp z,
-	int r
-)
-{
-  uInt t;               /* temporary storage */
-  uLong b;              /* bit buffer */
-  uInt k;               /* bits in bit buffer */
-  Byte *p;              /* input data pointer */
-  uInt n;               /* bytes available there */
-  Byte *q;              /* output window write pointer */
-  uInt m;               /* bytes to end of window or read pointer */
-
-  /* copy input/output information to locals (UPDATE macro restores) */
-  LOAD
-
-  /* process input based on current state */
-  while (1) switch (s->mode)
-  {
-    case TYPE:
-      NEEDBITS(3)
-      t = (uInt)b & 7;
-      s->last = t & 1;
-      switch (t >> 1)
-      {
-        case 0:                         /* stored */
-          DUMPBITS(3)
-          t = k & 7;                    /* go to byte boundary */
-          DUMPBITS(t)
-          s->mode = LENS;               /* get length of stored block */
-          break;
-        case 1:                         /* fixed */
-          {
-            uInt bl, bd;
-            inflate_huft *tl, *td;
-
-            zlib_inflate_trees_fixed(&bl, &bd, &tl, &td, s->hufts, z);
-            s->sub.decode.codes = zlib_inflate_codes_new(bl, bd, tl, td, z);
-            if (s->sub.decode.codes == NULL)
-            {
-              r = Z_MEM_ERROR;
-              LEAVE
-            }
-          }
-          DUMPBITS(3)
-          s->mode = CODES;
-          break;
-        case 2:                         /* dynamic */
-          DUMPBITS(3)
-          s->mode = TABLE;
-          break;
-        case 3:                         /* illegal */
-          DUMPBITS(3)
-          s->mode = B_BAD;
-          z->msg = (char*)"invalid block type";
-          r = Z_DATA_ERROR;
-          LEAVE
-      }
-      break;
-    case LENS:
-      NEEDBITS(32)
-      if ((((~b) >> 16) & 0xffff) != (b & 0xffff))
-      {
-        s->mode = B_BAD;
-        z->msg = (char*)"invalid stored block lengths";
-        r = Z_DATA_ERROR;
-        LEAVE
-      }
-      s->sub.left = (uInt)b & 0xffff;
-      b = k = 0;                      /* dump bits */
-      s->mode = s->sub.left ? STORED : (s->last ? DRY : TYPE);
-      break;
-    case STORED:
-      if (n == 0)
-        LEAVE
-      NEEDOUT
-      t = s->sub.left;
-      if (t > n) t = n;
-      if (t > m) t = m;
-      memcpy(q, p, t);
-      p += t;  n -= t;
-      q += t;  m -= t;
-      if ((s->sub.left -= t) != 0)
-        break;
-      s->mode = s->last ? DRY : TYPE;
-      break;
-    case TABLE:
-      NEEDBITS(14)
-      s->sub.trees.table = t = (uInt)b & 0x3fff;
-#ifndef PKZIP_BUG_WORKAROUND
-      if ((t & 0x1f) > 29 || ((t >> 5) & 0x1f) > 29)
-      {
-        s->mode = B_BAD;
-        z->msg = (char*)"too many length or distance symbols";
-        r = Z_DATA_ERROR;
-        LEAVE
-      }
-#endif
-      {
-      	s->sub.trees.blens = WS(z)->working_blens;
-      }
-      DUMPBITS(14)
-      s->sub.trees.index = 0;
-      s->mode = BTREE;
-    case BTREE:
-      while (s->sub.trees.index < 4 + (s->sub.trees.table >> 10))
-      {
-        NEEDBITS(3)
-        s->sub.trees.blens[border[s->sub.trees.index++]] = (uInt)b & 7;
-        DUMPBITS(3)
-      }
-      while (s->sub.trees.index < 19)
-        s->sub.trees.blens[border[s->sub.trees.index++]] = 0;
-      s->sub.trees.bb = 7;
-      t = zlib_inflate_trees_bits(s->sub.trees.blens, &s->sub.trees.bb,
-				  &s->sub.trees.tb, s->hufts, z);
-      if (t != Z_OK)
-      {
-        r = t;
-        if (r == Z_DATA_ERROR)
-          s->mode = B_BAD;
-        LEAVE
-      }
-      s->sub.trees.index = 0;
-      s->mode = DTREE;
-    case DTREE:
-      while (t = s->sub.trees.table,
-             s->sub.trees.index < 258 + (t & 0x1f) + ((t >> 5) & 0x1f))
-      {
-        inflate_huft *h;
-        uInt i, j, c;
-
-        t = s->sub.trees.bb;
-        NEEDBITS(t)
-        h = s->sub.trees.tb + ((uInt)b & zlib_inflate_mask[t]);
-        t = h->bits;
-        c = h->base;
-        if (c < 16)
-        {
-          DUMPBITS(t)
-          s->sub.trees.blens[s->sub.trees.index++] = c;
-        }
-        else /* c == 16..18 */
-        {
-          i = c == 18 ? 7 : c - 14;
-          j = c == 18 ? 11 : 3;
-          NEEDBITS(t + i)
-          DUMPBITS(t)
-          j += (uInt)b & zlib_inflate_mask[i];
-          DUMPBITS(i)
-          i = s->sub.trees.index;
-          t = s->sub.trees.table;
-          if (i + j > 258 + (t & 0x1f) + ((t >> 5) & 0x1f) ||
-              (c == 16 && i < 1))
-          {
-            s->mode = B_BAD;
-            z->msg = (char*)"invalid bit length repeat";
-            r = Z_DATA_ERROR;
-            LEAVE
-          }
-          c = c == 16 ? s->sub.trees.blens[i - 1] : 0;
-          do {
-            s->sub.trees.blens[i++] = c;
-          } while (--j);
-          s->sub.trees.index = i;
-        }
-      }
-      s->sub.trees.tb = NULL;
-      {
-        uInt bl, bd;
-        inflate_huft *tl, *td;
-        inflate_codes_statef *c;
-
-        bl = 9;         /* must be <= 9 for lookahead assumptions */
-        bd = 6;         /* must be <= 9 for lookahead assumptions */
-        t = s->sub.trees.table;
-        t = zlib_inflate_trees_dynamic(257 + (t & 0x1f), 1 + ((t >> 5) & 0x1f),
-				       s->sub.trees.blens, &bl, &bd, &tl, &td,
-				       s->hufts, z);
-        if (t != Z_OK)
-        {
-          if (t == (uInt)Z_DATA_ERROR)
-            s->mode = B_BAD;
-          r = t;
-          LEAVE
-        }
-        if ((c = zlib_inflate_codes_new(bl, bd, tl, td, z)) == NULL)
-        {
-          r = Z_MEM_ERROR;
-          LEAVE
-        }
-        s->sub.decode.codes = c;
-      }
-      s->mode = CODES;
-    case CODES:
-      UPDATE
-      if ((r = zlib_inflate_codes(s, z, r)) != Z_STREAM_END)
-        return zlib_inflate_flush(s, z, r);
-      r = Z_OK;
-      zlib_inflate_codes_free(s->sub.decode.codes, z);
-      LOAD
-      if (!s->last)
-      {
-        s->mode = TYPE;
-        break;
-      }
-      s->mode = DRY;
-    case DRY:
-      FLUSH
-      if (s->read != s->write)
-        LEAVE
-      s->mode = B_DONE;
-    case B_DONE:
-      r = Z_STREAM_END;
-      LEAVE
-    case B_BAD:
-      r = Z_DATA_ERROR;
-      LEAVE
-    default:
-      r = Z_STREAM_ERROR;
-      LEAVE
-  }
-}
-
-
-int zlib_inflate_blocks_free(
-	inflate_blocks_statef *s,
-	z_streamp z
-)
-{
-  zlib_inflate_blocks_reset(s, z, NULL);
-  return Z_OK;
-}
-
-
-#if 0
-void zlib_inflate_set_dictionary(
-	inflate_blocks_statef *s,
-	const Byte *d,
-	uInt  n
-)
-{
-  memcpy(s->window, d, n);
-  s->read = s->write = s->window + n;
-}
-#endif  /*  0  */
-
-
-/* Returns true if inflate is currently at the end of a block generated
- * by Z_SYNC_FLUSH or Z_FULL_FLUSH. 
- * IN assertion: s != NULL
- */
-#if 0
-int zlib_inflate_blocks_sync_point(
-	inflate_blocks_statef *s
-)
-{
-  return s->mode == LENS;
-}
-#endif  /*  0  */
diff --git a/lib/zlib_inflate/infblock.h b/lib/zlib_inflate/infblock.h
deleted file mode 100644
index ceee60b5107c..000000000000
--- a/lib/zlib_inflate/infblock.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* infblock.h -- header to use infblock.c
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
- */
-
-/* WARNING: this file should *not* be used by applications. It is
-   part of the implementation of the compression library and is
-   subject to change. Applications should only use zlib.h.
- */
-
-#ifndef _INFBLOCK_H
-#define _INFBLOCK_H
-
-struct inflate_blocks_state;
-typedef struct inflate_blocks_state inflate_blocks_statef;
-
-extern inflate_blocks_statef * zlib_inflate_blocks_new (
-    z_streamp z,
-    check_func c,              /* check function */
-    uInt w);                   /* window size */
-
-extern int zlib_inflate_blocks (
-    inflate_blocks_statef *,
-    z_streamp ,
-    int);                      /* initial return code */
-
-extern void zlib_inflate_blocks_reset (
-    inflate_blocks_statef *,
-    z_streamp ,
-    uLong *);                  /* check value on output */
-
-extern int zlib_inflate_blocks_free (
-    inflate_blocks_statef *,
-    z_streamp);
-
-#if 0
-extern void zlib_inflate_set_dictionary (
-    inflate_blocks_statef *s,
-    const Byte *d,  /* dictionary */
-    uInt  n);       /* dictionary length */
-#endif  /*  0  */
-
-#if 0
-extern int zlib_inflate_blocks_sync_point (
-    inflate_blocks_statef *s);
-#endif  /*  0  */
-
-#endif /* _INFBLOCK_H */
diff --git a/lib/zlib_inflate/infcodes.c b/lib/zlib_inflate/infcodes.c
deleted file mode 100644
index 07cd7591cbb7..000000000000
--- a/lib/zlib_inflate/infcodes.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/* infcodes.c -- process literals and length/distance pairs
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
- */
-
-#include <linux/zutil.h>
-#include "inftrees.h"
-#include "infblock.h"
-#include "infcodes.h"
-#include "infutil.h"
-#include "inffast.h"
-
-/* simplify the use of the inflate_huft type with some defines */
-#define exop word.what.Exop
-#define bits word.what.Bits
-
-inflate_codes_statef *zlib_inflate_codes_new(
-	uInt bl,
-	uInt bd,
-	inflate_huft *tl,
-	inflate_huft *td, /* need separate declaration for Borland C++ */
-	z_streamp z
-)
-{
-  inflate_codes_statef *c;
-
-  c = &WS(z)->working_state;
-  {
-    c->mode = START;
-    c->lbits = (Byte)bl;
-    c->dbits = (Byte)bd;
-    c->ltree = tl;
-    c->dtree = td;
-  }
-  return c;
-}
-
-
-int zlib_inflate_codes(
-	inflate_blocks_statef *s,
-	z_streamp z,
-	int r
-)
-{
-  uInt j;               /* temporary storage */
-  inflate_huft *t;      /* temporary pointer */
-  uInt e;               /* extra bits or operation */
-  uLong b;              /* bit buffer */
-  uInt k;               /* bits in bit buffer */
-  Byte *p;              /* input data pointer */
-  uInt n;               /* bytes available there */
-  Byte *q;              /* output window write pointer */
-  uInt m;               /* bytes to end of window or read pointer */
-  Byte *f;              /* pointer to copy strings from */
-  inflate_codes_statef *c = s->sub.decode.codes;  /* codes state */
-
-  /* copy input/output information to locals (UPDATE macro restores) */
-  LOAD
-
-  /* process input and output based on current state */
-  while (1) switch (c->mode)
-  {             /* waiting for "i:"=input, "o:"=output, "x:"=nothing */
-    case START:         /* x: set up for LEN */
-#ifndef SLOW
-      if (m >= 258 && n >= 10)
-      {
-        UPDATE
-        r = zlib_inflate_fast(c->lbits, c->dbits, c->ltree, c->dtree, s, z);
-        LOAD
-        if (r != Z_OK)
-        {
-          c->mode = r == Z_STREAM_END ? WASH : BADCODE;
-          break;
-        }
-      }
-#endif /* !SLOW */
-      c->sub.code.need = c->lbits;
-      c->sub.code.tree = c->ltree;
-      c->mode = LEN;
-    case LEN:           /* i: get length/literal/eob next */
-      j = c->sub.code.need;
-      NEEDBITS(j)
-      t = c->sub.code.tree + ((uInt)b & zlib_inflate_mask[j]);
-      DUMPBITS(t->bits)
-      e = (uInt)(t->exop);
-      if (e == 0)               /* literal */
-      {
-        c->sub.lit = t->base;
-        c->mode = LIT;
-        break;
-      }
-      if (e & 16)               /* length */
-      {
-        c->sub.copy.get = e & 15;
-        c->len = t->base;
-        c->mode = LENEXT;
-        break;
-      }
-      if ((e & 64) == 0)        /* next table */
-      {
-        c->sub.code.need = e;
-        c->sub.code.tree = t + t->base;
-        break;
-      }
-      if (e & 32)               /* end of block */
-      {
-        c->mode = WASH;
-        break;
-      }
-      c->mode = BADCODE;        /* invalid code */
-      z->msg = (char*)"invalid literal/length code";
-      r = Z_DATA_ERROR;
-      LEAVE
-    case LENEXT:        /* i: getting length extra (have base) */
-      j = c->sub.copy.get;
-      NEEDBITS(j)
-      c->len += (uInt)b & zlib_inflate_mask[j];
-      DUMPBITS(j)
-      c->sub.code.need = c->dbits;
-      c->sub.code.tree = c->dtree;
-      c->mode = DIST;
-    case DIST:          /* i: get distance next */
-      j = c->sub.code.need;
-      NEEDBITS(j)
-      t = c->sub.code.tree + ((uInt)b & zlib_inflate_mask[j]);
-      DUMPBITS(t->bits)
-      e = (uInt)(t->exop);
-      if (e & 16)               /* distance */
-      {
-        c->sub.copy.get = e & 15;
-        c->sub.copy.dist = t->base;
-        c->mode = DISTEXT;
-        break;
-      }
-      if ((e & 64) == 0)        /* next table */
-      {
-        c->sub.code.need = e;
-        c->sub.code.tree = t + t->base;
-        break;
-      }
-      c->mode = BADCODE;        /* invalid code */
-      z->msg = (char*)"invalid distance code";
-      r = Z_DATA_ERROR;
-      LEAVE
-    case DISTEXT:       /* i: getting distance extra */
-      j = c->sub.copy.get;
-      NEEDBITS(j)
-      c->sub.copy.dist += (uInt)b & zlib_inflate_mask[j];
-      DUMPBITS(j)
-      c->mode = COPY;
-    case COPY:          /* o: copying bytes in window, waiting for space */
-      f = q - c->sub.copy.dist;
-      while (f < s->window)             /* modulo window size-"while" instead */
-        f += s->end - s->window;        /* of "if" handles invalid distances */
-      while (c->len)
-      {
-        NEEDOUT
-        OUTBYTE(*f++)
-        if (f == s->end)
-          f = s->window;
-        c->len--;
-      }
-      c->mode = START;
-      break;
-    case LIT:           /* o: got literal, waiting for output space */
-      NEEDOUT
-      OUTBYTE(c->sub.lit)
-      c->mode = START;
-      break;
-    case WASH:          /* o: got eob, possibly more output */
-      if (k > 7)        /* return unused byte, if any */
-      {
-        k -= 8;
-        n++;
-        p--;            /* can always return one */
-      }
-      FLUSH
-      if (s->read != s->write)
-        LEAVE
-      c->mode = END;
-    case END:
-      r = Z_STREAM_END;
-      LEAVE
-    case BADCODE:       /* x: got error */
-      r = Z_DATA_ERROR;
-      LEAVE
-    default:
-      r = Z_STREAM_ERROR;
-      LEAVE
-  }
-#ifdef NEED_DUMMY_RETURN
-  return Z_STREAM_ERROR;  /* Some dumb compilers complain without this */
-#endif
-}
-
-
-void zlib_inflate_codes_free(
-	inflate_codes_statef *c,
-	z_streamp z
-)
-{
-}
diff --git a/lib/zlib_inflate/infcodes.h b/lib/zlib_inflate/infcodes.h
deleted file mode 100644
index 5cff417523b0..000000000000
--- a/lib/zlib_inflate/infcodes.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* infcodes.h -- header to use infcodes.c
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
- */
-
-/* WARNING: this file should *not* be used by applications. It is
-   part of the implementation of the compression library and is
-   subject to change. Applications should only use zlib.h.
- */
-
-#ifndef _INFCODES_H
-#define _INFCODES_H
-
-#include "infblock.h"
-
-struct inflate_codes_state;
-typedef struct inflate_codes_state inflate_codes_statef;
-
-extern inflate_codes_statef *zlib_inflate_codes_new (
-    uInt, uInt,
-    inflate_huft *, inflate_huft *,
-    z_streamp );
-
-extern int zlib_inflate_codes (
-    inflate_blocks_statef *,
-    z_streamp ,
-    int);
-
-extern void zlib_inflate_codes_free (
-    inflate_codes_statef *,
-    z_streamp );
-
-#endif /* _INFCODES_H */
diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c
index 0bd7623fc85a..02a16eacb72d 100644
--- a/lib/zlib_inflate/inffast.c
+++ b/lib/zlib_inflate/inffast.c
@@ -1,176 +1,312 @@
-/* inffast.c -- process literals and length/distance pairs fast
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
+/* inffast.c -- fast decoding
+ * Copyright (C) 1995-2004 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
 #include <linux/zutil.h>
 #include "inftrees.h"
-#include "infblock.h"
-#include "infcodes.h"
-#include "infutil.h"
+#include "inflate.h"
 #include "inffast.h"
 
-struct inflate_codes_state;
-
-/* simplify the use of the inflate_huft type with some defines */
-#define exop word.what.Exop
-#define bits word.what.Bits
-
-/* macros for bit input with no checking and for returning unused bytes */
-#define GRABBITS(j) {while(k<(j)){b|=((uLong)NEXTBYTE)<<k;k+=8;}}
-#define UNGRAB {c=z->avail_in-n;c=(k>>3)<c?k>>3:c;n+=c;p-=c;k-=c<<3;}
-
-/* Called with number of bytes left to write in window at least 258
-   (the maximum string length) and number of input bytes available
-   at least ten.  The ten bytes are six bytes for the longest length/
-   distance pair plus four bytes for overloading the bit buffer. */
-
-int zlib_inflate_fast(
-	uInt bl,
-	uInt bd,
-	inflate_huft *tl,
-	inflate_huft *td, /* need separate declaration for Borland C++ */
-	inflate_blocks_statef *s,
-	z_streamp z
-)
+#ifndef ASMINF
+
+/* Allow machine dependent optimization for post-increment or pre-increment.
+   Based on testing to date,
+   Pre-increment preferred for:
+   - PowerPC G3 (Adler)
+   - MIPS R5000 (Randers-Pehrson)
+   Post-increment preferred for:
+   - none
+   No measurable difference:
+   - Pentium III (Anderson)
+   - M68060 (Nikl)
+ */
+#ifdef POSTINC
+#  define OFF 0
+#  define PUP(a) *(a)++
+#else
+#  define OFF 1
+#  define PUP(a) *++(a)
+#endif
+
+/*
+   Decode literal, length, and distance codes and write out the resulting
+   literal and match bytes until either not enough input or output is
+   available, an end-of-block is encountered, or a data error is encountered.
+   When large enough input and output buffers are supplied to inflate(), for
+   example, a 16K input buffer and a 64K output buffer, more than 95% of the
+   inflate execution time is spent in this routine.
+
+   Entry assumptions:
+
+        state->mode == LEN
+        strm->avail_in >= 6
+        strm->avail_out >= 258
+        start >= strm->avail_out
+        state->bits < 8
+
+   On return, state->mode is one of:
+
+        LEN -- ran out of enough output space or enough available input
+        TYPE -- reached end of block code, inflate() to interpret next block
+        BAD -- error in block data
+
+   Notes:
+
+    - The maximum input bits used by a length/distance pair is 15 bits for the
+      length code, 5 bits for the length extra, 15 bits for the distance code,
+      and 13 bits for the distance extra.  This totals 48 bits, or six bytes.
+      Therefore if strm->avail_in >= 6, then there is enough input to avoid
+      checking for available input while decoding.
+
+    - The maximum bytes that a single length/distance pair can output is 258
+      bytes, which is the maximum length that can be coded.  inflate_fast()
+      requires strm->avail_out >= 258 for each loop to avoid checking for
+      output space.
+ */
+void inflate_fast(strm, start)
+z_streamp strm;
+unsigned start;         /* inflate()'s starting value for strm->avail_out */
 {
-  inflate_huft *t;      /* temporary pointer */
-  uInt e;               /* extra bits or operation */
-  uLong b;              /* bit buffer */
-  uInt k;               /* bits in bit buffer */
-  Byte *p;              /* input data pointer */
-  uInt n;               /* bytes available there */
-  Byte *q;              /* output window write pointer */
-  uInt m;               /* bytes to end of window or read pointer */
-  uInt ml;              /* mask for literal/length tree */
-  uInt md;              /* mask for distance tree */
-  uInt c;               /* bytes to copy */
-  uInt d;               /* distance back to copy from */
-  Byte *r;              /* copy source pointer */
-
-  /* load input, output, bit values */
-  LOAD
-
-  /* initialize masks */
-  ml = zlib_inflate_mask[bl];
-  md = zlib_inflate_mask[bd];
-
-  /* do until not enough input or output space for fast loop */
-  do {                          /* assume called with m >= 258 && n >= 10 */
-    /* get literal/length code */
-    GRABBITS(20)                /* max bits for literal/length code */
-    if ((e = (t = tl + ((uInt)b & ml))->exop) == 0)
-    {
-      DUMPBITS(t->bits)
-      *q++ = (Byte)t->base;
-      m--;
-      continue;
-    }
+    struct inflate_state *state;
+    unsigned char *in;      /* local strm->next_in */
+    unsigned char *last;    /* while in < last, enough input available */
+    unsigned char *out;     /* local strm->next_out */
+    unsigned char *beg;     /* inflate()'s initial strm->next_out */
+    unsigned char *end;     /* while out < end, enough space available */
+#ifdef INFLATE_STRICT
+    unsigned dmax;              /* maximum distance from zlib header */
+#endif
+    unsigned wsize;             /* window size or zero if not using window */
+    unsigned whave;             /* valid bytes in the window */
+    unsigned write;             /* window write index */
+    unsigned char *window;  /* allocated sliding window, if wsize != 0 */
+    unsigned long hold;         /* local strm->hold */
+    unsigned bits;              /* local strm->bits */
+    code const *lcode;      /* local strm->lencode */
+    code const *dcode;      /* local strm->distcode */
+    unsigned lmask;             /* mask for first level of length codes */
+    unsigned dmask;             /* mask for first level of distance codes */
+    code this;                  /* retrieved table entry */
+    unsigned op;                /* code bits, operation, extra bits, or */
+                                /*  window position, window bytes to copy */
+    unsigned len;               /* match length, unused bytes */
+    unsigned dist;              /* match distance */
+    unsigned char *from;    /* where to copy match from */
+
+    /* copy state to local variables */
+    state = (struct inflate_state *)strm->state;
+    in = strm->next_in - OFF;
+    last = in + (strm->avail_in - 5);
+    out = strm->next_out - OFF;
+    beg = out - (start - strm->avail_out);
+    end = out + (strm->avail_out - 257);
+#ifdef INFLATE_STRICT
+    dmax = state->dmax;
+#endif
+    wsize = state->wsize;
+    whave = state->whave;
+    write = state->write;
+    window = state->window;
+    hold = state->hold;
+    bits = state->bits;
+    lcode = state->lencode;
+    dcode = state->distcode;
+    lmask = (1U << state->lenbits) - 1;
+    dmask = (1U << state->distbits) - 1;
+
+    /* decode literals and length/distances until end-of-block or not enough
+       input data or output space */
     do {
-      DUMPBITS(t->bits)
-      if (e & 16)
-      {
-        /* get extra bits for length */
-        e &= 15;
-        c = t->base + ((uInt)b & zlib_inflate_mask[e]);
-        DUMPBITS(e)
-
-        /* decode distance base of block to copy */
-        GRABBITS(15);           /* max bits for distance code */
-        e = (t = td + ((uInt)b & md))->exop;
-        do {
-          DUMPBITS(t->bits)
-          if (e & 16)
-          {
-            /* get extra bits to add to distance base */
-            e &= 15;
-            GRABBITS(e)         /* get extra bits (up to 13) */
-            d = t->base + ((uInt)b & zlib_inflate_mask[e]);
-            DUMPBITS(e)
-
-            /* do the copy */
-            m -= c;
-            r = q - d;
-            if (r < s->window)                  /* wrap if needed */
-            {
-              do {
-                r += s->end - s->window;        /* force pointer in window */
-              } while (r < s->window);          /* covers invalid distances */
-              e = s->end - r;
-              if (c > e)
-              {
-                c -= e;                         /* wrapped copy */
-                do {
-                    *q++ = *r++;
-                } while (--e);
-                r = s->window;
-                do {
-                    *q++ = *r++;
-                } while (--c);
-              }
-              else                              /* normal copy */
-              {
-                *q++ = *r++;  c--;
-                *q++ = *r++;  c--;
-                do {
-                    *q++ = *r++;
-                } while (--c);
-              }
+        if (bits < 15) {
+            hold += (unsigned long)(PUP(in)) << bits;
+            bits += 8;
+            hold += (unsigned long)(PUP(in)) << bits;
+            bits += 8;
+        }
+        this = lcode[hold & lmask];
+      dolen:
+        op = (unsigned)(this.bits);
+        hold >>= op;
+        bits -= op;
+        op = (unsigned)(this.op);
+        if (op == 0) {                          /* literal */
+            PUP(out) = (unsigned char)(this.val);
+        }
+        else if (op & 16) {                     /* length base */
+            len = (unsigned)(this.val);
+            op &= 15;                           /* number of extra bits */
+            if (op) {
+                if (bits < op) {
+                    hold += (unsigned long)(PUP(in)) << bits;
+                    bits += 8;
+                }
+                len += (unsigned)hold & ((1U << op) - 1);
+                hold >>= op;
+                bits -= op;
+            }
+            if (bits < 15) {
+                hold += (unsigned long)(PUP(in)) << bits;
+                bits += 8;
+                hold += (unsigned long)(PUP(in)) << bits;
+                bits += 8;
+            }
+            this = dcode[hold & dmask];
+          dodist:
+            op = (unsigned)(this.bits);
+            hold >>= op;
+            bits -= op;
+            op = (unsigned)(this.op);
+            if (op & 16) {                      /* distance base */
+                dist = (unsigned)(this.val);
+                op &= 15;                       /* number of extra bits */
+                if (bits < op) {
+                    hold += (unsigned long)(PUP(in)) << bits;
+                    bits += 8;
+                    if (bits < op) {
+                        hold += (unsigned long)(PUP(in)) << bits;
+                        bits += 8;
+                    }
+                }
+                dist += (unsigned)hold & ((1U << op) - 1);
+#ifdef INFLATE_STRICT
+                if (dist > dmax) {
+                    strm->msg = (char *)"invalid distance too far back";
+                    state->mode = BAD;
+                    break;
+                }
+#endif
+                hold >>= op;
+                bits -= op;
+                op = (unsigned)(out - beg);     /* max distance in output */
+                if (dist > op) {                /* see if copy from window */
+                    op = dist - op;             /* distance back in window */
+                    if (op > whave) {
+                        strm->msg = (char *)"invalid distance too far back";
+                        state->mode = BAD;
+                        break;
+                    }
+                    from = window - OFF;
+                    if (write == 0) {           /* very common case */
+                        from += wsize - op;
+                        if (op < len) {         /* some from window */
+                            len -= op;
+                            do {
+                                PUP(out) = PUP(from);
+                            } while (--op);
+                            from = out - dist;  /* rest from output */
+                        }
+                    }
+                    else if (write < op) {      /* wrap around window */
+                        from += wsize + write - op;
+                        op -= write;
+                        if (op < len) {         /* some from end of window */
+                            len -= op;
+                            do {
+                                PUP(out) = PUP(from);
+                            } while (--op);
+                            from = window - OFF;
+                            if (write < len) {  /* some from start of window */
+                                op = write;
+                                len -= op;
+                                do {
+                                    PUP(out) = PUP(from);
+                                } while (--op);
+                                from = out - dist;      /* rest from output */
+                            }
+                        }
+                    }
+                    else {                      /* contiguous in window */
+                        from += write - op;
+                        if (op < len) {         /* some from window */
+                            len -= op;
+                            do {
+                                PUP(out) = PUP(from);
+                            } while (--op);
+                            from = out - dist;  /* rest from output */
+                        }
+                    }
+                    while (len > 2) {
+                        PUP(out) = PUP(from);
+                        PUP(out) = PUP(from);
+                        PUP(out) = PUP(from);
+                        len -= 3;
+                    }
+                    if (len) {
+                        PUP(out) = PUP(from);
+                        if (len > 1)
+                            PUP(out) = PUP(from);
+                    }
+                }
+                else {
+                    from = out - dist;          /* copy direct from output */
+                    do {                        /* minimum length is three */
+                        PUP(out) = PUP(from);
+                        PUP(out) = PUP(from);
+                        PUP(out) = PUP(from);
+                        len -= 3;
+                    } while (len > 2);
+                    if (len) {
+                        PUP(out) = PUP(from);
+                        if (len > 1)
+                            PUP(out) = PUP(from);
+                    }
+                }
+            }
+            else if ((op & 64) == 0) {          /* 2nd level distance code */
+                this = dcode[this.val + (hold & ((1U << op) - 1))];
+                goto dodist;
             }
-            else                                /* normal copy */
-            {
-              *q++ = *r++;  c--;
-              *q++ = *r++;  c--;
-              do {
-                *q++ = *r++;
-              } while (--c);
+            else {
+                strm->msg = (char *)"invalid distance code";
+                state->mode = BAD;
+                break;
             }
+        }
+        else if ((op & 64) == 0) {              /* 2nd level length code */
+            this = lcode[this.val + (hold & ((1U << op) - 1))];
+            goto dolen;
+        }
+        else if (op & 32) {                     /* end-of-block */
+            state->mode = TYPE;
             break;
-          }
-          else if ((e & 64) == 0)
-          {
-            t += t->base;
-            e = (t += ((uInt)b & zlib_inflate_mask[e]))->exop;
-          }
-          else
-          {
-            z->msg = (char*)"invalid distance code";
-            UNGRAB
-            UPDATE
-            return Z_DATA_ERROR;
-          }
-        } while (1);
-        break;
-      }
-      if ((e & 64) == 0)
-      {
-        t += t->base;
-        if ((e = (t += ((uInt)b & zlib_inflate_mask[e]))->exop) == 0)
-        {
-          DUMPBITS(t->bits)
-          *q++ = (Byte)t->base;
-          m--;
-          break;
         }
-      }
-      else if (e & 32)
-      {
-        UNGRAB
-        UPDATE
-        return Z_STREAM_END;
-      }
-      else
-      {
-        z->msg = (char*)"invalid literal/length code";
-        UNGRAB
-        UPDATE
-        return Z_DATA_ERROR;
-      }
-    } while (1);
-  } while (m >= 258 && n >= 10);
-
-  /* not enough input or output--restore pointers and return */
-  UNGRAB
-  UPDATE
-  return Z_OK;
+        else {
+            strm->msg = (char *)"invalid literal/length code";
+            state->mode = BAD;
+            break;
+        }
+    } while (in < last && out < end);
+
+    /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
+    len = bits >> 3;
+    in -= len;
+    bits -= len << 3;
+    hold &= (1U << bits) - 1;
+
+    /* update state and return */
+    strm->next_in = in + OFF;
+    strm->next_out = out + OFF;
+    strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
+    strm->avail_out = (unsigned)(out < end ?
+                                 257 + (end - out) : 257 - (out - end));
+    state->hold = hold;
+    state->bits = bits;
+    return;
 }
+
+/*
+   inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
+   - Using bit fields for code structure
+   - Different op definition to avoid & for extra bits (do & for table bits)
+   - Three separate decoding do-loops for direct, window, and write == 0
+   - Special case for distance > 1 copies to do overlapped load and store copy
+   - Explicit branch predictions (based on measured branch probabilities)
+   - Deferring match copy and interspersed it with decoding subsequent codes
+   - Swapping literal/length else
+   - Swapping window/direct else
+   - Larger unrolled copy loops (three is about right)
+   - Moving len -= 3 statement into middle of loop
+ */
+
+#endif /* !ASMINF */
diff --git a/lib/zlib_inflate/inffast.h b/lib/zlib_inflate/inffast.h
index fc720f0fa7f5..40315d9fddc4 100644
--- a/lib/zlib_inflate/inffast.h
+++ b/lib/zlib_inflate/inffast.h
@@ -1,6 +1,6 @@
 /* inffast.h -- header to use inffast.c
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
+ * Copyright (C) 1995-2003 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
 /* WARNING: this file should *not* be used by applications. It is
@@ -8,10 +8,4 @@
    subject to change. Applications should only use zlib.h.
  */
 
-extern int zlib_inflate_fast (
-    uInt,
-    uInt,
-    inflate_huft *,
-    inflate_huft *,
-    inflate_blocks_statef *,
-    z_streamp );
+void inflate_fast (z_streamp strm, unsigned start);
diff --git a/lib/zlib_inflate/inffixed.h b/lib/zlib_inflate/inffixed.h
new file mode 100644
index 000000000000..75ed4b5978de
--- /dev/null
+++ b/lib/zlib_inflate/inffixed.h
@@ -0,0 +1,94 @@
+    /* inffixed.h -- table for decoding fixed codes
+     * Generated automatically by makefixed().
+     */
+
+    /* WARNING: this file should *not* be used by applications. It
+       is part of the implementation of the compression library and
+       is subject to change. Applications should only use zlib.h.
+     */
+
+    static const code lenfix[512] = {
+        {96,7,0},{0,8,80},{0,8,16},{20,8,115},{18,7,31},{0,8,112},{0,8,48},
+        {0,9,192},{16,7,10},{0,8,96},{0,8,32},{0,9,160},{0,8,0},{0,8,128},
+        {0,8,64},{0,9,224},{16,7,6},{0,8,88},{0,8,24},{0,9,144},{19,7,59},
+        {0,8,120},{0,8,56},{0,9,208},{17,7,17},{0,8,104},{0,8,40},{0,9,176},
+        {0,8,8},{0,8,136},{0,8,72},{0,9,240},{16,7,4},{0,8,84},{0,8,20},
+        {21,8,227},{19,7,43},{0,8,116},{0,8,52},{0,9,200},{17,7,13},{0,8,100},
+        {0,8,36},{0,9,168},{0,8,4},{0,8,132},{0,8,68},{0,9,232},{16,7,8},
+        {0,8,92},{0,8,28},{0,9,152},{20,7,83},{0,8,124},{0,8,60},{0,9,216},
+        {18,7,23},{0,8,108},{0,8,44},{0,9,184},{0,8,12},{0,8,140},{0,8,76},
+        {0,9,248},{16,7,3},{0,8,82},{0,8,18},{21,8,163},{19,7,35},{0,8,114},
+        {0,8,50},{0,9,196},{17,7,11},{0,8,98},{0,8,34},{0,9,164},{0,8,2},
+        {0,8,130},{0,8,66},{0,9,228},{16,7,7},{0,8,90},{0,8,26},{0,9,148},
+        {20,7,67},{0,8,122},{0,8,58},{0,9,212},{18,7,19},{0,8,106},{0,8,42},
+        {0,9,180},{0,8,10},{0,8,138},{0,8,74},{0,9,244},{16,7,5},{0,8,86},
+        {0,8,22},{64,8,0},{19,7,51},{0,8,118},{0,8,54},{0,9,204},{17,7,15},
+        {0,8,102},{0,8,38},{0,9,172},{0,8,6},{0,8,134},{0,8,70},{0,9,236},
+        {16,7,9},{0,8,94},{0,8,30},{0,9,156},{20,7,99},{0,8,126},{0,8,62},
+        {0,9,220},{18,7,27},{0,8,110},{0,8,46},{0,9,188},{0,8,14},{0,8,142},
+        {0,8,78},{0,9,252},{96,7,0},{0,8,81},{0,8,17},{21,8,131},{18,7,31},
+        {0,8,113},{0,8,49},{0,9,194},{16,7,10},{0,8,97},{0,8,33},{0,9,162},
+        {0,8,1},{0,8,129},{0,8,65},{0,9,226},{16,7,6},{0,8,89},{0,8,25},
+        {0,9,146},{19,7,59},{0,8,121},{0,8,57},{0,9,210},{17,7,17},{0,8,105},
+        {0,8,41},{0,9,178},{0,8,9},{0,8,137},{0,8,73},{0,9,242},{16,7,4},
+        {0,8,85},{0,8,21},{16,8,258},{19,7,43},{0,8,117},{0,8,53},{0,9,202},
+        {17,7,13},{0,8,101},{0,8,37},{0,9,170},{0,8,5},{0,8,133},{0,8,69},
+        {0,9,234},{16,7,8},{0,8,93},{0,8,29},{0,9,154},{20,7,83},{0,8,125},
+        {0,8,61},{0,9,218},{18,7,23},{0,8,109},{0,8,45},{0,9,186},{0,8,13},
+        {0,8,141},{0,8,77},{0,9,250},{16,7,3},{0,8,83},{0,8,19},{21,8,195},
+        {19,7,35},{0,8,115},{0,8,51},{0,9,198},{17,7,11},{0,8,99},{0,8,35},
+        {0,9,166},{0,8,3},{0,8,131},{0,8,67},{0,9,230},{16,7,7},{0,8,91},
+        {0,8,27},{0,9,150},{20,7,67},{0,8,123},{0,8,59},{0,9,214},{18,7,19},
+        {0,8,107},{0,8,43},{0,9,182},{0,8,11},{0,8,139},{0,8,75},{0,9,246},
+        {16,7,5},{0,8,87},{0,8,23},{64,8,0},{19,7,51},{0,8,119},{0,8,55},
+        {0,9,206},{17,7,15},{0,8,103},{0,8,39},{0,9,174},{0,8,7},{0,8,135},
+        {0,8,71},{0,9,238},{16,7,9},{0,8,95},{0,8,31},{0,9,158},{20,7,99},
+        {0,8,127},{0,8,63},{0,9,222},{18,7,27},{0,8,111},{0,8,47},{0,9,190},
+        {0,8,15},{0,8,143},{0,8,79},{0,9,254},{96,7,0},{0,8,80},{0,8,16},
+        {20,8,115},{18,7,31},{0,8,112},{0,8,48},{0,9,193},{16,7,10},{0,8,96},
+        {0,8,32},{0,9,161},{0,8,0},{0,8,128},{0,8,64},{0,9,225},{16,7,6},
+        {0,8,88},{0,8,24},{0,9,145},{19,7,59},{0,8,120},{0,8,56},{0,9,209},
+        {17,7,17},{0,8,104},{0,8,40},{0,9,177},{0,8,8},{0,8,136},{0,8,72},
+        {0,9,241},{16,7,4},{0,8,84},{0,8,20},{21,8,227},{19,7,43},{0,8,116},
+        {0,8,52},{0,9,201},{17,7,13},{0,8,100},{0,8,36},{0,9,169},{0,8,4},
+        {0,8,132},{0,8,68},{0,9,233},{16,7,8},{0,8,92},{0,8,28},{0,9,153},
+        {20,7,83},{0,8,124},{0,8,60},{0,9,217},{18,7,23},{0,8,108},{0,8,44},
+        {0,9,185},{0,8,12},{0,8,140},{0,8,76},{0,9,249},{16,7,3},{0,8,82},
+        {0,8,18},{21,8,163},{19,7,35},{0,8,114},{0,8,50},{0,9,197},{17,7,11},
+        {0,8,98},{0,8,34},{0,9,165},{0,8,2},{0,8,130},{0,8,66},{0,9,229},
+        {16,7,7},{0,8,90},{0,8,26},{0,9,149},{20,7,67},{0,8,122},{0,8,58},
+        {0,9,213},{18,7,19},{0,8,106},{0,8,42},{0,9,181},{0,8,10},{0,8,138},
+        {0,8,74},{0,9,245},{16,7,5},{0,8,86},{0,8,22},{64,8,0},{19,7,51},
+        {0,8,118},{0,8,54},{0,9,205},{17,7,15},{0,8,102},{0,8,38},{0,9,173},
+        {0,8,6},{0,8,134},{0,8,70},{0,9,237},{16,7,9},{0,8,94},{0,8,30},
+        {0,9,157},{20,7,99},{0,8,126},{0,8,62},{0,9,221},{18,7,27},{0,8,110},
+        {0,8,46},{0,9,189},{0,8,14},{0,8,142},{0,8,78},{0,9,253},{96,7,0},
+        {0,8,81},{0,8,17},{21,8,131},{18,7,31},{0,8,113},{0,8,49},{0,9,195},
+        {16,7,10},{0,8,97},{0,8,33},{0,9,163},{0,8,1},{0,8,129},{0,8,65},
+        {0,9,227},{16,7,6},{0,8,89},{0,8,25},{0,9,147},{19,7,59},{0,8,121},
+        {0,8,57},{0,9,211},{17,7,17},{0,8,105},{0,8,41},{0,9,179},{0,8,9},
+        {0,8,137},{0,8,73},{0,9,243},{16,7,4},{0,8,85},{0,8,21},{16,8,258},
+        {19,7,43},{0,8,117},{0,8,53},{0,9,203},{17,7,13},{0,8,101},{0,8,37},
+        {0,9,171},{0,8,5},{0,8,133},{0,8,69},{0,9,235},{16,7,8},{0,8,93},
+        {0,8,29},{0,9,155},{20,7,83},{0,8,125},{0,8,61},{0,9,219},{18,7,23},
+        {0,8,109},{0,8,45},{0,9,187},{0,8,13},{0,8,141},{0,8,77},{0,9,251},
+        {16,7,3},{0,8,83},{0,8,19},{21,8,195},{19,7,35},{0,8,115},{0,8,51},
+        {0,9,199},{17,7,11},{0,8,99},{0,8,35},{0,9,167},{0,8,3},{0,8,131},
+        {0,8,67},{0,9,231},{16,7,7},{0,8,91},{0,8,27},{0,9,151},{20,7,67},
+        {0,8,123},{0,8,59},{0,9,215},{18,7,19},{0,8,107},{0,8,43},{0,9,183},
+        {0,8,11},{0,8,139},{0,8,75},{0,9,247},{16,7,5},{0,8,87},{0,8,23},
+        {64,8,0},{19,7,51},{0,8,119},{0,8,55},{0,9,207},{17,7,15},{0,8,103},
+        {0,8,39},{0,9,175},{0,8,7},{0,8,135},{0,8,71},{0,9,239},{16,7,9},
+        {0,8,95},{0,8,31},{0,9,159},{20,7,99},{0,8,127},{0,8,63},{0,9,223},
+        {18,7,27},{0,8,111},{0,8,47},{0,9,191},{0,8,15},{0,8,143},{0,8,79},
+        {0,9,255}
+    };
+
+    static const code distfix[32] = {
+        {16,5,1},{23,5,257},{19,5,17},{27,5,4097},{17,5,5},{25,5,1025},
+        {21,5,65},{29,5,16385},{16,5,3},{24,5,513},{20,5,33},{28,5,8193},
+        {18,5,9},{26,5,2049},{22,5,129},{64,5,0},{16,5,2},{23,5,385},
+        {19,5,25},{27,5,6145},{17,5,7},{25,5,1537},{21,5,97},{29,5,24577},
+        {16,5,4},{24,5,769},{20,5,49},{28,5,12289},{18,5,13},{26,5,3073},
+        {22,5,193},{64,5,0}
+    };
diff --git a/lib/zlib_inflate/inflate.c b/lib/zlib_inflate/inflate.c
index 31b9e9054bf7..7f922dccf1a5 100644
--- a/lib/zlib_inflate/inflate.c
+++ b/lib/zlib_inflate/inflate.c
@@ -1,89 +1,148 @@
-/* inflate.c -- zlib interface to inflate modules
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
+/* inflate.c -- zlib decompression
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Based on zlib 1.2.3 but modified for the Linux Kernel by
+ * Richard Purdie <richard@openedhand.com>
+ *
+ * Changes mainly for static instead of dynamic memory allocation
+ *
  */
 
 #include <linux/zutil.h>
-#include "infblock.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "inffast.h"
 #include "infutil.h"
 
 int zlib_inflate_workspacesize(void)
 {
-  return sizeof(struct inflate_workspace);
+    return sizeof(struct inflate_workspace);
 }
 
+int zlib_inflateReset(z_streamp strm)
+{
+    struct inflate_state *state;
+
+    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    strm->total_in = strm->total_out = state->total = 0;
+    strm->msg = NULL;
+    strm->adler = 1;        /* to support ill-conceived Java test suite */
+    state->mode = HEAD;
+    state->last = 0;
+    state->havedict = 0;
+    state->dmax = 32768U;
+    state->hold = 0;
+    state->bits = 0;
+    state->lencode = state->distcode = state->next = state->codes;
 
-int zlib_inflateReset(
-	z_streamp z
-)
+    /* Initialise Window */
+    state->wsize = 1U << state->wbits;
+    state->write = 0;
+    state->whave = 0;
+
+    return Z_OK;
+}
+
+#if 0
+int zlib_inflatePrime(z_streamp strm, int bits, int value)
 {
-  if (z == NULL || z->state == NULL || z->workspace == NULL)
-    return Z_STREAM_ERROR;
-  z->total_in = z->total_out = 0;
-  z->msg = NULL;
-  z->state->mode = z->state->nowrap ? BLOCKS : METHOD;
-  zlib_inflate_blocks_reset(z->state->blocks, z, NULL);
-  return Z_OK;
+    struct inflate_state *state;
+
+    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    if (bits > 16 || state->bits + bits > 32) return Z_STREAM_ERROR;
+    value &= (1L << bits) - 1;
+    state->hold += value << state->bits;
+    state->bits += bits;
+    return Z_OK;
 }
+#endif
+
+int zlib_inflateInit2(z_streamp strm, int windowBits)
+{
+    struct inflate_state *state;
+
+    if (strm == NULL) return Z_STREAM_ERROR;
+    strm->msg = NULL;                 /* in case we return an error */
+
+    state = &WS(strm)->inflate_state;
+    strm->state = (struct internal_state *)state;
+
+    if (windowBits < 0) {
+        state->wrap = 0;
+        windowBits = -windowBits;
+    }
+    else {
+        state->wrap = (windowBits >> 4) + 1;
+    }
+    if (windowBits < 8 || windowBits > 15) {
+        return Z_STREAM_ERROR;
+    }
+    state->wbits = (unsigned)windowBits;
+    state->window = &WS(strm)->working_window[0];
 
+    return zlib_inflateReset(strm);
+}
 
-int zlib_inflateEnd(
-	z_streamp z
-)
+/*
+   Return state with length and distance decoding tables and index sizes set to
+   fixed code decoding.  This returns fixed tables from inffixed.h.
+ */
+static void zlib_fixedtables(struct inflate_state *state)
 {
-  if (z == NULL || z->state == NULL || z->workspace == NULL)
-    return Z_STREAM_ERROR;
-  if (z->state->blocks != NULL)
-    zlib_inflate_blocks_free(z->state->blocks, z);
-  z->state = NULL;
-  return Z_OK;
+#   include "inffixed.h"
+    state->lencode = lenfix;
+    state->lenbits = 9;
+    state->distcode = distfix;
+    state->distbits = 5;
 }
 
 
-int zlib_inflateInit2_(
-	z_streamp z,
-	int w,
-	const char *version,
-	int stream_size
-)
+/*
+   Update the window with the last wsize (normally 32K) bytes written before
+   returning. This is only called when a window is already in use, or when
+   output has been written during this inflate call, but the end of the deflate
+   stream has not been reached yet. It is also called to window dictionary data
+   when a dictionary is loaded.
+
+   Providing output buffers larger than 32K to inflate() should provide a speed
+   advantage, since only the last 32K of output is copied to the sliding window
+   upon return from inflate(), and since all distances after the first 32K of
+   output will fall in the output data, making match copies simpler and faster.
+   The advantage may be dependent on the size of the processor's data caches.
+ */
+static void zlib_updatewindow(z_streamp strm, unsigned out)
 {
-  if (version == NULL || version[0] != ZLIB_VERSION[0] ||
-      stream_size != sizeof(z_stream) || z->workspace == NULL)
-      return Z_VERSION_ERROR;
-
-  /* initialize state */
-  z->msg = NULL;
-  z->state = &WS(z)->internal_state;
-  z->state->blocks = NULL;
-
-  /* handle undocumented nowrap option (no zlib header or check) */
-  z->state->nowrap = 0;
-  if (w < 0)
-  {
-    w = - w;
-    z->state->nowrap = 1;
-  }
-
-  /* set window size */
-  if (w < 8 || w > 15)
-  {
-    zlib_inflateEnd(z);
-    return Z_STREAM_ERROR;
-  }
-  z->state->wbits = (uInt)w;
-
-  /* create inflate_blocks state */
-  if ((z->state->blocks =
-      zlib_inflate_blocks_new(z, z->state->nowrap ? NULL : zlib_adler32, (uInt)1 << w))
-      == NULL)
-  {
-    zlib_inflateEnd(z);
-    return Z_MEM_ERROR;
-  }
-
-  /* reset state */
-  zlib_inflateReset(z);
-  return Z_OK;
+    struct inflate_state *state;
+    unsigned copy, dist;
+
+    state = (struct inflate_state *)strm->state;
+
+    /* copy state->wsize or less output bytes into the circular window */
+    copy = out - strm->avail_out;
+    if (copy >= state->wsize) {
+        memcpy(state->window, strm->next_out - state->wsize, state->wsize);
+        state->write = 0;
+        state->whave = state->wsize;
+    }
+    else {
+        dist = state->wsize - state->write;
+        if (dist > copy) dist = copy;
+        memcpy(state->window + state->write, strm->next_out - copy, dist);
+        copy -= dist;
+        if (copy) {
+            memcpy(state->window, strm->next_out - copy, copy);
+            state->write = copy;
+            state->whave = state->wsize;
+        }
+        else {
+            state->write += dist;
+            if (state->write == state->wsize) state->write = 0;
+            if (state->whave < state->wsize) state->whave += dist;
+        }
+    }
 }
 
 
@@ -91,157 +150,764 @@ int zlib_inflateInit2_(
  * At the end of a Deflate-compressed PPP packet, we expect to have seen
  * a `stored' block type value but not the (zero) length bytes.
  */
-static int zlib_inflate_packet_flush(inflate_blocks_statef *s)
+/*
+   Returns true if inflate is currently at the end of a block generated by
+   Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP
+   implementation to provide an additional safety check. PPP uses
+   Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored
+   block. When decompressing, PPP checks that at the end of input packet,
+   inflate is waiting for these length bytes.
+ */
+static int zlib_inflateSyncPacket(z_streamp strm)
 {
-    if (s->mode != LENS)
-	return Z_DATA_ERROR;
-    s->mode = TYPE;
+    struct inflate_state *state;
+
+    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+
+    if (state->mode == STORED && state->bits == 0) {
+	state->mode = TYPE;
+        return Z_OK;
+    }
+    return Z_DATA_ERROR;
+}
+
+/* Macros for inflate(): */
+
+/* check function to use adler32() for zlib or crc32() for gzip */
+#define UPDATE(check, buf, len) zlib_adler32(check, buf, len)
+
+/* Load registers with state in inflate() for speed */
+#define LOAD() \
+    do { \
+        put = strm->next_out; \
+        left = strm->avail_out; \
+        next = strm->next_in; \
+        have = strm->avail_in; \
+        hold = state->hold; \
+        bits = state->bits; \
+    } while (0)
+
+/* Restore state from registers in inflate() */
+#define RESTORE() \
+    do { \
+        strm->next_out = put; \
+        strm->avail_out = left; \
+        strm->next_in = next; \
+        strm->avail_in = have; \
+        state->hold = hold; \
+        state->bits = bits; \
+    } while (0)
+
+/* Clear the input bit accumulator */
+#define INITBITS() \
+    do { \
+        hold = 0; \
+        bits = 0; \
+    } while (0)
+
+/* Get a byte of input into the bit accumulator, or return from inflate()
+   if there is no input available. */
+#define PULLBYTE() \
+    do { \
+        if (have == 0) goto inf_leave; \
+        have--; \
+        hold += (unsigned long)(*next++) << bits; \
+        bits += 8; \
+    } while (0)
+
+/* Assure that there are at least n bits in the bit accumulator.  If there is
+   not enough available input to do that, then return from inflate(). */
+#define NEEDBITS(n) \
+    do { \
+        while (bits < (unsigned)(n)) \
+            PULLBYTE(); \
+    } while (0)
+
+/* Return the low n bits of the bit accumulator (n < 16) */
+#define BITS(n) \
+    ((unsigned)hold & ((1U << (n)) - 1))
+
+/* Remove n bits from the bit accumulator */
+#define DROPBITS(n) \
+    do { \
+        hold >>= (n); \
+        bits -= (unsigned)(n); \
+    } while (0)
+
+/* Remove zero to seven bits as needed to go to a byte boundary */
+#define BYTEBITS() \
+    do { \
+        hold >>= bits & 7; \
+        bits -= bits & 7; \
+    } while (0)
+
+/* Reverse the bytes in a 32-bit value */
+#define REVERSE(q) \
+    ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \
+     (((q) & 0xff00) << 8) + (((q) & 0xff) << 24))
+
+/*
+   inflate() uses a state machine to process as much input data and generate as
+   much output data as possible before returning.  The state machine is
+   structured roughly as follows:
+
+    for (;;) switch (state) {
+    ...
+    case STATEn:
+        if (not enough input data or output space to make progress)
+            return;
+        ... make progress ...
+        state = STATEm;
+        break;
+    ...
+    }
+
+   so when inflate() is called again, the same case is attempted again, and
+   if the appropriate resources are provided, the machine proceeds to the
+   next state.  The NEEDBITS() macro is usually the way the state evaluates
+   whether it can proceed or should return.  NEEDBITS() does the return if
+   the requested bits are not available.  The typical use of the BITS macros
+   is:
+
+        NEEDBITS(n);
+        ... do something with BITS(n) ...
+        DROPBITS(n);
+
+   where NEEDBITS(n) either returns from inflate() if there isn't enough
+   input left to load n bits into the accumulator, or it continues.  BITS(n)
+   gives the low n bits in the accumulator.  When done, DROPBITS(n) drops
+   the low n bits off the accumulator.  INITBITS() clears the accumulator
+   and sets the number of available bits to zero.  BYTEBITS() discards just
+   enough bits to put the accumulator on a byte boundary.  After BYTEBITS()
+   and a NEEDBITS(8), then BITS(8) would return the next byte in the stream.
+
+   NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return
+   if there is no input available.  The decoding of variable length codes uses
+   PULLBYTE() directly in order to pull just enough bytes to decode the next
+   code, and no more.
+
+   Some states loop until they get enough input, making sure that enough
+   state information is maintained to continue the loop where it left off
+   if NEEDBITS() returns in the loop.  For example, want, need, and keep
+   would all have to actually be part of the saved state in case NEEDBITS()
+   returns:
+
+    case STATEw:
+        while (want < need) {
+            NEEDBITS(n);
+            keep[want++] = BITS(n);
+            DROPBITS(n);
+        }
+        state = STATEx;
+    case STATEx:
+
+   As shown above, if the next state is also the next case, then the break
+   is omitted.
+
+   A state may also return if there is not enough output space available to
+   complete that state.  Those states are copying stored data, writing a
+   literal byte, and copying a matching string.
+
+   When returning, a "goto inf_leave" is used to update the total counters,
+   update the check value, and determine whether any progress has been made
+   during that inflate() call in order to return the proper return code.
+   Progress is defined as a change in either strm->avail_in or strm->avail_out.
+   When there is a window, goto inf_leave will update the window with the last
+   output written.  If a goto inf_leave occurs in the middle of decompression
+   and there is no window currently, goto inf_leave will create one and copy
+   output to the window for the next call of inflate().
+
+   In this implementation, the flush parameter of inflate() only affects the
+   return code (per zlib.h).  inflate() always writes as much as possible to
+   strm->next_out, given the space available and the provided input--the effect
+   documented in zlib.h of Z_SYNC_FLUSH.  Furthermore, inflate() always defers
+   the allocation of and copying into a sliding window until necessary, which
+   provides the effect documented in zlib.h for Z_FINISH when the entire input
+   stream available.  So the only thing the flush parameter actually does is:
+   when flush is set to Z_FINISH, inflate() cannot return Z_OK.  Instead it
+   will return Z_BUF_ERROR if it has not reached the end of the stream.
+ */
+
+int zlib_inflate(z_streamp strm, int flush)
+{
+    struct inflate_state *state;
+    unsigned char *next;    /* next input */
+    unsigned char *put;     /* next output */
+    unsigned have, left;        /* available input and output */
+    unsigned long hold;         /* bit buffer */
+    unsigned bits;              /* bits in bit buffer */
+    unsigned in, out;           /* save starting available input and output */
+    unsigned copy;              /* number of stored or match bytes to copy */
+    unsigned char *from;    /* where to copy match bytes from */
+    code this;                  /* current decoding table entry */
+    code last;                  /* parent table entry */
+    unsigned len;               /* length to copy for repeats, bits to drop */
+    int ret;                    /* return code */
+    static const unsigned short order[19] = /* permutation of code lengths */
+        {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+    if (strm == NULL || strm->state == NULL || strm->next_out == NULL ||
+        (strm->next_in == NULL && strm->avail_in != 0))
+        return Z_STREAM_ERROR;
+
+    state = (struct inflate_state *)strm->state;
+
+    if (state->mode == TYPE) state->mode = TYPEDO;      /* skip check */
+    LOAD();
+    in = have;
+    out = left;
+    ret = Z_OK;
+    for (;;)
+        switch (state->mode) {
+        case HEAD:
+            if (state->wrap == 0) {
+                state->mode = TYPEDO;
+                break;
+            }
+            NEEDBITS(16);
+            if (
+                ((BITS(8) << 8) + (hold >> 8)) % 31) {
+                strm->msg = (char *)"incorrect header check";
+                state->mode = BAD;
+                break;
+            }
+            if (BITS(4) != Z_DEFLATED) {
+                strm->msg = (char *)"unknown compression method";
+                state->mode = BAD;
+                break;
+            }
+            DROPBITS(4);
+            len = BITS(4) + 8;
+            if (len > state->wbits) {
+                strm->msg = (char *)"invalid window size";
+                state->mode = BAD;
+                break;
+            }
+            state->dmax = 1U << len;
+            strm->adler = state->check = zlib_adler32(0L, NULL, 0);
+            state->mode = hold & 0x200 ? DICTID : TYPE;
+            INITBITS();
+            break;
+        case DICTID:
+            NEEDBITS(32);
+            strm->adler = state->check = REVERSE(hold);
+            INITBITS();
+            state->mode = DICT;
+        case DICT:
+            if (state->havedict == 0) {
+                RESTORE();
+                return Z_NEED_DICT;
+            }
+            strm->adler = state->check = zlib_adler32(0L, NULL, 0);
+            state->mode = TYPE;
+        case TYPE:
+            if (flush == Z_BLOCK) goto inf_leave;
+        case TYPEDO:
+            if (state->last) {
+                BYTEBITS();
+                state->mode = CHECK;
+                break;
+            }
+            NEEDBITS(3);
+            state->last = BITS(1);
+            DROPBITS(1);
+            switch (BITS(2)) {
+            case 0:                             /* stored block */
+                state->mode = STORED;
+                break;
+            case 1:                             /* fixed block */
+                zlib_fixedtables(state);
+                state->mode = LEN;              /* decode codes */
+                break;
+            case 2:                             /* dynamic block */
+                state->mode = TABLE;
+                break;
+            case 3:
+                strm->msg = (char *)"invalid block type";
+                state->mode = BAD;
+            }
+            DROPBITS(2);
+            break;
+        case STORED:
+            BYTEBITS();                         /* go to byte boundary */
+            NEEDBITS(32);
+            if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) {
+                strm->msg = (char *)"invalid stored block lengths";
+                state->mode = BAD;
+                break;
+            }
+            state->length = (unsigned)hold & 0xffff;
+            INITBITS();
+            state->mode = COPY;
+        case COPY:
+            copy = state->length;
+            if (copy) {
+                if (copy > have) copy = have;
+                if (copy > left) copy = left;
+                if (copy == 0) goto inf_leave;
+                memcpy(put, next, copy);
+                have -= copy;
+                next += copy;
+                left -= copy;
+                put += copy;
+                state->length -= copy;
+                break;
+            }
+            state->mode = TYPE;
+            break;
+        case TABLE:
+            NEEDBITS(14);
+            state->nlen = BITS(5) + 257;
+            DROPBITS(5);
+            state->ndist = BITS(5) + 1;
+            DROPBITS(5);
+            state->ncode = BITS(4) + 4;
+            DROPBITS(4);
+#ifndef PKZIP_BUG_WORKAROUND
+            if (state->nlen > 286 || state->ndist > 30) {
+                strm->msg = (char *)"too many length or distance symbols";
+                state->mode = BAD;
+                break;
+            }
+#endif
+            state->have = 0;
+            state->mode = LENLENS;
+        case LENLENS:
+            while (state->have < state->ncode) {
+                NEEDBITS(3);
+                state->lens[order[state->have++]] = (unsigned short)BITS(3);
+                DROPBITS(3);
+            }
+            while (state->have < 19)
+                state->lens[order[state->have++]] = 0;
+            state->next = state->codes;
+            state->lencode = (code const *)(state->next);
+            state->lenbits = 7;
+            ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next),
+                                &(state->lenbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid code lengths set";
+                state->mode = BAD;
+                break;
+            }
+            state->have = 0;
+            state->mode = CODELENS;
+        case CODELENS:
+            while (state->have < state->nlen + state->ndist) {
+                for (;;) {
+                    this = state->lencode[BITS(state->lenbits)];
+                    if ((unsigned)(this.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                if (this.val < 16) {
+                    NEEDBITS(this.bits);
+                    DROPBITS(this.bits);
+                    state->lens[state->have++] = this.val;
+                }
+                else {
+                    if (this.val == 16) {
+                        NEEDBITS(this.bits + 2);
+                        DROPBITS(this.bits);
+                        if (state->have == 0) {
+                            strm->msg = (char *)"invalid bit length repeat";
+                            state->mode = BAD;
+                            break;
+                        }
+                        len = state->lens[state->have - 1];
+                        copy = 3 + BITS(2);
+                        DROPBITS(2);
+                    }
+                    else if (this.val == 17) {
+                        NEEDBITS(this.bits + 3);
+                        DROPBITS(this.bits);
+                        len = 0;
+                        copy = 3 + BITS(3);
+                        DROPBITS(3);
+                    }
+                    else {
+                        NEEDBITS(this.bits + 7);
+                        DROPBITS(this.bits);
+                        len = 0;
+                        copy = 11 + BITS(7);
+                        DROPBITS(7);
+                    }
+                    if (state->have + copy > state->nlen + state->ndist) {
+                        strm->msg = (char *)"invalid bit length repeat";
+                        state->mode = BAD;
+                        break;
+                    }
+                    while (copy--)
+                        state->lens[state->have++] = (unsigned short)len;
+                }
+            }
+
+            /* handle error breaks in while */
+            if (state->mode == BAD) break;
+
+            /* build code tables */
+            state->next = state->codes;
+            state->lencode = (code const *)(state->next);
+            state->lenbits = 9;
+            ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next),
+                                &(state->lenbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid literal/lengths set";
+                state->mode = BAD;
+                break;
+            }
+            state->distcode = (code const *)(state->next);
+            state->distbits = 6;
+            ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist,
+                            &(state->next), &(state->distbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid distances set";
+                state->mode = BAD;
+                break;
+            }
+            state->mode = LEN;
+        case LEN:
+            if (have >= 6 && left >= 258) {
+                RESTORE();
+                inflate_fast(strm, out);
+                LOAD();
+                break;
+            }
+            for (;;) {
+                this = state->lencode[BITS(state->lenbits)];
+                if ((unsigned)(this.bits) <= bits) break;
+                PULLBYTE();
+            }
+            if (this.op && (this.op & 0xf0) == 0) {
+                last = this;
+                for (;;) {
+                    this = state->lencode[last.val +
+                            (BITS(last.bits + last.op) >> last.bits)];
+                    if ((unsigned)(last.bits + this.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                DROPBITS(last.bits);
+            }
+            DROPBITS(this.bits);
+            state->length = (unsigned)this.val;
+            if ((int)(this.op) == 0) {
+                state->mode = LIT;
+                break;
+            }
+            if (this.op & 32) {
+                state->mode = TYPE;
+                break;
+            }
+            if (this.op & 64) {
+                strm->msg = (char *)"invalid literal/length code";
+                state->mode = BAD;
+                break;
+            }
+            state->extra = (unsigned)(this.op) & 15;
+            state->mode = LENEXT;
+        case LENEXT:
+            if (state->extra) {
+                NEEDBITS(state->extra);
+                state->length += BITS(state->extra);
+                DROPBITS(state->extra);
+            }
+            state->mode = DIST;
+        case DIST:
+            for (;;) {
+                this = state->distcode[BITS(state->distbits)];
+                if ((unsigned)(this.bits) <= bits) break;
+                PULLBYTE();
+            }
+            if ((this.op & 0xf0) == 0) {
+                last = this;
+                for (;;) {
+                    this = state->distcode[last.val +
+                            (BITS(last.bits + last.op) >> last.bits)];
+                    if ((unsigned)(last.bits + this.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                DROPBITS(last.bits);
+            }
+            DROPBITS(this.bits);
+            if (this.op & 64) {
+                strm->msg = (char *)"invalid distance code";
+                state->mode = BAD;
+                break;
+            }
+            state->offset = (unsigned)this.val;
+            state->extra = (unsigned)(this.op) & 15;
+            state->mode = DISTEXT;
+        case DISTEXT:
+            if (state->extra) {
+                NEEDBITS(state->extra);
+                state->offset += BITS(state->extra);
+                DROPBITS(state->extra);
+            }
+#ifdef INFLATE_STRICT
+            if (state->offset > state->dmax) {
+                strm->msg = (char *)"invalid distance too far back";
+                state->mode = BAD;
+                break;
+            }
+#endif
+            if (state->offset > state->whave + out - left) {
+                strm->msg = (char *)"invalid distance too far back";
+                state->mode = BAD;
+                break;
+            }
+            state->mode = MATCH;
+        case MATCH:
+            if (left == 0) goto inf_leave;
+            copy = out - left;
+            if (state->offset > copy) {         /* copy from window */
+                copy = state->offset - copy;
+                if (copy > state->write) {
+                    copy -= state->write;
+                    from = state->window + (state->wsize - copy);
+                }
+                else
+                    from = state->window + (state->write - copy);
+                if (copy > state->length) copy = state->length;
+            }
+            else {                              /* copy from output */
+                from = put - state->offset;
+                copy = state->length;
+            }
+            if (copy > left) copy = left;
+            left -= copy;
+            state->length -= copy;
+            do {
+                *put++ = *from++;
+            } while (--copy);
+            if (state->length == 0) state->mode = LEN;
+            break;
+        case LIT:
+            if (left == 0) goto inf_leave;
+            *put++ = (unsigned char)(state->length);
+            left--;
+            state->mode = LEN;
+            break;
+        case CHECK:
+            if (state->wrap) {
+                NEEDBITS(32);
+                out -= left;
+                strm->total_out += out;
+                state->total += out;
+                if (out)
+                    strm->adler = state->check =
+                        UPDATE(state->check, put - out, out);
+                out = left;
+                if ((
+                     REVERSE(hold)) != state->check) {
+                    strm->msg = (char *)"incorrect data check";
+                    state->mode = BAD;
+                    break;
+                }
+                INITBITS();
+            }
+            state->mode = DONE;
+        case DONE:
+            ret = Z_STREAM_END;
+            goto inf_leave;
+        case BAD:
+            ret = Z_DATA_ERROR;
+            goto inf_leave;
+        case MEM:
+            return Z_MEM_ERROR;
+        case SYNC:
+        default:
+            return Z_STREAM_ERROR;
+        }
+
+    /*
+       Return from inflate(), updating the total counts and the check value.
+       If there was no progress during the inflate() call, return a buffer
+       error.  Call zlib_updatewindow() to create and/or update the window state.
+     */
+  inf_leave:
+    RESTORE();
+    if (state->wsize || (state->mode < CHECK && out != strm->avail_out))
+        zlib_updatewindow(strm, out);
+
+    in -= strm->avail_in;
+    out -= strm->avail_out;
+    strm->total_in += in;
+    strm->total_out += out;
+    state->total += out;
+    if (state->wrap && out)
+        strm->adler = state->check =
+            UPDATE(state->check, strm->next_out - out, out);
+
+    strm->data_type = state->bits + (state->last ? 64 : 0) +
+                      (state->mode == TYPE ? 128 : 0);
+    if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK)
+        ret = Z_BUF_ERROR;
+
+    if (flush == Z_PACKET_FLUSH && ret == Z_OK &&
+            (strm->avail_out != 0 || strm->avail_in == 0))
+		return zlib_inflateSyncPacket(strm);
+    return ret;
+}
+
+int zlib_inflateEnd(z_streamp strm)
+{
+    if (strm == NULL || strm->state == NULL)
+        return Z_STREAM_ERROR;
     return Z_OK;
 }
 
+#if 0
+int zlib_inflateSetDictionary(z_streamp strm, const Byte *dictionary,
+        uInt dictLength)
+{
+    struct inflate_state *state;
+    unsigned long id;
+
+    /* check state */
+    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    if (state->wrap != 0 && state->mode != DICT)
+        return Z_STREAM_ERROR;
+
+    /* check for correct dictionary id */
+    if (state->mode == DICT) {
+        id = zlib_adler32(0L, NULL, 0);
+        id = zlib_adler32(id, dictionary, dictLength);
+        if (id != state->check)
+            return Z_DATA_ERROR;
+    }
+
+    /* copy dictionary to window */
+    zlib_updatewindow(strm, strm->avail_out);
 
-int zlib_inflateInit_(
-	z_streamp z,
-	const char *version,
-	int stream_size
-)
+    if (dictLength > state->wsize) {
+        memcpy(state->window, dictionary + dictLength - state->wsize,
+                state->wsize);
+        state->whave = state->wsize;
+    }
+    else {
+        memcpy(state->window + state->wsize - dictLength, dictionary,
+                dictLength);
+        state->whave = dictLength;
+    }
+    state->havedict = 1;
+    return Z_OK;
+}
+#endif
+
+#if 0
+/*
+   Search buf[0..len-1] for the pattern: 0, 0, 0xff, 0xff.  Return when found
+   or when out of input.  When called, *have is the number of pattern bytes
+   found in order so far, in 0..3.  On return *have is updated to the new
+   state.  If on return *have equals four, then the pattern was found and the
+   return value is how many bytes were read including the last byte of the
+   pattern.  If *have is less than four, then the pattern has not been found
+   yet and the return value is len.  In the latter case, zlib_syncsearch() can be
+   called again with more data and the *have state.  *have is initialized to
+   zero for the first call.
+ */
+static unsigned zlib_syncsearch(unsigned *have, unsigned char *buf,
+        unsigned len)
 {
-  return zlib_inflateInit2_(z, DEF_WBITS, version, stream_size);
+    unsigned got;
+    unsigned next;
+
+    got = *have;
+    next = 0;
+    while (next < len && got < 4) {
+        if ((int)(buf[next]) == (got < 2 ? 0 : 0xff))
+            got++;
+        else if (buf[next])
+            got = 0;
+        else
+            got = 4 - got;
+        next++;
+    }
+    *have = got;
+    return next;
 }
+#endif
 
-#undef NEEDBYTE
-#undef NEXTBYTE
-#define NEEDBYTE {if(z->avail_in==0)goto empty;r=trv;}
-#define NEXTBYTE (z->avail_in--,z->total_in++,*z->next_in++)
+#if 0
+int zlib_inflateSync(z_streamp strm)
+{
+    unsigned len;               /* number of bytes to look at or looked at */
+    unsigned long in, out;      /* temporary to save total_in and total_out */
+    unsigned char buf[4];       /* to restore bit buffer to byte string */
+    struct inflate_state *state;
+
+    /* check parameters */
+    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    if (strm->avail_in == 0 && state->bits < 8) return Z_BUF_ERROR;
+
+    /* if first time, start search in bit buffer */
+    if (state->mode != SYNC) {
+        state->mode = SYNC;
+        state->hold <<= state->bits & 7;
+        state->bits -= state->bits & 7;
+        len = 0;
+        while (state->bits >= 8) {
+            buf[len++] = (unsigned char)(state->hold);
+            state->hold >>= 8;
+            state->bits -= 8;
+        }
+        state->have = 0;
+        zlib_syncsearch(&(state->have), buf, len);
+    }
+
+    /* search available input */
+    len = zlib_syncsearch(&(state->have), strm->next_in, strm->avail_in);
+    strm->avail_in -= len;
+    strm->next_in += len;
+    strm->total_in += len;
+
+    /* return no joy or set up to restart inflate() on a new block */
+    if (state->have != 4) return Z_DATA_ERROR;
+    in = strm->total_in;  out = strm->total_out;
+    zlib_inflateReset(strm);
+    strm->total_in = in;  strm->total_out = out;
+    state->mode = TYPE;
+    return Z_OK;
+}
+#endif
 
-int zlib_inflate(
-	z_streamp z,
-	int f
-)
+/*
+ * This subroutine adds the data at next_in/avail_in to the output history
+ * without performing any output.  The output buffer must be "caught up";
+ * i.e. no pending output but this should always be the case. The state must
+ * be waiting on the start of a block (i.e. mode == TYPE or HEAD).  On exit,
+ * the output will also be caught up, and the checksum will have been updated
+ * if need be.
+ */
+int zlib_inflateIncomp(z_stream *z)
 {
-  int r, trv;
-  uInt b;
-
-  if (z == NULL || z->state == NULL || z->next_in == NULL)
-    return Z_STREAM_ERROR;
-  trv = f == Z_FINISH ? Z_BUF_ERROR : Z_OK;
-  r = Z_BUF_ERROR;
-  while (1) switch (z->state->mode)
-  {
-    case METHOD:
-      NEEDBYTE
-      if (((z->state->sub.method = NEXTBYTE) & 0xf) != Z_DEFLATED)
-      {
-        z->state->mode = I_BAD;
-        z->msg = (char*)"unknown compression method";
-        z->state->sub.marker = 5;       /* can't try inflateSync */
-        break;
-      }
-      if ((z->state->sub.method >> 4) + 8 > z->state->wbits)
-      {
-        z->state->mode = I_BAD;
-        z->msg = (char*)"invalid window size";
-        z->state->sub.marker = 5;       /* can't try inflateSync */
-        break;
-      }
-      z->state->mode = FLAG;
-    case FLAG:
-      NEEDBYTE
-      b = NEXTBYTE;
-      if (((z->state->sub.method << 8) + b) % 31)
-      {
-        z->state->mode = I_BAD;
-        z->msg = (char*)"incorrect header check";
-        z->state->sub.marker = 5;       /* can't try inflateSync */
-        break;
-      }
-      if (!(b & PRESET_DICT))
-      {
-        z->state->mode = BLOCKS;
-        break;
-      }
-      z->state->mode = DICT4;
-    case DICT4:
-      NEEDBYTE
-      z->state->sub.check.need = (uLong)NEXTBYTE << 24;
-      z->state->mode = DICT3;
-    case DICT3:
-      NEEDBYTE
-      z->state->sub.check.need += (uLong)NEXTBYTE << 16;
-      z->state->mode = DICT2;
-    case DICT2:
-      NEEDBYTE
-      z->state->sub.check.need += (uLong)NEXTBYTE << 8;
-      z->state->mode = DICT1;
-    case DICT1:
-      NEEDBYTE
-      z->state->sub.check.need += (uLong)NEXTBYTE;
-      z->adler = z->state->sub.check.need;
-      z->state->mode = DICT0;
-      return Z_NEED_DICT;
-    case DICT0:
-      z->state->mode = I_BAD;
-      z->msg = (char*)"need dictionary";
-      z->state->sub.marker = 0;       /* can try inflateSync */
-      return Z_STREAM_ERROR;
-    case BLOCKS:
-      r = zlib_inflate_blocks(z->state->blocks, z, r);
-      if (f == Z_PACKET_FLUSH && z->avail_in == 0 && z->avail_out != 0)
-	  r = zlib_inflate_packet_flush(z->state->blocks);
-      if (r == Z_DATA_ERROR)
-      {
-        z->state->mode = I_BAD;
-        z->state->sub.marker = 0;       /* can try inflateSync */
-        break;
-      }
-      if (r == Z_OK)
-        r = trv;
-      if (r != Z_STREAM_END)
-        return r;
-      r = trv;
-      zlib_inflate_blocks_reset(z->state->blocks, z, &z->state->sub.check.was);
-      if (z->state->nowrap)
-      {
-        z->state->mode = I_DONE;
-        break;
-      }
-      z->state->mode = CHECK4;
-    case CHECK4:
-      NEEDBYTE
-      z->state->sub.check.need = (uLong)NEXTBYTE << 24;
-      z->state->mode = CHECK3;
-    case CHECK3:
-      NEEDBYTE
-      z->state->sub.check.need += (uLong)NEXTBYTE << 16;
-      z->state->mode = CHECK2;
-    case CHECK2:
-      NEEDBYTE
-      z->state->sub.check.need += (uLong)NEXTBYTE << 8;
-      z->state->mode = CHECK1;
-    case CHECK1:
-      NEEDBYTE
-      z->state->sub.check.need += (uLong)NEXTBYTE;
-
-      if (z->state->sub.check.was != z->state->sub.check.need)
-      {
-        z->state->mode = I_BAD;
-        z->msg = (char*)"incorrect data check";
-        z->state->sub.marker = 5;       /* can't try inflateSync */
-        break;
-      }
-      z->state->mode = I_DONE;
-    case I_DONE:
-      return Z_STREAM_END;
-    case I_BAD:
-      return Z_DATA_ERROR;
-    default:
-      return Z_STREAM_ERROR;
-  }
- empty:
-  if (f != Z_PACKET_FLUSH)
-    return r;
-  z->state->mode = I_BAD;
-  z->msg = (char *)"need more for packet flush";
-  z->state->sub.marker = 0;       /* can try inflateSync */
-  return Z_DATA_ERROR;
+    struct inflate_state *state = (struct inflate_state *)z->state;
+    Byte *saved_no = z->next_out;
+    uInt saved_ao = z->avail_out;
+
+    if (state->mode != TYPE && state->mode != HEAD)
+	return Z_DATA_ERROR;
+
+    /* Setup some variables to allow misuse of updateWindow */
+    z->avail_out = 0;
+    z->next_out = z->next_in + z->avail_in;
+
+    zlib_updatewindow(z, z->avail_in);
+
+    /* Restore saved variables */
+    z->avail_out = saved_ao;
+    z->next_out = saved_no;
+
+    z->adler = state->check =
+        UPDATE(state->check, z->next_in, z->avail_in);
+
+    z->total_out += z->avail_in;
+    z->total_in += z->avail_in;
+    z->next_in += z->avail_in;
+    state->total += z->avail_in;
+    z->avail_in = 0;
+
+    return Z_OK;
 }
diff --git a/lib/zlib_inflate/inflate.h b/lib/zlib_inflate/inflate.h
new file mode 100644
index 000000000000..df8a6c92052d
--- /dev/null
+++ b/lib/zlib_inflate/inflate.h
@@ -0,0 +1,107 @@
+/* inflate.h -- internal inflate state definition
+ * Copyright (C) 1995-2004 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+/* Possible inflate modes between inflate() calls */
+typedef enum {
+    HEAD,       /* i: waiting for magic header */
+    FLAGS,      /* i: waiting for method and flags (gzip) */
+    TIME,       /* i: waiting for modification time (gzip) */
+    OS,         /* i: waiting for extra flags and operating system (gzip) */
+    EXLEN,      /* i: waiting for extra length (gzip) */
+    EXTRA,      /* i: waiting for extra bytes (gzip) */
+    NAME,       /* i: waiting for end of file name (gzip) */
+    COMMENT,    /* i: waiting for end of comment (gzip) */
+    HCRC,       /* i: waiting for header crc (gzip) */
+    DICTID,     /* i: waiting for dictionary check value */
+    DICT,       /* waiting for inflateSetDictionary() call */
+        TYPE,       /* i: waiting for type bits, including last-flag bit */
+        TYPEDO,     /* i: same, but skip check to exit inflate on new block */
+        STORED,     /* i: waiting for stored size (length and complement) */
+        COPY,       /* i/o: waiting for input or output to copy stored block */
+        TABLE,      /* i: waiting for dynamic block table lengths */
+        LENLENS,    /* i: waiting for code length code lengths */
+        CODELENS,   /* i: waiting for length/lit and distance code lengths */
+            LEN,        /* i: waiting for length/lit code */
+            LENEXT,     /* i: waiting for length extra bits */
+            DIST,       /* i: waiting for distance code */
+            DISTEXT,    /* i: waiting for distance extra bits */
+            MATCH,      /* o: waiting for output space to copy string */
+            LIT,        /* o: waiting for output space to write literal */
+    CHECK,      /* i: waiting for 32-bit check value */
+    LENGTH,     /* i: waiting for 32-bit length (gzip) */
+    DONE,       /* finished check, done -- remain here until reset */
+    BAD,        /* got a data error -- remain here until reset */
+    MEM,        /* got an inflate() memory error -- remain here until reset */
+    SYNC        /* looking for synchronization bytes to restart inflate() */
+} inflate_mode;
+
+/*
+    State transitions between above modes -
+
+    (most modes can go to the BAD or MEM mode -- not shown for clarity)
+
+    Process header:
+        HEAD -> (gzip) or (zlib)
+        (gzip) -> FLAGS -> TIME -> OS -> EXLEN -> EXTRA -> NAME
+        NAME -> COMMENT -> HCRC -> TYPE
+        (zlib) -> DICTID or TYPE
+        DICTID -> DICT -> TYPE
+    Read deflate blocks:
+            TYPE -> STORED or TABLE or LEN or CHECK
+            STORED -> COPY -> TYPE
+            TABLE -> LENLENS -> CODELENS -> LEN
+    Read deflate codes:
+                LEN -> LENEXT or LIT or TYPE
+                LENEXT -> DIST -> DISTEXT -> MATCH -> LEN
+                LIT -> LEN
+    Process trailer:
+        CHECK -> LENGTH -> DONE
+ */
+
+/* state maintained between inflate() calls.  Approximately 7K bytes. */
+struct inflate_state {
+    inflate_mode mode;          /* current inflate mode */
+    int last;                   /* true if processing last block */
+    int wrap;                   /* bit 0 true for zlib, bit 1 true for gzip */
+    int havedict;               /* true if dictionary provided */
+    int flags;                  /* gzip header method and flags (0 if zlib) */
+    unsigned dmax;              /* zlib header max distance (INFLATE_STRICT) */
+    unsigned long check;        /* protected copy of check value */
+    unsigned long total;        /* protected copy of output count */
+ /*   gz_headerp head; */           /* where to save gzip header information */
+        /* sliding window */
+    unsigned wbits;             /* log base 2 of requested window size */
+    unsigned wsize;             /* window size or zero if not using window */
+    unsigned whave;             /* valid bytes in the window */
+    unsigned write;             /* window write index */
+    unsigned char *window;  /* allocated sliding window, if needed */
+        /* bit accumulator */
+    unsigned long hold;         /* input bit accumulator */
+    unsigned bits;              /* number of bits in "in" */
+        /* for string and stored block copying */
+    unsigned length;            /* literal or length of data to copy */
+    unsigned offset;            /* distance back to copy string from */
+        /* for table and code decoding */
+    unsigned extra;             /* extra bits needed */
+        /* fixed and dynamic code tables */
+    code const *lencode;    /* starting table for length/literal codes */
+    code const *distcode;   /* starting table for distance codes */
+    unsigned lenbits;           /* index bits for lencode */
+    unsigned distbits;          /* index bits for distcode */
+        /* dynamic table building */
+    unsigned ncode;             /* number of code length code lengths */
+    unsigned nlen;              /* number of length code lengths */
+    unsigned ndist;             /* number of distance code lengths */
+    unsigned have;              /* number of code lengths in lens[] */
+    code *next;             /* next available space in codes[] */
+    unsigned short lens[320];   /* temporary storage for code lengths */
+    unsigned short work[288];   /* work area for code table building */
+    code codes[ENOUGH];         /* space for code tables */
+};
diff --git a/lib/zlib_inflate/inflate_syms.c b/lib/zlib_inflate/inflate_syms.c
index ef49738f57ec..2061d4f06765 100644
--- a/lib/zlib_inflate/inflate_syms.c
+++ b/lib/zlib_inflate/inflate_syms.c
@@ -12,8 +12,7 @@
 
 EXPORT_SYMBOL(zlib_inflate_workspacesize);
 EXPORT_SYMBOL(zlib_inflate);
-EXPORT_SYMBOL(zlib_inflateInit_);
-EXPORT_SYMBOL(zlib_inflateInit2_);
+EXPORT_SYMBOL(zlib_inflateInit2);
 EXPORT_SYMBOL(zlib_inflateEnd);
 EXPORT_SYMBOL(zlib_inflateReset);
 EXPORT_SYMBOL(zlib_inflateIncomp); 
diff --git a/lib/zlib_inflate/inflate_sync.c b/lib/zlib_inflate/inflate_sync.c
deleted file mode 100644
index 61411ff89d61..000000000000
--- a/lib/zlib_inflate/inflate_sync.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/* inflate.c -- zlib interface to inflate modules
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#include <linux/zutil.h>
-#include "infblock.h"
-#include "infutil.h"
-
-#if 0
-int zlib_inflateSync(
-	z_streamp z
-)
-{
-  uInt n;       /* number of bytes to look at */
-  Byte *p;      /* pointer to bytes */
-  uInt m;       /* number of marker bytes found in a row */
-  uLong r, w;   /* temporaries to save total_in and total_out */
-
-  /* set up */
-  if (z == NULL || z->state == NULL)
-    return Z_STREAM_ERROR;
-  if (z->state->mode != I_BAD)
-  {
-    z->state->mode = I_BAD;
-    z->state->sub.marker = 0;
-  }
-  if ((n = z->avail_in) == 0)
-    return Z_BUF_ERROR;
-  p = z->next_in;
-  m = z->state->sub.marker;
-
-  /* search */
-  while (n && m < 4)
-  {
-    static const Byte mark[4] = {0, 0, 0xff, 0xff};
-    if (*p == mark[m])
-      m++;
-    else if (*p)
-      m = 0;
-    else
-      m = 4 - m;
-    p++, n--;
-  }
-
-  /* restore */
-  z->total_in += p - z->next_in;
-  z->next_in = p;
-  z->avail_in = n;
-  z->state->sub.marker = m;
-
-  /* return no joy or set up to restart on a new block */
-  if (m != 4)
-    return Z_DATA_ERROR;
-  r = z->total_in;  w = z->total_out;
-  zlib_inflateReset(z);
-  z->total_in = r;  z->total_out = w;
-  z->state->mode = BLOCKS;
-  return Z_OK;
-}
-#endif  /*  0  */
-
-
-/* Returns true if inflate is currently at the end of a block generated
- * by Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP
- * implementation to provide an additional safety check. PPP uses Z_SYNC_FLUSH
- * but removes the length bytes of the resulting empty stored block. When
- * decompressing, PPP checks that at the end of input packet, inflate is
- * waiting for these length bytes.
- */
-#if 0
-int zlib_inflateSyncPoint(
-	z_streamp z
-)
-{
-  if (z == NULL || z->state == NULL || z->state->blocks == NULL)
-    return Z_STREAM_ERROR;
-  return zlib_inflate_blocks_sync_point(z->state->blocks);
-}
-#endif  /*  0  */
-
-/*
- * This subroutine adds the data at next_in/avail_in to the output history
- * without performing any output.  The output buffer must be "caught up";
- * i.e. no pending output (hence s->read equals s->write), and the state must
- * be BLOCKS (i.e. we should be willing to see the start of a series of
- * BLOCKS).  On exit, the output will also be caught up, and the checksum
- * will have been updated if need be.
- */
-static int zlib_inflate_addhistory(inflate_blocks_statef *s,
-				      z_stream              *z)
-{
-    uLong b;              /* bit buffer */  /* NOT USED HERE */
-    uInt k;               /* bits in bit buffer */ /* NOT USED HERE */
-    uInt t;               /* temporary storage */
-    Byte *p;              /* input data pointer */
-    uInt n;               /* bytes available there */
-    Byte *q;              /* output window write pointer */
-    uInt m;               /* bytes to end of window or read pointer */
-
-    if (s->read != s->write)
-	return Z_STREAM_ERROR;
-    if (s->mode != TYPE)
-	return Z_DATA_ERROR;
-
-    /* we're ready to rock */
-    LOAD
-    /* while there is input ready, copy to output buffer, moving
-     * pointers as needed.
-     */
-    while (n) {
-	t = n;  /* how many to do */
-	/* is there room until end of buffer? */
-	if (t > m) t = m;
-	/* update check information */
-	if (s->checkfn != NULL)
-	    s->check = (*s->checkfn)(s->check, q, t);
-	memcpy(q, p, t);
-	q += t;
-	p += t;
-	n -= t;
-	z->total_out += t;
-	s->read = q;    /* drag read pointer forward */
-/*      WWRAP  */ 	/* expand WWRAP macro by hand to handle s->read */
-	if (q == s->end) {
-	    s->read = q = s->window;
-	    m = WAVAIL;
-	}
-    }
-    UPDATE
-    return Z_OK;
-}
-
-
-/*
- * This subroutine adds the data at next_in/avail_in to the output history
- * without performing any output.  The output buffer must be "caught up";
- * i.e. no pending output (hence s->read equals s->write), and the state must
- * be BLOCKS (i.e. we should be willing to see the start of a series of
- * BLOCKS).  On exit, the output will also be caught up, and the checksum
- * will have been updated if need be.
- */
-
-int zlib_inflateIncomp(
-	z_stream *z
-
-)
-{
-    if (z->state->mode != BLOCKS)
-	return Z_DATA_ERROR;
-    return zlib_inflate_addhistory(z->state->blocks, z);
-}
diff --git a/lib/zlib_inflate/inftrees.c b/lib/zlib_inflate/inftrees.c
index 874950ec4858..62343c53bf7e 100644
--- a/lib/zlib_inflate/inftrees.c
+++ b/lib/zlib_inflate/inftrees.c
@@ -1,412 +1,329 @@
 /* inftrees.c -- generate Huffman trees for efficient decoding
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
 #include <linux/zutil.h>
 #include "inftrees.h"
-#include "infutil.h"
 
-static const char inflate_copyright[] __attribute_used__ =
-   " inflate 1.1.3 Copyright 1995-1998 Mark Adler ";
+#define MAXBITS 15
+
+const char inflate_copyright[] =
+   " inflate 1.2.3 Copyright 1995-2005 Mark Adler ";
 /*
   If you use the zlib library in a product, an acknowledgment is welcome
   in the documentation of your product. If for some reason you cannot
   include such an acknowledgment, I would appreciate that you keep this
   copyright string in the executable of your product.
  */
-struct internal_state;
-
-/* simplify the use of the inflate_huft type with some defines */
-#define exop word.what.Exop
-#define bits word.what.Bits
-
-
-static int huft_build (
-    uInt *,             /* code lengths in bits */
-    uInt,               /* number of codes */
-    uInt,               /* number of "simple" codes */
-    const uInt *,       /* list of base values for non-simple codes */
-    const uInt *,       /* list of extra bits for non-simple codes */
-    inflate_huft **,    /* result: starting table */
-    uInt *,             /* maximum lookup bits (returns actual) */
-    inflate_huft *,     /* space for trees */
-    uInt *,             /* hufts used in space */
-    uInt * );           /* space for values */
-
-/* Tables for deflate from PKZIP's appnote.txt. */
-static const uInt cplens[31] = { /* Copy lengths for literal codes 257..285 */
-        3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
-        35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
-        /* see note #13 above about 258 */
-static const uInt cplext[31] = { /* Extra bits for literal codes 257..285 */
-        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
-        3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 112, 112}; /* 112==invalid */
-static const uInt cpdist[30] = { /* Copy offsets for distance codes 0..29 */
-        1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
-        257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
-        8193, 12289, 16385, 24577};
-static const uInt cpdext[30] = { /* Extra bits for distance codes */
-        0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
-        7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
-        12, 12, 13, 13};
 
 /*
-   Huffman code decoding is performed using a multi-level table lookup.
-   The fastest way to decode is to simply build a lookup table whose
-   size is determined by the longest code.  However, the time it takes
-   to build this table can also be a factor if the data being decoded
-   is not very long.  The most common codes are necessarily the
-   shortest codes, so those codes dominate the decoding time, and hence
-   the speed.  The idea is you can have a shorter table that decodes the
-   shorter, more probable codes, and then point to subsidiary tables for
-   the longer codes.  The time it costs to decode the longer codes is
-   then traded against the time it takes to make longer tables.
-
-   This results of this trade are in the variables lbits and dbits
-   below.  lbits is the number of bits the first level table for literal/
-   length codes can decode in one step, and dbits is the same thing for
-   the distance codes.  Subsequent tables are also less than or equal to
-   those sizes.  These values may be adjusted either when all of the
-   codes are shorter than that, in which case the longest code length in
-   bits is used, or when the shortest code is *longer* than the requested
-   table size, in which case the length of the shortest code in bits is
-   used.
-
-   There are two different values for the two tables, since they code a
-   different number of possibilities each.  The literal/length table
-   codes 286 possible values, or in a flat code, a little over eight
-   bits.  The distance table codes 30 possible values, or a little less
-   than five bits, flat.  The optimum values for speed end up being
-   about one bit more than those, so lbits is 8+1 and dbits is 5+1.
-   The optimum values may differ though from machine to machine, and
-   possibly even between compilers.  Your mileage may vary.
+   Build a set of tables to decode the provided canonical Huffman code.
+   The code lengths are lens[0..codes-1].  The result starts at *table,
+   whose indices are 0..2^bits-1.  work is a writable array of at least
+   lens shorts, which is used as a work area.  type is the type of code
+   to be generated, CODES, LENS, or DISTS.  On return, zero is success,
+   -1 is an invalid code, and +1 means that ENOUGH isn't enough.  table
+   on return points to the next available entry's address.  bits is the
+   requested root table index bits, and on return it is the actual root
+   table index bits.  It will differ if the request is greater than the
+   longest code or if it is less than the shortest code.
  */
-
-
-/* If BMAX needs to be larger than 16, then h and x[] should be uLong. */
-#define BMAX 15         /* maximum bit length of any code */
-
-static int huft_build(
-	uInt *b,               /* code lengths in bits (all assumed <= BMAX) */
-	uInt n,                /* number of codes (assumed <= 288) */
-	uInt s,                /* number of simple-valued codes (0..s-1) */
-	const uInt *d,         /* list of base values for non-simple codes */
-	const uInt *e,         /* list of extra bits for non-simple codes */
-	inflate_huft **t,      /* result: starting table */
-	uInt *m,               /* maximum lookup bits, returns actual */
-	inflate_huft *hp,      /* space for trees */
-	uInt *hn,              /* hufts used in space */
-	uInt *v                /* working area: values in order of bit length */
-)
-/* Given a list of code lengths and a maximum table size, make a set of
-   tables to decode that set of codes.  Return Z_OK on success, Z_BUF_ERROR
-   if the given code set is incomplete (the tables are still built in this
-   case), Z_DATA_ERROR if the input is invalid (an over-subscribed set of
-   lengths), or Z_MEM_ERROR if not enough memory. */
+int zlib_inflate_table(type, lens, codes, table, bits, work)
+codetype type;
+unsigned short *lens;
+unsigned codes;
+code **table;
+unsigned *bits;
+unsigned short *work;
 {
+    unsigned len;               /* a code's length in bits */
+    unsigned sym;               /* index of code symbols */
+    unsigned min, max;          /* minimum and maximum code lengths */
+    unsigned root;              /* number of index bits for root table */
+    unsigned curr;              /* number of index bits for current table */
+    unsigned drop;              /* code bits to drop for sub-table */
+    int left;                   /* number of prefix codes available */
+    unsigned used;              /* code entries in table used */
+    unsigned huff;              /* Huffman code */
+    unsigned incr;              /* for incrementing code, index */
+    unsigned fill;              /* index for replicating entries */
+    unsigned low;               /* low bits for current root entry */
+    unsigned mask;              /* mask for low root bits */
+    code this;                  /* table entry for duplication */
+    code *next;             /* next available space in table */
+    const unsigned short *base;     /* base value table to use */
+    const unsigned short *extra;    /* extra bits table to use */
+    int end;                    /* use base and extra for symbol > end */
+    unsigned short count[MAXBITS+1];    /* number of codes of each length */
+    unsigned short offs[MAXBITS+1];     /* offsets in table for each length */
+    static const unsigned short lbase[31] = { /* Length codes 257..285 base */
+        3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+        35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+    static const unsigned short lext[31] = { /* Length codes 257..285 extra */
+        16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
+        19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 201, 196};
+    static const unsigned short dbase[32] = { /* Distance codes 0..29 base */
+        1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+        257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+        8193, 12289, 16385, 24577, 0, 0};
+    static const unsigned short dext[32] = { /* Distance codes 0..29 extra */
+        16, 16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22,
+        23, 23, 24, 24, 25, 25, 26, 26, 27, 27,
+        28, 28, 29, 29, 64, 64};
+
+    /*
+       Process a set of code lengths to create a canonical Huffman code.  The
+       code lengths are lens[0..codes-1].  Each length corresponds to the
+       symbols 0..codes-1.  The Huffman code is generated by first sorting the
+       symbols by length from short to long, and retaining the symbol order
+       for codes with equal lengths.  Then the code starts with all zero bits
+       for the first code of the shortest length, and the codes are integer
+       increments for the same length, and zeros are appended as the length
+       increases.  For the deflate format, these bits are stored backwards
+       from their more natural integer increment ordering, and so when the
+       decoding tables are built in the large loop below, the integer codes
+       are incremented backwards.
+
+       This routine assumes, but does not check, that all of the entries in
+       lens[] are in the range 0..MAXBITS.  The caller must assure this.
+       1..MAXBITS is interpreted as that code length.  zero means that that
+       symbol does not occur in this code.
+
+       The codes are sorted by computing a count of codes for each length,
+       creating from that a table of starting indices for each length in the
+       sorted table, and then entering the symbols in order in the sorted
+       table.  The sorted table is work[], with that space being provided by
+       the caller.
+
+       The length counts are used for other purposes as well, i.e. finding
+       the minimum and maximum length codes, determining if there are any
+       codes at all, checking for a valid set of lengths, and looking ahead
+       at length counts to determine sub-table sizes when building the
+       decoding tables.
+     */
+
+    /* accumulate lengths for codes (assumes lens[] all in 0..MAXBITS) */
+    for (len = 0; len <= MAXBITS; len++)
+        count[len] = 0;
+    for (sym = 0; sym < codes; sym++)
+        count[lens[sym]]++;
+
+    /* bound code lengths, force root to be within code lengths */
+    root = *bits;
+    for (max = MAXBITS; max >= 1; max--)
+        if (count[max] != 0) break;
+    if (root > max) root = max;
+    if (max == 0) {                     /* no symbols to code at all */
+        this.op = (unsigned char)64;    /* invalid code marker */
+        this.bits = (unsigned char)1;
+        this.val = (unsigned short)0;
+        *(*table)++ = this;             /* make a table to force an error */
+        *(*table)++ = this;
+        *bits = 1;
+        return 0;     /* no symbols, but wait for decoding to report error */
+    }
+    for (min = 1; min <= MAXBITS; min++)
+        if (count[min] != 0) break;
+    if (root < min) root = min;
+
+    /* check for an over-subscribed or incomplete set of lengths */
+    left = 1;
+    for (len = 1; len <= MAXBITS; len++) {
+        left <<= 1;
+        left -= count[len];
+        if (left < 0) return -1;        /* over-subscribed */
+    }
+    if (left > 0 && (type == CODES || max != 1))
+        return -1;                      /* incomplete set */
+
+    /* generate offsets into symbol table for each length for sorting */
+    offs[1] = 0;
+    for (len = 1; len < MAXBITS; len++)
+        offs[len + 1] = offs[len] + count[len];
+
+    /* sort symbols by length, by symbol order within each length */
+    for (sym = 0; sym < codes; sym++)
+        if (lens[sym] != 0) work[offs[lens[sym]]++] = (unsigned short)sym;
+
+    /*
+       Create and fill in decoding tables.  In this loop, the table being
+       filled is at next and has curr index bits.  The code being used is huff
+       with length len.  That code is converted to an index by dropping drop
+       bits off of the bottom.  For codes where len is less than drop + curr,
+       those top drop + curr - len bits are incremented through all values to
+       fill the table with replicated entries.
+
+       root is the number of index bits for the root table.  When len exceeds
+       root, sub-tables are created pointed to by the root entry with an index
+       of the low root bits of huff.  This is saved in low to check for when a
+       new sub-table should be started.  drop is zero when the root table is
+       being filled, and drop is root when sub-tables are being filled.
+
+       When a new sub-table is needed, it is necessary to look ahead in the
+       code lengths to determine what size sub-table is needed.  The length
+       counts are used for this, and so count[] is decremented as codes are
+       entered in the tables.
+
+       used keeps track of how many table entries have been allocated from the
+       provided *table space.  It is checked when a LENS table is being made
+       against the space in *table, ENOUGH, minus the maximum space needed by
+       the worst case distance code, MAXD.  This should never happen, but the
+       sufficiency of ENOUGH has not been proven exhaustively, hence the check.
+       This assumes that when type == LENS, bits == 9.
+
+       sym increments through all symbols, and the loop terminates when
+       all codes of length max, i.e. all codes, have been processed.  This
+       routine permits incomplete codes, so another loop after this one fills
+       in the rest of the decoding tables with invalid code markers.
+     */
+
+    /* set up for code type */
+    switch (type) {
+    case CODES:
+        base = extra = work;    /* dummy value--not used */
+        end = 19;
+        break;
+    case LENS:
+        base = lbase;
+        base -= 257;
+        extra = lext;
+        extra -= 257;
+        end = 256;
+        break;
+    default:            /* DISTS */
+        base = dbase;
+        extra = dext;
+        end = -1;
+    }
 
-  uInt a;                       /* counter for codes of length k */
-  uInt c[BMAX+1];               /* bit length count table */
-  uInt f;                       /* i repeats in table every f entries */
-  int g;                        /* maximum code length */
-  int h;                        /* table level */
-  register uInt i;              /* counter, current code */
-  register uInt j;              /* counter */
-  register int k;               /* number of bits in current code */
-  int l;                        /* bits per table (returned in m) */
-  uInt mask;                    /* (1 << w) - 1, to avoid cc -O bug on HP */
-  register uInt *p;             /* pointer into c[], b[], or v[] */
-  inflate_huft *q;              /* points to current table */
-  struct inflate_huft_s r;      /* table entry for structure assignment */
-  inflate_huft *u[BMAX];        /* table stack */
-  register int w;               /* bits before this table == (l * h) */
-  uInt x[BMAX+1];               /* bit offsets, then code stack */
-  uInt *xp;                     /* pointer into x */
-  int y;                        /* number of dummy codes added */
-  uInt z;                       /* number of entries in current table */
-
-
-  /* Generate counts for each bit length */
-  p = c;
-#define C0 *p++ = 0;
-#define C2 C0 C0 C0 C0
-#define C4 C2 C2 C2 C2
-  C4                            /* clear c[]--assume BMAX+1 is 16 */
-  p = b;  i = n;
-  do {
-    c[*p++]++;                  /* assume all entries <= BMAX */
-  } while (--i);
-  if (c[0] == n)                /* null input--all zero length codes */
-  {
-    *t = NULL;
-    *m = 0;
-    return Z_OK;
-  }
-
-
-  /* Find minimum and maximum length, bound *m by those */
-  l = *m;
-  for (j = 1; j <= BMAX; j++)
-    if (c[j])
-      break;
-  k = j;                        /* minimum code length */
-  if ((uInt)l < j)
-    l = j;
-  for (i = BMAX; i; i--)
-    if (c[i])
-      break;
-  g = i;                        /* maximum code length */
-  if ((uInt)l > i)
-    l = i;
-  *m = l;
-
-
-  /* Adjust last length count to fill out codes, if needed */
-  for (y = 1 << j; j < i; j++, y <<= 1)
-    if ((y -= c[j]) < 0)
-      return Z_DATA_ERROR;
-  if ((y -= c[i]) < 0)
-    return Z_DATA_ERROR;
-  c[i] += y;
-
-
-  /* Generate starting offsets into the value table for each length */
-  x[1] = j = 0;
-  p = c + 1;  xp = x + 2;
-  while (--i) {                 /* note that i == g from above */
-    *xp++ = (j += *p++);
-  }
-
-
-  /* Make a table of values in order of bit lengths */
-  p = b;  i = 0;
-  do {
-    if ((j = *p++) != 0)
-      v[x[j]++] = i;
-  } while (++i < n);
-  n = x[g];                     /* set n to length of v */
-
-
-  /* Generate the Huffman codes and for each, make the table entries */
-  x[0] = i = 0;                 /* first Huffman code is zero */
-  p = v;                        /* grab values in bit order */
-  h = -1;                       /* no tables yet--level -1 */
-  w = -l;                       /* bits decoded == (l * h) */
-  u[0] = NULL;                  /* just to keep compilers happy */
-  q = NULL;                     /* ditto */
-  z = 0;                        /* ditto */
-
-  /* go through the bit lengths (k already is bits in shortest code) */
-  for (; k <= g; k++)
-  {
-    a = c[k];
-    while (a--)
-    {
-      /* here i is the Huffman code of length k bits for value *p */
-      /* make tables up to required level */
-      while (k > w + l)
-      {
-        h++;
-        w += l;                 /* previous table always l bits */
-
-        /* compute minimum size table less than or equal to l bits */
-        z = g - w;
-        z = z > (uInt)l ? l : z;        /* table size upper limit */
-        if ((f = 1 << (j = k - w)) > a + 1)     /* try a k-w bit table */
-        {                       /* too few codes for k-w bit table */
-          f -= a + 1;           /* deduct codes from patterns left */
-          xp = c + k;
-          if (j < z)
-            while (++j < z)     /* try smaller tables up to z bits */
-            {
-              if ((f <<= 1) <= *++xp)
-                break;          /* enough codes to use up j bits */
-              f -= *xp;         /* else deduct codes from patterns */
-            }
+    /* initialize state for loop */
+    huff = 0;                   /* starting code */
+    sym = 0;                    /* starting code symbol */
+    len = min;                  /* starting code length */
+    next = *table;              /* current table to fill in */
+    curr = root;                /* current table index bits */
+    drop = 0;                   /* current bits to drop from code for index */
+    low = (unsigned)(-1);       /* trigger new sub-table when len > root */
+    used = 1U << root;          /* use root table entries */
+    mask = used - 1;            /* mask for comparing low */
+
+    /* check available table space */
+    if (type == LENS && used >= ENOUGH - MAXD)
+        return 1;
+
+    /* process all codes and make table entries */
+    for (;;) {
+        /* create table entry */
+        this.bits = (unsigned char)(len - drop);
+        if ((int)(work[sym]) < end) {
+            this.op = (unsigned char)0;
+            this.val = work[sym];
         }
-        z = 1 << j;             /* table entries for j-bit table */
-
-        /* allocate new table */
-        if (*hn + z > MANY)     /* (note: doesn't matter for fixed) */
-          return Z_DATA_ERROR;  /* overflow of MANY */
-        u[h] = q = hp + *hn;
-        *hn += z;
-
-        /* connect to last table, if there is one */
-        if (h)
-        {
-          x[h] = i;             /* save pattern for backing up */
-          r.bits = (Byte)l;     /* bits to dump before this table */
-          r.exop = (Byte)j;     /* bits in this table */
-          j = i >> (w - l);
-          r.base = (uInt)(q - u[h-1] - j);   /* offset to this table */
-          u[h-1][j] = r;        /* connect to last table */
+        else if ((int)(work[sym]) > end) {
+            this.op = (unsigned char)(extra[work[sym]]);
+            this.val = base[work[sym]];
+        }
+        else {
+            this.op = (unsigned char)(32 + 64);         /* end of block */
+            this.val = 0;
         }
-        else
-          *t = q;               /* first table is returned result */
-      }
-
-      /* set up table entry in r */
-      r.bits = (Byte)(k - w);
-      if (p >= v + n)
-        r.exop = 128 + 64;      /* out of values--invalid code */
-      else if (*p < s)
-      {
-        r.exop = (Byte)(*p < 256 ? 0 : 32 + 64);     /* 256 is end-of-block */
-        r.base = *p++;          /* simple code is just the value */
-      }
-      else
-      {
-        r.exop = (Byte)(e[*p - s] + 16 + 64);/* non-simple--look up in lists */
-        r.base = d[*p++ - s];
-      }
-
-      /* fill code-like entries with r */
-      f = 1 << (k - w);
-      for (j = i >> w; j < z; j += f)
-        q[j] = r;
-
-      /* backwards increment the k-bit code i */
-      for (j = 1 << (k - 1); i & j; j >>= 1)
-        i ^= j;
-      i ^= j;
-
-      /* backup over finished tables */
-      mask = (1 << w) - 1;      /* needed on HP, cc -O bug */
-      while ((i & mask) != x[h])
-      {
-        h--;                    /* don't need to update q */
-        w -= l;
-        mask = (1 << w) - 1;
-      }
-    }
-  }
 
+        /* replicate for those indices with low len bits equal to huff */
+        incr = 1U << (len - drop);
+        fill = 1U << curr;
+        min = fill;                 /* save offset to next table */
+        do {
+            fill -= incr;
+            next[(huff >> drop) + fill] = this;
+        } while (fill != 0);
+
+        /* backwards increment the len-bit code huff */
+        incr = 1U << (len - 1);
+        while (huff & incr)
+            incr >>= 1;
+        if (incr != 0) {
+            huff &= incr - 1;
+            huff += incr;
+        }
+        else
+            huff = 0;
 
-  /* Return Z_BUF_ERROR if we were given an incomplete table */
-  return y != 0 && g != 1 ? Z_BUF_ERROR : Z_OK;
-}
+        /* go to next symbol, update count, len */
+        sym++;
+        if (--(count[len]) == 0) {
+            if (len == max) break;
+            len = lens[work[sym]];
+        }
 
+        /* create new sub-table if needed */
+        if (len > root && (huff & mask) != low) {
+            /* if first time, transition to sub-tables */
+            if (drop == 0)
+                drop = root;
+
+            /* increment past last table */
+            next += min;            /* here min is 1 << curr */
+
+            /* determine length of next table */
+            curr = len - drop;
+            left = (int)(1 << curr);
+            while (curr + drop < max) {
+                left -= count[curr + drop];
+                if (left <= 0) break;
+                curr++;
+                left <<= 1;
+            }
 
-int zlib_inflate_trees_bits(
-	uInt *c,                /* 19 code lengths */
-	uInt *bb,               /* bits tree desired/actual depth */
-	inflate_huft **tb,      /* bits tree result */
-	inflate_huft *hp,       /* space for trees */
-	z_streamp z             /* for messages */
-)
-{
-  int r;
-  uInt hn = 0;          /* hufts used in space */
-  uInt *v;              /* work area for huft_build */
-  
-  v = WS(z)->tree_work_area_1;
-  r = huft_build(c, 19, 19, NULL, NULL, tb, bb, hp, &hn, v);
-  if (r == Z_DATA_ERROR)
-    z->msg = (char*)"oversubscribed dynamic bit lengths tree";
-  else if (r == Z_BUF_ERROR || *bb == 0)
-  {
-    z->msg = (char*)"incomplete dynamic bit lengths tree";
-    r = Z_DATA_ERROR;
-  }
-  return r;
-}
+            /* check for enough space */
+            used += 1U << curr;
+            if (type == LENS && used >= ENOUGH - MAXD)
+                return 1;
 
-int zlib_inflate_trees_dynamic(
-	uInt nl,                /* number of literal/length codes */
-	uInt nd,                /* number of distance codes */
-	uInt *c,                /* that many (total) code lengths */
-	uInt *bl,               /* literal desired/actual bit depth */
-	uInt *bd,               /* distance desired/actual bit depth */
-	inflate_huft **tl,      /* literal/length tree result */
-	inflate_huft **td,      /* distance tree result */
-	inflate_huft *hp,       /* space for trees */
-	z_streamp z             /* for messages */
-)
-{
-  int r;
-  uInt hn = 0;          /* hufts used in space */
-  uInt *v;              /* work area for huft_build */
-
-  /* allocate work area */
-  v = WS(z)->tree_work_area_2;
-
-  /* build literal/length tree */
-  r = huft_build(c, nl, 257, cplens, cplext, tl, bl, hp, &hn, v);
-  if (r != Z_OK || *bl == 0)
-  {
-    if (r == Z_DATA_ERROR)
-      z->msg = (char*)"oversubscribed literal/length tree";
-    else if (r != Z_MEM_ERROR)
-    {
-      z->msg = (char*)"incomplete literal/length tree";
-      r = Z_DATA_ERROR;
-    }
-    return r;
-  }
-
-  /* build distance tree */
-  r = huft_build(c + nl, nd, 0, cpdist, cpdext, td, bd, hp, &hn, v);
-  if (r != Z_OK || (*bd == 0 && nl > 257))
-  {
-    if (r == Z_DATA_ERROR)
-      z->msg = (char*)"oversubscribed distance tree";
-    else if (r == Z_BUF_ERROR) {
-#ifdef PKZIP_BUG_WORKAROUND
-      r = Z_OK;
-    }
-#else
-      z->msg = (char*)"incomplete distance tree";
-      r = Z_DATA_ERROR;
-    }
-    else if (r != Z_MEM_ERROR)
-    {
-      z->msg = (char*)"empty distance tree with lengths";
-      r = Z_DATA_ERROR;
+            /* point entry in root table to sub-table */
+            low = huff & mask;
+            (*table)[low].op = (unsigned char)curr;
+            (*table)[low].bits = (unsigned char)root;
+            (*table)[low].val = (unsigned short)(next - *table);
+        }
     }
-    return r;
-#endif
-  }
 
-  /* done */
-  return Z_OK;
-}
+    /*
+       Fill in rest of table for incomplete codes.  This loop is similar to the
+       loop above in incrementing huff for table indices.  It is assumed that
+       len is equal to curr + drop, so there is no loop needed to increment
+       through high index bits.  When the current sub-table is filled, the loop
+       drops back to the root table to fill in any remaining entries there.
+     */
+    this.op = (unsigned char)64;                /* invalid code marker */
+    this.bits = (unsigned char)(len - drop);
+    this.val = (unsigned short)0;
+    while (huff != 0) {
+        /* when done with sub-table, drop back to root table */
+        if (drop != 0 && (huff & mask) != low) {
+            drop = 0;
+            len = root;
+            next = *table;
+            this.bits = (unsigned char)len;
+        }
 
+        /* put invalid code marker in table */
+        next[huff >> drop] = this;
 
-int zlib_inflate_trees_fixed(
-	uInt *bl,                /* literal desired/actual bit depth */
-	uInt *bd,                /* distance desired/actual bit depth */
-	inflate_huft **tl,       /* literal/length tree result */
-	inflate_huft **td,       /* distance tree result */
-	inflate_huft *hp,       /* space for trees */
-	z_streamp z              /* for memory allocation */
-)
-{
-  int i;                /* temporary variable */
-  unsigned l[288];      /* length list for huft_build */
-  uInt *v;              /* work area for huft_build */
-
-  /* set up literal table */
-  for (i = 0; i < 144; i++)
-    l[i] = 8;
-  for (; i < 256; i++)
-    l[i] = 9;
-  for (; i < 280; i++)
-    l[i] = 7;
-  for (; i < 288; i++)          /* make a complete, but wrong code set */
-    l[i] = 8;
-  *bl = 9;
-  v = WS(z)->tree_work_area_1;
-  if ((i = huft_build(l, 288, 257, cplens, cplext, tl, bl, hp,  &i, v)) != 0)
-    return i;
-
-  /* set up distance table */
-  for (i = 0; i < 30; i++)      /* make an incomplete code set */
-    l[i] = 5;
-  *bd = 5;
-  if ((i = huft_build(l, 30, 0, cpdist, cpdext, td, bd, hp, &i, v)) > 1)
-    return i;
-
-  return Z_OK;
+        /* backwards increment the len-bit code huff */
+        incr = 1U << (len - 1);
+        while (huff & incr)
+            incr >>= 1;
+        if (incr != 0) {
+            huff &= incr - 1;
+            huff += incr;
+        }
+        else
+            huff = 0;
+    }
+
+    /* set return parameters */
+    *table += used;
+    *bits = root;
+    return 0;
 }
diff --git a/lib/zlib_inflate/inftrees.h b/lib/zlib_inflate/inftrees.h
index e37705adc008..5f5219b1240e 100644
--- a/lib/zlib_inflate/inftrees.h
+++ b/lib/zlib_inflate/inftrees.h
@@ -1,6 +1,6 @@
 /* inftrees.h -- header to use inftrees.c
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
 /* WARNING: this file should *not* be used by applications. It is
@@ -8,57 +8,48 @@
    subject to change. Applications should only use zlib.h.
  */
 
-/* Huffman code lookup table entry--this entry is four bytes for machines
-   that have 16-bit pointers (e.g. PC's in the small or medium model). */
-
-#ifndef _INFTREES_H
-#define _INFTREES_H
-
-typedef struct inflate_huft_s inflate_huft;
-
-struct inflate_huft_s {
-  union {
-    struct {
-      Byte Exop;        /* number of extra bits or operation */
-      Byte Bits;        /* number of bits in this code or subcode */
-    } what;
-    uInt pad;           /* pad structure to a power of 2 (4 bytes for */
-  } word;               /*  16-bit, 8 bytes for 32-bit int's) */
-  uInt base;            /* literal, length base, distance base,
-                           or table offset */
-};
+/* Structure for decoding tables.  Each entry provides either the
+   information needed to do the operation requested by the code that
+   indexed that table entry, or it provides a pointer to another
+   table that indexes more bits of the code.  op indicates whether
+   the entry is a pointer to another table, a literal, a length or
+   distance, an end-of-block, or an invalid code.  For a table
+   pointer, the low four bits of op is the number of index bits of
+   that table.  For a length or distance, the low four bits of op
+   is the number of extra bits to get after the code.  bits is
+   the number of bits in this code or part of the code to drop off
+   of the bit buffer.  val is the actual byte to output in the case
+   of a literal, the base length or distance, or the offset from
+   the current table to the next table.  Each entry is four bytes. */
+typedef struct {
+    unsigned char op;           /* operation, extra bits, table bits */
+    unsigned char bits;         /* bits in this part of the code */
+    unsigned short val;         /* offset in table or code value */
+} code;
+
+/* op values as set by inflate_table():
+    00000000 - literal
+    0000tttt - table link, tttt != 0 is the number of table index bits
+    0001eeee - length or distance, eeee is the number of extra bits
+    01100000 - end of block
+    01000000 - invalid code
+ */
 
 /* Maximum size of dynamic tree.  The maximum found in a long but non-
-   exhaustive search was 1004 huft structures (850 for length/literals
-   and 154 for distances, the latter actually the result of an
-   exhaustive search).  The actual maximum is not known, but the
-   value below is more than safe. */
-#define MANY 1440
-
-extern int zlib_inflate_trees_bits (
-    uInt *,                     /* 19 code lengths */
-    uInt *,                     /* bits tree desired/actual depth */
-    inflate_huft **,            /* bits tree result */
-    inflate_huft *,             /* space for trees */
-    z_streamp);                 /* for messages */
-
-extern int zlib_inflate_trees_dynamic (
-    uInt,                       /* number of literal/length codes */
-    uInt,                       /* number of distance codes */
-    uInt *,                     /* that many (total) code lengths */
-    uInt *,                     /* literal desired/actual bit depth */
-    uInt *,                     /* distance desired/actual bit depth */
-    inflate_huft **,            /* literal/length tree result */
-    inflate_huft **,            /* distance tree result */
-    inflate_huft *,             /* space for trees */
-    z_streamp);                 /* for messages */
-
-extern int zlib_inflate_trees_fixed (
-    uInt *,                     /* literal desired/actual bit depth */
-    uInt *,                     /* distance desired/actual bit depth */
-    inflate_huft **,            /* literal/length tree result */
-    inflate_huft **,            /* distance tree result */
-    inflate_huft *,             /* space for trees */
-    z_streamp);                 /* for memory allocation */
-
-#endif /* _INFTREES_H */
+   exhaustive search was 1444 code structures (852 for length/literals
+   and 592 for distances, the latter actually the result of an
+   exhaustive search).  The true maximum is not known, but the value
+   below is more than safe. */
+#define ENOUGH 2048
+#define MAXD 592
+
+/* Type of code to build for inftable() */
+typedef enum {
+    CODES,
+    LENS,
+    DISTS
+} codetype;
+
+extern int zlib_inflate_table (codetype type, unsigned short *lens,
+                             unsigned codes, code **table,
+                             unsigned *bits, unsigned short *work);
diff --git a/lib/zlib_inflate/infutil.c b/lib/zlib_inflate/infutil.c
deleted file mode 100644
index 00202b3438e1..000000000000
--- a/lib/zlib_inflate/infutil.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/* inflate_util.c -- data and routines common to blocks and codes
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
- */
-
-#include <linux/zutil.h>
-#include "infblock.h"
-#include "inftrees.h"
-#include "infcodes.h"
-#include "infutil.h"
-
-struct inflate_codes_state;
-
-/* And'ing with mask[n] masks the lower n bits */
-uInt zlib_inflate_mask[17] = {
-    0x0000,
-    0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff,
-    0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff
-};
-
-
-/* copy as much as possible from the sliding window to the output area */
-int zlib_inflate_flush(
-	inflate_blocks_statef *s,
-	z_streamp z,
-	int r
-)
-{
-  uInt n;
-  Byte *p;
-  Byte *q;
-
-  /* local copies of source and destination pointers */
-  p = z->next_out;
-  q = s->read;
-
-  /* compute number of bytes to copy as far as end of window */
-  n = (uInt)((q <= s->write ? s->write : s->end) - q);
-  if (n > z->avail_out) n = z->avail_out;
-  if (n && r == Z_BUF_ERROR) r = Z_OK;
-
-  /* update counters */
-  z->avail_out -= n;
-  z->total_out += n;
-
-  /* update check information */
-  if (s->checkfn != NULL)
-    z->adler = s->check = (*s->checkfn)(s->check, q, n);
-
-  /* copy as far as end of window */
-  memcpy(p, q, n);
-  p += n;
-  q += n;
-
-  /* see if more to copy at beginning of window */
-  if (q == s->end)
-  {
-    /* wrap pointers */
-    q = s->window;
-    if (s->write == s->end)
-      s->write = s->window;
-
-    /* compute bytes to copy */
-    n = (uInt)(s->write - q);
-    if (n > z->avail_out) n = z->avail_out;
-    if (n && r == Z_BUF_ERROR) r = Z_OK;
-
-    /* update counters */
-    z->avail_out -= n;
-    z->total_out += n;
-
-    /* update check information */
-    if (s->checkfn != NULL)
-      z->adler = s->check = (*s->checkfn)(s->check, q, n);
-
-    /* copy */
-    memcpy(p, q, n);
-    p += n;
-    q += n;
-  }
-
-  /* update pointers */
-  z->next_out = p;
-  s->read = q;
-
-  /* done */
-  return r;
-}
diff --git a/lib/zlib_inflate/infutil.h b/lib/zlib_inflate/infutil.h
index a15875fc5f72..eb1a9007bd86 100644
--- a/lib/zlib_inflate/infutil.h
+++ b/lib/zlib_inflate/infutil.h
@@ -11,184 +11,12 @@
 #ifndef _INFUTIL_H
 #define _INFUTIL_H
 
-#include <linux/zconf.h>
-#include "inftrees.h"
-#include "infcodes.h"
-
-typedef enum {
-      TYPE,     /* get type bits (3, including end bit) */
-      LENS,     /* get lengths for stored */
-      STORED,   /* processing stored block */
-      TABLE,    /* get table lengths */
-      BTREE,    /* get bit lengths tree for a dynamic block */
-      DTREE,    /* get length, distance trees for a dynamic block */
-      CODES,    /* processing fixed or dynamic block */
-      DRY,      /* output remaining window bytes */
-      B_DONE,   /* finished last block, done */
-      B_BAD}    /* got a data error--stuck here */
-inflate_block_mode;
-
-/* inflate blocks semi-private state */
-struct inflate_blocks_state {
-
-  /* mode */
-  inflate_block_mode  mode;     /* current inflate_block mode */
-
-  /* mode dependent information */
-  union {
-    uInt left;          /* if STORED, bytes left to copy */
-    struct {
-      uInt table;               /* table lengths (14 bits) */
-      uInt index;               /* index into blens (or border) */
-      uInt *blens;              /* bit lengths of codes */
-      uInt bb;                  /* bit length tree depth */
-      inflate_huft *tb;         /* bit length decoding tree */
-    } trees;            /* if DTREE, decoding info for trees */
-    struct {
-      inflate_codes_statef 
-         *codes;
-    } decode;           /* if CODES, current state */
-  } sub;                /* submode */
-  uInt last;            /* true if this block is the last block */
-
-  /* mode independent information */
-  uInt bitk;            /* bits in bit buffer */
-  uLong bitb;           /* bit buffer */
-  inflate_huft *hufts;  /* single malloc for tree space */
-  Byte *window;         /* sliding window */
-  Byte *end;            /* one byte after sliding window */
-  Byte *read;           /* window read pointer */
-  Byte *write;          /* window write pointer */
-  check_func checkfn;   /* check function */
-  uLong check;          /* check on output */
-
-};
-
-
-/* defines for inflate input/output */
-/*   update pointers and return */
-#define UPDBITS {s->bitb=b;s->bitk=k;}
-#define UPDIN {z->avail_in=n;z->total_in+=p-z->next_in;z->next_in=p;}
-#define UPDOUT {s->write=q;}
-#define UPDATE {UPDBITS UPDIN UPDOUT}
-#define LEAVE {UPDATE return zlib_inflate_flush(s,z,r);}
-/*   get bytes and bits */
-#define LOADIN {p=z->next_in;n=z->avail_in;b=s->bitb;k=s->bitk;}
-#define NEEDBYTE {if(n)r=Z_OK;else LEAVE}
-#define NEXTBYTE (n--,*p++)
-#define NEEDBITS(j) {while(k<(j)){NEEDBYTE;b|=((uLong)NEXTBYTE)<<k;k+=8;}}
-#define DUMPBITS(j) {b>>=(j);k-=(j);}
-/*   output bytes */
-#define WAVAIL (uInt)(q<s->read?s->read-q-1:s->end-q)
-#define LOADOUT {q=s->write;m=(uInt)WAVAIL;}
-#define WRAP {if(q==s->end&&s->read!=s->window){q=s->window;m=(uInt)WAVAIL;}}
-#define FLUSH {UPDOUT r=zlib_inflate_flush(s,z,r); LOADOUT}
-#define NEEDOUT {if(m==0){WRAP if(m==0){FLUSH WRAP if(m==0) LEAVE}}r=Z_OK;}
-#define OUTBYTE(a) {*q++=(Byte)(a);m--;}
-/*   load local pointers */
-#define LOAD {LOADIN LOADOUT}
-
-/* masks for lower bits (size given to avoid silly warnings with Visual C++) */
-extern uInt zlib_inflate_mask[17];
-
-/* copy as much as possible from the sliding window to the output area */
-extern int zlib_inflate_flush (
-    inflate_blocks_statef *,
-    z_streamp ,
-    int);
-
-/* inflate private state */
-typedef enum {
-      METHOD,   /* waiting for method byte */
-      FLAG,     /* waiting for flag byte */
-      DICT4,    /* four dictionary check bytes to go */
-      DICT3,    /* three dictionary check bytes to go */
-      DICT2,    /* two dictionary check bytes to go */
-      DICT1,    /* one dictionary check byte to go */
-      DICT0,    /* waiting for inflateSetDictionary */
-      BLOCKS,   /* decompressing blocks */
-      CHECK4,   /* four check bytes to go */
-      CHECK3,   /* three check bytes to go */
-      CHECK2,   /* two check bytes to go */
-      CHECK1,   /* one check byte to go */
-      I_DONE,   /* finished check, done */
-      I_BAD}    /* got an error--stay here */
-inflate_mode;
-
-struct internal_state {
-
-  /* mode */
-  inflate_mode  mode;   /* current inflate mode */
-
-  /* mode dependent information */
-  union {
-    uInt method;        /* if FLAGS, method byte */
-    struct {
-      uLong was;                /* computed check value */
-      uLong need;               /* stream check value */
-    } check;            /* if CHECK, check values to compare */
-    uInt marker;        /* if BAD, inflateSync's marker bytes count */
-  } sub;        /* submode */
-
-  /* mode independent information */
-  int  nowrap;          /* flag for no wrapper */
-  uInt wbits;           /* log2(window size)  (8..15, defaults to 15) */
-  inflate_blocks_statef 
-    *blocks;            /* current inflate_blocks state */
-
-};
-
-/* inflate codes private state */
-typedef enum {        /* waiting for "i:"=input, "o:"=output, "x:"=nothing */
-      START,    /* x: set up for LEN */
-      LEN,      /* i: get length/literal/eob next */
-      LENEXT,   /* i: getting length extra (have base) */
-      DIST,     /* i: get distance next */
-      DISTEXT,  /* i: getting distance extra */
-      COPY,     /* o: copying bytes in window, waiting for space */
-      LIT,      /* o: got literal, waiting for output space */
-      WASH,     /* o: got eob, possibly still output waiting */
-      END,      /* x: got eob and all data flushed */
-      BADCODE}  /* x: got error */
-inflate_codes_mode;
-
-struct inflate_codes_state {
-
-  /* mode */
-  inflate_codes_mode mode;      /* current inflate_codes mode */
-
-  /* mode dependent information */
-  uInt len;
-  union {
-    struct {
-      inflate_huft *tree;       /* pointer into tree */
-      uInt need;                /* bits needed */
-    } code;             /* if LEN or DIST, where in tree */
-    uInt lit;           /* if LIT, literal */
-    struct {
-      uInt get;                 /* bits to get for extra */
-      uInt dist;                /* distance back to copy from */
-    } copy;             /* if EXT or COPY, where and how much */
-  } sub;                /* submode */
-
-  /* mode independent information */
-  Byte lbits;           /* ltree bits decoded per branch */
-  Byte dbits;           /* dtree bits decoder per branch */
-  inflate_huft *ltree;          /* literal/length/eob tree */
-  inflate_huft *dtree;          /* distance tree */
-
-};
+#include <linux/zlib.h>
 
 /* memory allocation for inflation */
 
 struct inflate_workspace {
-	inflate_codes_statef working_state;
-	struct inflate_blocks_state working_blocks_state;
-	struct internal_state internal_state;
-	unsigned int tree_work_area_1[19];
-	unsigned int tree_work_area_2[288];
-	unsigned working_blens[258 + 0x1f + 0x1f];
-	inflate_huft working_hufts[MANY];
+	struct inflate_state inflate_state;
 	unsigned char working_window[1 << MAX_WBITS];
 };
 
-- 
cgit v1.2.3


From 4a31e348e3ecaf54c50240109ac4574b180f8840 Mon Sep 17 00:00:00 2001
From: Krzysztof Halasa <khc@pm.waw.pl>
Date: Thu, 22 Jun 2006 22:20:19 +0200
Subject: [PATCH] WAN: register_hdlc_device() doesn't need dev_alloc_name()

David Boggs noticed that register_hdlc_device() no longer needs
to call dev_alloc_name() as it's called by register_netdev().
register_hdlc_device() is currently equivalent to register_netdev().

hdlc_setup() is now EXPORTed as per David's request.

Signed-off-by: Krzysztof Halasa <khc@pm.waw.pl>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/net/wan/hdlc_generic.c | 24 ++----------------------
 include/linux/hdlc.h           |  2 +-
 2 files changed, 3 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wan/hdlc_generic.c b/drivers/net/wan/hdlc_generic.c
index 46cef8f92133..57f9538b8fb5 100644
--- a/drivers/net/wan/hdlc_generic.c
+++ b/drivers/net/wan/hdlc_generic.c
@@ -259,7 +259,7 @@ int hdlc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	}
 }
 
-static void hdlc_setup(struct net_device *dev)
+void hdlc_setup(struct net_device *dev)
 {
 	hdlc_device *hdlc = dev_to_hdlc(dev);
 
@@ -288,26 +288,6 @@ struct net_device *alloc_hdlcdev(void *priv)
 	return dev;
 }
 
-int register_hdlc_device(struct net_device *dev)
-{
-	int result = dev_alloc_name(dev, "hdlc%d");
-	if (result < 0)
-		return result;
-
-	result = register_netdev(dev);
-	if (result != 0)
-		return -EIO;
-
-#if 0
-	if (netif_carrier_ok(dev))
-		netif_carrier_off(dev); /* no carrier until DCD goes up */
-#endif
-
-	return 0;
-}
-
-
-
 void unregister_hdlc_device(struct net_device *dev)
 {
 	rtnl_lock();
@@ -326,8 +306,8 @@ EXPORT_SYMBOL(hdlc_open);
 EXPORT_SYMBOL(hdlc_close);
 EXPORT_SYMBOL(hdlc_set_carrier);
 EXPORT_SYMBOL(hdlc_ioctl);
+EXPORT_SYMBOL(hdlc_setup);
 EXPORT_SYMBOL(alloc_hdlcdev);
-EXPORT_SYMBOL(register_hdlc_device);
 EXPORT_SYMBOL(unregister_hdlc_device);
 
 static struct packet_type hdlc_packet_type = {
diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h
index df695e9ae327..4513f9e40937 100644
--- a/include/linux/hdlc.h
+++ b/include/linux/hdlc.h
@@ -188,7 +188,7 @@ int hdlc_x25_ioctl(struct net_device *dev, struct ifreq *ifr);
 int hdlc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
 
 /* Must be used by hardware driver on module startup/exit */
-int register_hdlc_device(struct net_device *dev);
+#define register_hdlc_device(dev)	register_netdev(dev)
 void unregister_hdlc_device(struct net_device *dev);
 
 struct net_device *alloc_hdlcdev(void *priv);
-- 
cgit v1.2.3


From 47005f255ed126a4b48a1a2f63164fb1d83bcb0a Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 19 Jun 2006 18:27:23 +0900
Subject: [PATCH] libata: implement per-dev EH action mask
 eh_info->dev_action[]

Currently, the only per-dev EH action is REVALIDATE.  EH used to
exploit ehi->dev to do selective revalidation on a ATA bus.  However,
this is a bit hacky and makes it impossible to request selective
revalidation from outside of EH or add another per-dev EH action.

This patch adds per-dev EH action mask eh_info->dev_action[] and
update EH to use this field for REVALIDATE.  Note that per-dev actions
can still be specified at port-level and it has the same effect of
specifying the action for all devices on the port.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-eh.c | 85 +++++++++++++++++++++++++++++++++++++++---------
 include/linux/libata.h   |  2 ++
 2 files changed, 71 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index 531a4e11c078..70b623988a9f 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -706,9 +706,35 @@ static void ata_eh_detach_dev(struct ata_device *dev)
 	spin_unlock_irqrestore(&ap->host_set->lock, flags);
 }
 
+static void ata_eh_clear_action(struct ata_device *dev,
+				struct ata_eh_info *ehi, unsigned int action)
+{
+	int i;
+
+	if (!dev) {
+		ehi->action &= ~action;
+		for (i = 0; i < ATA_MAX_DEVICES; i++)
+			ehi->dev_action[i] &= ~action;
+	} else {
+		/* doesn't make sense for port-wide EH actions */
+		WARN_ON(!(action & ATA_EH_PERDEV_MASK));
+
+		/* break ehi->action into ehi->dev_action */
+		if (ehi->action & action) {
+			for (i = 0; i < ATA_MAX_DEVICES; i++)
+				ehi->dev_action[i] |= ehi->action & action;
+			ehi->action &= ~action;
+		}
+
+		/* turn off the specified per-dev action */
+		ehi->dev_action[dev->devno] &= ~action;
+	}
+}
+
 /**
  *	ata_eh_about_to_do - about to perform eh_action
  *	@ap: target ATA port
+ *	@dev: target ATA dev for per-dev action (can be NULL)
  *	@action: action about to be performed
  *
  *	Called just before performing EH actions to clear related bits
@@ -718,16 +744,35 @@ static void ata_eh_detach_dev(struct ata_device *dev)
  *	LOCKING:
  *	None.
  */
-static void ata_eh_about_to_do(struct ata_port *ap, unsigned int action)
+static void ata_eh_about_to_do(struct ata_port *ap, struct ata_device *dev,
+			       unsigned int action)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&ap->host_set->lock, flags);
-	ap->eh_info.action &= ~action;
+	ata_eh_clear_action(dev, &ap->eh_info, action);
 	ap->flags |= ATA_FLAG_RECOVERED;
 	spin_unlock_irqrestore(&ap->host_set->lock, flags);
 }
 
+/**
+ *	ata_eh_done - EH action complete
+ *	@ap: target ATA port
+ *	@dev: target ATA dev for per-dev action (can be NULL)
+ *	@action: action just completed
+ *
+ *	Called right after performing EH actions to clear related bits
+ *	in @ap->eh_context.
+ *
+ *	LOCKING:
+ *	None.
+ */
+static void ata_eh_done(struct ata_port *ap, struct ata_device *dev,
+			unsigned int action)
+{
+	ata_eh_clear_action(dev, &ap->eh_context.i, action);
+}
+
 /**
  *	ata_err_string - convert err_mask to descriptive string
  *	@err_mask: error mask to convert to string
@@ -1271,10 +1316,6 @@ static void ata_eh_autopsy(struct ata_port *ap)
 			is_io = 1;
 	}
 
-	/* speed down iff command was in progress */
-	if (failed_dev)
-		action |= ata_eh_speed_down(failed_dev, is_io, all_err_mask);
-
 	/* enforce default EH actions */
 	if (ap->flags & ATA_FLAG_FROZEN ||
 	    all_err_mask & (AC_ERR_HSM | AC_ERR_TIMEOUT))
@@ -1282,6 +1323,17 @@ static void ata_eh_autopsy(struct ata_port *ap)
 	else if (all_err_mask)
 		action |= ATA_EH_REVALIDATE;
 
+	/* if we have offending qcs and the associated failed device */
+	if (failed_dev) {
+		/* speed down */
+		action |= ata_eh_speed_down(failed_dev, is_io, all_err_mask);
+
+		/* perform per-dev EH action only on the offending device */
+		ehc->i.dev_action[failed_dev->devno] |=
+			action & ATA_EH_PERDEV_MASK;
+		action &= ~ATA_EH_PERDEV_MASK;
+	}
+
 	/* record autopsy result */
 	ehc->i.dev = failed_dev;
 	ehc->i.action = action;
@@ -1457,7 +1509,7 @@ static int ata_eh_reset(struct ata_port *ap, int classify,
 				reset == softreset ? "soft" : "hard");
 
 	/* reset */
-	ata_eh_about_to_do(ap, ATA_EH_RESET_MASK);
+	ata_eh_about_to_do(ap, NULL, ATA_EH_RESET_MASK);
 	ehc->i.flags |= ATA_EHI_DID_RESET;
 
 	rc = ata_do_reset(ap, reset, classes);
@@ -1476,7 +1528,7 @@ static int ata_eh_reset(struct ata_port *ap, int classify,
 			return -EINVAL;
 		}
 
-		ata_eh_about_to_do(ap, ATA_EH_RESET_MASK);
+		ata_eh_about_to_do(ap, NULL, ATA_EH_RESET_MASK);
 		rc = ata_do_reset(ap, reset, classes);
 
 		if (rc == 0 && classify &&
@@ -1520,8 +1572,7 @@ static int ata_eh_reset(struct ata_port *ap, int classify,
 			postreset(ap, classes);
 
 		/* reset successful, schedule revalidation */
-		ehc->i.dev = NULL;
-		ehc->i.action &= ~ATA_EH_RESET_MASK;
+		ata_eh_done(ap, NULL, ATA_EH_RESET_MASK);
 		ehc->i.action |= ATA_EH_REVALIDATE;
 	}
 
@@ -1539,21 +1590,25 @@ static int ata_eh_revalidate_and_attach(struct ata_port *ap,
 	DPRINTK("ENTER\n");
 
 	for (i = 0; i < ATA_MAX_DEVICES; i++) {
+		unsigned int action;
+
 		dev = &ap->device[i];
+		action = ehc->i.action | ehc->i.dev_action[dev->devno];
 
-		if (ehc->i.action & ATA_EH_REVALIDATE && ata_dev_enabled(dev) &&
-		    (!ehc->i.dev || ehc->i.dev == dev)) {
+		if (action & ATA_EH_REVALIDATE && ata_dev_enabled(dev)) {
 			if (ata_port_offline(ap)) {
 				rc = -EIO;
 				break;
 			}
 
-			ata_eh_about_to_do(ap, ATA_EH_REVALIDATE);
+			ata_eh_about_to_do(ap, dev, ATA_EH_REVALIDATE);
 			rc = ata_dev_revalidate(dev,
 					ehc->i.flags & ATA_EHI_DID_RESET);
 			if (rc)
 				break;
 
+			ata_eh_done(ap, dev, ATA_EH_REVALIDATE);
+
 			/* schedule the scsi_rescan_device() here */
 			queue_work(ata_aux_wq, &(ap->scsi_rescan_task));
 		} else if (dev->class == ATA_DEV_UNKNOWN &&
@@ -1576,9 +1631,7 @@ static int ata_eh_revalidate_and_attach(struct ata_port *ap,
 		}
 	}
 
-	if (rc == 0)
-		ehc->i.action &= ~ATA_EH_REVALIDATE;
-	else
+	if (rc)
 		*r_failed_dev = dev;
 
 	DPRINTK("EXIT\n");
diff --git a/include/linux/libata.h b/include/linux/libata.h
index f03b8664af11..6b3c3af2c75f 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -249,6 +249,7 @@ enum {
 	ATA_EH_HARDRESET	= (1 << 2),
 
 	ATA_EH_RESET_MASK	= ATA_EH_SOFTRESET | ATA_EH_HARDRESET,
+	ATA_EH_PERDEV_MASK	= ATA_EH_REVALIDATE,
 
 	/* ata_eh_info->flags */
 	ATA_EHI_HOTPLUGGED	= (1 << 0),  /* could have been hotplugged */
@@ -462,6 +463,7 @@ struct ata_eh_info {
 	u32			serror;		/* SError from LLDD */
 	unsigned int		err_mask;	/* port-wide err_mask */
 	unsigned int		action;		/* ATA_EH_* action mask */
+	unsigned int		dev_action[ATA_MAX_DEVICES]; /* dev EH action */
 	unsigned int		flags;		/* ATA_EHI_* flags */
 
 	unsigned long		hotplug_timestamp;
-- 
cgit v1.2.3


From ba6a13083c1b720a47c05bee7bedbb6ef06c4611 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Thu, 22 Jun 2006 23:46:10 -0400
Subject: [libata] Add host lock to struct ata_port

Prepare for changes required to support SATA devices
attached to SAS HBAs. For these devices we don't want to
use host_set at all, since libata will not be the owner
of struct scsi_host.

Signed-off-by: Brian King <brking@us.ibm.com>

(with slight merge modifications made by...)
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-bmdma.c |  5 ++--
 drivers/scsi/libata-core.c  | 55 +++++++++++++++++++-------------------
 drivers/scsi/libata-eh.c    | 65 ++++++++++++++++++++++-----------------------
 drivers/scsi/libata-scsi.c  | 20 +++++++-------
 include/linux/libata.h      |  1 +
 5 files changed, 73 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-bmdma.c b/drivers/scsi/libata-bmdma.c
index 13fab97c840e..004e1a0d8b71 100644
--- a/drivers/scsi/libata-bmdma.c
+++ b/drivers/scsi/libata-bmdma.c
@@ -715,7 +715,6 @@ void ata_bmdma_drive_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
 			ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
 			ata_postreset_fn_t postreset)
 {
-	struct ata_host_set *host_set = ap->host_set;
 	struct ata_eh_context *ehc = &ap->eh_context;
 	struct ata_queued_cmd *qc;
 	unsigned long flags;
@@ -726,7 +725,7 @@ void ata_bmdma_drive_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
 		qc = NULL;
 
 	/* reset PIO HSM and stop DMA engine */
-	spin_lock_irqsave(&host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 
 	ap->hsm_task_state = HSM_ST_IDLE;
 
@@ -755,7 +754,7 @@ void ata_bmdma_drive_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
 	ata_chk_status(ap);
 	ap->ops->irq_clear(ap);
 
-	spin_unlock_irqrestore(&host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
 	if (thaw)
 		ata_eh_thaw_port(ap);
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 425ab1493fd3..24d340aeb518 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -933,9 +933,9 @@ void ata_port_flush_task(struct ata_port *ap)
 
 	DPRINTK("ENTER\n");
 
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 	ap->flags |= ATA_FLAG_FLUSH_PORT_TASK;
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
 	DPRINTK("flush #1\n");
 	flush_workqueue(ata_wq);
@@ -950,9 +950,9 @@ void ata_port_flush_task(struct ata_port *ap)
 		flush_workqueue(ata_wq);
 	}
 
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 	ap->flags &= ~ATA_FLAG_FLUSH_PORT_TASK;
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
 	DPRINTK("EXIT\n");
 }
@@ -999,11 +999,11 @@ unsigned ata_exec_internal(struct ata_device *dev,
 	unsigned int err_mask;
 	int rc;
 
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 
 	/* no internal command while frozen */
 	if (ap->flags & ATA_FLAG_FROZEN) {
-		spin_unlock_irqrestore(&ap->host_set->lock, flags);
+		spin_unlock_irqrestore(ap->lock, flags);
 		return AC_ERR_SYSTEM;
 	}
 
@@ -1052,14 +1052,14 @@ unsigned ata_exec_internal(struct ata_device *dev,
 
 	ata_qc_issue(qc);
 
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
 	rc = wait_for_completion_timeout(&wait, ATA_TMOUT_INTERNAL);
 
 	ata_port_flush_task(ap);
 
 	if (!rc) {
-		spin_lock_irqsave(&ap->host_set->lock, flags);
+		spin_lock_irqsave(ap->lock, flags);
 
 		/* We're racing with irq here.  If we lose, the
 		 * following test prevents us from completing the qc
@@ -1078,7 +1078,7 @@ unsigned ata_exec_internal(struct ata_device *dev,
 				       "qc timeout (cmd 0x%x)\n", command);
 		}
 
-		spin_unlock_irqrestore(&ap->host_set->lock, flags);
+		spin_unlock_irqrestore(ap->lock, flags);
 	}
 
 	/* do post_internal_cmd */
@@ -1092,7 +1092,7 @@ unsigned ata_exec_internal(struct ata_device *dev,
 	}
 
 	/* finish up */
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 
 	*tf = qc->result_tf;
 	err_mask = qc->err_mask;
@@ -1118,7 +1118,7 @@ unsigned ata_exec_internal(struct ata_device *dev,
 		ata_port_probe(ap);
 	}
 
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
 	return err_mask;
 }
@@ -3912,7 +3912,7 @@ static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
 
 	if (ap->ops->error_handler) {
 		if (in_wq) {
-			spin_lock_irqsave(&ap->host_set->lock, flags);
+			spin_lock_irqsave(ap->lock, flags);
 
 			/* EH might have kicked in while host_set lock
 			 * is released.
@@ -3926,7 +3926,7 @@ static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
 					ata_port_freeze(ap);
 			}
 
-			spin_unlock_irqrestore(&ap->host_set->lock, flags);
+			spin_unlock_irqrestore(ap->lock, flags);
 		} else {
 			if (likely(!(qc->err_mask & AC_ERR_HSM)))
 				ata_qc_complete(qc);
@@ -3935,10 +3935,10 @@ static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
 		}
 	} else {
 		if (in_wq) {
-			spin_lock_irqsave(&ap->host_set->lock, flags);
+			spin_lock_irqsave(ap->lock, flags);
 			ata_irq_on(ap);
 			ata_qc_complete(qc);
-			spin_unlock_irqrestore(&ap->host_set->lock, flags);
+			spin_unlock_irqrestore(ap->lock, flags);
 		} else
 			ata_qc_complete(qc);
 	}
@@ -4018,7 +4018,7 @@ fsm_start:
 		 * hsm_task_state is changed. Hence, the following locking.
 		 */
 		if (in_wq)
-			spin_lock_irqsave(&ap->host_set->lock, flags);
+			spin_lock_irqsave(ap->lock, flags);
 
 		if (qc->tf.protocol == ATA_PROT_PIO) {
 			/* PIO data out protocol.
@@ -4037,7 +4037,7 @@ fsm_start:
 			atapi_send_cdb(ap, qc);
 
 		if (in_wq)
-			spin_unlock_irqrestore(&ap->host_set->lock, flags);
+			spin_unlock_irqrestore(ap->lock, flags);
 
 		/* if polling, ata_pio_task() handles the rest.
 		 * otherwise, interrupt handler takes over from here.
@@ -5130,9 +5130,9 @@ void ata_dev_init(struct ata_device *dev)
 	 * requests which occur asynchronously.  Synchronize using
 	 * host_set lock.
 	 */
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 	dev->flags &= ~ATA_DFLAG_INIT_MASK;
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
 	memset((void *)dev + ATA_DEVICE_CLEAR_OFFSET, 0,
 	       sizeof(*dev) - ATA_DEVICE_CLEAR_OFFSET);
@@ -5167,6 +5167,7 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 	host->unique_id = ata_unique_id++;
 	host->max_cmd_len = 12;
 
+	ap->lock = &host_set->lock;
 	ap->flags = ATA_FLAG_DISABLED;
 	ap->id = host->unique_id;
 	ap->host = host;
@@ -5388,7 +5389,7 @@ int ata_device_add(const struct ata_probe_ent *ent)
 			ata_port_probe(ap);
 
 			/* kick EH for boot probing */
-			spin_lock_irqsave(&ap->host_set->lock, flags);
+			spin_lock_irqsave(ap->lock, flags);
 
 			ap->eh_info.probe_mask = (1 << ATA_MAX_DEVICES) - 1;
 			ap->eh_info.action |= ATA_EH_SOFTRESET;
@@ -5396,7 +5397,7 @@ int ata_device_add(const struct ata_probe_ent *ent)
 			ap->flags |= ATA_FLAG_LOADING;
 			ata_port_schedule_eh(ap);
 
-			spin_unlock_irqrestore(&ap->host_set->lock, flags);
+			spin_unlock_irqrestore(ap->lock, flags);
 
 			/* wait for EH to finish */
 			ata_port_wait_eh(ap);
@@ -5460,29 +5461,29 @@ void ata_port_detach(struct ata_port *ap)
 		return;
 
 	/* tell EH we're leaving & flush EH */
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 	ap->flags |= ATA_FLAG_UNLOADING;
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
 	ata_port_wait_eh(ap);
 
 	/* EH is now guaranteed to see UNLOADING, so no new device
 	 * will be attached.  Disable all existing devices.
 	 */
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 
 	for (i = 0; i < ATA_MAX_DEVICES; i++)
 		ata_dev_disable(&ap->device[i]);
 
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
 	/* Final freeze & EH.  All in-flight commands are aborted.  EH
 	 * will be skipped and retrials will be terminated with bad
 	 * target.
 	 */
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 	ata_port_freeze(ap);	/* won't be thawed */
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
 	ata_port_wait_eh(ap);
 
diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c
index 70b623988a9f..823385981a7a 100644
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@@ -128,7 +128,7 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 	}
 
 	ret = EH_HANDLED;
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 	qc = ata_qc_from_tag(ap, ap->active_tag);
 	if (qc) {
 		WARN_ON(qc->scsicmd != cmd);
@@ -136,7 +136,7 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 		qc->err_mask |= AC_ERR_TIMEOUT;
 		ret = EH_NOT_HANDLED;
 	}
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
  out:
 	DPRINTK("EXIT, ret=%d\n", ret);
@@ -158,7 +158,7 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 void ata_scsi_error(struct Scsi_Host *host)
 {
 	struct ata_port *ap = ata_shost_to_port(host);
-	spinlock_t *hs_lock = &ap->host_set->lock;
+	spinlock_t *ap_lock = ap->lock;
 	int i, repeat_cnt = ATA_EH_MAX_REPEAT;
 	unsigned long flags;
 
@@ -185,7 +185,7 @@ void ata_scsi_error(struct Scsi_Host *host)
 		struct scsi_cmnd *scmd, *tmp;
 		int nr_timedout = 0;
 
-		spin_lock_irqsave(hs_lock, flags);
+		spin_lock_irqsave(ap_lock, flags);
 
 		list_for_each_entry_safe(scmd, tmp, &host->eh_cmd_q, eh_entry) {
 			struct ata_queued_cmd *qc;
@@ -224,15 +224,15 @@ void ata_scsi_error(struct Scsi_Host *host)
 		if (nr_timedout)
 			__ata_port_freeze(ap);
 
-		spin_unlock_irqrestore(hs_lock, flags);
+		spin_unlock_irqrestore(ap_lock, flags);
 	} else
-		spin_unlock_wait(hs_lock);
+		spin_unlock_wait(ap_lock);
 
  repeat:
 	/* invoke error handler */
 	if (ap->ops->error_handler) {
 		/* fetch & clear EH info */
-		spin_lock_irqsave(hs_lock, flags);
+		spin_lock_irqsave(ap_lock, flags);
 
 		memset(&ap->eh_context, 0, sizeof(ap->eh_context));
 		ap->eh_context.i = ap->eh_info;
@@ -241,7 +241,7 @@ void ata_scsi_error(struct Scsi_Host *host)
 		ap->flags |= ATA_FLAG_EH_IN_PROGRESS;
 		ap->flags &= ~ATA_FLAG_EH_PENDING;
 
-		spin_unlock_irqrestore(hs_lock, flags);
+		spin_unlock_irqrestore(ap_lock, flags);
 
 		/* invoke EH.  if unloading, just finish failed qcs */
 		if (!(ap->flags & ATA_FLAG_UNLOADING))
@@ -253,14 +253,14 @@ void ata_scsi_error(struct Scsi_Host *host)
 		 * recovered the port but before this point.  Repeat
 		 * EH in such case.
 		 */
-		spin_lock_irqsave(hs_lock, flags);
+		spin_lock_irqsave(ap_lock, flags);
 
 		if (ap->flags & ATA_FLAG_EH_PENDING) {
 			if (--repeat_cnt) {
 				ata_port_printk(ap, KERN_INFO,
 					"EH pending after completion, "
 					"repeating EH (cnt=%d)\n", repeat_cnt);
-				spin_unlock_irqrestore(hs_lock, flags);
+				spin_unlock_irqrestore(ap_lock, flags);
 				goto repeat;
 			}
 			ata_port_printk(ap, KERN_ERR, "EH pending after %d "
@@ -270,14 +270,14 @@ void ata_scsi_error(struct Scsi_Host *host)
 		/* this run is complete, make sure EH info is clear */
 		memset(&ap->eh_info, 0, sizeof(ap->eh_info));
 
-		/* Clear host_eh_scheduled while holding hs_lock such
+		/* Clear host_eh_scheduled while holding ap_lock such
 		 * that if exception occurs after this point but
 		 * before EH completion, SCSI midlayer will
 		 * re-initiate EH.
 		 */
 		host->host_eh_scheduled = 0;
 
-		spin_unlock_irqrestore(hs_lock, flags);
+		spin_unlock_irqrestore(ap_lock, flags);
 	} else {
 		WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL);
 		ap->ops->eng_timeout(ap);
@@ -289,7 +289,7 @@ void ata_scsi_error(struct Scsi_Host *host)
 	scsi_eh_flush_done_q(&ap->eh_done_q);
 
 	/* clean up */
-	spin_lock_irqsave(hs_lock, flags);
+	spin_lock_irqsave(ap_lock, flags);
 
 	if (ap->flags & ATA_FLAG_LOADING) {
 		ap->flags &= ~ATA_FLAG_LOADING;
@@ -306,7 +306,7 @@ void ata_scsi_error(struct Scsi_Host *host)
 	ap->flags &= ~ATA_FLAG_EH_IN_PROGRESS;
 	wake_up_all(&ap->eh_wait_q);
 
-	spin_unlock_irqrestore(hs_lock, flags);
+	spin_unlock_irqrestore(ap_lock, flags);
 
 	DPRINTK("EXIT\n");
 }
@@ -326,17 +326,17 @@ void ata_port_wait_eh(struct ata_port *ap)
 	DEFINE_WAIT(wait);
 
  retry:
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 
 	while (ap->flags & (ATA_FLAG_EH_PENDING | ATA_FLAG_EH_IN_PROGRESS)) {
 		prepare_to_wait(&ap->eh_wait_q, &wait, TASK_UNINTERRUPTIBLE);
-		spin_unlock_irqrestore(&ap->host_set->lock, flags);
+		spin_unlock_irqrestore(ap->lock, flags);
 		schedule();
-		spin_lock_irqsave(&ap->host_set->lock, flags);
+		spin_lock_irqsave(ap->lock, flags);
 	}
 	finish_wait(&ap->eh_wait_q, &wait);
 
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
 	/* make sure SCSI EH is complete */
 	if (scsi_host_in_recovery(ap->host)) {
@@ -368,7 +368,6 @@ void ata_port_wait_eh(struct ata_port *ap)
 static void ata_qc_timeout(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
-	struct ata_host_set *host_set = ap->host_set;
 	u8 host_stat = 0, drv_stat;
 	unsigned long flags;
 
@@ -376,7 +375,7 @@ static void ata_qc_timeout(struct ata_queued_cmd *qc)
 
 	ap->hsm_task_state = HSM_ST_IDLE;
 
-	spin_lock_irqsave(&host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 
 	switch (qc->tf.protocol) {
 
@@ -405,7 +404,7 @@ static void ata_qc_timeout(struct ata_queued_cmd *qc)
 		break;
 	}
 
-	spin_unlock_irqrestore(&host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
 	ata_eh_qc_complete(qc);
 
@@ -592,9 +591,9 @@ void ata_eh_freeze_port(struct ata_port *ap)
 	if (!ap->ops->error_handler)
 		return;
 
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 	__ata_port_freeze(ap);
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 }
 
 /**
@@ -613,14 +612,14 @@ void ata_eh_thaw_port(struct ata_port *ap)
 	if (!ap->ops->error_handler)
 		return;
 
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 
 	ap->flags &= ~ATA_FLAG_FROZEN;
 
 	if (ap->ops->thaw)
 		ap->ops->thaw(ap);
 
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
 	DPRINTK("ata%u port thawed\n", ap->id);
 }
@@ -636,11 +635,11 @@ static void __ata_eh_qc_complete(struct ata_queued_cmd *qc)
 	struct scsi_cmnd *scmd = qc->scsicmd;
 	unsigned long flags;
 
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 	qc->scsidone = ata_eh_scsidone;
 	__ata_qc_complete(qc);
 	WARN_ON(ata_tag_valid(qc->tag));
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
 	scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
 }
@@ -694,7 +693,7 @@ static void ata_eh_detach_dev(struct ata_device *dev)
 
 	ata_dev_disable(dev);
 
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 
 	dev->flags &= ~ATA_DFLAG_DETACH;
 
@@ -703,7 +702,7 @@ static void ata_eh_detach_dev(struct ata_device *dev)
 		ap->flags |= ATA_FLAG_SCSI_HOTPLUG;
 	}
 
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 }
 
 static void ata_eh_clear_action(struct ata_device *dev,
@@ -749,10 +748,10 @@ static void ata_eh_about_to_do(struct ata_port *ap, struct ata_device *dev,
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 	ata_eh_clear_action(dev, &ap->eh_info, action);
 	ap->flags |= ATA_FLAG_RECOVERED;
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 }
 
 /**
@@ -1625,9 +1624,9 @@ static int ata_eh_revalidate_and_attach(struct ata_port *ap,
 				break;
 			}
 
-			spin_lock_irqsave(&ap->host_set->lock, flags);
+			spin_lock_irqsave(ap->lock, flags);
 			ap->flags |= ATA_FLAG_SCSI_HOTPLUG;
-			spin_unlock_irqrestore(&ap->host_set->lock, flags);
+			spin_unlock_irqrestore(ap->lock, flags);
 		}
 	}
 
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 9698949fa52a..d86abed62007 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -752,7 +752,7 @@ void ata_scsi_slave_destroy(struct scsi_device *sdev)
 	if (!ap->ops->error_handler)
 		return;
 
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 	dev = __ata_scsi_find_dev(ap, sdev);
 	if (dev && dev->sdev) {
 		/* SCSI device already in CANCEL state, no need to offline it */
@@ -760,7 +760,7 @@ void ata_scsi_slave_destroy(struct scsi_device *sdev)
 		dev->flags |= ATA_DFLAG_DETACH;
 		ata_port_schedule_eh(ap);
 	}
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 }
 
 /**
@@ -2684,7 +2684,7 @@ int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
 	ap = ata_shost_to_port(shost);
 
 	spin_unlock(shost->host_lock);
-	spin_lock(&ap->host_set->lock);
+	spin_lock(ap->lock);
 
 	ata_scsi_dump_cdb(ap, cmd);
 
@@ -2696,7 +2696,7 @@ int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
 		done(cmd);
 	}
 
-	spin_unlock(&ap->host_set->lock);
+	spin_unlock(ap->lock);
 	spin_lock(shost->host_lock);
 	return rc;
 }
@@ -2858,7 +2858,7 @@ static void ata_scsi_remove_dev(struct ata_device *dev)
 	 * increments reference counts regardless of device state.
 	 */
 	mutex_lock(&ap->host->scan_mutex);
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 
 	/* clearing dev->sdev is protected by host_set lock */
 	sdev = dev->sdev;
@@ -2882,7 +2882,7 @@ static void ata_scsi_remove_dev(struct ata_device *dev)
 		}
 	}
 
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 	mutex_unlock(&ap->host->scan_mutex);
 
 	if (sdev) {
@@ -2926,9 +2926,9 @@ void ata_scsi_hotplug(void *data)
 		if (!(dev->flags & ATA_DFLAG_DETACHED))
 			continue;
 
-		spin_lock_irqsave(&ap->host_set->lock, flags);
+		spin_lock_irqsave(ap->lock, flags);
 		dev->flags &= ~ATA_DFLAG_DETACHED;
-		spin_unlock_irqrestore(&ap->host_set->lock, flags);
+		spin_unlock_irqrestore(ap->lock, flags);
 
 		ata_scsi_remove_dev(dev);
 	}
@@ -2981,7 +2981,7 @@ static int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
 	    (lun != SCAN_WILD_CARD && lun != 0))
 		return -EINVAL;
 
-	spin_lock_irqsave(&ap->host_set->lock, flags);
+	spin_lock_irqsave(ap->lock, flags);
 
 	if (id == SCAN_WILD_CARD) {
 		ap->eh_info.probe_mask |= (1 << ATA_MAX_DEVICES) - 1;
@@ -2999,7 +2999,7 @@ static int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
 	if (rc == 0)
 		ata_port_schedule_eh(ap);
 
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
 	return rc;
 }
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 6b3c3af2c75f..20b1cf527c60 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -483,6 +483,7 @@ struct ata_eh_context {
 struct ata_port {
 	struct Scsi_Host	*host;	/* our co-allocated scsi host */
 	const struct ata_port_operations *ops;
+	spinlock_t		*lock;
 	unsigned long		flags;	/* ATA_FLAG_xxx */
 	unsigned int		id;	/* unique id req'd by scsi midlyr */
 	unsigned int		port_no; /* unique port #; from zero */
-- 
cgit v1.2.3


From 5b057c6b1a25d57edf2b4d1e956e50936480a9ff Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 23 Jun 2006 02:06:41 -0700
Subject: [NET]: Avoid allocating skb in skb_pad

First of all it is unnecessary to allocate a new skb in skb_pad since
the existing one is not shared.  More importantly, our hard_start_xmit
interface does not allow a new skb to be allocated since that breaks
requeueing.

This patch uses pskb_expand_head to expand the existing skb and linearize
it if needed.  Actually, someone should sift through every instance of
skb_pad on a non-linear skb as they do not fit the reasons why this was
originally created.

Incidentally, this fixes a minor bug when the skb is cloned (tcpdump,
TCP, etc.).  As it is skb_pad will simply write over a cloned skb.  Because
of the position of the write it is unlikely to cause problems but still
it's best if we don't do it.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/3c527.c               |  3 +--
 drivers/net/82596.c               |  3 +--
 drivers/net/a2065.c               |  3 +--
 drivers/net/ariadne.c             |  3 +--
 drivers/net/arm/ether1.c          |  3 +--
 drivers/net/arm/ether3.c          |  3 +--
 drivers/net/atarilance.c          |  3 +--
 drivers/net/cassini.c             |  3 +--
 drivers/net/declance.c            |  3 +--
 drivers/net/depca.c               |  7 ++-----
 drivers/net/eepro.c               |  3 +--
 drivers/net/eexpress.c            |  3 +--
 drivers/net/epic100.c             |  7 ++-----
 drivers/net/eth16i.c              |  3 +--
 drivers/net/hp100.c               |  7 ++-----
 drivers/net/lance.c               |  3 +--
 drivers/net/lasi_82596.c          |  3 +--
 drivers/net/lp486e.c              |  3 +--
 drivers/net/myri10ge/myri10ge.c   |  3 +--
 drivers/net/pcmcia/fmvj18x_cs.c   |  3 +--
 drivers/net/pcmcia/xirc2ps_cs.c   |  3 +--
 drivers/net/r8169.c               |  3 +--
 drivers/net/seeq8005.c            |  3 +--
 drivers/net/sis190.c              |  3 +--
 drivers/net/sk98lin/skge.c        |  2 +-
 drivers/net/skge.c                |  3 +--
 drivers/net/smc9194.c             |  3 +--
 drivers/net/sonic.c               |  3 +--
 drivers/net/starfire.c            |  3 +--
 drivers/net/via-rhine.c           |  7 ++-----
 drivers/net/wireless/ray_cs.c     |  3 +--
 drivers/net/wireless/wavelan_cs.c |  7 ++-----
 drivers/net/yellowfin.c           | 12 +++++-------
 drivers/net/znet.c                |  3 +--
 include/linux/skbuff.h            | 11 +++++------
 net/core/skbuff.c                 | 36 ++++++++++++++++++++++++++----------
 36 files changed, 74 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c
index 1b1cb0026072..157eda573925 100644
--- a/drivers/net/3c527.c
+++ b/drivers/net/3c527.c
@@ -1031,8 +1031,7 @@ static int mc32_send_packet(struct sk_buff *skb, struct net_device *dev)
 		return 1;
 	}
 
-	skb = skb_padto(skb, ETH_ZLEN);
-	if (skb == NULL) {
+	if (skb_padto(skb, ETH_ZLEN)) {
 		netif_wake_queue(dev);
 		return 0;
 	}
diff --git a/drivers/net/82596.c b/drivers/net/82596.c
index da0c878dcba8..8a9f7d61b9b1 100644
--- a/drivers/net/82596.c
+++ b/drivers/net/82596.c
@@ -1070,8 +1070,7 @@ static int i596_start_xmit(struct sk_buff *skb, struct net_device *dev)
 				skb->len, (unsigned int)skb->data));
 
 	if (skb->len < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/a2065.c b/drivers/net/a2065.c
index 79bb56b8dcef..71165ac0257a 100644
--- a/drivers/net/a2065.c
+++ b/drivers/net/a2065.c
@@ -573,8 +573,7 @@ static int lance_start_xmit (struct sk_buff *skb, struct net_device *dev)
 	
 	if (len < ETH_ZLEN) {
 		len = ETH_ZLEN;
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 	}
 
diff --git a/drivers/net/ariadne.c b/drivers/net/ariadne.c
index d1b6b1f794e2..a9bb7a4aff98 100644
--- a/drivers/net/ariadne.c
+++ b/drivers/net/ariadne.c
@@ -607,8 +607,7 @@ static int ariadne_start_xmit(struct sk_buff *skb, struct net_device *dev)
     /* FIXME: is the 79C960 new enough to do its own padding right ? */
     if (skb->len < ETH_ZLEN)
     {
-    	skb = skb_padto(skb, ETH_ZLEN);
-    	if (skb == NULL)
+    	if (skb_padto(skb, ETH_ZLEN))
     	    return 0;
     	len = ETH_ZLEN;
     }
diff --git a/drivers/net/arm/ether1.c b/drivers/net/arm/ether1.c
index 36475eb2727f..312955d07b28 100644
--- a/drivers/net/arm/ether1.c
+++ b/drivers/net/arm/ether1.c
@@ -700,8 +700,7 @@ ether1_sendpacket (struct sk_buff *skb, struct net_device *dev)
 	}
 
 	if (skb->len < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			goto out;
 	}
 
diff --git a/drivers/net/arm/ether3.c b/drivers/net/arm/ether3.c
index f1d5b1027ff7..081074180e62 100644
--- a/drivers/net/arm/ether3.c
+++ b/drivers/net/arm/ether3.c
@@ -518,8 +518,7 @@ ether3_sendpacket(struct sk_buff *skb, struct net_device *dev)
 
 	length = (length + 1) & ~1;
 	if (length != skb->len) {
-		skb = skb_padto(skb, length);
-		if (skb == NULL)
+		if (skb_padto(skb, length))
 			goto out;
 	}
 
diff --git a/drivers/net/atarilance.c b/drivers/net/atarilance.c
index 442b2cbeb58a..91783a8008be 100644
--- a/drivers/net/atarilance.c
+++ b/drivers/net/atarilance.c
@@ -804,8 +804,7 @@ static int lance_start_xmit( struct sk_buff *skb, struct net_device *dev )
 		++len;
 		
 	if (len > skb->len) {
-		skb = skb_padto(skb, len);
-		if (skb == NULL)
+		if (skb_padto(skb, len))
 			return 0;
 	}
 		
diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c
index 39f36aa05aa8..565a54f1d06a 100644
--- a/drivers/net/cassini.c
+++ b/drivers/net/cassini.c
@@ -2915,8 +2915,7 @@ static int cas_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	 */
 	static int ring; 
 
-	skb = skb_padto(skb, cp->min_frame_size);
-	if (!skb)
+	if (skb_padto(skb, cp->min_frame_size))
 		return 0;
 
 	/* XXX: we need some higher-level QoS hooks to steer packets to
diff --git a/drivers/net/declance.c b/drivers/net/declance.c
index f130bdab3fd3..d3d958e7ac56 100644
--- a/drivers/net/declance.c
+++ b/drivers/net/declance.c
@@ -885,8 +885,7 @@ static int lance_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	len = skblen;
 	
 	if (len < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		len = ETH_ZLEN;
 	}
diff --git a/drivers/net/depca.c b/drivers/net/depca.c
index 0941d40f046f..e946c43d3b10 100644
--- a/drivers/net/depca.c
+++ b/drivers/net/depca.c
@@ -938,11 +938,8 @@ static int depca_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (skb->len < 1)
 		goto out;
 
-	if (skb->len < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
-			goto out;
-	}
+	if (skb_padto(skb, ETH_ZLEN))
+		goto out;
 	
 	netif_stop_queue(dev);
 
diff --git a/drivers/net/eepro.c b/drivers/net/eepro.c
index a806dfe54d23..e70f172699db 100644
--- a/drivers/net/eepro.c
+++ b/drivers/net/eepro.c
@@ -1154,8 +1154,7 @@ static int eepro_send_packet(struct sk_buff *skb, struct net_device *dev)
 		printk(KERN_DEBUG  "%s: entering eepro_send_packet routine.\n", dev->name);
 
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/eexpress.c b/drivers/net/eexpress.c
index 82bd356e4f3a..a74b20715755 100644
--- a/drivers/net/eexpress.c
+++ b/drivers/net/eexpress.c
@@ -677,8 +677,7 @@ static int eexp_xmit(struct sk_buff *buf, struct net_device *dev)
 #endif
 
 	if (buf->len < ETH_ZLEN) {
-		buf = skb_padto(buf, ETH_ZLEN);
-		if (buf == NULL)
+		if (skb_padto(buf, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/epic100.c b/drivers/net/epic100.c
index 8d680ce600d7..724d7dc35fa3 100644
--- a/drivers/net/epic100.c
+++ b/drivers/net/epic100.c
@@ -1027,11 +1027,8 @@ static int epic_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	u32 ctrl_word;
 	unsigned long flags;
 
-	if (skb->len < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
-			return 0;
-	}
+	if (skb_padto(skb, ETH_ZLEN))
+		return 0;
 
 	/* Caution: the write order is important here, set the field with the
 	   "ownership" bit last. */
diff --git a/drivers/net/eth16i.c b/drivers/net/eth16i.c
index b67545be2caa..4bf76f86d8e9 100644
--- a/drivers/net/eth16i.c
+++ b/drivers/net/eth16i.c
@@ -1064,8 +1064,7 @@ static int eth16i_tx(struct sk_buff *skb, struct net_device *dev)
 	unsigned long flags;
 
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/hp100.c b/drivers/net/hp100.c
index 247c8ca86033..dd1dc32dc98d 100644
--- a/drivers/net/hp100.c
+++ b/drivers/net/hp100.c
@@ -1487,11 +1487,8 @@ static int hp100_start_xmit_bm(struct sk_buff *skb, struct net_device *dev)
 	if (skb->len <= 0)
 		return 0;
 		
-	if (skb->len < ETH_ZLEN && lp->chip == HP100_CHIPID_SHASTA) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
-			return 0;
-	}
+	if (lp->chip == HP100_CHIPID_SHASTA && skb_padto(skb, ETH_ZLEN))
+		return 0;
 
 	/* Get Tx ring tail pointer */
 	if (lp->txrtail->next == lp->txrhead) {
diff --git a/drivers/net/lance.c b/drivers/net/lance.c
index bb5ad479210b..c1c3452c90ca 100644
--- a/drivers/net/lance.c
+++ b/drivers/net/lance.c
@@ -968,8 +968,7 @@ static int lance_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* The old LANCE chips doesn't automatically pad buffers to min. size. */
 	if (chip_table[lp->chip_version].flags & LANCE_MUST_PAD) {
 		if (skb->len < ETH_ZLEN) {
-			skb = skb_padto(skb, ETH_ZLEN);
-			if (skb == NULL)
+			if (skb_padto(skb, ETH_ZLEN))
 				goto out;
 			lp->tx_ring[entry].length = -ETH_ZLEN;
 		}
diff --git a/drivers/net/lasi_82596.c b/drivers/net/lasi_82596.c
index 957888de3d7e..1ab09447baa5 100644
--- a/drivers/net/lasi_82596.c
+++ b/drivers/net/lasi_82596.c
@@ -1083,8 +1083,7 @@ static int i596_start_xmit(struct sk_buff *skb, struct net_device *dev)
 				skb->len, skb->data));
 
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/lp486e.c b/drivers/net/lp486e.c
index 94d5ea1ce8bd..bf3f343ae715 100644
--- a/drivers/net/lp486e.c
+++ b/drivers/net/lp486e.c
@@ -877,8 +877,7 @@ static int i596_start_xmit (struct sk_buff *skb, struct net_device *dev) {
 	length = skb->len;
 	
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index 5a74f63618bc..b983e1e04348 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -1939,8 +1939,7 @@ again:
 
 		/* pad frames to at least ETH_ZLEN bytes */
 		if (unlikely(skb->len < ETH_ZLEN)) {
-			skb = skb_padto(skb, ETH_ZLEN);
-			if (skb == NULL) {
+			if (skb_padto(skb, ETH_ZLEN)) {
 				/* The packet is gone, so we must
 				 * return 0 */
 				mgp->stats.tx_dropped += 1;
diff --git a/drivers/net/pcmcia/fmvj18x_cs.c b/drivers/net/pcmcia/fmvj18x_cs.c
index 09b11761cdfa..ea93b8f18605 100644
--- a/drivers/net/pcmcia/fmvj18x_cs.c
+++ b/drivers/net/pcmcia/fmvj18x_cs.c
@@ -831,8 +831,7 @@ static int fjn_start_xmit(struct sk_buff *skb, struct net_device *dev)
     
     if (length < ETH_ZLEN)
     {
-    	skb = skb_padto(skb, ETH_ZLEN);
-    	if (skb == NULL)
+    	if (skb_padto(skb, ETH_ZLEN))
     		return 0;
     	length = ETH_ZLEN;
     }
diff --git a/drivers/net/pcmcia/xirc2ps_cs.c b/drivers/net/pcmcia/xirc2ps_cs.c
index e80d1e3aec68..9bae77ce1314 100644
--- a/drivers/net/pcmcia/xirc2ps_cs.c
+++ b/drivers/net/pcmcia/xirc2ps_cs.c
@@ -1374,8 +1374,7 @@ do_start_xmit(struct sk_buff *skb, struct net_device *dev)
      */
     if (pktlen < ETH_ZLEN)
     {
-        skb = skb_padto(skb, ETH_ZLEN);
-        if (skb == NULL)
+        if (skb_padto(skb, ETH_ZLEN))
         	return 0;
 	pktlen = ETH_ZLEN;
     }
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 9945cc6b8d90..985afe0e6273 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -2222,8 +2222,7 @@ static int rtl8169_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		len = skb->len;
 
 		if (unlikely(len < ETH_ZLEN)) {
-			skb = skb_padto(skb, ETH_ZLEN);
-			if (!skb)
+			if (skb_padto(skb, ETH_ZLEN))
 				goto err_update_stats;
 			len = ETH_ZLEN;
 		}
diff --git a/drivers/net/seeq8005.c b/drivers/net/seeq8005.c
index bcef03feb2fc..efd0f235020f 100644
--- a/drivers/net/seeq8005.c
+++ b/drivers/net/seeq8005.c
@@ -396,8 +396,7 @@ static int seeq8005_send_packet(struct sk_buff *skb, struct net_device *dev)
 	unsigned char *buf;
 
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/sis190.c b/drivers/net/sis190.c
index 31dd3f036fa8..df39f3447655 100644
--- a/drivers/net/sis190.c
+++ b/drivers/net/sis190.c
@@ -1156,8 +1156,7 @@ static int sis190_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	dma_addr_t mapping;
 
 	if (unlikely(skb->len < ETH_ZLEN)) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (!skb) {
+		if (skb_padto(skb, ETH_ZLEN)) {
 			tp->stats.tx_dropped++;
 			goto out;
 		}
diff --git a/drivers/net/sk98lin/skge.c b/drivers/net/sk98lin/skge.c
index 38a26df4095f..f3efbd177ae7 100644
--- a/drivers/net/sk98lin/skge.c
+++ b/drivers/net/sk98lin/skge.c
@@ -1525,7 +1525,7 @@ struct sk_buff	*pMessage)	/* pointer to send-message              */
 	** This is to resolve faulty padding by the HW with 0xaa bytes.
 	*/
 	if (BytesSend < C_LEN_ETHERNET_MINSIZE) {
-		if ((pMessage = skb_padto(pMessage, C_LEN_ETHERNET_MINSIZE)) == NULL) {
+		if (skb_padto(pMessage, C_LEN_ETHERNET_MINSIZE)) {
 			spin_unlock_irqrestore(&pTxPort->TxDesRingLock, Flags);
 			return 0;
 		}
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 536dd1cf7f79..19a4a16055dc 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -2310,8 +2310,7 @@ static int skge_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 	u64 map;
 	unsigned long flags;
 
-	skb = skb_padto(skb, ETH_ZLEN);
-	if (!skb)
+	if (skb_padto(skb, ETH_ZLEN))
 		return NETDEV_TX_OK;
 
 	if (!spin_trylock_irqsave(&skge->tx_lock, flags))
diff --git a/drivers/net/smc9194.c b/drivers/net/smc9194.c
index 6cf16f322ad5..8b0321f1976c 100644
--- a/drivers/net/smc9194.c
+++ b/drivers/net/smc9194.c
@@ -523,8 +523,7 @@ static int smc_wait_to_send_packet( struct sk_buff * skb, struct net_device * de
 	length = skb->len;
 
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL) {
+		if (skb_padto(skb, ETH_ZLEN)) {
 			netif_wake_queue(dev);
 			return 0;
 		}
diff --git a/drivers/net/sonic.c b/drivers/net/sonic.c
index 90b818a8de6e..cab0dd958492 100644
--- a/drivers/net/sonic.c
+++ b/drivers/net/sonic.c
@@ -231,8 +231,7 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev)
 
 	length = skb->len;
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/starfire.c b/drivers/net/starfire.c
index 9b7805be21da..c158eedc7813 100644
--- a/drivers/net/starfire.c
+++ b/drivers/net/starfire.c
@@ -1349,8 +1349,7 @@ static int start_tx(struct sk_buff *skb, struct net_device *dev)
 
 #if defined(ZEROCOPY) && defined(HAS_BROKEN_FIRMWARE)
 	if (skb->ip_summed == CHECKSUM_HW) {
-		skb = skb_padto(skb, (skb->len + PADDING_MASK) & ~PADDING_MASK);
-		if (skb == NULL)
+		if (skb_padto(skb, (skb->len + PADDING_MASK) & ~PADDING_MASK))
 			return NETDEV_TX_OK;
 	}
 #endif /* ZEROCOPY && HAS_BROKEN_FIRMWARE */
diff --git a/drivers/net/via-rhine.c b/drivers/net/via-rhine.c
index fdc21037f6dc..c80a4f1d5f7a 100644
--- a/drivers/net/via-rhine.c
+++ b/drivers/net/via-rhine.c
@@ -1284,11 +1284,8 @@ static int rhine_start_tx(struct sk_buff *skb, struct net_device *dev)
 	/* Calculate the next Tx descriptor entry. */
 	entry = rp->cur_tx % TX_RING_SIZE;
 
-	if (skb->len < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
-			return 0;
-	}
+	if (skb_padto(skb, ETH_ZLEN))
+		return 0;
 
 	rp->tx_skbuff[entry] = skb;
 
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index 879eb427607c..a915fe6c6aa5 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -924,8 +924,7 @@ static int ray_dev_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
     if (length < ETH_ZLEN)
     {
-    	skb = skb_padto(skb, ETH_ZLEN);
-    	if (skb == NULL)
+    	if (skb_padto(skb, ETH_ZLEN))
     		return 0;
     	length = ETH_ZLEN;
     }
diff --git a/drivers/net/wireless/wavelan_cs.c b/drivers/net/wireless/wavelan_cs.c
index f7724eb2fa7e..561250f73fd3 100644
--- a/drivers/net/wireless/wavelan_cs.c
+++ b/drivers/net/wireless/wavelan_cs.c
@@ -3194,11 +3194,8 @@ wavelan_packet_xmit(struct sk_buff *	skb,
 	 * and we don't have the Ethernet specific requirement of beeing
 	 * able to detect collisions, therefore in theory we don't really
 	 * need to pad. Jean II */
-	if (skb->len < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
-			return 0;
-	}
+	if (skb_padto(skb, ETH_ZLEN))
+		return 0;
 
   wv_packet_write(dev, skb->data, skb->len);
 
diff --git a/drivers/net/yellowfin.c b/drivers/net/yellowfin.c
index fd0f43b7db5b..ecec8e5db786 100644
--- a/drivers/net/yellowfin.c
+++ b/drivers/net/yellowfin.c
@@ -862,13 +862,11 @@ static int yellowfin_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		/* Fix GX chipset errata. */
 		if (cacheline_end > 24  || cacheline_end == 0) {
 			len = skb->len + 32 - cacheline_end + 1;
-			if (len != skb->len)
-				skb = skb_padto(skb, len);
-		}
-		if (skb == NULL) {
-			yp->tx_skbuff[entry] = NULL;
-			netif_wake_queue(dev);
-			return 0;
+			if (skb_padto(skb, len)) {
+				yp->tx_skbuff[entry] = NULL;
+				netif_wake_queue(dev);
+				return 0;
+			}
 		}
 	}
 	yp->tx_skbuff[entry] = skb;
diff --git a/drivers/net/znet.c b/drivers/net/znet.c
index 3ac047bc727d..a7c089df66e6 100644
--- a/drivers/net/znet.c
+++ b/drivers/net/znet.c
@@ -544,8 +544,7 @@ static int znet_send_packet(struct sk_buff *skb, struct net_device *dev)
 		printk(KERN_DEBUG "%s: ZNet_send_packet.\n", dev->name);
 
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 66f8819f9568..f8c7eb79a27f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -345,7 +345,7 @@ extern struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
 extern struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
 				       int newheadroom, int newtailroom,
 				       gfp_t priority);
-extern struct sk_buff *		skb_pad(struct sk_buff *skb, int pad);
+extern int	       skb_pad(struct sk_buff *skb, int pad);
 #define dev_kfree_skb(a)	kfree_skb(a)
 extern void	      skb_over_panic(struct sk_buff *skb, int len,
 				     void *here);
@@ -1122,16 +1122,15 @@ static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
  *
  *	Pads up a buffer to ensure the trailing bytes exist and are
  *	blanked. If the buffer already contains sufficient data it
- *	is untouched. Returns the buffer, which may be a replacement
- *	for the original, or NULL for out of memory - in which case
- *	the original buffer is still freed.
+ *	is untouched. Otherwise it is extended. Returns zero on
+ *	success. The skb is freed on error.
  */
  
-static inline struct sk_buff *skb_padto(struct sk_buff *skb, unsigned int len)
+static inline int skb_padto(struct sk_buff *skb, unsigned int len)
 {
 	unsigned int size = skb->len;
 	if (likely(size >= len))
-		return skb;
+		return 0;
 	return skb_pad(skb, len-size);
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bb7210f4005e..fe63d4efbd4d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -781,24 +781,40 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
  *	filled. Used by network drivers which may DMA or transfer data
  *	beyond the buffer end onto the wire.
  *
- *	May return NULL in out of memory cases.
+ *	May return error in out of memory cases. The skb is freed on error.
  */
  
-struct sk_buff *skb_pad(struct sk_buff *skb, int pad)
+int skb_pad(struct sk_buff *skb, int pad)
 {
-	struct sk_buff *nskb;
+	int err;
+	int ntail;
 	
 	/* If the skbuff is non linear tailroom is always zero.. */
-	if (skb_tailroom(skb) >= pad) {
+	if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
 		memset(skb->data+skb->len, 0, pad);
-		return skb;
+		return 0;
 	}
-	
-	nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC);
+
+	ntail = skb->data_len + pad - (skb->end - skb->tail);
+	if (likely(skb_cloned(skb) || ntail > 0)) {
+		err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
+		if (unlikely(err))
+			goto free_skb;
+	}
+
+	/* FIXME: The use of this function with non-linear skb's really needs
+	 * to be audited.
+	 */
+	err = skb_linearize(skb);
+	if (unlikely(err))
+		goto free_skb;
+
+	memset(skb->data + skb->len, 0, pad);
+	return 0;
+
+free_skb:
 	kfree_skb(skb);
-	if (nskb)
-		memset(nskb->data+nskb->len, 0, pad);
-	return nskb;
+	return err;
 }	
  
 /* Trims skb to length len. It can change skb pointers.
-- 
cgit v1.2.3


From 7967168cefdbc63bf332d6b1548eca7cd65ebbcc Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 22 Jun 2006 02:40:14 -0700
Subject: [NET]: Merge TSO/UFO fields in sk_buff

Having separate fields in sk_buff for TSO/UFO (tso_size/ufo_size) is not
going to scale if we add any more segmentation methods (e.g., DCCP).  So
let's merge them.

They were used to tell the protocol of a packet.  This function has been
subsumed by the new gso_type field.  This is essentially a set of netdev
feature bits (shifted by 16 bits) that are required to process a specific
skb.  As such it's easy to tell whether a given device can process a GSO
skb: you just have to and the gso_type field and the netdev's features
field.

I've made gso_type a conjunction.  The idea is that you have a base type
(e.g., SKB_GSO_TCPV4) that can be modified further to support new features.
For example, if we add a hardware TSO type that supports ECN, they would
declare NETIF_F_TSO | NETIF_F_TSO_ECN.  All TSO packets with CWR set would
have a gso_type of SKB_GSO_TCPV4 | SKB_GSO_TCPV4_ECN while all other TSO
packets would be SKB_GSO_TCPV4.  This means that only the CWR packets need
to be emulated in software.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/8139cp.c            |  2 +-
 drivers/net/bnx2.c              |  4 ++--
 drivers/net/chelsio/sge.c       |  4 ++--
 drivers/net/e1000/e1000_main.c  | 10 ++++-----
 drivers/net/forcedeth.c         |  4 ++--
 drivers/net/ixgb/ixgb_main.c    |  4 ++--
 drivers/net/loopback.c          |  4 ++--
 drivers/net/myri10ge/myri10ge.c |  4 ++--
 drivers/net/r8169.c             |  2 +-
 drivers/net/s2io.c              | 16 +++++++-------
 drivers/net/sky2.c              |  4 ++--
 drivers/net/tg3.c               |  4 ++--
 drivers/net/typhoon.c           |  2 +-
 drivers/s390/net/qeth_eddp.c    | 12 +++++------
 drivers/s390/net/qeth_main.c    |  4 ++--
 drivers/s390/net/qeth_tso.h     |  2 +-
 include/linux/netdevice.h       | 14 ++++++++++--
 include/linux/skbuff.h          | 12 ++++++++---
 include/net/tcp.h               |  4 ++--
 net/bridge/br_forward.c         |  4 ++--
 net/bridge/br_netfilter.c       |  2 +-
 net/core/skbuff.c               | 16 ++++++++------
 net/ipv4/ip_output.c            | 16 ++++++++------
 net/ipv4/tcp.c                  |  4 ++--
 net/ipv4/tcp_input.c            |  2 +-
 net/ipv4/tcp_output.c           | 47 ++++++++++++++++++++++++-----------------
 net/ipv6/ip6_output.c           |  7 +++---
 27 files changed, 120 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c
index a26077a175ad..0cdc830449d8 100644
--- a/drivers/net/8139cp.c
+++ b/drivers/net/8139cp.c
@@ -797,7 +797,7 @@ static int cp_start_xmit (struct sk_buff *skb, struct net_device *dev)
 	entry = cp->tx_head;
 	eor = (entry == (CP_TX_RING_SIZE - 1)) ? RingEnd : 0;
 	if (dev->features & NETIF_F_TSO)
-		mss = skb_shinfo(skb)->tso_size;
+		mss = skb_shinfo(skb)->gso_size;
 
 	if (skb_shinfo(skb)->nr_frags == 0) {
 		struct cp_desc *txd = &cp->tx_ring[entry];
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 702d546567ad..7635736cc791 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -1640,7 +1640,7 @@ bnx2_tx_int(struct bnx2 *bp)
 		skb = tx_buf->skb;
 #ifdef BCM_TSO 
 		/* partial BD completions possible with TSO packets */
-		if (skb_shinfo(skb)->tso_size) {
+		if (skb_shinfo(skb)->gso_size) {
 			u16 last_idx, last_ring_idx;
 
 			last_idx = sw_cons +
@@ -4428,7 +4428,7 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			(TX_BD_FLAGS_VLAN_TAG | (vlan_tx_tag_get(skb) << 16));
 	}
 #ifdef BCM_TSO 
-	if ((mss = skb_shinfo(skb)->tso_size) &&
+	if ((mss = skb_shinfo(skb)->gso_size) &&
 		(skb->len > (bp->dev->mtu + ETH_HLEN))) {
 		u32 tcp_opt_len, ip_tcp_len;
 
diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c
index 4391bf4bf573..53efff6da784 100644
--- a/drivers/net/chelsio/sge.c
+++ b/drivers/net/chelsio/sge.c
@@ -1418,7 +1418,7 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct cpl_tx_pkt *cpl;
 
 #ifdef NETIF_F_TSO
-	if (skb_shinfo(skb)->tso_size) {
+	if (skb_shinfo(skb)->gso_size) {
 		int eth_type;
 		struct cpl_tx_pkt_lso *hdr;
 
@@ -1433,7 +1433,7 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		hdr->ip_hdr_words = skb->nh.iph->ihl;
 		hdr->tcp_hdr_words = skb->h.th->doff;
 		hdr->eth_type_mss = htons(MK_ETH_TYPE_MSS(eth_type,
-						skb_shinfo(skb)->tso_size));
+						skb_shinfo(skb)->gso_size));
 		hdr->len = htonl(skb->len - sizeof(*hdr));
 		cpl = (struct cpl_tx_pkt *)hdr;
 		sge->stats.tx_lso_pkts++;
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index a373ccb308d8..32b7d444b374 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -2394,7 +2394,7 @@ e1000_tso(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring,
 	uint8_t ipcss, ipcso, tucss, tucso, hdr_len;
 	int err;
 
-	if (skb_shinfo(skb)->tso_size) {
+	if (skb_shinfo(skb)->gso_size) {
 		if (skb_header_cloned(skb)) {
 			err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
 			if (err)
@@ -2402,7 +2402,7 @@ e1000_tso(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring,
 		}
 
 		hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
-		mss = skb_shinfo(skb)->tso_size;
+		mss = skb_shinfo(skb)->gso_size;
 		if (skb->protocol == htons(ETH_P_IP)) {
 			skb->nh.iph->tot_len = 0;
 			skb->nh.iph->check = 0;
@@ -2519,7 +2519,7 @@ e1000_tx_map(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring,
 		 * tso gets written back prematurely before the data is fully
 		 * DMA'd to the controller */
 		if (!skb->data_len && tx_ring->last_tx_tso &&
-		    !skb_shinfo(skb)->tso_size) {
+		    !skb_shinfo(skb)->gso_size) {
 			tx_ring->last_tx_tso = 0;
 			size -= 4;
 		}
@@ -2757,7 +2757,7 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	}
 
 #ifdef NETIF_F_TSO
-	mss = skb_shinfo(skb)->tso_size;
+	mss = skb_shinfo(skb)->gso_size;
 	/* The controller does a simple calculation to
 	 * make sure there is enough room in the FIFO before
 	 * initiating the DMA for each buffer.  The calc is:
@@ -2807,7 +2807,7 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 #ifdef NETIF_F_TSO
 	/* Controller Erratum workaround */
 	if (!skb->data_len && tx_ring->last_tx_tso &&
-	    !skb_shinfo(skb)->tso_size)
+	    !skb_shinfo(skb)->gso_size)
 		count++;
 #endif
 
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index 191383d461d7..21be4fa071b5 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -1495,8 +1495,8 @@ static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	np->tx_skbuff[nr] = skb;
 
 #ifdef NETIF_F_TSO
-	if (skb_shinfo(skb)->tso_size)
-		tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->tso_size << NV_TX2_TSO_SHIFT);
+	if (skb_shinfo(skb)->gso_size)
+		tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->gso_size << NV_TX2_TSO_SHIFT);
 	else
 #endif
 	tx_flags_extra = (skb->ip_summed == CHECKSUM_HW ? (NV_TX2_CHECKSUM_L3|NV_TX2_CHECKSUM_L4) : 0);
diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c
index 57006fb8840e..8bb32f946993 100644
--- a/drivers/net/ixgb/ixgb_main.c
+++ b/drivers/net/ixgb/ixgb_main.c
@@ -1173,7 +1173,7 @@ ixgb_tso(struct ixgb_adapter *adapter, struct sk_buff *skb)
 	uint16_t ipcse, tucse, mss;
 	int err;
 
-	if(likely(skb_shinfo(skb)->tso_size)) {
+	if(likely(skb_shinfo(skb)->gso_size)) {
 		if (skb_header_cloned(skb)) {
 			err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
 			if (err)
@@ -1181,7 +1181,7 @@ ixgb_tso(struct ixgb_adapter *adapter, struct sk_buff *skb)
 		}
 
 		hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
-		mss = skb_shinfo(skb)->tso_size;
+		mss = skb_shinfo(skb)->gso_size;
 		skb->nh.iph->tot_len = 0;
 		skb->nh.iph->check = 0;
 		skb->h.th->check = ~csum_tcpudp_magic(skb->nh.iph->saddr,
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index b79d6e8d3045..43fef7de8cb9 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -74,7 +74,7 @@ static void emulate_large_send_offload(struct sk_buff *skb)
 	struct iphdr *iph = skb->nh.iph;
 	struct tcphdr *th = (struct tcphdr*)(skb->nh.raw + (iph->ihl * 4));
 	unsigned int doffset = (iph->ihl + th->doff) * 4;
-	unsigned int mtu = skb_shinfo(skb)->tso_size + doffset;
+	unsigned int mtu = skb_shinfo(skb)->gso_size + doffset;
 	unsigned int offset = 0;
 	u32 seq = ntohl(th->seq);
 	u16 id  = ntohs(iph->id);
@@ -139,7 +139,7 @@ static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
 #endif
 
 #ifdef LOOPBACK_TSO
-	if (skb_shinfo(skb)->tso_size) {
+	if (skb_shinfo(skb)->gso_size) {
 		BUG_ON(skb->protocol != htons(ETH_P_IP));
 		BUG_ON(skb->nh.iph->protocol != IPPROTO_TCP);
 
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index b983e1e04348..dbdf189436fa 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -1879,7 +1879,7 @@ again:
 
 #ifdef NETIF_F_TSO
 	if (skb->len > (dev->mtu + ETH_HLEN)) {
-		mss = skb_shinfo(skb)->tso_size;
+		mss = skb_shinfo(skb)->gso_size;
 		if (mss != 0)
 			max_segments = MYRI10GE_MAX_SEND_DESC_TSO;
 	}
@@ -2112,7 +2112,7 @@ abort_linearize:
 		}
 		idx = (idx + 1) & tx->mask;
 	} while (idx != last_idx);
-	if (skb_shinfo(skb)->tso_size) {
+	if (skb_shinfo(skb)->gso_size) {
 		printk(KERN_ERR
 		       "myri10ge: %s: TSO but wanted to linearize?!?!?\n",
 		       mgp->dev->name);
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 985afe0e6273..12d1cb289bb0 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -2172,7 +2172,7 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb,
 static inline u32 rtl8169_tso_csum(struct sk_buff *skb, struct net_device *dev)
 {
 	if (dev->features & NETIF_F_TSO) {
-		u32 mss = skb_shinfo(skb)->tso_size;
+		u32 mss = skb_shinfo(skb)->gso_size;
 
 		if (mss)
 			return LargeSend | ((mss & MSSMask) << MSSShift);
diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c
index 11daed495b97..3defe5d4f7d3 100644
--- a/drivers/net/s2io.c
+++ b/drivers/net/s2io.c
@@ -3959,8 +3959,8 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 	txdp->Control_1 = 0;
 	txdp->Control_2 = 0;
 #ifdef NETIF_F_TSO
-	mss = skb_shinfo(skb)->tso_size;
-	if (mss) {
+	mss = skb_shinfo(skb)->gso_size;
+	if (skb_shinfo(skb)->gso_type == SKB_GSO_TCPV4) {
 		txdp->Control_1 |= TXD_TCP_LSO_EN;
 		txdp->Control_1 |= TXD_TCP_LSO_MSS(mss);
 	}
@@ -3980,10 +3980,10 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	frg_len = skb->len - skb->data_len;
-	if (skb_shinfo(skb)->ufo_size) {
+	if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) {
 		int ufo_size;
 
-		ufo_size = skb_shinfo(skb)->ufo_size;
+		ufo_size = skb_shinfo(skb)->gso_size;
 		ufo_size &= ~7;
 		txdp->Control_1 |= TXD_UFO_EN;
 		txdp->Control_1 |= TXD_UFO_MSS(ufo_size);
@@ -4009,7 +4009,7 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 	txdp->Host_Control = (unsigned long) skb;
 	txdp->Control_1 |= TXD_BUFFER0_SIZE(frg_len);
 
-	if (skb_shinfo(skb)->ufo_size)
+	if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
 		txdp->Control_1 |= TXD_UFO_EN;
 
 	frg_cnt = skb_shinfo(skb)->nr_frags;
@@ -4024,12 +4024,12 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 		    (sp->pdev, frag->page, frag->page_offset,
 		     frag->size, PCI_DMA_TODEVICE);
 		txdp->Control_1 = TXD_BUFFER0_SIZE(frag->size);
-		if (skb_shinfo(skb)->ufo_size)
+		if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
 			txdp->Control_1 |= TXD_UFO_EN;
 	}
 	txdp->Control_1 |= TXD_GATHER_CODE_LAST;
 
-	if (skb_shinfo(skb)->ufo_size)
+	if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
 		frg_cnt++; /* as Txd0 was used for inband header */
 
 	tx_fifo = mac_control->tx_FIFO_start[queue];
@@ -4043,7 +4043,7 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (mss)
 		val64 |= TX_FIFO_SPECIAL_FUNC;
 #endif
-	if (skb_shinfo(skb)->ufo_size)
+	if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
 		val64 |= TX_FIFO_SPECIAL_FUNC;
 	writeq(val64, &tx_fifo->List_Control);
 
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index fba1e4d4d83d..d3577871be28 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -1160,7 +1160,7 @@ static unsigned tx_le_req(const struct sk_buff *skb)
 	count = sizeof(dma_addr_t) / sizeof(u32);
 	count += skb_shinfo(skb)->nr_frags * count;
 
-	if (skb_shinfo(skb)->tso_size)
+	if (skb_shinfo(skb)->gso_size)
 		++count;
 
 	if (skb->ip_summed == CHECKSUM_HW)
@@ -1232,7 +1232,7 @@ static int sky2_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	/* Check for TCP Segmentation Offload */
-	mss = skb_shinfo(skb)->tso_size;
+	mss = skb_shinfo(skb)->gso_size;
 	if (mss != 0) {
 		/* just drop the packet if non-linear expansion fails */
 		if (skb_header_cloned(skb) &&
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index b2ddd4522a87..e3e380f90f86 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -3780,7 +3780,7 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 #if TG3_TSO_SUPPORT != 0
 	mss = 0;
 	if (skb->len > (tp->dev->mtu + ETH_HLEN) &&
-	    (mss = skb_shinfo(skb)->tso_size) != 0) {
+	    (mss = skb_shinfo(skb)->gso_size) != 0) {
 		int tcp_opt_len, ip_tcp_len;
 
 		if (skb_header_cloned(skb) &&
@@ -3905,7 +3905,7 @@ static int tg3_start_xmit_dma_bug(struct sk_buff *skb, struct net_device *dev)
 #if TG3_TSO_SUPPORT != 0
 	mss = 0;
 	if (skb->len > (tp->dev->mtu + ETH_HLEN) &&
-	    (mss = skb_shinfo(skb)->tso_size) != 0) {
+	    (mss = skb_shinfo(skb)->gso_size) != 0) {
 		int tcp_opt_len, ip_tcp_len;
 
 		if (skb_header_cloned(skb) &&
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
index d9258d42090c..e49e8b520c28 100644
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -340,7 +340,7 @@ enum state_values {
 #endif
 
 #if defined(NETIF_F_TSO)
-#define skb_tso_size(x)		(skb_shinfo(x)->tso_size)
+#define skb_tso_size(x)		(skb_shinfo(x)->gso_size)
 #define TSO_NUM_DESCRIPTORS	2
 #define TSO_OFFLOAD_ON		TYPHOON_OFFLOAD_TCP_SEGMENT
 #else
diff --git a/drivers/s390/net/qeth_eddp.c b/drivers/s390/net/qeth_eddp.c
index 0bab60a20309..38aad8321456 100644
--- a/drivers/s390/net/qeth_eddp.c
+++ b/drivers/s390/net/qeth_eddp.c
@@ -420,7 +420,7 @@ __qeth_eddp_fill_context_tcp(struct qeth_eddp_context *ctx,
        }
 	tcph = eddp->skb->h.th;
 	while (eddp->skb_offset < eddp->skb->len) {
-		data_len = min((int)skb_shinfo(eddp->skb)->tso_size,
+		data_len = min((int)skb_shinfo(eddp->skb)->gso_size,
 			       (int)(eddp->skb->len - eddp->skb_offset));
 		/* prepare qdio hdr */
 		if (eddp->qh.hdr.l2.id == QETH_HEADER_TYPE_LAYER2){
@@ -515,20 +515,20 @@ qeth_eddp_calc_num_pages(struct qeth_eddp_context *ctx, struct sk_buff *skb,
 
 	QETH_DBF_TEXT(trace, 5, "eddpcanp");
 	/* can we put multiple skbs in one page? */
-	skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->tso_size + hdr_len);
+	skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->gso_size + hdr_len);
 	if (skbs_per_page > 1){
-		ctx->num_pages = (skb_shinfo(skb)->tso_segs + 1) /
+		ctx->num_pages = (skb_shinfo(skb)->gso_segs + 1) /
 				 skbs_per_page + 1;
 		ctx->elements_per_skb = 1;
 	} else {
 		/* no -> how many elements per skb? */
-		ctx->elements_per_skb = (skb_shinfo(skb)->tso_size + hdr_len +
+		ctx->elements_per_skb = (skb_shinfo(skb)->gso_size + hdr_len +
 				     PAGE_SIZE) >> PAGE_SHIFT;
 		ctx->num_pages = ctx->elements_per_skb *
-				 (skb_shinfo(skb)->tso_segs + 1);
+				 (skb_shinfo(skb)->gso_segs + 1);
 	}
 	ctx->num_elements = ctx->elements_per_skb *
-			    (skb_shinfo(skb)->tso_segs + 1);
+			    (skb_shinfo(skb)->gso_segs + 1);
 }
 
 static inline struct qeth_eddp_context *
diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c
index 9e671a48cd2f..56009d768326 100644
--- a/drivers/s390/net/qeth_main.c
+++ b/drivers/s390/net/qeth_main.c
@@ -4417,7 +4417,7 @@ qeth_send_packet(struct qeth_card *card, struct sk_buff *skb)
 	struct qeth_eddp_context *ctx = NULL;
 	int tx_bytes = skb->len;
 	unsigned short nr_frags = skb_shinfo(skb)->nr_frags;
-	unsigned short tso_size = skb_shinfo(skb)->tso_size;
+	unsigned short tso_size = skb_shinfo(skb)->gso_size;
 	int rc;
 
 	QETH_DBF_TEXT(trace, 6, "sendpkt");
@@ -4453,7 +4453,7 @@ qeth_send_packet(struct qeth_card *card, struct sk_buff *skb)
 	queue = card->qdio.out_qs
 		[qeth_get_priority_queue(card, skb, ipv, cast_type)];
 
-	if (skb_shinfo(skb)->tso_size)
+	if (skb_shinfo(skb)->gso_size)
 		large_send = card->options.large_send;
 
 	/*are we able to do TSO ? If so ,prepare and send it from here */
diff --git a/drivers/s390/net/qeth_tso.h b/drivers/s390/net/qeth_tso.h
index 24ef40ca9562..593f298142c1 100644
--- a/drivers/s390/net/qeth_tso.h
+++ b/drivers/s390/net/qeth_tso.h
@@ -51,7 +51,7 @@ qeth_tso_fill_header(struct qeth_card *card, struct sk_buff *skb)
 	hdr->ext.hdr_version = 1;
 	hdr->ext.hdr_len     = 28;
 	/*insert non-fix values */
-	hdr->ext.mss = skb_shinfo(skb)->tso_size;
+	hdr->ext.mss = skb_shinfo(skb)->gso_size;
 	hdr->ext.dg_hdr_len = (__u16)(iph->ihl*4 + tcph->doff*4);
 	hdr->ext.payload_len = (__u16)(skb->len - hdr->ext.dg_hdr_len -
 				       sizeof(struct qeth_hdr_tso));
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cead6be467ed..fa5671307b90 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -308,9 +308,12 @@ struct net_device
 #define NETIF_F_HW_VLAN_RX	256	/* Receive VLAN hw acceleration */
 #define NETIF_F_HW_VLAN_FILTER	512	/* Receive filtering on VLAN */
 #define NETIF_F_VLAN_CHALLENGED	1024	/* Device cannot handle VLAN packets */
-#define NETIF_F_TSO		2048	/* Can offload TCP/IP segmentation */
 #define NETIF_F_LLTX		4096	/* LockLess TX */
-#define NETIF_F_UFO             8192    /* Can offload UDP Large Send*/
+
+	/* Segmentation offload features */
+#define NETIF_F_GSO_SHIFT	16
+#define NETIF_F_TSO		(SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
+#define NETIF_F_UFO		(SKB_GSO_UDPV4 << NETIF_F_GSO_SHIFT)
 
 #define NETIF_F_GEN_CSUM	(NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 #define NETIF_F_ALL_CSUM	(NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM)
@@ -979,6 +982,13 @@ extern void dev_seq_stop(struct seq_file *seq, void *v);
 
 extern void linkwatch_run_queue(void);
 
+static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
+{
+	int feature = skb_shinfo(skb)->gso_type << NETIF_F_GSO_SHIFT;
+	return skb_shinfo(skb)->gso_size &&
+	       (dev->features & feature) != feature;
+}
+
 #endif /* __KERNEL__ */
 
 #endif	/* _LINUX_DEV_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f8c7eb79a27f..97b0d2d1a6b0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -134,9 +134,10 @@ struct skb_frag_struct {
 struct skb_shared_info {
 	atomic_t	dataref;
 	unsigned short	nr_frags;
-	unsigned short	tso_size;
-	unsigned short	tso_segs;
-	unsigned short  ufo_size;
+	unsigned short	gso_size;
+	/* Warning: this field is not always filled in (UFO)! */
+	unsigned short	gso_segs;
+	unsigned short  gso_type;
 	unsigned int    ip6_frag_id;
 	struct sk_buff	*frag_list;
 	skb_frag_t	frags[MAX_SKB_FRAGS];
@@ -168,6 +169,11 @@ enum {
 	SKB_FCLONE_CLONE,
 };
 
+enum {
+	SKB_GSO_TCPV4 = 1 << 0,
+	SKB_GSO_UDPV4 = 1 << 1,
+};
+
 /** 
  *	struct sk_buff - socket buffer
  *	@next: Next buffer in list
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5f4eb5c79689..b197a9e615c1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -569,13 +569,13 @@ struct tcp_skb_cb {
  */
 static inline int tcp_skb_pcount(const struct sk_buff *skb)
 {
-	return skb_shinfo(skb)->tso_segs;
+	return skb_shinfo(skb)->gso_segs;
 }
 
 /* This is valid iff tcp_skb_pcount() > 1. */
 static inline int tcp_skb_mss(const struct sk_buff *skb)
 {
-	return skb_shinfo(skb)->tso_size;
+	return skb_shinfo(skb)->gso_size;
 }
 
 static inline void tcp_dec_pcount_approx(__u32 *count,
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 0dca027ceb80..8be9f2123e54 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -34,8 +34,8 @@ static inline unsigned packet_length(const struct sk_buff *skb)
 
 int br_dev_queue_push_xmit(struct sk_buff *skb)
 {
-	/* drop mtu oversized packets except tso */
-	if (packet_length(skb) > skb->dev->mtu && !skb_shinfo(skb)->tso_size)
+	/* drop mtu oversized packets except gso */
+	if (packet_length(skb) > skb->dev->mtu && !skb_shinfo(skb)->gso_size)
 		kfree_skb(skb);
 	else {
 #ifdef CONFIG_BRIDGE_NETFILTER
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 3e41f9d6d51c..8298a5179aef 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -761,7 +761,7 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
 	if (skb->protocol == htons(ETH_P_IP) &&
 	    skb->len > skb->dev->mtu &&
-	    !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
+	    !skb_shinfo(skb)->gso_size)
 		return ip_fragment(skb, br_dev_queue_push_xmit);
 	else
 		return br_dev_queue_push_xmit(skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fe63d4efbd4d..368d98578c14 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -172,9 +172,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	shinfo = skb_shinfo(skb);
 	atomic_set(&shinfo->dataref, 1);
 	shinfo->nr_frags  = 0;
-	shinfo->tso_size = 0;
-	shinfo->tso_segs = 0;
-	shinfo->ufo_size = 0;
+	shinfo->gso_size = 0;
+	shinfo->gso_segs = 0;
+	shinfo->gso_type = 0;
 	shinfo->ip6_frag_id = 0;
 	shinfo->frag_list = NULL;
 
@@ -238,8 +238,9 @@ struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
 
 	atomic_set(&(skb_shinfo(skb)->dataref), 1);
 	skb_shinfo(skb)->nr_frags  = 0;
-	skb_shinfo(skb)->tso_size = 0;
-	skb_shinfo(skb)->tso_segs = 0;
+	skb_shinfo(skb)->gso_size = 0;
+	skb_shinfo(skb)->gso_segs = 0;
+	skb_shinfo(skb)->gso_type = 0;
 	skb_shinfo(skb)->frag_list = NULL;
 out:
 	return skb;
@@ -528,8 +529,9 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
 	skb_copy_secmark(new, old);
 	atomic_set(&new->users, 1);
-	skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size;
-	skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs;
+	skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
+	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
+	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
 }
 
 /**
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8538aac3d148..7624fd1d8f9f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -210,8 +210,7 @@ static inline int ip_finish_output(struct sk_buff *skb)
 		return dst_output(skb);
 	}
 #endif
-	if (skb->len > dst_mtu(skb->dst) &&
-	    !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
+	if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size)
 		return ip_fragment(skb, ip_finish_output2);
 	else
 		return ip_finish_output2(skb);
@@ -362,7 +361,7 @@ packet_routed:
 	}
 
 	ip_select_ident_more(iph, &rt->u.dst, sk,
-			     (skb_shinfo(skb)->tso_segs ?: 1) - 1);
+			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 
 	/* Add an IP checksum. */
 	ip_send_check(iph);
@@ -744,7 +743,8 @@ static inline int ip_ufo_append_data(struct sock *sk,
 			       (length - transhdrlen));
 	if (!err) {
 		/* specify the length of each IP datagram fragment*/
-		skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
+		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
 		__skb_queue_tail(&sk->sk_write_queue, skb);
 
 		return 0;
@@ -1087,14 +1087,16 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,
 
 	inet->cork.length += size;
 	if ((sk->sk_protocol == IPPROTO_UDP) &&
-	    (rt->u.dst.dev->features & NETIF_F_UFO))
-		skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
+	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
+		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
+	}
 
 
 	while (size > 0) {
 		int i;
 
-		if (skb_shinfo(skb)->ufo_size)
+		if (skb_shinfo(skb)->gso_size)
 			len = size;
 		else {
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 74998f250071..062dd1a0d8a8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -571,7 +571,7 @@ new_segment:
 		skb->ip_summed = CHECKSUM_HW;
 		tp->write_seq += copy;
 		TCP_SKB_CB(skb)->end_seq += copy;
-		skb_shinfo(skb)->tso_segs = 0;
+		skb_shinfo(skb)->gso_segs = 0;
 
 		if (!copied)
 			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
@@ -818,7 +818,7 @@ new_segment:
 
 			tp->write_seq += copy;
 			TCP_SKB_CB(skb)->end_seq += copy;
-			skb_shinfo(skb)->tso_segs = 0;
+			skb_shinfo(skb)->gso_segs = 0;
 
 			from += copy;
 			copied += copy;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e08245bdda3a..94fe5b1f9dcb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1073,7 +1073,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 				else
 					pkt_len = (end_seq -
 						   TCP_SKB_CB(skb)->seq);
-				if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size))
+				if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
 					break;
 				pcount = tcp_skb_pcount(skb);
 			}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 07bb5a2b375e..bdd71db8bf90 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -515,15 +515,17 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned
 		/* Avoid the costly divide in the normal
 		 * non-TSO case.
 		 */
-		skb_shinfo(skb)->tso_segs = 1;
-		skb_shinfo(skb)->tso_size = 0;
+		skb_shinfo(skb)->gso_segs = 1;
+		skb_shinfo(skb)->gso_size = 0;
+		skb_shinfo(skb)->gso_type = 0;
 	} else {
 		unsigned int factor;
 
 		factor = skb->len + (mss_now - 1);
 		factor /= mss_now;
-		skb_shinfo(skb)->tso_segs = factor;
-		skb_shinfo(skb)->tso_size = mss_now;
+		skb_shinfo(skb)->gso_segs = factor;
+		skb_shinfo(skb)->gso_size = mss_now;
+		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
 	}
 }
 
@@ -914,7 +916,7 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int
 
 	if (!tso_segs ||
 	    (tso_segs > 1 &&
-	     skb_shinfo(skb)->tso_size != mss_now)) {
+	     tcp_skb_mss(skb) != mss_now)) {
 		tcp_set_skb_tso_segs(sk, skb, mss_now);
 		tso_segs = tcp_skb_pcount(skb);
 	}
@@ -1724,8 +1726,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	   tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
 		if (!pskb_trim(skb, 0)) {
 			TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
-			skb_shinfo(skb)->tso_segs = 1;
-			skb_shinfo(skb)->tso_size = 0;
+			skb_shinfo(skb)->gso_segs = 1;
+			skb_shinfo(skb)->gso_size = 0;
+			skb_shinfo(skb)->gso_type = 0;
 			skb->ip_summed = CHECKSUM_NONE;
 			skb->csum = 0;
 		}
@@ -1930,8 +1933,9 @@ void tcp_send_fin(struct sock *sk)
 		skb->csum = 0;
 		TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
 		TCP_SKB_CB(skb)->sacked = 0;
-		skb_shinfo(skb)->tso_segs = 1;
-		skb_shinfo(skb)->tso_size = 0;
+		skb_shinfo(skb)->gso_segs = 1;
+		skb_shinfo(skb)->gso_size = 0;
+		skb_shinfo(skb)->gso_type = 0;
 
 		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
 		TCP_SKB_CB(skb)->seq = tp->write_seq;
@@ -1963,8 +1967,9 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
 	skb->csum = 0;
 	TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
 	TCP_SKB_CB(skb)->sacked = 0;
-	skb_shinfo(skb)->tso_segs = 1;
-	skb_shinfo(skb)->tso_size = 0;
+	skb_shinfo(skb)->gso_segs = 1;
+	skb_shinfo(skb)->gso_size = 0;
+	skb_shinfo(skb)->gso_type = 0;
 
 	/* Send it off. */
 	TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
@@ -2047,8 +2052,9 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
 	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
 	TCP_SKB_CB(skb)->sacked = 0;
-	skb_shinfo(skb)->tso_segs = 1;
-	skb_shinfo(skb)->tso_size = 0;
+	skb_shinfo(skb)->gso_segs = 1;
+	skb_shinfo(skb)->gso_size = 0;
+	skb_shinfo(skb)->gso_type = 0;
 	th->seq = htonl(TCP_SKB_CB(skb)->seq);
 	th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
 	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
@@ -2152,8 +2158,9 @@ int tcp_connect(struct sock *sk)
 	TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
 	TCP_ECN_send_syn(sk, tp, buff);
 	TCP_SKB_CB(buff)->sacked = 0;
-	skb_shinfo(buff)->tso_segs = 1;
-	skb_shinfo(buff)->tso_size = 0;
+	skb_shinfo(buff)->gso_segs = 1;
+	skb_shinfo(buff)->gso_size = 0;
+	skb_shinfo(buff)->gso_type = 0;
 	buff->csum = 0;
 	TCP_SKB_CB(buff)->seq = tp->write_seq++;
 	TCP_SKB_CB(buff)->end_seq = tp->write_seq;
@@ -2257,8 +2264,9 @@ void tcp_send_ack(struct sock *sk)
 		buff->csum = 0;
 		TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
 		TCP_SKB_CB(buff)->sacked = 0;
-		skb_shinfo(buff)->tso_segs = 1;
-		skb_shinfo(buff)->tso_size = 0;
+		skb_shinfo(buff)->gso_segs = 1;
+		skb_shinfo(buff)->gso_size = 0;
+		skb_shinfo(buff)->gso_type = 0;
 
 		/* Send it off, this clears delayed acks for us. */
 		TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
@@ -2293,8 +2301,9 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
 	skb->csum = 0;
 	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 	TCP_SKB_CB(skb)->sacked = urgent;
-	skb_shinfo(skb)->tso_segs = 1;
-	skb_shinfo(skb)->tso_size = 0;
+	skb_shinfo(skb)->gso_segs = 1;
+	skb_shinfo(skb)->gso_size = 0;
+	skb_shinfo(skb)->gso_type = 0;
 
 	/* Use a previous sequence.  This should cause the other
 	 * end to send an ack.  Don't queue or clone SKB, just
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index d29620f4910e..abb94de33768 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -148,7 +148,7 @@ static int ip6_output2(struct sk_buff *skb)
 
 int ip6_output(struct sk_buff *skb)
 {
-	if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
+	if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size) ||
 				dst_allfrag(skb->dst))
 		return ip6_fragment(skb, ip6_output2);
 	else
@@ -833,8 +833,9 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 		struct frag_hdr fhdr;
 
 		/* specify the length of each IP datagram fragment*/
-		skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - 
-						sizeof(struct frag_hdr);
+		skb_shinfo(skb)->gso_size = mtu - fragheaderlen - 
+					    sizeof(struct frag_hdr);
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
 		ipv6_select_ident(skb, &fhdr);
 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
 		__skb_queue_tail(&sk->sk_write_queue, skb);
-- 
cgit v1.2.3


From f6a78bfcb141f963187464bac838d46a81c3882a Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 22 Jun 2006 02:57:17 -0700
Subject: [NET]: Add generic segmentation offload

This patch adds the infrastructure for generic segmentation offload.
The idea is to tap into the potential savings of TSO without hardware
support by postponing the allocation of segmented skb's until just
before the entry point into the NIC driver.

The same structure can be used to support software IPv6 TSO, as well as
UFO and segmentation offload for other relevant protocols, e.g., DCCP.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   8 ++-
 net/core/dev.c            | 127 ++++++++++++++++++++++++++++++++++++++++++++--
 net/sched/sch_generic.c   |  19 +++++--
 3 files changed, 143 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fa5671307b90..b4eae18390cc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -405,6 +405,9 @@ struct net_device
 	struct list_head	qdisc_list;
 	unsigned long		tx_queue_len;	/* Max frames per queue allowed */
 
+	/* Partially transmitted GSO packet. */
+	struct sk_buff		*gso_skb;
+
 	/* ingress path synchronizer */
 	spinlock_t		ingress_lock;
 	struct Qdisc		*qdisc_ingress;
@@ -539,6 +542,7 @@ struct packet_type {
 					 struct net_device *,
 					 struct packet_type *,
 					 struct net_device *);
+	struct sk_buff		*(*gso_segment)(struct sk_buff *skb, int sg);
 	void			*af_packet_priv;
 	struct list_head	list;
 };
@@ -689,7 +693,8 @@ extern int		dev_change_name(struct net_device *, char *);
 extern int		dev_set_mtu(struct net_device *, int);
 extern int		dev_set_mac_address(struct net_device *,
 					    struct sockaddr *);
-extern void		dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
+extern int		dev_hard_start_xmit(struct sk_buff *skb,
+					    struct net_device *dev);
 
 extern void		dev_init(void);
 
@@ -963,6 +968,7 @@ extern int		netdev_max_backlog;
 extern int		weight_p;
 extern int		netdev_set_master(struct net_device *dev, struct net_device *master);
 extern int skb_checksum_help(struct sk_buff *skb, int inward);
+extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg);
 #ifdef CONFIG_BUG
 extern void netdev_rx_csum_fault(struct net_device *dev);
 #else
diff --git a/net/core/dev.c b/net/core/dev.c
index 29e3888102bc..d293e0f90a0c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -116,6 +116,7 @@
 #include <asm/current.h>
 #include <linux/audit.h>
 #include <linux/dmaengine.h>
+#include <linux/err.h>
 
 /*
  *	The list of packet types we will receive (as opposed to discard)
@@ -1048,7 +1049,7 @@ static inline void net_timestamp(struct sk_buff *skb)
  *	taps currently in use.
  */
 
-void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct packet_type *ptype;
 
@@ -1186,6 +1187,40 @@ out:
 	return ret;
 }
 
+/**
+ *	skb_gso_segment - Perform segmentation on skb.
+ *	@skb: buffer to segment
+ *	@sg: whether scatter-gather is supported on the target.
+ *
+ *	This function segments the given skb and returns a list of segments.
+ */
+struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg)
+{
+	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+	struct packet_type *ptype;
+	int type = skb->protocol;
+
+	BUG_ON(skb_shinfo(skb)->frag_list);
+	BUG_ON(skb->ip_summed != CHECKSUM_HW);
+
+	skb->mac.raw = skb->data;
+	skb->mac_len = skb->nh.raw - skb->data;
+	__skb_pull(skb, skb->mac_len);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
+		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
+			segs = ptype->gso_segment(skb, sg);
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return segs;
+}
+
+EXPORT_SYMBOL(skb_gso_segment);
+
 /* Take action when hardware reception checksum errors are detected. */
 #ifdef CONFIG_BUG
 void netdev_rx_csum_fault(struct net_device *dev)
@@ -1222,6 +1257,86 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 #define illegal_highdma(dev, skb)	(0)
 #endif
 
+struct dev_gso_cb {
+	void (*destructor)(struct sk_buff *skb);
+};
+
+#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
+
+static void dev_gso_skb_destructor(struct sk_buff *skb)
+{
+	struct dev_gso_cb *cb;
+
+	do {
+		struct sk_buff *nskb = skb->next;
+
+		skb->next = nskb->next;
+		nskb->next = NULL;
+		kfree_skb(nskb);
+	} while (skb->next);
+
+	cb = DEV_GSO_CB(skb);
+	if (cb->destructor)
+		cb->destructor(skb);
+}
+
+/**
+ *	dev_gso_segment - Perform emulated hardware segmentation on skb.
+ *	@skb: buffer to segment
+ *
+ *	This function segments the given skb and stores the list of segments
+ *	in skb->next.
+ */
+static int dev_gso_segment(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct sk_buff *segs;
+
+	segs = skb_gso_segment(skb, dev->features & NETIF_F_SG &&
+				    !illegal_highdma(dev, skb));
+	if (unlikely(IS_ERR(segs)))
+		return PTR_ERR(segs);
+
+	skb->next = segs;
+	DEV_GSO_CB(skb)->destructor = skb->destructor;
+	skb->destructor = dev_gso_skb_destructor;
+
+	return 0;
+}
+
+int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	if (likely(!skb->next)) {
+		if (netdev_nit)
+			dev_queue_xmit_nit(skb, dev);
+
+		if (!netif_needs_gso(dev, skb))
+			return dev->hard_start_xmit(skb, dev);
+
+		if (unlikely(dev_gso_segment(skb)))
+			goto out_kfree_skb;
+	}
+
+	do {
+		struct sk_buff *nskb = skb->next;
+		int rc;
+
+		skb->next = nskb->next;
+		nskb->next = NULL;
+		rc = dev->hard_start_xmit(nskb, dev);
+		if (unlikely(rc)) {
+			skb->next = nskb;
+			return rc;
+		}
+	} while (skb->next);
+	
+	skb->destructor = DEV_GSO_CB(skb)->destructor;
+
+out_kfree_skb:
+	kfree_skb(skb);
+	return 0;
+}
+
 #define HARD_TX_LOCK(dev, cpu) {			\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
 		netif_tx_lock(dev);			\
@@ -1266,6 +1381,10 @@ int dev_queue_xmit(struct sk_buff *skb)
 	struct Qdisc *q;
 	int rc = -ENOMEM;
 
+	/* GSO will handle the following emulations directly. */
+	if (netif_needs_gso(dev, skb))
+		goto gso;
+
 	if (skb_shinfo(skb)->frag_list &&
 	    !(dev->features & NETIF_F_FRAGLIST) &&
 	    __skb_linearize(skb))
@@ -1290,6 +1409,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 	      	if (skb_checksum_help(skb, 0))
 	      		goto out_kfree_skb;
 
+gso:
 	spin_lock_prefetch(&dev->queue_lock);
 
 	/* Disable soft irqs for various locks below. Also 
@@ -1346,11 +1466,8 @@ int dev_queue_xmit(struct sk_buff *skb)
 			HARD_TX_LOCK(dev, cpu);
 
 			if (!netif_queue_stopped(dev)) {
-				if (netdev_nit)
-					dev_queue_xmit_nit(skb, dev);
-
 				rc = 0;
-				if (!dev->hard_start_xmit(skb, dev)) {
+				if (!dev_hard_start_xmit(skb, dev)) {
 					HARD_TX_UNLOCK(dev);
 					goto out;
 				}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7aad0121232c..74d4a1dceeec 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -96,8 +96,11 @@ static inline int qdisc_restart(struct net_device *dev)
 	struct sk_buff *skb;
 
 	/* Dequeue packet */
-	if ((skb = q->dequeue(q)) != NULL) {
+	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
 		unsigned nolock = (dev->features & NETIF_F_LLTX);
+
+		dev->gso_skb = NULL;
+
 		/*
 		 * When the driver has LLTX set it does its own locking
 		 * in start_xmit. No need to add additional overhead by
@@ -134,10 +137,8 @@ static inline int qdisc_restart(struct net_device *dev)
 
 			if (!netif_queue_stopped(dev)) {
 				int ret;
-				if (netdev_nit)
-					dev_queue_xmit_nit(skb, dev);
 
-				ret = dev->hard_start_xmit(skb, dev);
+				ret = dev_hard_start_xmit(skb, dev);
 				if (ret == NETDEV_TX_OK) { 
 					if (!nolock) {
 						netif_tx_unlock(dev);
@@ -171,7 +172,10 @@ static inline int qdisc_restart(struct net_device *dev)
 		 */
 
 requeue:
-		q->ops->requeue(skb, q);
+		if (skb->next)
+			dev->gso_skb = skb;
+		else
+			q->ops->requeue(skb, q);
 		netif_schedule(dev);
 		return 1;
 	}
@@ -593,6 +597,11 @@ void dev_deactivate(struct net_device *dev)
 	/* Wait for outstanding qdisc_run calls. */
 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
 		yield();
+
+	if (dev->gso_skb) {
+		kfree_skb(dev->gso_skb);
+		dev->gso_skb = NULL;
+	}
 }
 
 void dev_init_scheduler(struct net_device *dev)
-- 
cgit v1.2.3


From f4c50d990dcf11a296679dc05de3873783236711 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 22 Jun 2006 03:02:40 -0700
Subject: [NET]: Add software TSOv4

This patch adds the GSO implementation for IPv4 TCP.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |   1 +
 include/net/protocol.h |   1 +
 include/net/tcp.h      |   2 +
 net/core/skbuff.c      | 126 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/af_inet.c     |  51 ++++++++++++++++++++
 net/ipv4/tcp.c         |  62 ++++++++++++++++++++++++
 6 files changed, 243 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 97b0d2d1a6b0..a45bba9b8cbd 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1297,6 +1297,7 @@ extern void	       skb_split(struct sk_buff *skb,
 				 struct sk_buff *skb1, const u32 len);
 
 extern void	       skb_release_data(struct sk_buff *skb);
+extern struct sk_buff *skb_segment(struct sk_buff *skb, int sg);
 
 static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
 				       int len, void *buffer)
diff --git a/include/net/protocol.h b/include/net/protocol.h
index bcaee39bd2ff..3b6dc15c68a5 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -36,6 +36,7 @@
 struct net_protocol {
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
+	struct sk_buff	       *(*gso_segment)(struct sk_buff *skb, int sg);
 	int			no_policy;
 };
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b197a9e615c1..ca3d38dfc00b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1086,6 +1086,8 @@ extern struct request_sock_ops tcp_request_sock_ops;
 
 extern int tcp_v4_destroy_sock(struct sock *sk);
 
+extern struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg);
+
 #ifdef CONFIG_PROC_FS
 extern int  tcp4_proc_init(void);
 extern void tcp4_proc_exit(void);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 368d98578c14..8e5044ba3ab6 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1842,6 +1842,132 @@ unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
 
 EXPORT_SYMBOL_GPL(skb_pull_rcsum);
 
+/**
+ *	skb_segment - Perform protocol segmentation on skb.
+ *	@skb: buffer to segment
+ *	@sg: whether scatter-gather can be used for generated segments
+ *
+ *	This function performs segmentation on the given skb.  It returns
+ *	the segment at the given position.  It returns NULL if there are
+ *	no more segments to generate, or when an error is encountered.
+ */
+struct sk_buff *skb_segment(struct sk_buff *skb, int sg)
+{
+	struct sk_buff *segs = NULL;
+	struct sk_buff *tail = NULL;
+	unsigned int mss = skb_shinfo(skb)->gso_size;
+	unsigned int doffset = skb->data - skb->mac.raw;
+	unsigned int offset = doffset;
+	unsigned int headroom;
+	unsigned int len;
+	int nfrags = skb_shinfo(skb)->nr_frags;
+	int err = -ENOMEM;
+	int i = 0;
+	int pos;
+
+	__skb_push(skb, doffset);
+	headroom = skb_headroom(skb);
+	pos = skb_headlen(skb);
+
+	do {
+		struct sk_buff *nskb;
+		skb_frag_t *frag;
+		int hsize, nsize;
+		int k;
+		int size;
+
+		len = skb->len - offset;
+		if (len > mss)
+			len = mss;
+
+		hsize = skb_headlen(skb) - offset;
+		if (hsize < 0)
+			hsize = 0;
+		nsize = hsize + doffset;
+		if (nsize > len + doffset || !sg)
+			nsize = len + doffset;
+
+		nskb = alloc_skb(nsize + headroom, GFP_ATOMIC);
+		if (unlikely(!nskb))
+			goto err;
+
+		if (segs)
+			tail->next = nskb;
+		else
+			segs = nskb;
+		tail = nskb;
+
+		nskb->dev = skb->dev;
+		nskb->priority = skb->priority;
+		nskb->protocol = skb->protocol;
+		nskb->dst = dst_clone(skb->dst);
+		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+		nskb->pkt_type = skb->pkt_type;
+		nskb->mac_len = skb->mac_len;
+
+		skb_reserve(nskb, headroom);
+		nskb->mac.raw = nskb->data;
+		nskb->nh.raw = nskb->data + skb->mac_len;
+		nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw);
+		memcpy(skb_put(nskb, doffset), skb->data, doffset);
+
+		if (!sg) {
+			nskb->csum = skb_copy_and_csum_bits(skb, offset,
+							    skb_put(nskb, len),
+							    len, 0);
+			continue;
+		}
+
+		frag = skb_shinfo(nskb)->frags;
+		k = 0;
+
+		nskb->ip_summed = CHECKSUM_HW;
+		nskb->csum = skb->csum;
+		memcpy(skb_put(nskb, hsize), skb->data + offset, hsize);
+
+		while (pos < offset + len) {
+			BUG_ON(i >= nfrags);
+
+			*frag = skb_shinfo(skb)->frags[i];
+			get_page(frag->page);
+			size = frag->size;
+
+			if (pos < offset) {
+				frag->page_offset += offset - pos;
+				frag->size -= offset - pos;
+			}
+
+			k++;
+
+			if (pos + size <= offset + len) {
+				i++;
+				pos += size;
+			} else {
+				frag->size -= pos + size - (offset + len);
+				break;
+			}
+
+			frag++;
+		}
+
+		skb_shinfo(nskb)->nr_frags = k;
+		nskb->data_len = len - hsize;
+		nskb->len += nskb->data_len;
+		nskb->truesize += nskb->data_len;
+	} while ((offset += len) < skb->len);
+
+	return segs;
+
+err:
+	while ((skb = segs)) {
+		segs = skb->next;
+		kfree(skb);
+	}
+	return ERR_PTR(err);
+}
+
+EXPORT_SYMBOL_GPL(skb_segment);
+
 void __init skb_init(void)
 {
 	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0a277453526b..461216b47948 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -68,6 +68,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/socket.h>
@@ -1096,6 +1097,54 @@ int inet_sk_rebuild_header(struct sock *sk)
 
 EXPORT_SYMBOL(inet_sk_rebuild_header);
 
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int sg)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	struct iphdr *iph;
+	struct net_protocol *ops;
+	int proto;
+	int ihl;
+	int id;
+
+	if (!pskb_may_pull(skb, sizeof(*iph)))
+		goto out;
+
+	iph = skb->nh.iph;
+	ihl = iph->ihl * 4;
+	if (ihl < sizeof(*iph))
+		goto out;
+
+	if (!pskb_may_pull(skb, ihl))
+		goto out;
+
+	skb->h.raw = __skb_pull(skb, ihl);
+	iph = skb->nh.iph;
+	id = ntohs(iph->id);
+	proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	segs = ERR_PTR(-EPROTONOSUPPORT);
+
+	rcu_read_lock();
+	ops = rcu_dereference(inet_protos[proto]);
+	if (ops && ops->gso_segment)
+		segs = ops->gso_segment(skb, sg);
+	rcu_read_unlock();
+
+	if (IS_ERR(segs))
+		goto out;
+
+	skb = segs;
+	do {
+		iph = skb->nh.iph;
+		iph->id = htons(id++);
+		iph->tot_len = htons(skb->len - skb->mac_len);
+		iph->check = 0;
+		iph->check = ip_fast_csum(skb->nh.raw, iph->ihl);
+	} while ((skb = skb->next));
+
+out:
+	return segs;
+}
+
 #ifdef CONFIG_IP_MULTICAST
 static struct net_protocol igmp_protocol = {
 	.handler =	igmp_rcv,
@@ -1105,6 +1154,7 @@ static struct net_protocol igmp_protocol = {
 static struct net_protocol tcp_protocol = {
 	.handler =	tcp_v4_rcv,
 	.err_handler =	tcp_v4_err,
+	.gso_segment =	tcp_tso_segment,
 	.no_policy =	1,
 };
 
@@ -1150,6 +1200,7 @@ static int ipv4_proc_init(void);
 static struct packet_type ip_packet_type = {
 	.type = __constant_htons(ETH_P_IP),
 	.func = ip_rcv,
+	.gso_segment = inet_gso_segment,
 };
 
 static int __init inet_init(void)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 062dd1a0d8a8..0e029c4e2903 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -258,6 +258,7 @@
 #include <linux/random.h>
 #include <linux/bootmem.h>
 #include <linux/cache.h>
+#include <linux/err.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -2144,6 +2145,67 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
 EXPORT_SYMBOL(compat_tcp_getsockopt);
 #endif
 
+struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	struct tcphdr *th;
+	unsigned thlen;
+	unsigned int seq;
+	unsigned int delta;
+	unsigned int oldlen;
+	unsigned int len;
+
+	if (!pskb_may_pull(skb, sizeof(*th)))
+		goto out;
+
+	th = skb->h.th;
+	thlen = th->doff * 4;
+	if (thlen < sizeof(*th))
+		goto out;
+
+	if (!pskb_may_pull(skb, thlen))
+		goto out;
+
+	oldlen = ~htonl(skb->len);
+	__skb_pull(skb, thlen);
+
+	segs = skb_segment(skb, sg);
+	if (IS_ERR(segs))
+		goto out;
+
+	len = skb_shinfo(skb)->gso_size;
+	delta = csum_add(oldlen, htonl(thlen + len));
+
+	skb = segs;
+	th = skb->h.th;
+	seq = ntohl(th->seq);
+
+	do {
+		th->fin = th->psh = 0;
+
+		if (skb->ip_summed == CHECKSUM_NONE) {
+			th->check = csum_fold(csum_partial(
+				skb->h.raw, thlen, csum_add(skb->csum, delta)));
+		}
+
+		seq += len;
+		skb = skb->next;
+		th = skb->h.th;
+
+		th->seq = htonl(seq);
+		th->cwr = 0;
+	} while (skb->next);
+
+	if (skb->ip_summed == CHECKSUM_NONE) {
+		delta = csum_add(oldlen, htonl(skb->tail - skb->h.raw));
+		th->check = csum_fold(csum_partial(
+			skb->h.raw, thlen, csum_add(skb->csum, delta)));
+	}
+
+out:
+	return segs;
+}
+
 extern void __skb_cb_too_small_for_tcp(int, int);
 extern struct tcp_congestion_ops tcp_reno;
 
-- 
cgit v1.2.3


From 37c3185a02d4b85fbe134bf5204535405dd2c957 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 22 Jun 2006 03:07:29 -0700
Subject: [NET]: Added GSO toggle

This patch adds a generic segmentation offload toggle that can be turned
on/off for each net device.  For now it only supports in TCPv4.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h   |  2 ++
 include/linux/netdevice.h |  1 +
 include/net/sock.h        |  4 ++++
 net/bridge/br_if.c        | 17 +++++++++++------
 net/core/ethtool.c        | 29 +++++++++++++++++++++++++++++
 5 files changed, 47 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index cf2abeca92a0..c6310aef5ab0 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -411,6 +411,8 @@ struct ethtool_ops {
 #define ETHTOOL_GPERMADDR	0x00000020 /* Get permanent hardware address */
 #define ETHTOOL_GUFO		0x00000021 /* Get UFO enable (ethtool_value) */
 #define ETHTOOL_SUFO		0x00000022 /* Set UFO enable (ethtool_value) */
+#define ETHTOOL_GGSO		0x00000023 /* Get GSO enable (ethtool_value) */
+#define ETHTOOL_SGSO		0x00000024 /* Set GSO enable (ethtool_value) */
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b4eae18390cc..bc747e5d7138 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -308,6 +308,7 @@ struct net_device
 #define NETIF_F_HW_VLAN_RX	256	/* Receive VLAN hw acceleration */
 #define NETIF_F_HW_VLAN_FILTER	512	/* Receive filtering on VLAN */
 #define NETIF_F_VLAN_CHALLENGED	1024	/* Device cannot handle VLAN packets */
+#define NETIF_F_GSO		2048	/* Enable software GSO. */
 #define NETIF_F_LLTX		4096	/* LockLess TX */
 
 	/* Segmentation offload features */
diff --git a/include/net/sock.h b/include/net/sock.h
index d10dfecb6cbd..a897f05de3b5 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1030,9 +1030,13 @@ static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 {
 	__sk_dst_set(sk, dst);
 	sk->sk_route_caps = dst->dev->features;
+	if (sk->sk_route_caps & NETIF_F_GSO)
+		sk->sk_route_caps |= NETIF_F_TSO;
 	if (sk->sk_route_caps & NETIF_F_TSO) {
 		if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len)
 			sk->sk_route_caps &= ~NETIF_F_TSO;
+		else 
+			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
 	}
 }
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index fdec773f5b52..07956ecf545e 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -376,15 +376,20 @@ void br_features_recompute(struct net_bridge *br)
 	features = br->feature_mask & ~NETIF_F_ALL_CSUM;
 
 	list_for_each_entry(p, &br->port_list, list) {
-		if (checksum & NETIF_F_NO_CSUM &&
-		    !(p->dev->features & NETIF_F_NO_CSUM))
+		unsigned long feature = p->dev->features;
+
+		if (checksum & NETIF_F_NO_CSUM && !(feature & NETIF_F_NO_CSUM))
 			checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
-		if (checksum & NETIF_F_HW_CSUM &&
-		    !(p->dev->features & NETIF_F_HW_CSUM))
+		if (checksum & NETIF_F_HW_CSUM && !(feature & NETIF_F_HW_CSUM))
 			checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM;
-		if (!(p->dev->features & NETIF_F_IP_CSUM))
+		if (!(feature & NETIF_F_IP_CSUM))
 			checksum = 0;
-		features &= p->dev->features;
+
+		if (feature & NETIF_F_GSO)
+			feature |= NETIF_F_TSO;
+		feature |= NETIF_F_GSO;
+
+		features &= feature;
 	}
 
 	br->dev->features = features | checksum | NETIF_F_LLTX;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 33ce7ed6afc6..27ce1683caf5 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -614,6 +614,29 @@ static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr)
 	return dev->ethtool_ops->set_ufo(dev, edata.data);
 }
 
+static int ethtool_get_gso(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_value edata = { ETHTOOL_GGSO };
+
+	edata.data = dev->features & NETIF_F_GSO;
+	if (copy_to_user(useraddr, &edata, sizeof(edata)))
+		 return -EFAULT;
+	return 0;
+}
+
+static int ethtool_set_gso(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_value edata;
+
+	if (copy_from_user(&edata, useraddr, sizeof(edata)))
+		return -EFAULT;
+	if (edata.data)
+		dev->features |= NETIF_F_GSO;
+	else
+		dev->features &= ~NETIF_F_GSO;
+	return 0;
+}
+
 static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
 {
 	struct ethtool_test test;
@@ -905,6 +928,12 @@ int dev_ethtool(struct ifreq *ifr)
 	case ETHTOOL_SUFO:
 		rc = ethtool_set_ufo(dev, useraddr);
 		break;
+	case ETHTOOL_GGSO:
+		rc = ethtool_get_gso(dev, useraddr);
+		break;
+	case ETHTOOL_SGSO:
+		rc = ethtool_set_gso(dev, useraddr);
+		break;
 	default:
 		rc =  -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From c8a553ad7f0bf943047943a758cf07017819cb3c Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 22 Jun 2006 14:28:09 -0700
Subject: [TCP]: Move inclusion of <linux/dmaengine.h> to correct place in
 <linux/tcp.h>

The new <linux/dmaengine.h> header shouldn't be included from
the !__KERNEL__ portion of tcp.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 420a689c3fb4..8ebf497907f8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -18,7 +18,6 @@
 #define _LINUX_TCP_H
 
 #include <linux/types.h>
-#include <linux/dmaengine.h>
 #include <asm/byteorder.h>
 
 struct tcphdr {
@@ -161,6 +160,7 @@ struct tcp_info
 #ifdef __KERNEL__
 
 #include <linux/skbuff.h>
+#include <linux/dmaengine.h>
 #include <net/sock.h>
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
-- 
cgit v1.2.3


From f4b8ea7849544114e9d3d682df4d400180854677 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Thu, 22 Jun 2006 16:00:11 -0700
Subject: [NET]: fix net-core kernel-doc

Warning(/var/linsrc/linux-2617-g4//include/linux/skbuff.h:304): No description found for parameter 'dma_cookie'
Warning(/var/linsrc/linux-2617-g4//include/net/sock.h:1274): No description found for parameter 'copied_early'
Warning(/var/linsrc/linux-2617-g4//net/core/dev.c:3309): No description found for parameter 'chan'
Warning(/var/linsrc/linux-2617-g4//net/core/dev.c:3309): No description found for parameter 'event'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 ++
 include/net/sock.h     | 1 +
 net/core/dev.c         | 4 ++--
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a45bba9b8cbd..16eef03ce0eb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -215,6 +215,8 @@ enum {
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@tc_index: Traffic control index
  *	@tc_verd: traffic control verdict
+ *	@dma_cookie: a cookie to one of several possible DMA operations
+ *		done by skb DMA functions
  *	@secmark: security marking
  */
 
diff --git a/include/net/sock.h b/include/net/sock.h
index a897f05de3b5..2d8d6adf1616 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1269,6 +1269,7 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
  * sk_eat_skb - Release a skb if it is no longer needed
  * @sk: socket to eat this skb from
  * @skb: socket buffer to eat
+ * @copied_early: flag indicating whether DMA operations copied this data early
  *
  * This routine must be called with interrupts disabled or with the socket
  * locked so that the sk_buff queue operation is ok.
diff --git a/net/core/dev.c b/net/core/dev.c
index d293e0f90a0c..9b8f0f22c81d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3418,8 +3418,8 @@ static void net_dma_rebalance(void)
 /**
  * netdev_dma_event - event callback for the net_dma_client
  * @client: should always be net_dma_client
- * @chan:
- * @event:
+ * @chan: DMA channel for the event
+ * @event: event type
  */
 static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 	enum dma_event event)
-- 
cgit v1.2.3


From 454e2398be9b9fa30433fccc548db34d19aa9958 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Jun 2006 02:02:57 -0700
Subject: [PATCH] VFS: Permit filesystem to override root dentry on mount

Extend the get_sb() filesystem operation to take an extra argument that
permits the VFS to pass in the target vfsmount that defines the mountpoint.

The filesystem is then required to manually set the superblock and root dentry
pointers.  For most filesystems, this should be done with simple_set_mnt()
which will set the superblock pointer and then set the root dentry to the
superblock's s_root (as per the old default behaviour).

The get_sb() op now returns an integer as there's now no need to return the
superblock pointer.

This patch permits a superblock to be implicitly shared amongst several mount
points, such as can be done with NFS to avoid potential inode aliasing.  In
such a case, simple_set_mnt() would not be called, and instead the mnt_root
and mnt_sb would be set directly.

The patch also makes the following changes:

 (*) the get_sb_*() convenience functions in the core kernel now take a vfsmount
     pointer argument and return an integer, so most filesystems have to change
     very little.

 (*) If one of the convenience function is not used, then get_sb() should
     normally call simple_set_mnt() to instantiate the vfsmount. This will
     always return 0, and so can be tail-called from get_sb().

 (*) generic_shutdown_super() now calls shrink_dcache_sb() to clean up the
     dcache upon superblock destruction rather than shrink_dcache_anon().

     This is required because the superblock may now have multiple trees that
     aren't actually bound to s_root, but that still need to be cleaned up. The
     currently called functions assume that the whole tree is rooted at s_root,
     and that anonymous dentries are not the roots of trees which results in
     dentries being left unculled.

     However, with the way NFS superblock sharing are currently set to be
     implemented, these assumptions are violated: the root of the filesystem is
     simply a dummy dentry and inode (the real inode for '/' may well be
     inaccessible), and all the vfsmounts are rooted on anonymous[*] dentries
     with child trees.

     [*] Anonymous until discovered from another tree.

 (*) The documentation has been adjusted, including the additional bit of
     changing ext2_* into foo_* in the documentation.

[akpm@osdl.org: convert ipath_fs, do other stuff]
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/Locking         |   7 +-
 Documentation/filesystems/porting         |   7 +-
 Documentation/filesystems/vfs.txt         |   4 +-
 arch/ia64/kernel/perfmon.c                |   7 +-
 arch/powerpc/platforms/cell/spufs/inode.c |   6 +-
 drivers/infiniband/core/uverbs_main.c     |   7 +-
 drivers/infiniband/hw/ipath/ipath_fs.c    |  13 ++--
 drivers/isdn/capi/capifs.c                |   6 +-
 drivers/misc/ibmasm/ibmasmfs.c            |   7 +-
 drivers/oprofile/oprofilefs.c             |   6 +-
 drivers/usb/core/inode.c                  |   6 +-
 drivers/usb/gadget/inode.c                |   6 +-
 fs/9p/vfs_super.c                         |  21 +++---
 fs/adfs/super.c                           |   7 +-
 fs/affs/super.c                           |   7 +-
 fs/afs/super.c                            |  24 ++++---
 fs/autofs/init.c                          |   6 +-
 fs/autofs4/init.c                         |   6 +-
 fs/befs/linuxvfs.c                        |   7 +-
 fs/bfs/inode.c                            |   6 +-
 fs/binfmt_misc.c                          |   6 +-
 fs/block_dev.c                            |   6 +-
 fs/cifs/cifsfs.c                          |  10 +--
 fs/coda/inode.c                           |   6 +-
 fs/configfs/mount.c                       |   6 +-
 fs/cramfs/inode.c                         |   7 +-
 fs/dcache.c                               |  40 -----------
 fs/debugfs/inode.c                        |   8 +--
 fs/devfs/base.c                           |   8 +--
 fs/devpts/inode.c                         |   6 +-
 fs/efs/super.c                            |   6 +-
 fs/eventpoll.c                            |  13 ++--
 fs/ext2/super.c                           |   6 +-
 fs/ext3/super.c                           |   6 +-
 fs/freevxfs/vxfs_super.c                  |   7 +-
 fs/fuse/inode.c                           |   8 +--
 fs/hfs/super.c                            |   7 +-
 fs/hfsplus/super.c                        |   8 ++-
 fs/hostfs/hostfs_kern.c                   |   8 +--
 fs/hpfs/super.c                           |   7 +-
 fs/hppfs/hppfs_kern.c                     |   8 +--
 fs/hugetlbfs/inode.c                      |   6 +-
 fs/inotify_user.c                         |   6 +-
 fs/isofs/inode.c                          |   7 +-
 fs/jffs/inode-v23.c                       |   7 +-
 fs/jffs2/super.c                          |  49 ++++++++------
 fs/jfs/super.c                            |   7 +-
 fs/libfs.c                                |  12 ++--
 fs/minix/inode.c                          |   7 +-
 fs/msdos/namei.c                          |   9 +--
 fs/namespace.c                            |   9 +++
 fs/ncpfs/inode.c                          |   6 +-
 fs/nfs/inode.c                            |  96 +++++++++++++++-----------
 fs/nfsd/nfsctl.c                          |   6 +-
 fs/ntfs/super.c                           |   7 +-
 fs/ocfs2/dlm/dlmfs.c                      |   6 +-
 fs/ocfs2/super.c                          |  12 ++--
 fs/openpromfs/inode.c                     |   6 +-
 fs/pipe.c                                 |   9 ++-
 fs/proc/root.c                            |   6 +-
 fs/qnx4/inode.c                           |   7 +-
 fs/ramfs/inode.c                          |  13 ++--
 fs/reiserfs/super.c                       |   9 +--
 fs/romfs/inode.c                          |   7 +-
 fs/smbfs/inode.c                          |   6 +-
 fs/super.c                                | 109 +++++++++++++++++-------------
 fs/sysfs/mount.c                          |   6 +-
 fs/sysv/super.c                           |  13 ++--
 fs/udf/super.c                            |   6 +-
 fs/ufs/super.c                            |   6 +-
 fs/vfat/namei.c                           |   9 +--
 fs/xfs/linux-2.6/xfs_super.c              |   8 ++-
 include/linux/dcache.h                    |   1 -
 include/linux/fs.h                        |  25 ++++---
 include/linux/ramfs.h                     |   4 +-
 ipc/mqueue.c                              |   8 +--
 kernel/cpuset.c                           |   8 +--
 kernel/futex.c                            |   8 +--
 mm/shmem.c                                |   6 +-
 net/socket.c                              |   7 +-
 net/sunrpc/rpc_pipe.c                     |   6 +-
 security/inode.c                          |   8 +--
 security/selinux/selinuxfs.c              |   7 +-
 83 files changed, 482 insertions(+), 431 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 1045da582b9b..3abf08f1b14a 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -142,15 +142,16 @@ see also dquot_operations section.
 
 --------------------------- file_system_type ---------------------------
 prototypes:
-	struct super_block *(*get_sb) (struct file_system_type *, int,
-			const char *, void *);
+	struct int (*get_sb) (struct file_system_type *, int,
+			const char *, void *, struct vfsmount *);
 	void (*kill_sb) (struct super_block *);
 locking rules:
 		may block	BKL
 get_sb		yes		yes
 kill_sb		yes		yes
 
-->get_sb() returns error or a locked superblock (exclusive on ->s_umount).
+->get_sb() returns error or 0 with locked superblock attached to the vfsmount
+(exclusive on ->s_umount).
 ->kill_sb() takes a write-locked superblock, does all shutdown work on it,
 unlocks and drops the reference.
 
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 2f388460cbe7..5531694059ab 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -50,10 +50,11 @@ Turn your foo_read_super() into a function that would return 0 in case of
 success and negative number in case of error (-EINVAL unless you have more
 informative error value to report).  Call it foo_fill_super().  Now declare
 
-struct super_block foo_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+int foo_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, foo_fill_super,
+			   mnt);
 }
 
 (or similar with s/bdev/nodev/ or s/bdev/single/, depending on the kind of
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 3a2e5520c1e3..dd7d0dcedc87 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -113,8 +113,8 @@ members are defined:
 struct file_system_type {
 	const char *name;
 	int fs_flags;
-        struct super_block *(*get_sb) (struct file_system_type *, int,
-                                       const char *, void *);
+        struct int (*get_sb) (struct file_system_type *, int,
+                              const char *, void *, struct vfsmount *);
         void (*kill_sb) (struct super_block *);
         struct module *owner;
         struct file_system_type * next;
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 077f21216b65..2359e2809f50 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -595,10 +595,11 @@ pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 }
 
 
-static struct super_block *
-pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
+static int
+pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data,
+	     struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC);
+	return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt);
 }
 
 static struct file_system_type pfm_fs_type = {
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 1987697b23a0..7b4572805db9 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -436,11 +436,11 @@ spufs_fill_super(struct super_block *sb, void *data, int silent)
 	return spufs_create_root(sb, data);
 }
 
-static struct super_block *
+static int
 spufs_get_sb(struct file_system_type *fstype, int flags,
-		const char *name, void *data)
+		const char *name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fstype, flags, data, spufs_fill_super);
+	return get_sb_single(fstype, flags, data, spufs_fill_super, mnt);
 }
 
 static struct file_system_type spufs_type = {
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 5ec2d49e9bb6..e57d3c50f75f 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -821,11 +821,12 @@ static void ib_uverbs_remove_one(struct ib_device *device)
 	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
 }
 
-static struct super_block *uverbs_event_get_sb(struct file_system_type *fs_type, int flags,
-					       const char *dev_name, void *data)
+static int uverbs_event_get_sb(struct file_system_type *fs_type, int flags,
+			       const char *dev_name, void *data,
+			       struct vfsmount *mnt)
 {
 	return get_sb_pseudo(fs_type, "infinibandevent:", NULL,
-			     INFINIBANDEVENTFS_MAGIC);
+			     INFINIBANDEVENTFS_MAGIC, mnt);
 }
 
 static struct file_system_type uverbs_event_fs = {
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index e274120567e1..63de3046aff3 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -542,13 +542,14 @@ bail:
 	return ret;
 }
 
-static struct super_block *ipathfs_get_sb(struct file_system_type *fs_type,
-				        int flags, const char *dev_name,
-					void *data)
+static int ipathfs_get_sb(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	ipath_super = get_sb_single(fs_type, flags, data,
-				    ipathfs_fill_super);
-	return ipath_super;
+	int ret = get_sb_single(fs_type, flags, data,
+				    ipathfs_fill_super, mnt);
+	if (ret >= 0)
+		ipath_super = mnt->mnt_sb;
+	return ret;
 }
 
 static void ipathfs_kill_super(struct super_block *s)
diff --git a/drivers/isdn/capi/capifs.c b/drivers/isdn/capi/capifs.c
index 0a37aded4b54..9ea6bd0ddc35 100644
--- a/drivers/isdn/capi/capifs.c
+++ b/drivers/isdn/capi/capifs.c
@@ -121,10 +121,10 @@ fail:
 	return -ENOMEM;
 }
 
-static struct super_block *capifs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int capifs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, capifs_fill_super);
+	return get_sb_single(fs_type, flags, data, capifs_fill_super, mnt);
 }
 
 static struct file_system_type capifs_fs_type = {
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index 26a230b6ff80..4a35caff5d02 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -90,10 +90,11 @@ static void ibmasmfs_create_files (struct super_block *sb, struct dentry *root);
 static int ibmasmfs_fill_super (struct super_block *sb, void *data, int silent);
 
 
-static struct super_block *ibmasmfs_get_super(struct file_system_type *fst,
-			int flags, const char *name, void *data)
+static int ibmasmfs_get_super(struct file_system_type *fst,
+			int flags, const char *name, void *data,
+			struct vfsmount *mnt)
 {
-	return get_sb_single(fst, flags, data, ibmasmfs_fill_super);
+	return get_sb_single(fst, flags, data, ibmasmfs_fill_super, mnt);
 }
 
 static struct super_operations ibmasmfs_s_ops = {
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index b62da9b0cbf0..71c2da277d6e 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -272,10 +272,10 @@ static int oprofilefs_fill_super(struct super_block * sb, void * data, int silen
 }
 
 
-static struct super_block *oprofilefs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int oprofilefs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, oprofilefs_fill_super);
+	return get_sb_single(fs_type, flags, data, oprofilefs_fill_super, mnt);
 }
 
 
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 3cf945cc5b9a..95f5ad923b0f 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -543,10 +543,10 @@ static void fs_remove_file (struct dentry *dentry)
 
 /* --------------------------------------------------------------------- */
 
-static struct super_block *usb_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int usb_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, usbfs_fill_super);
+	return get_sb_single(fs_type, flags, data, usbfs_fill_super, mnt);
 }
 
 static struct file_system_type usb_fs_type = {
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index aef0722b8f17..3bdc5e3ba234 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -2070,11 +2070,11 @@ enomem0:
 }
 
 /* "mount -t gadgetfs path /dev/gadget" ends up here */
-static struct super_block *
+static int
 gadgetfs_get_sb (struct file_system_type *t, int flags,
-		const char *path, void *opts)
+		const char *path, void *opts, struct vfsmount *mnt)
 {
-	return get_sb_single (t, flags, opts, gadgetfs_fill_super);
+	return get_sb_single (t, flags, opts, gadgetfs_fill_super, mnt);
 }
 
 static void
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 61c599b4a1e3..872943004e59 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -99,12 +99,13 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
  * @flags: mount flags
  * @dev_name: device name that was mounted
  * @data: mount options
+ * @mnt: mountpoint record to be instantiated
  *
  */
 
-static struct super_block *v9fs_get_sb(struct file_system_type
-				       *fs_type, int flags,
-				       const char *dev_name, void *data)
+static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data,
+		       struct vfsmount *mnt)
 {
 	struct super_block *sb = NULL;
 	struct v9fs_fcall *fcall = NULL;
@@ -123,17 +124,19 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
-		sb = ERR_PTR(newfid);
+		retval = newfid;
 		goto out_free_session;
 	}
 
 	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
-	if (IS_ERR(sb))
+	if (IS_ERR(sb)) {
+		retval = PTR_ERR(sb);
 		goto out_close_session;
+	}
 	v9fs_fill_super(sb, v9ses, flags);
 
 	inode = v9fs_get_inode(sb, S_IFDIR | mode);
@@ -184,19 +187,19 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 		goto put_back_sb;
 	}
 
-	return sb;
+	return simple_set_mnt(mnt, sb);
 
 out_close_session:
 	v9fs_session_close(v9ses);
 out_free_session:
 	kfree(v9ses);
-	return sb;
+	return retval;
 
 put_back_sb:
 	/* deactivate_super calls v9fs_kill_super which will frees the rest */
 	up_write(&sb->s_umount);
 	deactivate_super(sb);
-	return ERR_PTR(retval);
+	return retval;
 }
 
 /**
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 252abda0d200..1b58a9b7f6aa 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -470,10 +470,11 @@ error:
 	return -EINVAL;
 }
 
-static struct super_block *adfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int adfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type adfs_fs_type = {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4d7e5b19e5cd..6a52e7875403 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -524,10 +524,11 @@ affs_statfs(struct super_block *sb, struct kstatfs *buf)
 	return 0;
 }
 
-static struct super_block *affs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int affs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type affs_fs_type = {
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 53c56e7231ab..82468df0ba54 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -38,9 +38,9 @@ struct afs_mount_params {
 static void afs_i_init_once(void *foo, kmem_cache_t *cachep,
 			    unsigned long flags);
 
-static struct super_block *afs_get_sb(struct file_system_type *fs_type,
-				      int flags, const char *dev_name,
-				      void *data);
+static int afs_get_sb(struct file_system_type *fs_type,
+		      int flags, const char *dev_name,
+		      void *data, struct vfsmount *mnt);
 
 static struct inode *afs_alloc_inode(struct super_block *sb);
 
@@ -294,10 +294,11 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
  * get an AFS superblock
  * - TODO: don't use get_sb_nodev(), but rather call sget() directly
  */
-static struct super_block *afs_get_sb(struct file_system_type *fs_type,
-				      int flags,
-				      const char *dev_name,
-				      void *options)
+static int afs_get_sb(struct file_system_type *fs_type,
+		      int flags,
+		      const char *dev_name,
+		      void *options,
+		      struct vfsmount *mnt)
 {
 	struct afs_mount_params params;
 	struct super_block *sb;
@@ -311,7 +312,7 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type,
 	ret = afscm_start();
 	if (ret < 0) {
 		_leave(" = %d", ret);
-		return ERR_PTR(ret);
+		return ret;
 	}
 
 	/* parse the options */
@@ -348,18 +349,19 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type,
 		goto error;
 	}
 	sb->s_flags |= MS_ACTIVE;
+	simple_set_mnt(mnt, sb);
 
 	afs_put_volume(params.volume);
 	afs_put_cell(params.default_cell);
-	_leave(" = %p", sb);
-	return sb;
+	_leave(" = 0 [%p]", 0, sb);
+	return 0;
 
  error:
 	afs_put_volume(params.volume);
 	afs_put_cell(params.default_cell);
 	afscm_stop();
 	_leave(" = %d", ret);
-	return ERR_PTR(ret);
+	return ret;
 } /* end afs_get_sb() */
 
 /*****************************************************************************/
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index b977ece69f0c..aca123752406 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -14,10 +14,10 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static struct super_block *autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int autofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, autofs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
 }
 
 static struct file_system_type autofs_fs_type = {
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index acecec8578ce..5d9193332bef 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -14,10 +14,10 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static struct super_block *autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int autofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, autofs4_fill_super);
+	return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
 }
 
 static struct file_system_type autofs_fs_type = {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 68ebd10f345d..6ed07a5f10c6 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -899,11 +899,12 @@ befs_statfs(struct super_block *sb, struct kstatfs *buf)
 	return 0;
 }
 
-static struct super_block *
+static int
 befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name,
-	    void *data)
+	    void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type befs_fs_type = {
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 55a7a78332f8..e7da03f63a5a 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -410,10 +410,10 @@ out:
 	return -EINVAL;
 }
 
-static struct super_block *bfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt);
 }
 
 static struct file_system_type bfs_fs_type = {
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 599f36fd0f67..07a4996cca3f 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -739,10 +739,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
 	return err;
 }
 
-static struct super_block *bm_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bm_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, bm_fill_super);
+	return get_sb_single(fs_type, flags, data, bm_fill_super, mnt);
 }
 
 static struct linux_binfmt misc_format = {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 44aaba202f78..028d9fb9c2d5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -300,10 +300,10 @@ static struct super_operations bdev_sops = {
 	.clear_inode = bdev_clear_inode,
 };
 
-static struct super_block *bd_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bd_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
+	return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
 }
 
 static struct file_system_type bd_type = {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c262d8874ce9..08b35801dfed 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -460,9 +460,9 @@ struct super_operations cifs_super_ops = {
 	.remount_fs = cifs_remount,
 };
 
-static struct super_block *
+static int
 cifs_get_sb(struct file_system_type *fs_type,
-	    int flags, const char *dev_name, void *data)
+	    int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
 	int rc;
 	struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
@@ -470,7 +470,7 @@ cifs_get_sb(struct file_system_type *fs_type,
 	cFYI(1, ("Devname: %s flags: %d ", dev_name, flags));
 
 	if (IS_ERR(sb))
-		return sb;
+		return PTR_ERR(sb);
 
 	sb->s_flags = flags;
 
@@ -478,10 +478,10 @@ cifs_get_sb(struct file_system_type *fs_type,
 	if (rc) {
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
-		return ERR_PTR(rc);
+		return rc;
 	}
 	sb->s_flags |= MS_ACTIVE;
-	return sb;
+	return simple_set_mnt(mnt, sb);
 }
 
 static ssize_t cifs_file_writev(struct file *file, const struct iovec *iov,
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index ada1a81df6bd..cba70201567d 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -307,10 +307,10 @@ static int coda_statfs(struct super_block *sb, struct kstatfs *buf)
 
 /* init_coda: used by filesystems.c to register coda */
 
-static struct super_block *coda_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int coda_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, coda_fill_super);
+	return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt);
 }
 
 struct file_system_type coda_fs_type = {
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f920d30478e5..94dab7bdd851 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -103,10 +103,10 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int configfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, configfs_fill_super);
+	return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt);
 }
 
 static struct file_system_type configfs_fs_type = {
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 9efcc3a164e8..37a91a153aa5 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -528,10 +528,11 @@ static struct super_operations cramfs_ops = {
 	.statfs		= cramfs_statfs,
 };
 
-static struct super_block *cramfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int cramfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type cramfs_fs_type = {
diff --git a/fs/dcache.c b/fs/dcache.c
index 59dbc92c2079..313b54b2b8f2 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -687,46 +687,6 @@ void shrink_dcache_parent(struct dentry * parent)
 		prune_dcache(found, parent->d_sb);
 }
 
-/**
- * shrink_dcache_anon - further prune the cache
- * @head: head of d_hash list of dentries to prune
- *
- * Prune the dentries that are anonymous
- *
- * parsing d_hash list does not hlist_for_each_entry_rcu() as it
- * done under dcache_lock.
- *
- */
-void shrink_dcache_anon(struct super_block *sb)
-{
-	struct hlist_node *lp;
-	struct hlist_head *head = &sb->s_anon;
-	int found;
-	do {
-		found = 0;
-		spin_lock(&dcache_lock);
-		hlist_for_each(lp, head) {
-			struct dentry *this = hlist_entry(lp, struct dentry, d_hash);
-			if (!list_empty(&this->d_lru)) {
-				dentry_stat.nr_unused--;
-				list_del_init(&this->d_lru);
-			}
-
-			/* 
-			 * move only zero ref count dentries to the end 
-			 * of the unused list for prune_dcache
-			 */
-			if (!atomic_read(&this->d_count)) {
-				list_add_tail(&this->d_lru, &dentry_unused);
-				dentry_stat.nr_unused++;
-				found++;
-			}
-		}
-		spin_unlock(&dcache_lock);
-		prune_dcache(found, sb);
-	} while(found);
-}
-
 /*
  * Scan `nr' dentries and return the number which remain.
  *
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b55b4ea9a676..440128ebef3b 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -111,11 +111,11 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
 	return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
 }
 
-static struct super_block *debug_get_sb(struct file_system_type *fs_type,
-				        int flags, const char *dev_name,
-					void *data)
+static int debug_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, debug_fill_super);
+	return get_sb_single(fs_type, flags, data, debug_fill_super, mnt);
 }
 
 static struct file_system_type debug_fs_type = {
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index 52f5059c4f31..51a97f132745 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -2549,11 +2549,11 @@ static int devfs_fill_super(struct super_block *sb, void *data, int silent)
 	return -EINVAL;
 }				/*  End Function devfs_fill_super  */
 
-static struct super_block *devfs_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int devfs_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, devfs_fill_super);
+	return get_sb_single(fs_type, flags, data, devfs_fill_super, mnt);
 }
 
 static struct file_system_type devfs_fs_type = {
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 14c5620b5cab..f7aef5bb584a 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -130,10 +130,10 @@ fail:
 	return -ENOMEM;
 }
 
-static struct super_block *devpts_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int devpts_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, devpts_fill_super);
+	return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
 }
 
 static struct file_system_type devpts_fs_type = {
diff --git a/fs/efs/super.c b/fs/efs/super.c
index dff623e3ddbf..1ba5e14f879f 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -18,10 +18,10 @@
 static int efs_statfs(struct super_block *s, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
 
-static struct super_block *efs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int efs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt);
 }
 
 static struct file_system_type efs_fs_type = {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2695337d4d64..08e7e6a555ca 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -268,9 +268,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		   int maxevents, long timeout);
 static int eventpollfs_delete_dentry(struct dentry *dentry);
 static struct inode *ep_eventpoll_inode(void);
-static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data);
+static int eventpollfs_get_sb(struct file_system_type *fs_type,
+			      int flags, const char *dev_name,
+			      void *data, struct vfsmount *mnt);
 
 /*
  * This semaphore is used to serialize ep_free() and eventpoll_release_file().
@@ -1595,11 +1595,12 @@ eexit_1:
 }
 
 
-static struct super_block *
+static int
 eventpollfs_get_sb(struct file_system_type *fs_type, int flags,
-		   const char *dev_name, void *data)
+		   const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC);
+	return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC,
+			     mnt);
 }
 
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7e30bae174ed..a4dfffac5967 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1087,10 +1087,10 @@ static int ext2_statfs (struct super_block * sb, struct kstatfs * buf)
 	return 0;
 }
 
-static struct super_block *ext2_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ext2_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt);
 }
 
 #ifdef CONFIG_QUOTA
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f8a5266ea1ff..657f8e73b62f 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2646,10 +2646,10 @@ out:
 
 #endif
 
-static struct super_block *ext3_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ext3_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
 }
 
 static struct file_system_type ext3_fs_type = {
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index b44c916d24a1..d76eeaafbde2 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -241,10 +241,11 @@ out:
 /*
  * The usual module blurb.
  */
-static struct super_block *vxfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int vxfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type vxfs_fs_type = {
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7627022446b2..c91f0a50aadb 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -569,11 +569,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	return err;
 }
 
-static struct super_block *fuse_get_sb(struct file_system_type *fs_type,
-				       int flags, const char *dev_name,
-				       void *raw_data)
+static int fuse_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *raw_data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super);
+	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
 }
 
 static struct file_system_type fuse_fs_type = {
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 1181d116117d..ee5b80a409e8 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -413,10 +413,11 @@ bail:
 	return res;
 }
 
-static struct super_block *hfs_get_sb(struct file_system_type *fs_type,
-				      int flags, const char *dev_name, void *data)
+static int hfs_get_sb(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data,
+		      struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt);
 }
 
 static struct file_system_type hfs_fs_type = {
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7843f792a4b7..0ed8b7e8e87f 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -450,10 +450,12 @@ static void hfsplus_destroy_inode(struct inode *inode)
 
 #define HFSPLUS_INODE_SIZE	sizeof(struct hfsplus_inode_info)
 
-static struct super_block *hfsplus_get_sb(struct file_system_type *fs_type,
-					  int flags, const char *dev_name, void *data)
+static int hfsplus_get_sb(struct file_system_type *fs_type,
+			  int flags, const char *dev_name, void *data,
+			  struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super,
+			   mnt);
 }
 
 static struct file_system_type hfsplus_fs_type = {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index bf0f8e16e433..04035e08f5c1 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -993,11 +993,11 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
 	return(err);
 }
 
-static struct super_block *hostfs_read_sb(struct file_system_type *type,
-					     int flags, const char *dev_name,
-					     void *data)
+static int hostfs_read_sb(struct file_system_type *type,
+			  int flags, const char *dev_name,
+			  void *data, struct vfsmount *mnt)
 {
-	return(get_sb_nodev(type, flags, data, hostfs_fill_sb_common));
+	return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt);
 }
 
 static struct file_system_type hostfs_type = {
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index d72d8c87c996..3b25cf3e2e65 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -662,10 +662,11 @@ bail0:
 	return -EINVAL;
 }
 
-static struct super_block *hpfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int hpfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type hpfs_fs_type = {
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index 5e6363be246f..ec43c22bc9c0 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -769,11 +769,11 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
 	return(err);
 }
 
-static struct super_block *hppfs_read_super(struct file_system_type *type,
-					     int flags, const char *dev_name,
-					     void *data)
+static int hppfs_read_super(struct file_system_type *type,
+			    int flags, const char *dev_name,
+			    void *data, struct vfsmount *mnt)
 {
-	return(get_sb_nodev(type, flags, data, hppfs_fill_super));
+	return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt);
 }
 
 static struct file_system_type hppfs_type = {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3a5b4e923455..4665c26171f7 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -723,10 +723,10 @@ void hugetlb_put_quota(struct address_space *mapping)
 	}
 }
 
-static struct super_block *hugetlbfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int hugetlbfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt);
 }
 
 static struct file_system_type hugetlbfs_fs_type = {
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 9e9931e2badd..f2386442adee 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -672,11 +672,11 @@ out:
 	return ret;
 }
 
-static struct super_block *
+static int
 inotify_get_sb(struct file_system_type *fs_type, int flags,
-	       const char *dev_name, void *data)
+	       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-    return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA);
+	return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA, mnt);
 }
 
 static struct file_system_type inotify_fs_type = {
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 70adbb98bad1..17268da63a49 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1399,10 +1399,11 @@ struct inode *isofs_iget(struct super_block *sb,
 	return inode;
 }
 
-static struct super_block *isofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int isofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type iso9660_fs_type = {
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 020cc097c539..dd93a091ad67 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -1785,10 +1785,11 @@ static struct super_operations jffs_ops =
 	.remount_fs	= jffs_remount,
 };
 
-static struct super_block *jffs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int jffs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type jffs_fs_type = {
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9d0521451f59..2378a662c256 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -111,9 +111,10 @@ static int jffs2_sb_set(struct super_block *sb, void *data)
 	return 0;
 }
 
-static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data, struct mtd_info *mtd)
+static int jffs2_get_sb_mtd(struct file_system_type *fs_type,
+			    int flags, const char *dev_name,
+			    void *data, struct mtd_info *mtd,
+			    struct vfsmount *mnt)
 {
 	struct super_block *sb;
 	struct jffs2_sb_info *c;
@@ -121,19 +122,20 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 
 	c = kmalloc(sizeof(*c), GFP_KERNEL);
 	if (!c)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	memset(c, 0, sizeof(*c));
 	c->mtd = mtd;
 
 	sb = sget(fs_type, jffs2_sb_compare, jffs2_sb_set, c);
 
 	if (IS_ERR(sb))
-		goto out_put;
+		goto out_error;
 
 	if (sb->s_root) {
 		/* New mountpoint for JFFS2 which is already mounted */
 		D1(printk(KERN_DEBUG "jffs2_get_sb_mtd(): Device %d (\"%s\") is already mounted\n",
 			  mtd->index, mtd->name));
+		ret = simple_set_mnt(mnt, sb);
 		goto out_put;
 	}
 
@@ -161,44 +163,47 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 		/* Failure case... */
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
-		return ERR_PTR(ret);
+		return ret;
 	}
 
 	sb->s_flags |= MS_ACTIVE;
-	return sb;
+	return simple_set_mnt(mnt, sb);
 
+out_error:
+	ret = PTR_ERR(sb);
  out_put:
 	kfree(c);
 	put_mtd_device(mtd);
 
-	return sb;
+	return ret;
 }
 
-static struct super_block *jffs2_get_sb_mtdnr(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data, int mtdnr)
+static int jffs2_get_sb_mtdnr(struct file_system_type *fs_type,
+			      int flags, const char *dev_name,
+			      void *data, int mtdnr,
+			      struct vfsmount *mnt)
 {
 	struct mtd_info *mtd;
 
 	mtd = get_mtd_device(NULL, mtdnr);
 	if (!mtd) {
 		D1(printk(KERN_DEBUG "jffs2: MTD device #%u doesn't appear to exist\n", mtdnr));
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
-	return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd);
+	return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt);
 }
 
-static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int jffs2_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
 	int err;
 	struct nameidata nd;
 	int mtdnr;
 
 	if (!dev_name)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	D1(printk(KERN_DEBUG "jffs2_get_sb(): dev_name \"%s\"\n", dev_name));
 
@@ -220,7 +225,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 				mtd = get_mtd_device(NULL, mtdnr);
 				if (mtd) {
 					if (!strcmp(mtd->name, dev_name+4))
-						return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd);
+						return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt);
 					put_mtd_device(mtd);
 				}
 			}
@@ -233,7 +238,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 			if (!*endptr) {
 				/* It was a valid number */
 				D1(printk(KERN_DEBUG "jffs2_get_sb(): mtd%%d, mtdnr %d\n", mtdnr));
-				return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr);
+				return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt);
 			}
 		}
 	}
@@ -247,7 +252,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 		  err, nd.dentry->d_inode));
 
 	if (err)
-		return ERR_PTR(err);
+		return err;
 
 	err = -EINVAL;
 
@@ -269,11 +274,11 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 	mtdnr = iminor(nd.dentry->d_inode);
 	path_release(&nd);
 
-	return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr);
+	return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt);
 
 out:
 	path_release(&nd);
-	return ERR_PTR(err);
+	return err;
 }
 
 static void jffs2_put_super (struct super_block *sb)
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index db6f41d6dd60..18a28137b90e 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -565,10 +565,11 @@ static void jfs_unlockfs(struct super_block *sb)
 	}
 }
 
-static struct super_block *jfs_get_sb(struct file_system_type *fs_type, 
-	int flags, const char *dev_name, void *data)
+static int jfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super,
+			   mnt);
 }
 
 static int jfs_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/libfs.c b/fs/libfs.c
index 7145ba7a48d0..7d70efa46da9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -196,9 +196,9 @@ struct inode_operations simple_dir_inode_operations = {
  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
  * will never be mountable)
  */
-struct super_block *
-get_sb_pseudo(struct file_system_type *fs_type, char *name,
-	struct super_operations *ops, unsigned long magic)
+int get_sb_pseudo(struct file_system_type *fs_type, char *name,
+	struct super_operations *ops, unsigned long magic,
+	struct vfsmount *mnt)
 {
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 	static struct super_operations default_ops = {.statfs = simple_statfs};
@@ -207,7 +207,7 @@ get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	struct qstr d_name = {.name = name, .len = strlen(name)};
 
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 
 	s->s_flags = MS_NOUSER;
 	s->s_maxbytes = ~0ULL;
@@ -232,12 +232,12 @@ get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
 
 Enomem:
 	up_write(&s->s_umount);
 	deactivate_super(s);
-	return ERR_PTR(-ENOMEM);
+	return -ENOMEM;
 }
 
 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 2dcccf1d1b7f..14f24dfbfe30 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -559,10 +559,11 @@ void minix_truncate(struct inode * inode)
 		V2_minix_truncate(inode);
 }
 
-static struct super_block *minix_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int minix_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super,
+			   mnt);
 }
 
 static struct file_system_type minix_fs_type = {
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 5b76ccd19e3f..9e44158a7540 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -661,11 +661,12 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *msdos_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int msdos_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
+			   mnt);
 }
 
 static struct file_system_type msdos_fs_type = {
diff --git a/fs/namespace.c b/fs/namespace.c
index bf478addb852..c13072a5f1ee 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -86,6 +86,15 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 	return mnt;
 }
 
+int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
+{
+	mnt->mnt_sb = sb;
+	mnt->mnt_root = dget(sb->s_root);
+	return 0;
+}
+
+EXPORT_SYMBOL(simple_set_mnt);
+
 void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index a1f3e972c6ef..8db033fab3fd 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -957,10 +957,10 @@ out:
 	return result;
 }
 
-static struct super_block *ncp_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ncp_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, ncp_fill_super);
+	return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt);
 }
 
 static struct file_system_type ncp_fs_type = {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d0b991a92327..ff645a961bc8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1690,8 +1690,8 @@ static int nfs_compare_super(struct super_block *sb, void *data)
 	return !nfs_compare_fh(&old->fh, &server->fh);
 }
 
-static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
+static int nfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
 	int error;
 	struct nfs_server *server = NULL;
@@ -1699,14 +1699,14 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 	struct nfs_fh *root;
 	struct nfs_mount_data *data = raw_data;
 
-	s = ERR_PTR(-EINVAL);
+	error = -EINVAL;
 	if (data == NULL) {
 		dprintk("%s: missing data argument\n", __FUNCTION__);
-		goto out_err;
+		goto out_err_noserver;
 	}
 	if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
 		dprintk("%s: bad mount version\n", __FUNCTION__);
-		goto out_err;
+		goto out_err_noserver;
 	}
 	switch (data->version) {
 		case 1:
@@ -1718,7 +1718,7 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 				dprintk("%s: mount structure version %d does not support NFSv3\n",
 						__FUNCTION__,
 						data->version);
-				goto out_err;
+				goto out_err_noserver;
 			}
 			data->root.size = NFS2_FHSIZE;
 			memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
@@ -1727,24 +1727,24 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 				dprintk("%s: mount structure version %d does not support strong security\n",
 						__FUNCTION__,
 						data->version);
-				goto out_err;
+				goto out_err_noserver;
 			}
 		case 5:
 			memset(data->context, 0, sizeof(data->context));
 	}
 #ifndef CONFIG_NFS_V3
 	/* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
-	s = ERR_PTR(-EPROTONOSUPPORT);
+	error = -EPROTONOSUPPORT;
 	if (data->flags & NFS_MOUNT_VER3) {
 		dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
-		goto out_err;
+		goto out_err_noserver;
 	}
 #endif /* CONFIG_NFS_V3 */
 
-	s = ERR_PTR(-ENOMEM);
+	error = -ENOMEM;
 	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
 	if (!server)
-		goto out_err;
+		goto out_err_noserver;
 	/* Zero out the NFS state stuff */
 	init_nfsv4_state(server);
 	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
@@ -1754,7 +1754,7 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 		root->size = data->root.size;
 	else
 		root->size = NFS2_FHSIZE;
-	s = ERR_PTR(-EINVAL);
+	error = -EINVAL;
 	if (root->size > sizeof(root->data)) {
 		dprintk("%s: invalid root filehandle\n", __FUNCTION__);
 		goto out_err;
@@ -1770,15 +1770,20 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 	}
 
 	/* Fire up rpciod if not yet running */
-	s = ERR_PTR(rpciod_up());
-	if (IS_ERR(s)) {
-		dprintk("%s: couldn't start rpciod! Error = %ld\n",
-				__FUNCTION__, PTR_ERR(s));
+	error = rpciod_up();
+	if (error < 0) {
+		dprintk("%s: couldn't start rpciod! Error = %d\n",
+				__FUNCTION__, error);
 		goto out_err;
 	}
 
 	s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
-	if (IS_ERR(s) || s->s_root)
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto out_err_rpciod;
+	}
+
+	if (s->s_root)
 		goto out_rpciod_down;
 
 	s->s_flags = flags;
@@ -1787,15 +1792,22 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
-		return ERR_PTR(error);
+		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
+
 out_rpciod_down:
 	rpciod_down();
+	kfree(server);
+	return simple_set_mnt(mnt, s);
+
+out_err_rpciod:
+	rpciod_down();
 out_err:
 	kfree(server);
-	return s;
+out_err_noserver:
+	return error;
 }
 
 static void nfs_kill_super(struct super_block *s)
@@ -2032,8 +2044,8 @@ nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
 	return dst;
 }
 
-static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
+static int nfs4_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
 	int error;
 	struct nfs_server *server;
@@ -2043,16 +2055,16 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
 
 	if (data == NULL) {
 		dprintk("%s: missing data argument\n", __FUNCTION__);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 	if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) {
 		dprintk("%s: bad mount version\n", __FUNCTION__);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
 	if (!server)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	/* Zero out the NFS state stuff */
 	init_nfsv4_state(server);
 	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
@@ -2074,33 +2086,41 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
 
 	/* We now require that the mount process passes the remote address */
 	if (data->host_addrlen != sizeof(server->addr)) {
-		s = ERR_PTR(-EINVAL);
+		error = -EINVAL;
 		goto out_free;
 	}
 	if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
-		s = ERR_PTR(-EFAULT);
+		error = -EFAULT;
 		goto out_free;
 	}
 	if (server->addr.sin_family != AF_INET ||
 	    server->addr.sin_addr.s_addr == INADDR_ANY) {
 		dprintk("%s: mount program didn't pass remote IP address!\n",
 				__FUNCTION__);
-		s = ERR_PTR(-EINVAL);
+		error = -EINVAL;
 		goto out_free;
 	}
 
 	/* Fire up rpciod if not yet running */
-	s = ERR_PTR(rpciod_up());
-	if (IS_ERR(s)) {
-		dprintk("%s: couldn't start rpciod! Error = %ld\n",
-				__FUNCTION__, PTR_ERR(s));
+	error = rpciod_up();
+	if (error < 0) {
+		dprintk("%s: couldn't start rpciod! Error = %d\n",
+				__FUNCTION__, error);
 		goto out_free;
 	}
 
 	s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
-
-	if (IS_ERR(s) || s->s_root)
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
 		goto out_free;
+	}
+
+	if (s->s_root) {
+		kfree(server->mnt_path);
+		kfree(server->hostname);
+		kfree(server);
+		return simple_set_mnt(mnt, s);
+	}
 
 	s->s_flags = flags;
 
@@ -2108,17 +2128,17 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
-		return ERR_PTR(error);
+		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
 out_err:
-	s = (struct super_block *)p;
+	error = PTR_ERR(p);
 out_free:
 	kfree(server->mnt_path);
 	kfree(server->hostname);
 	kfree(server);
-	return s;
+	return error;
 }
 
 static void nfs4_kill_super(struct super_block *sb)
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3ef017b3b5bd..a1810e6a93e5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -494,10 +494,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 	return simple_fill_super(sb, 0x6e667364, nfsd_files);
 }
 
-static struct super_block *nfsd_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int nfsd_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, nfsd_fill_super);
+	return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt);
 }
 
 static struct file_system_type nfsd_fs_type = {
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 27833f6df49f..d5d5e969294f 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3093,10 +3093,11 @@ struct kmem_cache *ntfs_index_ctx_cache;
 /* Driver wide mutex. */
 DEFINE_MUTEX(ntfs_lock);
 
-static struct super_block *ntfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ntfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type ntfs_fs_type = {
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 7e88e24b3471..7273d9fa6bab 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -574,10 +574,10 @@ static struct inode_operations dlmfs_file_inode_operations = {
 	.getattr	= simple_getattr,
 };
 
-static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int dlmfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
 }
 
 static struct file_system_type dlmfs_fs_type = {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 949b3dac30f1..788b8b50dc4c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -672,12 +672,14 @@ read_super_error:
 	return status;
 }
 
-static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type,
-					int flags,
-					const char *dev_name,
-					void *data)
+static int ocfs2_get_sb(struct file_system_type *fs_type,
+			int flags,
+			const char *dev_name,
+			void *data,
+			struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
+			   mnt);
 }
 
 static struct file_system_type ocfs2_fs_type = {
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 0f14276a2e51..464e2bce0203 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -1054,10 +1054,10 @@ out_no_root:
 	return -ENOMEM;
 }
 
-static struct super_block *openprom_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int openprom_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, openprom_fill_super);
+	return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt);
 }
 
 static struct file_system_type openprom_fs_type = {
diff --git a/fs/pipe.c b/fs/pipe.c
index 5acd8954aaa0..20352573e025 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -979,12 +979,11 @@ no_files:
  * any operations on the root directory. However, we need a non-trivial
  * d_name - pipe: will go nicely and kill the special-casing in procfs.
  */
-
-static struct super_block *
-pipefs_get_sb(struct file_system_type *fs_type, int flags,
-	      const char *dev_name, void *data)
+static int pipefs_get_sb(struct file_system_type *fs_type,
+			 int flags, const char *dev_name, void *data,
+			 struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
+	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
 }
 
 static struct file_system_type pipe_fs_type = {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c3fd3611112f..9995356ce73e 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -26,10 +26,10 @@ struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc
 struct proc_dir_entry *proc_sys_root;
 #endif
 
-static struct super_block *proc_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int proc_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, proc_fill_super);
+	return get_sb_single(fs_type, flags, data, proc_fill_super, mnt);
 }
 
 static struct file_system_type proc_fs_type = {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2ecd46f85e9f..e6cca5cd4b44 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -561,10 +561,11 @@ static void destroy_inodecache(void)
 		       "qnx4_inode_cache: not all structures were freed\n");
 }
 
-static struct super_block *qnx4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int qnx4_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super,
+			   mnt);
 }
 
 static struct file_system_type qnx4_fs_type = {
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 14bd2246fb6d..b9677335cc8d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -185,16 +185,17 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 	return 0;
 }
 
-struct super_block *ramfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+int ramfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, ramfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt);
 }
 
-static struct super_block *rootfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int rootfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
+	return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super,
+			    mnt);
 }
 
 static struct file_system_type ramfs_fs_type = {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index cae2abbc0c71..f3ff41d33989 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2249,11 +2249,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 
 #endif
 
-static struct super_block *get_super_block(struct file_system_type *fs_type,
-					   int flags, const char *dev_name,
-					   void *data)
+static int get_super_block(struct file_system_type *fs_type,
+			   int flags, const char *dev_name,
+			   void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super,
+			   mnt);
 }
 
 static int __init init_reiserfs_fs(void)
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 9b9eda7b335c..4d6cd6621062 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -607,10 +607,11 @@ static struct super_operations romfs_ops = {
 	.remount_fs	= romfs_remount,
 };
 
-static struct super_block *romfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int romfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type romfs_fs_type = {
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index fdeabc0a34f7..4a37c2bbfa3f 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -782,10 +782,10 @@ out:
 	return error;
 }
 
-static struct super_block *smb_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int smb_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, smb_fill_super);
+	return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
 }
 
 static struct file_system_type smb_fs_type = {
diff --git a/fs/super.c b/fs/super.c
index 9d5c2add7228..324c2d232f54 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -231,7 +231,7 @@ void generic_shutdown_super(struct super_block *sb)
 	if (root) {
 		sb->s_root = NULL;
 		shrink_dcache_parent(root);
-		shrink_dcache_anon(sb);
+		shrink_dcache_sb(sb);
 		dput(root);
 		fsync_super(sb);
 		lock_super(sb);
@@ -676,9 +676,10 @@ static void bdev_uevent(struct block_device *bdev, enum kobject_action action)
 	}
 }
 
-struct super_block *get_sb_bdev(struct file_system_type *fs_type,
+int get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	struct block_device *bdev;
 	struct super_block *s;
@@ -686,7 +687,7 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 
 	bdev = open_bdev_excl(dev_name, flags, fs_type);
 	if (IS_ERR(bdev))
-		return (struct super_block *)bdev;
+		return PTR_ERR(bdev);
 
 	/*
 	 * once the super is inserted into the list by sget, s_umount
@@ -697,15 +698,17 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
 	mutex_unlock(&bdev->bd_mount_mutex);
 	if (IS_ERR(s))
-		goto out;
+		goto error_s;
 
 	if (s->s_root) {
 		if ((flags ^ s->s_flags) & MS_RDONLY) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			s = ERR_PTR(-EBUSY);
+			error = -EBUSY;
+			goto error_bdev;
 		}
-		goto out;
+
+		close_bdev_excl(bdev);
 	} else {
 		char b[BDEVNAME_SIZE];
 
@@ -716,18 +719,21 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			s = ERR_PTR(error);
-		} else {
-			s->s_flags |= MS_ACTIVE;
-			bdev_uevent(bdev, KOBJ_MOUNT);
+			goto error;
 		}
+
+		s->s_flags |= MS_ACTIVE;
+		bdev_uevent(bdev, KOBJ_MOUNT);
 	}
 
-	return s;
+	return simple_set_mnt(mnt, s);
 
-out:
+error_s:
+	error = PTR_ERR(s);
+error_bdev:
 	close_bdev_excl(bdev);
-	return s;
+error:
+	return error;
 }
 
 EXPORT_SYMBOL(get_sb_bdev);
@@ -744,15 +750,16 @@ void kill_block_super(struct super_block *sb)
 
 EXPORT_SYMBOL(kill_block_super);
 
-struct super_block *get_sb_nodev(struct file_system_type *fs_type,
+int get_sb_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	int error;
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 
 	s->s_flags = flags;
 
@@ -760,10 +767,10 @@ struct super_block *get_sb_nodev(struct file_system_type *fs_type,
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
-		return ERR_PTR(error);
+		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
 }
 
 EXPORT_SYMBOL(get_sb_nodev);
@@ -773,94 +780,102 @@ static int compare_single(struct super_block *s, void *p)
 	return 1;
 }
 
-struct super_block *get_sb_single(struct file_system_type *fs_type,
+int get_sb_single(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	struct super_block *s;
 	int error;
 
 	s = sget(fs_type, compare_single, set_anon_super, NULL);
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 	if (!s->s_root) {
 		s->s_flags = flags;
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			return ERR_PTR(error);
+			return error;
 		}
 		s->s_flags |= MS_ACTIVE;
 	}
 	do_remount_sb(s, flags, data, 0);
-	return s;
+	return simple_set_mnt(mnt, s);
 }
 
 EXPORT_SYMBOL(get_sb_single);
 
 struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
-	struct file_system_type *type = get_fs_type(fstype);
-	struct super_block *sb = ERR_PTR(-ENOMEM);
 	struct vfsmount *mnt;
-	int error;
 	char *secdata = NULL;
+	int error;
 
 	if (!type)
 		return ERR_PTR(-ENODEV);
 
+	error = -ENOMEM;
 	mnt = alloc_vfsmnt(name);
 	if (!mnt)
 		goto out;
 
 	if (data) {
 		secdata = alloc_secdata();
-		if (!secdata) {
-			sb = ERR_PTR(-ENOMEM);
+		if (!secdata)
 			goto out_mnt;
-		}
 
 		error = security_sb_copy_data(type, data, secdata);
-		if (error) {
-			sb = ERR_PTR(error);
+		if (error)
 			goto out_free_secdata;
-		}
 	}
 
-	sb = type->get_sb(type, flags, name, data);
-	if (IS_ERR(sb))
+	error = type->get_sb(type, flags, name, data, mnt);
+	if (error < 0)
 		goto out_free_secdata;
- 	error = security_sb_kern_mount(sb, secdata);
+
+ 	error = security_sb_kern_mount(mnt->mnt_sb, secdata);
  	if (error)
  		goto out_sb;
-	mnt->mnt_sb = sb;
-	mnt->mnt_root = dget(sb->s_root);
-	mnt->mnt_mountpoint = sb->s_root;
+
+	mnt->mnt_mountpoint = mnt->mnt_root;
 	mnt->mnt_parent = mnt;
-	up_write(&sb->s_umount);
+	up_write(&mnt->mnt_sb->s_umount);
 	free_secdata(secdata);
-	put_filesystem(type);
 	return mnt;
 out_sb:
-	up_write(&sb->s_umount);
-	deactivate_super(sb);
-	sb = ERR_PTR(error);
+	dput(mnt->mnt_root);
+	up_write(&mnt->mnt_sb->s_umount);
+	deactivate_super(mnt->mnt_sb);
 out_free_secdata:
 	free_secdata(secdata);
 out_mnt:
 	free_vfsmnt(mnt);
 out:
+	return ERR_PTR(error);
+}
+
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
+
+struct vfsmount *
+do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+{
+	struct file_system_type *type = get_fs_type(fstype);
+	struct vfsmount *mnt;
+	if (!type)
+		return ERR_PTR(-ENODEV);
+	mnt = vfs_kern_mount(type, flags, name, data);
 	put_filesystem(type);
-	return (struct vfsmount *)sb;
+	return mnt;
 }
 
 EXPORT_SYMBOL_GPL(do_kern_mount);
 
 struct vfsmount *kern_mount(struct file_system_type *type)
 {
-	return do_kern_mount(type->name, 0, type->name, NULL);
+	return vfs_kern_mount(type, 0, type->name, NULL);
 }
 
 EXPORT_SYMBOL(kern_mount);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f1117e885bd6..40190c489271 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -66,10 +66,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *sysfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sysfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, sysfs_fill_super);
+	return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
 }
 
 static struct file_system_type sysfs_fs_type = {
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index e92b991e6dda..876639b93321 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -506,16 +506,17 @@ failed:
 
 /* Every kernel module contains stuff like this. */
 
-static struct super_block *sysv_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sysv_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super,
+			   mnt);
 }
 
-static struct super_block *v7_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int v7_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt);
 }
 
 static struct file_system_type sysv_fs_type = {
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e45789fe38e8..2250774a831d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -94,10 +94,10 @@ static unsigned int udf_count_free(struct super_block *);
 static int udf_statfs(struct super_block *, struct kstatfs *);
 
 /* UDF filesystem type */
-static struct super_block *udf_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int udf_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt);
 }
 
 static struct file_system_type udf_fstype = {
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index db98a4c71e63..768fb8d9e67a 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1311,10 +1311,10 @@ out:
 
 #endif
 
-static struct super_block *ufs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ufs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt);
 }
 
 static struct file_system_type ufs_fs_type = {
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a56cec3be5f0..9a8f48bae956 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -1023,11 +1023,12 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *vfat_get_sb(struct file_system_type *fs_type,
-				       int flags, const char *dev_name,
-				       void *data)
+static int vfat_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
+			   mnt);
 }
 
 static struct file_system_type vfat_fs_type = {
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f2a0778536f4..d03c89a36655 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -853,14 +853,16 @@ fail_vfsop:
 	return -error;
 }
 
-STATIC struct super_block *
+STATIC int
 xfs_fs_get_sb(
 	struct file_system_type	*fs_type,
 	int			flags,
 	const char		*dev_name,
-	void			*data)
+	void			*data,
+	struct vfsmount		*mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super,
+			   mnt);
 }
 
 STATIC struct super_operations xfs_super_operations = {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 46d0e079735d..0dd1610a94a9 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -217,7 +217,6 @@ extern struct dentry * d_alloc_anon(struct inode *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
-extern void shrink_dcache_anon(struct super_block *);
 extern int d_invalidate(struct dentry *);
 
 /* only used at mount-time */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 73c7d6f04b31..3e50dd24af87 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1269,23 +1269,26 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
 struct file_system_type {
 	const char *name;
 	int fs_flags;
-	struct super_block *(*get_sb) (struct file_system_type *, int,
-				       const char *, void *);
+	int (*get_sb) (struct file_system_type *, int,
+		       const char *, void *, struct vfsmount *);
 	void (*kill_sb) (struct super_block *);
 	struct module *owner;
 	struct file_system_type * next;
 	struct list_head fs_supers;
 };
 
-struct super_block *get_sb_bdev(struct file_system_type *fs_type,
+extern int get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int));
-struct super_block *get_sb_single(struct file_system_type *fs_type,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt);
+extern int get_sb_single(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int));
-struct super_block *get_sb_nodev(struct file_system_type *fs_type,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt);
+extern int get_sb_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int));
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt);
 void generic_shutdown_super(struct super_block *sb);
 void kill_block_super(struct super_block *sb);
 void kill_anon_super(struct super_block *sb);
@@ -1296,8 +1299,10 @@ struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
 			void *data);
-struct super_block *get_sb_pseudo(struct file_system_type *, char *,
-			struct super_operations *ops, unsigned long);
+extern int get_sb_pseudo(struct file_system_type *, char *,
+	struct super_operations *ops, unsigned long,
+	struct vfsmount *mnt);
+extern int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
 int __put_super(struct super_block *sb);
 int __put_super_and_need_restart(struct super_block *sb);
 void unnamed_dev_init(void);
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 78ecfa28b1c2..00b340ba6612 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -2,8 +2,8 @@
 #define _LINUX_RAMFS_H
 
 struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev);
-struct super_block *ramfs_get_sb(struct file_system_type *fs_type,
-	 int flags, const char *dev_name, void *data);
+extern int ramfs_get_sb(struct file_system_type *fs_type,
+	 int flags, const char *dev_name, void *data, struct vfsmount *mnt);
 
 #ifndef CONFIG_MMU
 extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 1511714a9585..0a2a24b6ebe4 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -205,11 +205,11 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *mqueue_get_sb(struct file_system_type *fs_type,
-					 int flags, const char *dev_name,
-					 void *data)
+static int mqueue_get_sb(struct file_system_type *fs_type,
+			 int flags, const char *dev_name,
+			 void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, mqueue_fill_super);
+	return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt);
 }
 
 static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ab81fdd4572b..77f45ffd5ea1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -392,11 +392,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
 	return 0;
 }
 
-static struct super_block *cpuset_get_sb(struct file_system_type *fs_type,
-					int flags, const char *unused_dev_name,
-					void *data)
+static int cpuset_get_sb(struct file_system_type *fs_type,
+			 int flags, const char *unused_dev_name,
+			 void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, cpuset_fill_super);
+	return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt);
 }
 
 static struct file_system_type cpuset_fs_type = {
diff --git a/kernel/futex.c b/kernel/futex.c
index 5699c512057b..e1a380c77a5a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1056,11 +1056,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
 			(unsigned long)uaddr2, val2, val3);
 }
 
-static struct super_block *
-futexfs_get_sb(struct file_system_type *fs_type,
-	       int flags, const char *dev_name, void *data)
+static int futexfs_get_sb(struct file_system_type *fs_type,
+			  int flags, const char *dev_name, void *data,
+			  struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA);
+	return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt);
 }
 
 static struct file_system_type futex_fs_type = {
diff --git a/mm/shmem.c b/mm/shmem.c
index 1e43c8a865ba..7617bb1c6bf7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2233,10 +2233,10 @@ static struct vm_operations_struct shmem_vm_ops = {
 };
 
 
-static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int shmem_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, shmem_fill_super);
+	return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
 }
 
 static struct file_system_type tmpfs_fs_type = {
diff --git a/net/socket.c b/net/socket.c
index 02948b622bd2..565f5e8d1191 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -335,10 +335,11 @@ static struct super_operations sockfs_ops = {
 	.statfs =	simple_statfs,
 };
 
-static struct super_block *sockfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sockfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
+	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
+			     mnt);
 }
 
 static struct vfsmount *sock_mnt __read_mostly;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index cc673dd8433f..8241fa726803 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -815,11 +815,11 @@ out:
 	return -ENOMEM;
 }
 
-static struct super_block *
+static int
 rpc_get_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *data)
+		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, rpc_fill_super);
+	return get_sb_single(fs_type, flags, data, rpc_fill_super, mnt);
 }
 
 static struct file_system_type rpc_pipe_fs_type = {
diff --git a/security/inode.c b/security/inode.c
index 0f77b0223662..e6fc29ac8564 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -135,11 +135,11 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	return simple_fill_super(sb, SECURITYFS_MAGIC, files);
 }
 
-static struct super_block *get_sb(struct file_system_type *fs_type,
-				        int flags, const char *dev_name,
-					void *data)
+static int get_sb(struct file_system_type *fs_type,
+		  int flags, const char *dev_name,
+		  void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, fill_super);
+	return get_sb_single(fs_type, flags, data, fill_super, mnt);
 }
 
 static struct file_system_type fs_type = {
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 2e73d3279f2d..7029bbc9bef8 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1345,10 +1345,11 @@ err:
 	goto out;
 }
 
-static struct super_block *sel_get_sb(struct file_system_type *fs_type,
-				      int flags, const char *dev_name, void *data)
+static int sel_get_sb(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data,
+		      struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, sel_fill_super);
+	return get_sb_single(fs_type, flags, data, sel_fill_super, mnt);
 }
 
 static struct file_system_type sel_fs_type = {
-- 
cgit v1.2.3


From 726c334223180e3c0197cc980a432681370d4baf Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Jun 2006 02:02:58 -0700
Subject: [PATCH] VFS: Permit filesystem to perform statfs with a known root
 dentry

Give the statfs superblock operation a dentry pointer rather than a superblock
pointer.

This complements the get_sb() patch.  That reduced the significance of
sb->s_root, allowing NFS to place a fake root there.  However, NFS does
require a dentry to use as a target for the statfs operation.  This permits
the root in the vfsmount to be used instead.

linux/mount.h has been added where necessary to make allyesconfig build
successfully.

Interest has also been expressed for use with the FUSE and XFS filesystems.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Nathan Scott <nathans@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/Locking |  2 +-
 Documentation/filesystems/vfs.txt |  2 +-
 arch/alpha/kernel/osf_sys.c       |  2 +-
 arch/mips/kernel/sysirix.c        | 12 ++++++------
 arch/parisc/hpux/sys_hpux.c       | 10 +++++-----
 arch/sparc64/solaris/fs.c         |  4 ++--
 fs/adfs/super.c                   |  8 ++++----
 fs/affs/super.c                   |  5 +++--
 fs/befs/linuxvfs.c                |  5 +++--
 fs/bfs/inode.c                    |  3 ++-
 fs/cifs/cifsfs.c                  |  3 ++-
 fs/coda/inode.c                   |  6 +++---
 fs/coda/upcall.c                  |  4 ++--
 fs/compat.c                       |  8 ++++----
 fs/cramfs/inode.c                 |  4 +++-
 fs/efs/super.c                    |  6 +++---
 fs/ext2/super.c                   |  5 +++--
 fs/ext3/super.c                   |  5 +++--
 fs/fat/inode.c                    |  8 ++++----
 fs/freevxfs/vxfs_super.c          | 13 +++++++------
 fs/fuse/inode.c                   |  3 ++-
 fs/hfs/super.c                    |  4 +++-
 fs/hfsplus/super.c                |  4 +++-
 fs/hostfs/hostfs_kern.c           |  4 ++--
 fs/hpfs/super.c                   |  3 ++-
 fs/hppfs/hppfs_kern.c             |  2 +-
 fs/hugetlbfs/inode.c              |  4 ++--
 fs/isofs/inode.c                  |  6 ++++--
 fs/jffs/inode-v23.c               |  4 ++--
 fs/jffs2/fs.c                     |  4 ++--
 fs/jffs2/os-linux.h               |  2 +-
 fs/jfs/super.c                    |  4 ++--
 fs/libfs.c                        |  4 ++--
 fs/minix/inode.c                  | 10 +++++-----
 fs/ncpfs/inode.c                  |  5 +++--
 fs/nfs/inode.c                    |  5 +++--
 fs/nfsd/nfs4xdr.c                 |  2 +-
 fs/nfsd/vfs.c                     |  2 +-
 fs/ntfs/super.c                   |  7 ++++---
 fs/ocfs2/super.c                  | 10 +++++-----
 fs/open.c                         | 26 +++++++++++++-------------
 fs/qnx4/inode.c                   |  6 ++++--
 fs/reiserfs/super.c               |  8 ++++----
 fs/romfs/inode.c                  |  4 ++--
 fs/smbfs/inode.c                  |  6 +++---
 fs/smbfs/proc.c                   |  4 ++--
 fs/smbfs/proto.h                  |  2 +-
 fs/super.c                        |  2 +-
 fs/sysv/inode.c                   |  3 ++-
 fs/udf/super.c                    |  6 ++++--
 fs/ufs/super.c                    |  3 ++-
 fs/xfs/linux-2.6/xfs_super.c      |  4 ++--
 include/linux/coda_psdev.h        |  2 +-
 include/linux/fs.h                |  6 +++---
 include/linux/mount.h             |  5 +++++
 include/linux/security.h          | 14 +++++++-------
 kernel/acct.c                     |  2 +-
 mm/shmem.c                        |  4 ++--
 security/dummy.c                  |  2 +-
 security/selinux/hooks.c          |  6 +++---
 60 files changed, 175 insertions(+), 144 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 3abf08f1b14a..d31efbbdfe50 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -99,7 +99,7 @@ prototypes:
 	int (*sync_fs)(struct super_block *sb, int wait);
 	void (*write_super_lockfs) (struct super_block *);
 	void (*unlockfs) (struct super_block *);
-	int (*statfs) (struct super_block *, struct kstatfs *);
+	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*clear_inode) (struct inode *);
 	void (*umount_begin) (struct super_block *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index dd7d0dcedc87..9d3aed628bc1 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -211,7 +211,7 @@ struct super_operations {
         int (*sync_fs)(struct super_block *sb, int wait);
         void (*write_super_lockfs) (struct super_block *);
         void (*unlockfs) (struct super_block *);
-        int (*statfs) (struct super_block *, struct kstatfs *);
+        int (*statfs) (struct dentry *, struct kstatfs *);
         int (*remount_fs) (struct super_block *, int *, char *);
         void (*clear_inode) (struct inode *);
         void (*umount_begin) (struct super_block *);
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 31afe3d91ac6..e15dcf4f3dcd 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -244,7 +244,7 @@ do_osf_statfs(struct dentry * dentry, struct osf_statfs __user *buffer,
 	      unsigned long bufsiz)
 {
 	struct kstatfs linux_stat;
-	int error = vfs_statfs(dentry->d_inode->i_sb, &linux_stat);
+	int error = vfs_statfs(dentry, &linux_stat);
 	if (!error)
 		error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz);
 	return error;	
diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c
index 5407b784cd01..19e1ef43eb4b 100644
--- a/arch/mips/kernel/sysirix.c
+++ b/arch/mips/kernel/sysirix.c
@@ -694,7 +694,7 @@ asmlinkage int irix_statfs(const char __user *path,
 	if (error)
 		goto out;
 
-	error = vfs_statfs(nd.dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(nd.dentry, &kbuf);
 	if (error)
 		goto dput_and_out;
 
@@ -732,7 +732,7 @@ asmlinkage int irix_fstatfs(unsigned int fd, struct irix_statfs __user *buf)
 		goto out;
 	}
 
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(file->f_dentry, &kbuf);
 	if (error)
 		goto out_f;
 
@@ -1360,7 +1360,7 @@ asmlinkage int irix_statvfs(char __user *fname, struct irix_statvfs __user *buf)
 	error = user_path_walk(fname, &nd);
 	if (error)
 		goto out;
-	error = vfs_statfs(nd.dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(nd.dentry, &kbuf);
 	if (error)
 		goto dput_and_out;
 
@@ -1406,7 +1406,7 @@ asmlinkage int irix_fstatvfs(int fd, struct irix_statvfs __user *buf)
 		error = -EBADF;
 		goto out;
 	}
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(file->f_dentry, &kbuf);
 	if (error)
 		goto out_f;
 
@@ -1611,7 +1611,7 @@ asmlinkage int irix_statvfs64(char __user *fname, struct irix_statvfs64 __user *
 	error = user_path_walk(fname, &nd);
 	if (error)
 		goto out;
-	error = vfs_statfs(nd.dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(nd.dentry, &kbuf);
 	if (error)
 		goto dput_and_out;
 
@@ -1658,7 +1658,7 @@ asmlinkage int irix_fstatvfs64(int fd, struct irix_statvfs __user *buf)
 		error = -EBADF;
 		goto out;
 	}
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(file->f_dentry, &kbuf);
 	if (error)
 		goto out_f;
 
diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c
index 05273ccced0e..cb69727027ae 100644
--- a/arch/parisc/hpux/sys_hpux.c
+++ b/arch/parisc/hpux/sys_hpux.c
@@ -145,7 +145,7 @@ static int hpux_ustat(dev_t dev, struct hpux_ustat __user *ubuf)
 	s = user_get_super(dev);
 	if (s == NULL)
 		goto out;
-	err = vfs_statfs(s, &sbuf);
+	err = vfs_statfs(s->s_root, &sbuf);
 	drop_super(s);
 	if (err)
 		goto out;
@@ -186,12 +186,12 @@ struct hpux_statfs {
      int16_t f_pad;
 };
 
-static int vfs_statfs_hpux(struct super_block *sb, struct hpux_statfs *buf)
+static int vfs_statfs_hpux(struct dentry *dentry, struct hpux_statfs *buf)
 {
 	struct kstatfs st;
 	int retval;
 	
-	retval = vfs_statfs(sb, &st);
+	retval = vfs_statfs(dentry, &st);
 	if (retval)
 		return retval;
 
@@ -219,7 +219,7 @@ asmlinkage long hpux_statfs(const char __user *path,
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct hpux_statfs tmp;
-		error = vfs_statfs_hpux(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs_hpux(nd.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_release(&nd);
@@ -237,7 +237,7 @@ asmlinkage long hpux_fstatfs(unsigned int fd, struct hpux_statfs __user * buf)
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs_hpux(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs_hpux(file->f_dentry, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
diff --git a/arch/sparc64/solaris/fs.c b/arch/sparc64/solaris/fs.c
index 4885ca6cbc77..0f0eb6aa1c40 100644
--- a/arch/sparc64/solaris/fs.c
+++ b/arch/sparc64/solaris/fs.c
@@ -356,7 +356,7 @@ static int report_statvfs(struct vfsmount *mnt, struct inode *inode, u32 buf)
 	int error;
 	struct sol_statvfs __user *ss = A(buf);
 
-	error = vfs_statfs(mnt->mnt_sb, &s);
+	error = vfs_statfs(mnt->mnt_root, &s);
 	if (!error) {
 		const char *p = mnt->mnt_sb->s_type->name;
 		int i = 0;
@@ -392,7 +392,7 @@ static int report_statvfs64(struct vfsmount *mnt, struct inode *inode, u32 buf)
 	int error;
 	struct sol_statvfs64 __user *ss = A(buf);
 			
-	error = vfs_statfs(mnt->mnt_sb, &s);
+	error = vfs_statfs(mnt->mnt_root, &s);
 	if (!error) {
 		const char *p = mnt->mnt_sb->s_type->name;
 		int i = 0;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 1b58a9b7f6aa..ba1c88af49fe 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -196,17 +196,17 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data)
 	return parse_options(sb, data);
 }
 
-static int adfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct adfs_sb_info *asb = ADFS_SB(sb);
+	struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb);
 
 	buf->f_type    = ADFS_SUPER_MAGIC;
 	buf->f_namelen = asb->s_namelen;
-	buf->f_bsize   = sb->s_blocksize;
+	buf->f_bsize   = dentry->d_sb->s_blocksize;
 	buf->f_blocks  = asb->s_size;
 	buf->f_files   = asb->s_ids_per_zone * asb->s_map_size;
 	buf->f_bavail  =
-	buf->f_bfree   = adfs_map_free(sb);
+	buf->f_bfree   = adfs_map_free(dentry->d_sb);
 	buf->f_ffree   = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
 
 	return 0;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 6a52e7875403..8765cba35bb9 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -18,7 +18,7 @@
 
 extern struct timezone sys_tz;
 
-static int affs_statfs(struct super_block *sb, struct kstatfs *buf);
+static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 
 static void
@@ -508,8 +508,9 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 }
 
 static int
-affs_statfs(struct super_block *sb, struct kstatfs *buf)
+affs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	int		 free;
 
 	pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 6ed07a5f10c6..08201fab26cd 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -49,7 +49,7 @@ static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
 			char **out, int *out_len);
 static void befs_put_super(struct super_block *);
 static int befs_remount(struct super_block *, int *, char *);
-static int befs_statfs(struct super_block *, struct kstatfs *);
+static int befs_statfs(struct dentry *, struct kstatfs *);
 static int parse_options(char *, befs_mount_options *);
 
 static const struct super_operations befs_sops = {
@@ -880,8 +880,9 @@ befs_remount(struct super_block *sb, int *flags, char *data)
 }
 
 static int
-befs_statfs(struct super_block *sb, struct kstatfs *buf)
+befs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 
 	befs_debug(sb, "---> befs_statfs()");
 
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index e7da03f63a5a..cf74f3d4d966 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -203,8 +203,9 @@ static void bfs_put_super(struct super_block *s)
 	s->s_fs_info = NULL;
 }
 
-static int bfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *s = dentry->d_sb;
 	struct bfs_sb_info *info = BFS_SB(s);
 	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
 	buf->f_type = BFS_MAGIC;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 08b35801dfed..7520f4687158 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -166,8 +166,9 @@ cifs_put_super(struct super_block *sb)
 }
 
 static int
-cifs_statfs(struct super_block *sb, struct kstatfs *buf)
+cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	int xid; 
 	int rc = -EOPNOTSUPP;
 	struct cifs_sb_info *cifs_sb;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index cba70201567d..87f1dc8aa24b 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -36,7 +36,7 @@
 /* VFS super_block ops */
 static void coda_clear_inode(struct inode *);
 static void coda_put_super(struct super_block *);
-static int coda_statfs(struct super_block *sb, struct kstatfs *buf);
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static kmem_cache_t * coda_inode_cachep;
 
@@ -278,13 +278,13 @@ struct inode_operations coda_file_inode_operations = {
 	.setattr	= coda_setattr,
 };
 
-static int coda_statfs(struct super_block *sb, struct kstatfs *buf)
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int error;
 	
 	lock_kernel();
 
-	error = venus_statfs(sb, buf);
+	error = venus_statfs(dentry, buf);
 
 	unlock_kernel();
 
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 1bae99650a91..b040eba13a7d 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -611,7 +611,7 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
 	return error;
 }
 
-int venus_statfs(struct super_block *sb, struct kstatfs *sfs) 
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs)
 { 
         union inputArgs *inp;
         union outputArgs *outp;
@@ -620,7 +620,7 @@ int venus_statfs(struct super_block *sb, struct kstatfs *sfs)
 	insize = max_t(unsigned int, INSIZE(statfs), OUTSIZE(statfs));
 	UPARG(CODA_STATFS);
 
-        error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+        error = coda_upcall(coda_sbp(dentry->d_sb), insize, &outsize, inp);
 	
         if (!error) {
 		sfs->f_blocks = outp->coda_statfs.stat.f_blocks;
diff --git a/fs/compat.c b/fs/compat.c
index b1f64786a613..7e7e5bc4f3cf 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -197,7 +197,7 @@ asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs(nd.dentry, &tmp);
 		if (!error)
 			error = put_compat_statfs(buf, &tmp);
 		path_release(&nd);
@@ -215,7 +215,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs(file->f_dentry, &tmp);
 	if (!error)
 		error = put_compat_statfs(buf, &tmp);
 	fput(file);
@@ -265,7 +265,7 @@ asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, s
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs(nd.dentry, &tmp);
 		if (!error)
 			error = put_compat_statfs64(buf, &tmp);
 		path_release(&nd);
@@ -286,7 +286,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs(file->f_dentry, &tmp);
 	if (!error)
 		error = put_compat_statfs64(buf, &tmp);
 	fput(file);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 37a91a153aa5..8a9d5d3b3262 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -322,8 +322,10 @@ out:
 	return -EINVAL;
 }
 
-static int cramfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = CRAMFS_MAGIC;
 	buf->f_bsize = PAGE_CACHE_SIZE;
 	buf->f_blocks = CRAMFS_SB(sb)->blocks;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 1ba5e14f879f..8ac2462ae5dd 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -15,7 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 
-static int efs_statfs(struct super_block *s, struct kstatfs *buf);
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
 
 static int efs_get_sb(struct file_system_type *fs_type,
@@ -322,8 +322,8 @@ out_no_fs:
 	return -EINVAL;
 }
 
-static int efs_statfs(struct super_block *s, struct kstatfs *buf) {
-	struct efs_sb_info *sb = SUPER_INFO(s);
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
+	struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb);
 
 	buf->f_type    = EFS_SUPER_MAGIC;	/* efs magic number */
 	buf->f_bsize   = EFS_BLOCKSIZE;		/* blocksize */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a4dfffac5967..a6c4d6e02324 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -39,7 +39,7 @@
 static void ext2_sync_super(struct super_block *sb,
 			    struct ext2_super_block *es);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
-static int ext2_statfs (struct super_block * sb, struct kstatfs * buf);
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 
 void ext2_error (struct super_block * sb, const char * function,
 		 const char * fmt, ...)
@@ -1038,8 +1038,9 @@ restore_opts:
 	return err;
 }
 
-static int ext2_statfs (struct super_block * sb, struct kstatfs * buf)
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	unsigned long overhead;
 	int i;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 657f8e73b62f..e08b6439617c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -58,7 +58,7 @@ static int ext3_sync_fs(struct super_block *sb, int wait);
 static const char *ext3_decode_error(struct super_block * sb, int errno,
 				     char nbuf[16]);
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
-static int ext3_statfs (struct super_block * sb, struct kstatfs * buf);
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
 static void ext3_unlockfs(struct super_block *sb);
 static void ext3_write_super (struct super_block * sb);
 static void ext3_write_super_lockfs(struct super_block *sb);
@@ -2318,8 +2318,9 @@ restore_opts:
 	return err;
 }
 
-static int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
 	unsigned long overhead;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c1ce284f8a94..7c35d582ec10 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -539,18 +539,18 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-static int fat_statfs(struct super_block *sb, struct kstatfs *buf)
+static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 
 	/* If the count of free cluster is still unknown, counts it here. */
 	if (sbi->free_clusters == -1) {
-		int err = fat_count_free_clusters(sb);
+		int err = fat_count_free_clusters(dentry->d_sb);
 		if (err)
 			return err;
 	}
 
-	buf->f_type = sb->s_magic;
+	buf->f_type = dentry->d_sb->s_magic;
 	buf->f_bsize = sbi->cluster_size;
 	buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
 	buf->f_bfree = sbi->free_clusters;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index d76eeaafbde2..b74b791fc23b 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/vfs.h>
+#include <linux/mount.h>
 
 #include "vxfs.h"
 #include "vxfs_extern.h"
@@ -55,7 +56,7 @@ MODULE_ALIAS("vxfs"); /* makes mount -t vxfs autoload the module */
 
 
 static void		vxfs_put_super(struct super_block *);
-static int		vxfs_statfs(struct super_block *, struct kstatfs *);
+static int		vxfs_statfs(struct dentry *, struct kstatfs *);
 static int		vxfs_remount(struct super_block *, int *, char *);
 
 static struct super_operations vxfs_super_ops = {
@@ -90,12 +91,12 @@ vxfs_put_super(struct super_block *sbp)
 
 /**
  * vxfs_statfs - get filesystem information
- * @sbp:	VFS superblock
+ * @dentry:	VFS dentry to locate superblock
  * @bufp:	output buffer
  *
  * Description:
  *   vxfs_statfs fills the statfs buffer @bufp with information
- *   about the filesystem described by @sbp.
+ *   about the filesystem described by @dentry.
  *
  * Returns:
  *   Zero.
@@ -107,12 +108,12 @@ vxfs_put_super(struct super_block *sbp)
  *   This is everything but complete...
  */
 static int
-vxfs_statfs(struct super_block *sbp, struct kstatfs *bufp)
+vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
 {
-	struct vxfs_sb_info		*infp = VXFS_SBI(sbp);
+	struct vxfs_sb_info		*infp = VXFS_SBI(dentry->d_sb);
 
 	bufp->f_type = VXFS_SUPER_MAGIC;
-	bufp->f_bsize = sbp->s_blocksize;
+	bufp->f_bsize = dentry->d_sb->s_blocksize;
 	bufp->f_blocks = infp->vsi_raw->vs_dsize;
 	bufp->f_bfree = infp->vsi_raw->vs_free;
 	bufp->f_bavail = 0;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index c91f0a50aadb..a13c0f529058 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -236,8 +236,9 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
 	/* fsid is left zero */
 }
 
-static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
+static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 	struct fuse_req *req;
 	struct fuse_statfs_out outarg;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index ee5b80a409e8..d9227bf14e86 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -80,8 +80,10 @@ static void hfs_put_super(struct super_block *sb)
  *
  * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks.
  */
-static int hfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = HFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 0ed8b7e8e87f..0a92fa2336a2 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -212,8 +212,10 @@ static void hfsplus_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 }
 
-static int hfsplus_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = HFSPLUS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 04035e08f5c1..8e0d37743e7c 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -239,7 +239,7 @@ static int read_inode(struct inode *ino)
 	return(err);
 }
 
-int hostfs_statfs(struct super_block *sb, struct kstatfs *sf)
+int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 {
 	/* do_statfs uses struct statfs64 internally, but the linux kernel
 	 * struct statfs still has 32-bit versions for most of these fields,
@@ -252,7 +252,7 @@ int hostfs_statfs(struct super_block *sb, struct kstatfs *sf)
 	long long f_files;
 	long long f_ffree;
 
-	err = do_statfs(HOSTFS_I(sb->s_root->d_inode)->host_filename,
+	err = do_statfs(HOSTFS_I(dentry->d_sb->s_root->d_inode)->host_filename,
 			&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
 			&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
 			&sf->f_namelen, sf->f_spare);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 3b25cf3e2e65..f798480a363f 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -135,8 +135,9 @@ static unsigned count_bitmaps(struct super_block *s)
 	return count;
 }
 
-static int hpfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *s = dentry->d_sb;
 	struct hpfs_sb_info *sbi = hpfs_sb(s);
 	lock_kernel();
 
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index ec43c22bc9c0..3a9bdf58166f 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -616,7 +616,7 @@ static const struct file_operations hppfs_dir_fops = {
 	.fsync		= hppfs_fsync,
 };
 
-static int hppfs_statfs(struct super_block *sb, struct kstatfs *sf)
+static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 {
 	sf->f_blocks = 0;
 	sf->f_bfree = 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4665c26171f7..678fc72c3646 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -467,9 +467,9 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 	return 0;
 }
 
-static int hugetlbfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
 
 	buf->f_type = HUGETLBFS_MAGIC;
 	buf->f_bsize = HPAGE_SIZE;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 17268da63a49..3f9c8ba1fa1f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -56,7 +56,7 @@ static void isofs_put_super(struct super_block *sb)
 }
 
 static void isofs_read_inode(struct inode *);
-static int isofs_statfs (struct super_block *, struct kstatfs *);
+static int isofs_statfs (struct dentry *, struct kstatfs *);
 
 static kmem_cache_t *isofs_inode_cachep;
 
@@ -901,8 +901,10 @@ out_freesbi:
 	return -EINVAL;
 }
 
-static int isofs_statfs (struct super_block *sb, struct kstatfs *buf)
+static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = ISOFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = (ISOFS_SB(sb)->s_nzones
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index dd93a091ad67..9e46ea6da752 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -377,9 +377,9 @@ jffs_new_inode(const struct inode * dir, struct jffs_raw_inode *raw_inode,
 
 /* Get statistics of the file system.  */
 static int
-jffs_statfs(struct super_block *sb, struct kstatfs *buf)
+jffs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jffs_control *c = (struct jffs_control *) sb->s_fs_info;
+	struct jffs_control *c = (struct jffs_control *) dentry->d_sb->s_fs_info;
 	struct jffs_fmcontrol *fmc;
 
 	lock_kernel();
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 7b6c24b14f85..2900ec3ec3af 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -192,9 +192,9 @@ int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
 	return rc;
 }
 
-int jffs2_statfs(struct super_block *sb, struct kstatfs *buf)
+int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(dentry->d_sb);
 	unsigned long avail;
 
 	buf->f_type = JFFS2_SUPER_MAGIC;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index cd4021bcb944..6b5223565405 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -175,7 +175,7 @@ void jffs2_clear_inode (struct inode *);
 void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
 			       struct jffs2_raw_inode *ri);
-int jffs2_statfs (struct super_block *, struct kstatfs *);
+int jffs2_statfs (struct dentry *, struct kstatfs *);
 void jffs2_write_super (struct super_block *);
 int jffs2_remount_fs (struct super_block *, int *, char *);
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 18a28137b90e..73d2aba084c6 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -139,9 +139,9 @@ static void jfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(jfs_inode_cachep, ji);
 }
 
-static int jfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
 	s64 maxinodes;
 	struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 7d70efa46da9..1b1156381787 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -20,9 +20,9 @@ int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	return 0;
 }
 
-int simple_statfs(struct super_block *sb, struct kstatfs *buf)
+int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	buf->f_type = sb->s_magic;
+	buf->f_type = dentry->d_sb->s_magic;
 	buf->f_bsize = PAGE_CACHE_SIZE;
 	buf->f_namelen = NAME_MAX;
 	return 0;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 14f24dfbfe30..a6fb509b7341 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -19,7 +19,7 @@
 
 static void minix_read_inode(struct inode * inode);
 static int minix_write_inode(struct inode * inode, int wait);
-static int minix_statfs(struct super_block *sb, struct kstatfs *buf);
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int minix_remount (struct super_block * sb, int * flags, char * data);
 
 static void minix_delete_inode(struct inode *inode)
@@ -296,11 +296,11 @@ out_bad_sb:
 	return -EINVAL;
 }
 
-static int minix_statfs(struct super_block *sb, struct kstatfs *buf)
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct minix_sb_info *sbi = minix_sb(sb);
-	buf->f_type = sb->s_magic;
-	buf->f_bsize = sb->s_blocksize;
+	struct minix_sb_info *sbi = minix_sb(dentry->d_sb);
+	buf->f_type = dentry->d_sb->s_magic;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
 	buf->f_bfree = minix_count_free_blocks(sbi);
 	buf->f_bavail = buf->f_bfree;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 8db033fab3fd..90d2ea28f333 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -39,7 +39,7 @@
 
 static void ncp_delete_inode(struct inode *);
 static void ncp_put_super(struct super_block *);
-static int  ncp_statfs(struct super_block *, struct kstatfs *);
+static int  ncp_statfs(struct dentry *, struct kstatfs *);
 
 static kmem_cache_t * ncp_inode_cachep;
 
@@ -724,13 +724,14 @@ static void ncp_put_super(struct super_block *sb)
 	kfree(server);
 }
 
-static int ncp_statfs(struct super_block *sb, struct kstatfs *buf)
+static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct dentry* d;
 	struct inode* i;
 	struct ncp_inode_info* ni;
 	struct ncp_server* s;
 	struct ncp_volume_info vi;
+	struct super_block *sb = dentry->d_sb;
 	int err;
 	__u8 dh;
 	
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ff645a961bc8..937fbfc381bb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -65,7 +65,7 @@ static int nfs_write_inode(struct inode *,int);
 static void nfs_delete_inode(struct inode *);
 static void nfs_clear_inode(struct inode *);
 static void nfs_umount_begin(struct super_block *);
-static int  nfs_statfs(struct super_block *, struct kstatfs *);
+static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static void nfs_zap_acl_cache(struct inode *);
@@ -534,8 +534,9 @@ nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
 }
 
 static int
-nfs_statfs(struct super_block *sb, struct kstatfs *buf)
+nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct nfs_server *server = NFS_SB(sb);
 	unsigned char blockbits;
 	unsigned long blockres;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index de3998f15f10..5446a0861d1d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1310,7 +1310,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL)) ||
 	    (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
 		       FATTR4_WORD1_SPACE_TOTAL))) {
-		status = vfs_statfs(dentry->d_inode->i_sb, &statfs);
+		status = vfs_statfs(dentry, &statfs);
 		if (status)
 			goto out_nfserr;
 	}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 1d65f13f458c..245eaa1fb59b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1737,7 +1737,7 @@ int
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 {
 	int err = fh_verify(rqstp, fhp, 0, MAY_NOP);
-	if (!err && vfs_statfs(fhp->fh_dentry->d_inode->i_sb,stat))
+	if (!err && vfs_statfs(fhp->fh_dentry,stat))
 		err = nfserr_io;
 	return err;
 }
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index d5d5e969294f..0e14acea3f8b 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2601,10 +2601,10 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 
 /**
  * ntfs_statfs - return information about mounted NTFS volume
- * @sb:		super block of mounted volume
+ * @dentry:	dentry from mounted volume
  * @sfs:	statfs structure in which to return the information
  *
- * Return information about the mounted NTFS volume @sb in the statfs structure
+ * Return information about the mounted NTFS volume @dentry in the statfs structure
  * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
  * called). We interpret the values to be correct of the moment in time at
  * which we are called. Most values are variable otherwise and this isn't just
@@ -2617,8 +2617,9 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
  *
  * Return 0 on success or -errno on error.
  */
-static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs)
+static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
 {
+	struct super_block *sb = dentry->d_sb;
 	s64 size;
 	ntfs_volume *vol = NTFS_SB(sb);
 	ntfs_inode *mft_ni = NTFS_I(vol->mft_ino);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 788b8b50dc4c..cdf73393f094 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -100,7 +100,7 @@ static int ocfs2_initialize_mem_caches(void);
 static void ocfs2_free_mem_caches(void);
 static void ocfs2_delete_osb(struct ocfs2_super *osb);
 
-static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf);
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static int ocfs2_sync_fs(struct super_block *sb, int wait);
 
@@ -857,7 +857,7 @@ static void ocfs2_put_super(struct super_block *sb)
 	mlog_exit_void();
 }
 
-static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct ocfs2_super *osb;
 	u32 numbits, freebits;
@@ -866,9 +866,9 @@ static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
 	struct buffer_head *bh = NULL;
 	struct inode *inode = NULL;
 
-	mlog_entry("(%p, %p)\n", sb, buf);
+	mlog_entry("(%p, %p)\n", dentry->d_sb, buf);
 
-	osb = OCFS2_SB(sb);
+	osb = OCFS2_SB(dentry->d_sb);
 
 	inode = ocfs2_get_system_file_inode(osb,
 					    GLOBAL_BITMAP_SYSTEM_INODE,
@@ -891,7 +891,7 @@ static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
 	freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
 
 	buf->f_type = OCFS2_SUPER_MAGIC;
-	buf->f_bsize = sb->s_blocksize;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
 	buf->f_blocks = ((sector_t) numbits) *
 			(osb->s_clustersize >> osb->sb->s_blocksize_bits);
diff --git a/fs/open.c b/fs/open.c
index 4f178acd4c09..a37ff861108f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -31,18 +31,18 @@
 
 #include <asm/unistd.h>
 
-int vfs_statfs(struct super_block *sb, struct kstatfs *buf)
+int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int retval = -ENODEV;
 
-	if (sb) {
+	if (dentry) {
 		retval = -ENOSYS;
-		if (sb->s_op->statfs) {
+		if (dentry->d_sb->s_op->statfs) {
 			memset(buf, 0, sizeof(*buf));
-			retval = security_sb_statfs(sb);
+			retval = security_sb_statfs(dentry);
 			if (retval)
 				return retval;
-			retval = sb->s_op->statfs(sb, buf);
+			retval = dentry->d_sb->s_op->statfs(dentry, buf);
 			if (retval == 0 && buf->f_frsize == 0)
 				buf->f_frsize = buf->f_bsize;
 		}
@@ -52,12 +52,12 @@ int vfs_statfs(struct super_block *sb, struct kstatfs *buf)
 
 EXPORT_SYMBOL(vfs_statfs);
 
-static int vfs_statfs_native(struct super_block *sb, struct statfs *buf)
+static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
 {
 	struct kstatfs st;
 	int retval;
 
-	retval = vfs_statfs(sb, &st);
+	retval = vfs_statfs(dentry, &st);
 	if (retval)
 		return retval;
 
@@ -95,12 +95,12 @@ static int vfs_statfs_native(struct super_block *sb, struct statfs *buf)
 	return 0;
 }
 
-static int vfs_statfs64(struct super_block *sb, struct statfs64 *buf)
+static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
 {
 	struct kstatfs st;
 	int retval;
 
-	retval = vfs_statfs(sb, &st);
+	retval = vfs_statfs(dentry, &st);
 	if (retval)
 		return retval;
 
@@ -130,7 +130,7 @@ asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf)
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct statfs tmp;
-		error = vfs_statfs_native(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs_native(nd.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_release(&nd);
@@ -149,7 +149,7 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz, struct statfs64
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct statfs64 tmp;
-		error = vfs_statfs64(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs64(nd.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_release(&nd);
@@ -168,7 +168,7 @@ asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf)
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs_native(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs_native(file->f_dentry, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
@@ -189,7 +189,7 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs64(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs64(file->f_dentry, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index e6cca5cd4b44..2f24c46f72a1 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -128,7 +128,7 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb);
 static void qnx4_destroy_inode(struct inode *inode);
 static void qnx4_read_inode(struct inode *);
 static int qnx4_remount(struct super_block *sb, int *flags, char *data);
-static int qnx4_statfs(struct super_block *, struct kstatfs *);
+static int qnx4_statfs(struct dentry *, struct kstatfs *);
 
 static struct super_operations qnx4_sops =
 {
@@ -282,8 +282,10 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
 	return block;
 }
 
-static int qnx4_statfs(struct super_block *sb, struct kstatfs *buf)
+static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	lock_kernel();
 
 	buf->f_type    = sb->s_magic;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f3ff41d33989..00f1321e9209 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -60,7 +60,7 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
 }
 
 static int reiserfs_remount(struct super_block *s, int *flags, char *data);
-static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf);
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static int reiserfs_sync_fs(struct super_block *s, int wait)
 {
@@ -1938,15 +1938,15 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	return errval;
 }
 
-static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb);
 
 	buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize));
 	buf->f_bfree = sb_free_blocks(rs);
 	buf->f_bavail = buf->f_bfree;
 	buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
-	buf->f_bsize = s->s_blocksize;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	/* changed to accommodate gcc folks. */
 	buf->f_type = REISERFS_SUPER_MAGIC;
 	return 0;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 4d6cd6621062..283fbc6b8eea 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -179,12 +179,12 @@ outnobh:
 /* That's simple too. */
 
 static int
-romfs_statfs(struct super_block *sb, struct kstatfs *buf)
+romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	buf->f_type = ROMFS_MAGIC;
 	buf->f_bsize = ROMBSIZE;
 	buf->f_bfree = buf->f_bavail = buf->f_ffree;
-	buf->f_blocks = (romfs_maxsize(sb)+ROMBSIZE-1)>>ROMBSBITS;
+	buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
 	buf->f_namelen = ROMFS_MAXFN;
 	return 0;
 }
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 4a37c2bbfa3f..506ff87c1d4b 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -48,7 +48,7 @@
 
 static void smb_delete_inode(struct inode *);
 static void smb_put_super(struct super_block *);
-static int  smb_statfs(struct super_block *, struct kstatfs *);
+static int  smb_statfs(struct dentry *, struct kstatfs *);
 static int  smb_show_options(struct seq_file *, struct vfsmount *);
 
 static kmem_cache_t *smb_inode_cachep;
@@ -641,13 +641,13 @@ out_no_server:
 }
 
 static int
-smb_statfs(struct super_block *sb, struct kstatfs *buf)
+smb_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int result;
 	
 	lock_kernel();
 
-	result = smb_proc_dskattr(sb, buf);
+	result = smb_proc_dskattr(dentry, buf);
 
 	unlock_kernel();
 
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index b1b878b81730..c3495059889d 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -3226,9 +3226,9 @@ smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
 }
 
 int
-smb_proc_dskattr(struct super_block *sb, struct kstatfs *attr)
+smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
 {
-	struct smb_sb_info *server = SMB_SB(sb);
+	struct smb_sb_info *server = SMB_SB(dentry->d_sb);
 	int result;
 	char *p;
 	long unit;
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 47664597e6b1..972ed7dad388 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -29,7 +29,7 @@ extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
 extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
-extern int smb_proc_dskattr(struct super_block *sb, struct kstatfs *attr);
+extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
 extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
 extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
 extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
diff --git a/fs/super.c b/fs/super.c
index 324c2d232f54..057b5325b7ef 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -486,7 +486,7 @@ asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf)
         s = user_get_super(new_decode_dev(dev));
         if (s == NULL)
                 goto out;
-	err = vfs_statfs(s, &sbuf);
+	err = vfs_statfs(s->s_root, &sbuf);
 	drop_super(s);
 	if (err)
 		goto out;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3ff89cc5833a..58b2d22142ba 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -85,8 +85,9 @@ static void sysv_put_super(struct super_block *sb)
 	kfree(sbi);
 }
 
-static int sysv_statfs(struct super_block *sb, struct kstatfs *buf)
+static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 
 	buf->f_type = sb->s_magic;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 2250774a831d..44fe2cb0bbb2 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -91,7 +91,7 @@ static void udf_load_partdesc(struct super_block *, struct buffer_head *);
 static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
-static int udf_statfs(struct super_block *, struct kstatfs *);
+static int udf_statfs(struct dentry *, struct kstatfs *);
 
 /* UDF filesystem type */
 static int udf_get_sb(struct file_system_type *fs_type,
@@ -1779,8 +1779,10 @@ udf_put_super(struct super_block *sb)
  *	Written, tested, and released.
  */
 static int
-udf_statfs(struct super_block *sb, struct kstatfs *buf)
+udf_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = UDF_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = UDF_SB_PARTLEN(sb, UDF_SB_PARTITION(sb));
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 768fb8d9e67a..fe5ab2aa2899 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1113,8 +1113,9 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	return 0;
 }
 
-static int ufs_statfs (struct super_block *sb, struct kstatfs *buf)
+static int ufs_statfs (struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
 	struct ufs_super_block * usb;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index d03c89a36655..4fb0fc65af34 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -684,10 +684,10 @@ xfs_fs_sync_super(
 
 STATIC int
 xfs_fs_statfs(
-	struct super_block	*sb,
+	struct dentry		*dentry,
 	struct kstatfs		*statp)
 {
-	return -bhv_vfs_statvfs(vfs_from_sb(sb), statp, NULL);
+	return -bhv_vfs_statvfs(vfs_from_sb(dentry->d_sb), statp, NULL);
 }
 
 STATIC int
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index d539262a8f89..98f6c52c152b 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -70,7 +70,7 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
 		 unsigned int cmd, struct PioctlData *data);
 int coda_downcall(int opcode, union outputArgs *out, struct super_block *sb);
 int venus_fsync(struct super_block *sb, struct CodaFid *fid);
-int venus_statfs(struct super_block *sb, struct kstatfs *sfs);
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
 
 
 /* messages between coda filesystem in kernel and Venus */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3e50dd24af87..c823a3815e24 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1096,7 +1096,7 @@ struct super_operations {
 	int (*sync_fs)(struct super_block *sb, int wait);
 	void (*write_super_lockfs) (struct super_block *);
 	void (*unlockfs) (struct super_block *);
-	int (*statfs) (struct super_block *, struct kstatfs *);
+	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*clear_inode) (struct inode *);
 	void (*umount_begin) (struct super_block *);
@@ -1325,7 +1325,7 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
 				  struct vfsmount *);
 
-extern int vfs_statfs(struct super_block *, struct kstatfs *);
+extern int vfs_statfs(struct dentry *, struct kstatfs *);
 
 /* /sys/fs */
 extern struct subsystem fs_subsys;
@@ -1746,7 +1746,7 @@ extern int dcache_dir_close(struct inode *, struct file *);
 extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
 extern int dcache_readdir(struct file *, void *, filldir_t);
 extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *);
-extern int simple_statfs(struct super_block *, struct kstatfs *);
+extern int simple_statfs(struct dentry *, struct kstatfs *);
 extern int simple_link(struct dentry *, struct inode *, struct dentry *);
 extern int simple_unlink(struct inode *, struct dentry *);
 extern int simple_rmdir(struct inode *, struct dentry *);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index b7472ae91fa4..60718f12caa9 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -17,6 +17,11 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
 
+struct super_block;
+struct vfsmount;
+struct dentry;
+struct namespace;
+
 #define MNT_NOSUID	0x01
 #define MNT_NODEV	0x02
 #define MNT_NOEXEC	0x04
diff --git a/include/linux/security.h b/include/linux/security.h
index 47722d355532..383c320fc834 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -171,9 +171,9 @@ struct swap_info_struct;
  *	Deallocate and clear the sb->s_security field.
  *	@sb contains the super_block structure to be modified.
  * @sb_statfs:
- *	Check permission before obtaining filesystem statistics for the @sb
- *	filesystem.
- *	@sb contains the super_block structure for the filesystem.
+ *	Check permission before obtaining filesystem statistics for the @mnt
+ *	mountpoint.
+ *	@dentry is a handle on the superblock for the filesystem.
  *	Return 0 if permission is granted.  
  * @sb_mount:
  *	Check permission before an object specified by @dev_name is mounted on
@@ -1127,7 +1127,7 @@ struct security_operations {
 	int (*sb_copy_data)(struct file_system_type *type,
 			    void *orig, void *copy);
 	int (*sb_kern_mount) (struct super_block *sb, void *data);
-	int (*sb_statfs) (struct super_block * sb);
+	int (*sb_statfs) (struct dentry *dentry);
 	int (*sb_mount) (char *dev_name, struct nameidata * nd,
 			 char *type, unsigned long flags, void *data);
 	int (*sb_check_sb) (struct vfsmount * mnt, struct nameidata * nd);
@@ -1450,9 +1450,9 @@ static inline int security_sb_kern_mount (struct super_block *sb, void *data)
 	return security_ops->sb_kern_mount (sb, data);
 }
 
-static inline int security_sb_statfs (struct super_block *sb)
+static inline int security_sb_statfs (struct dentry *dentry)
 {
-	return security_ops->sb_statfs (sb);
+	return security_ops->sb_statfs (dentry);
 }
 
 static inline int security_sb_mount (char *dev_name, struct nameidata *nd,
@@ -2162,7 +2162,7 @@ static inline int security_sb_kern_mount (struct super_block *sb, void *data)
 	return 0;
 }
 
-static inline int security_sb_statfs (struct super_block *sb)
+static inline int security_sb_statfs (struct dentry *dentry)
 {
 	return 0;
 }
diff --git a/kernel/acct.c b/kernel/acct.c
index b327f4d20104..6802020e0ceb 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -118,7 +118,7 @@ static int check_free_space(struct file *file)
 	spin_unlock(&acct_globals.lock);
 
 	/* May block */
-	if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf))
+	if (vfs_statfs(file->f_dentry, &sbuf))
 		return res;
 	suspend = sbuf.f_blocks * SUSPEND;
 	resume = sbuf.f_blocks * RESUME;
diff --git a/mm/shmem.c b/mm/shmem.c
index 7617bb1c6bf7..10020d8b4073 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1654,9 +1654,9 @@ static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
 	return desc.error;
 }
 
-static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
+static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
 
 	buf->f_type = TMPFS_MAGIC;
 	buf->f_bsize = PAGE_CACHE_SIZE;
diff --git a/security/dummy.c b/security/dummy.c
index 6de4a4a5eb13..c98d553984ec 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -191,7 +191,7 @@ static int dummy_sb_kern_mount (struct super_block *sb, void *data)
 	return 0;
 }
 
-static int dummy_sb_statfs (struct super_block *sb)
+static int dummy_sb_statfs (struct dentry *dentry)
 {
 	return 0;
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 524915dfda64..093efba4d9b6 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1903,13 +1903,13 @@ static int selinux_sb_kern_mount(struct super_block *sb, void *data)
 	return superblock_has_perm(current, sb, FILESYSTEM__MOUNT, &ad);
 }
 
-static int selinux_sb_statfs(struct super_block *sb)
+static int selinux_sb_statfs(struct dentry *dentry)
 {
 	struct avc_audit_data ad;
 
 	AVC_AUDIT_DATA_INIT(&ad,FS);
-	ad.u.fs.dentry = sb->s_root;
-	return superblock_has_perm(current, sb, FILESYSTEM__GETATTR, &ad);
+	ad.u.fs.dentry = dentry->d_sb->s_root;
+	return superblock_has_perm(current, dentry->d_sb, FILESYSTEM__GETATTR, &ad);
 }
 
 static int selinux_mount(char * dev_name,
-- 
cgit v1.2.3


From cb2b95e1c6b56e3d2369d3a5f4bc97f4fa180683 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 23 Jun 2006 02:03:01 -0700
Subject: [PATCH] zone handle unaligned zone boundaries

The buddy allocator has a requirement that boundaries between contigious
zones occur aligned with the the MAX_ORDER ranges.  Where they do not we
will incorrectly merge pages cross zone boundaries.  This can lead to pages
from the wrong zone being handed out.

Originally the buddy allocator would check that buddies were in the same
zone by referencing the zone start and end page frame numbers.  This was
removed as it became very expensive and the buddy allocator already made
the assumption that zones boundaries were aligned.

It is clear that not all configurations and architectures are honouring
this alignment requirement.  Therefore it seems safest to reintroduce
support for non-aligned zone boundaries.  This patch introduces a new check
when considering a page a buddy it compares the zone_table index for the
two pages and refuses to merge the pages where they do not match.  The
zone_table index is unique for each node/zone combination when
FLATMEM/DISCONTIGMEM is enabled and for each section/zone combination when
SPARSEMEM is enabled (a SPARSEMEM section is at least a MAX_ORDER size).

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h |  7 +++++--
 mm/page_alloc.c    | 17 +++++++++++------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e2fa375e478e..697c6bf248c2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -465,10 +465,13 @@ static inline unsigned long page_zonenum(struct page *page)
 struct zone;
 extern struct zone *zone_table[];
 
+static inline int page_zone_id(struct page *page)
+{
+	return (page->flags >> ZONETABLE_PGSHIFT) & ZONETABLE_MASK;
+}
 static inline struct zone *page_zone(struct page *page)
 {
-	return zone_table[(page->flags >> ZONETABLE_PGSHIFT) &
-			ZONETABLE_MASK];
+	return zone_table[page_zone_id(page)];
 }
 
 static inline unsigned long page_to_nid(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 253a450c400d..fd631c2536a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -286,22 +286,27 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
  * (b) the buddy is in the buddy system &&
- * (c) a page and its buddy have the same order.
+ * (c) a page and its buddy have the same order &&
+ * (d) a page and its buddy are in the same zone.
  *
  * For recording whether a page is in the buddy system, we use PG_buddy.
  * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
-static inline int page_is_buddy(struct page *page, int order)
+static inline int page_is_buddy(struct page *page, struct page *buddy,
+								int order)
 {
 #ifdef CONFIG_HOLES_IN_ZONE
-	if (!pfn_valid(page_to_pfn(page)))
+	if (!pfn_valid(page_to_pfn(buddy)))
 		return 0;
 #endif
 
-	if (PageBuddy(page) && page_order(page) == order) {
-		BUG_ON(page_count(page) != 0);
+	if (page_zone_id(page) != page_zone_id(buddy))
+		return 0;
+
+	if (PageBuddy(buddy) && page_order(buddy) == order) {
+		BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	return 0;
@@ -352,7 +357,7 @@ static inline void __free_one_page(struct page *page,
 		struct page *buddy;
 
 		buddy = __page_find_buddy(page, page_idx, order);
-		if (!page_is_buddy(buddy, order))
+		if (!page_is_buddy(page, buddy, order))
 			break;		/* Move the buddy up one level. */
 
 		list_del(&buddy->lru);
-- 
cgit v1.2.3


From f886ed443fedb109e2062988bf120a531f0ec80a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 23 Jun 2006 02:03:06 -0700
Subject: [PATCH] PG_uncached is ia64 only

As Nick points out, only ia64 uses PG_uncached.  So we can push it up into the
higher bits of the lower half of page->flags and make room for another flag on
32-bit machines.

Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Jesse Barnes <jbarnes@sgi.com>
Cc: Jes Sorensen <jes@trained-monkey.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d276a4e2f825..0c076d58c676 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -7,6 +7,8 @@
 
 #include <linux/percpu.h>
 #include <linux/cache.h>
+#include <linux/types.h>
+
 #include <asm/pgtable.h>
 
 /*
@@ -88,7 +90,17 @@
 #define PG_nosave_free		18	/* Free, should not be written */
 #define PG_buddy		19	/* Page is free, on buddy lists */
 
-#define PG_uncached		20	/* Page has been mapped as uncached */
+
+#if (BITS_PER_LONG > 32)
+/*
+ * 64-bit-only flags build down from bit 31
+ *
+ * 32 bit  -------------------------------| FIELDS |       FLAGS         |
+ * 64 bit  |           FIELDS             | ??????         FLAGS         |
+ *         63                            32                              0
+ */
+#define PG_uncached		31	/* Page has been mapped as uncached */
+#endif
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
-- 
cgit v1.2.3


From 02b694dea473ad3db1e2d1b14c1fef8fbd92e5e6 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:08 -0700
Subject: [PATCH] wait_table and zonelist initializing for memory hotadd:
 change name of wait_table_size()

This is just to rename from wait_table_size() to wait_table_hash_nr_entries().

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h |  4 ++--
 mm/page_alloc.c        | 12 +++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9742e3c16222..652673ea92f1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -197,7 +197,7 @@ struct zone {
 
 	/*
 	 * wait_table		-- the array holding the hash table
-	 * wait_table_size	-- the size of the hash table array
+	 * wait_table_hash_nr_entries	-- the size of the hash table array
 	 * wait_table_bits	-- wait_table_size == (1 << wait_table_bits)
 	 *
 	 * The purpose of all these is to keep track of the people
@@ -220,7 +220,7 @@ struct zone {
 	 * free_area_init_core() performs the initialization of them.
 	 */
 	wait_queue_head_t	* wait_table;
-	unsigned long		wait_table_size;
+	unsigned long		wait_table_hash_nr_entries;
 	unsigned long		wait_table_bits;
 
 	/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fd631c2536a5..27320a0542d3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1727,7 +1727,7 @@ void __init build_all_zonelists(void)
  */
 #define PAGES_PER_WAITQUEUE	256
 
-static inline unsigned long wait_table_size(unsigned long pages)
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	unsigned long size = 1;
 
@@ -2019,13 +2019,15 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 	 * The per-page waitqueue mechanism uses hashed waitqueues
 	 * per zone.
 	 */
-	zone->wait_table_size = wait_table_size(zone_size_pages);
-	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
+	zone->wait_table_hash_nr_entries =
+		 wait_table_hash_nr_entries(zone_size_pages);
+	zone->wait_table_bits =
+		wait_table_bits(zone->wait_table_hash_nr_entries);
 	zone->wait_table = (wait_queue_head_t *)
-		alloc_bootmem_node(pgdat, zone->wait_table_size
+		alloc_bootmem_node(pgdat, zone->wait_table_hash_nr_entries
 					* sizeof(wait_queue_head_t));
 
-	for(i = 0; i < zone->wait_table_size; ++i)
+	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
 		init_waitqueue_head(zone->wait_table + i);
 }
 
-- 
cgit v1.2.3


From 86356ab147669bd3bcb2149fd9561d1280835c24 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:09 -0700
Subject: [PATCH] wait_table and zonelist initializing for memory hotadd:
 change to meminit for build_zonelist

Change definitions of some functions and data from __init to __meminit.

These functions and data can be used after bootup by this patch to be used for
hot-add codes.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h |  4 ++--
 mm/page_alloc.c         | 18 +++++++++---------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index da2d107fe2cf..22866fa2d960 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -91,8 +91,8 @@ static inline void *alloc_remap(int nid, unsigned long size)
 }
 #endif
 
-extern unsigned long __initdata nr_kernel_pages;
-extern unsigned long __initdata nr_all_pages;
+extern unsigned long nr_kernel_pages;
+extern unsigned long nr_all_pages;
 
 extern void *__init alloc_large_system_hash(const char *tablename,
 					    unsigned long bucketsize,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 27320a0542d3..5ae75bead4df 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -83,8 +83,8 @@ EXPORT_SYMBOL(zone_table);
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 
-unsigned long __initdata nr_kernel_pages;
-unsigned long __initdata nr_all_pages;
+unsigned long __meminitdata nr_kernel_pages;
+unsigned long __meminitdata nr_all_pages;
 
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -1517,7 +1517,7 @@ void show_free_areas(void)
  *
  * Add all populated zones of a node to the zonelist.
  */
-static int __init build_zonelists_node(pg_data_t *pgdat,
+static int __meminit build_zonelists_node(pg_data_t *pgdat,
 			struct zonelist *zonelist, int nr_zones, int zone_type)
 {
 	struct zone *zone;
@@ -1553,7 +1553,7 @@ static inline int highest_zone(int zone_bits)
 
 #ifdef CONFIG_NUMA
 #define MAX_NODE_LOAD (num_online_nodes())
-static int __initdata node_load[MAX_NUMNODES];
+static int __meminitdata node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
@@ -1568,7 +1568,7 @@ static int __initdata node_load[MAX_NUMNODES];
  * on them otherwise.
  * It returns -1 if no node is found.
  */
-static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
+static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
@@ -1614,7 +1614,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
 	return best_node;
 }
 
-static void __init build_zonelists(pg_data_t *pgdat)
+static void __meminit build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
 	int prev_node, load;
@@ -1666,7 +1666,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
 
 #else	/* CONFIG_NUMA */
 
-static void __init build_zonelists(pg_data_t *pgdat)
+static void __meminit build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
 
@@ -2071,7 +2071,7 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  */
-static void __init free_area_init_core(struct pglist_data *pgdat,
+static void __meminit free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	unsigned long j;
@@ -2159,7 +2159,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 
-void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long node_start_pfn,
 		unsigned long *zholes_size)
 {
-- 
cgit v1.2.3


From 718127cc3170454f4aa274fdd2f1e01574fecd66 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:10 -0700
Subject: [PATCH] wait_table and zonelist initializing for memory hotadd: add
 return code for init_current_empty_zone

When add_zone() is called against empty zone (not populated zone), we have to
initialize the zone which didn't initialize at boot time.  But,
init_currently_empty_zone() may fail due to allocation of wait table.  So,
this patch is to catch its error code.

Changes against wait_table is in the next patch.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h |  3 +++
 mm/memory_hotplug.c    | 15 +++++++++++++--
 mm/page_alloc.c        | 11 ++++++++---
 3 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 652673ea92f1..e82fc1a52cd0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -333,6 +333,9 @@ void wakeup_kswapd(struct zone *zone, int order);
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		int classzone_idx, int alloc_flags);
 
+extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
+				     unsigned long size);
+
 #ifdef CONFIG_HAVE_MEMORY_PRESENT
 void memory_present(int nid, unsigned long start, unsigned long end);
 #else
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 70df5c0d957e..71da5c98c9c1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,7 +26,7 @@
 
 extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
 			  unsigned long size);
-static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int nr_pages = PAGES_PER_SECTION;
@@ -34,8 +34,15 @@ static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 	int zone_type;
 
 	zone_type = zone - pgdat->node_zones;
+	if (!populated_zone(zone)) {
+		int ret = 0;
+		ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages);
+		if (ret < 0)
+			return ret;
+	}
 	memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
 	zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
+	return 0;
 }
 
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
@@ -50,7 +57,11 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 	if (ret < 0)
 		return ret;
 
-	__add_zone(zone, phys_start_pfn);
+	ret = __add_zone(zone, phys_start_pfn);
+
+	if (ret < 0)
+		return ret;
+
 	return register_new_memory(__pfn_to_section(phys_start_pfn));
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5ae75bead4df..4bc66f6b7718 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2050,8 +2050,9 @@ static __meminit void zone_pcp_init(struct zone *zone)
 			zone->name, zone->present_pages, batch);
 }
 
-static __meminit void init_currently_empty_zone(struct zone *zone,
-		unsigned long zone_start_pfn, unsigned long size)
+__meminit int init_currently_empty_zone(struct zone *zone,
+					unsigned long zone_start_pfn,
+					unsigned long size)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 
@@ -2063,6 +2064,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
 	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
 
 	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+
+	return 0;
 }
 
 /*
@@ -2077,6 +2080,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 	unsigned long j;
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
+	int ret;
 
 	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
@@ -2118,7 +2122,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 			continue;
 
 		zonetable_add(zone, nid, j, zone_start_pfn, size);
-		init_currently_empty_zone(zone, zone_start_pfn, size);
+		ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+		BUG_ON(ret);
 		zone_start_pfn += size;
 	}
 }
-- 
cgit v1.2.3


From fadd8fbd153c12963f8fe3c9ef7f8967f286f98b Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:13 -0700
Subject: [PATCH] support for panic at OOM

This patch adds panic_on_oom sysctl under sys.vm.

When sysctl vm.panic_on_oom = 1, the kernel panics intead of killing rogue
processes.  And if vm.panic_on_oom is 0 the kernel will do oom_kill() in
the same way as it does today.  Of course, the default value is 0 and only
root can modifies it.

In general, oom_killer works well and kill rogue processes.  So the whole
system can survive.  But there are environments where panic is preferable
rather than kill some processes.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt | 13 +++++++++++++
 include/linux/sysctl.h      |  1 +
 kernel/sysctl.c             |  9 +++++++++
 mm/oom_kill.c               |  3 +++
 4 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index a46c10fcddfc..2dc246af4885 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm:
 - drop-caches
 - zone_reclaim_mode
 - zone_reclaim_interval
+- panic_on_oom
 
 ==============================================================
 
@@ -178,3 +179,15 @@ Time is set in seconds and set by default to 30 seconds.
 Reduce the interval if undesired off node allocations occur. However, too
 frequent scans will have a negative impact onoff node allocation performance.
 
+=============================================================
+
+panic_on_oom
+
+This enables or disables panic on out-of-memory feature.  If this is set to 1,
+the kernel panics when out-of-memory happens.  If this is set to 0, the kernel
+will kill some rogue process, called oom_killer.  Usually, oom_killer can kill
+rogue processes and system will survive.  If you want to panic the system
+rather than killing rogue processes, set this to 1.
+
+The default value is 0.
+
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index cee944dbdcd4..c7132029af0f 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -186,6 +186,7 @@ enum
 	VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
 	VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
 	VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
+	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d656e61621d..072ac446810a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -59,6 +59,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
 extern int C_A_D;
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
+extern int sysctl_panic_on_oom;
 extern int max_threads;
 extern int sysrq_enabled;
 extern int core_uses_pid;
@@ -701,6 +702,14 @@ static ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= VM_PANIC_ON_OOM,
+		.procname	= "panic_on_oom",
+		.data		= &sysctl_panic_on_oom,
+		.maxlen		= sizeof(sysctl_panic_on_oom),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= VM_OVERCOMMIT_RATIO,
 		.procname	= "overcommit_ratio",
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 042e6436c3ee..f9bb3cf32384 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -22,6 +22,7 @@
 #include <linux/jiffies.h>
 #include <linux/cpuset.h>
 
+int sysctl_panic_on_oom;
 /* #define DEBUG */
 
 /**
@@ -344,6 +345,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 		break;
 
 	case CONSTRAINT_NONE:
+		if (sysctl_panic_on_oom)
+			panic("out of memory. panic_on_oom is selected\n");
 retry:
 		/*
 		 * Rambo mode: Shoot down a process and hope it solves whatever
-- 
cgit v1.2.3


From e8f03d02080b25f53cd6bba8dc3a297803f18c01 Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@shaw.ca>
Date: Fri, 23 Jun 2006 02:03:14 -0700
Subject: [PATCH] reserve space for swap label

Reserve space in the swap disk header for a LABEL and UUID to be specified.
 This has been possible with util-linux-2.12b (via e2fsprogs 1.36
libblkid), and is used by at least FC3 and later.  The kernel doesn't
really care about this, but the space shouldn't accidentally be used by
something else either.

Also make the on-disk structures be fixed-size types, instead of "int",
though I don't know of any architecture in use where an "int" isn't the
same size as a "__u32" (all current kernel arches have it as "unsigned
int").

Signed-off-by: Andreas Dilger <adilger@shaw.ca>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index aca9bfae208f..cd28ad206dae 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -48,12 +48,14 @@ union swap_header {
 		char magic[10];			/* SWAP-SPACE or SWAPSPACE2 */
 	} magic;
 	struct {
-		char	     bootbits[1024];	/* Space for disklabel etc. */
-		unsigned int version;
-		unsigned int last_page;
-		unsigned int nr_badpages;
-		unsigned int padding[125];
-		unsigned int badpages[1];
+		char		bootbits[1024];	/* Space for disklabel etc. */
+		__u32		version;
+		__u32		last_page;
+		__u32		nr_badpages;
+		unsigned char	sws_uuid[16];
+		unsigned char	sws_volume[16];
+		__u32		padding[117];
+		__u32		badpages[1];
 	} info;
 };
 
-- 
cgit v1.2.3


From a43a8c39bbb493c9e93f6764b350de2e33e18e92 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Fri, 23 Jun 2006 02:03:15 -0700
Subject: [PATCH] tightening hugetlb strict accounting

Current hugetlb strict accounting for shared mapping always assume mapping
starts at zero file offset and reserves pages between zero and size of the
file.  This assumption often reserves (or lock down) a lot more pages then
necessary if application maps at none zero file offset.  libhugetlbfs is
one example that requires proper reservation on shared mapping starts at
none zero offset.

This patch extends the reservation and hugetlb strict accounting to support
any arbitrary pair of (offset, len), resulting a much more robust and
accurate scheme.  More importantly, it won't lock down any hugetlb pages
outside file mapping.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Adam Litke <agl@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c    |  21 ++--
 include/linux/hugetlb.h |   8 +-
 mm/hugetlb.c            | 282 +++++++++++++++++++++++++++---------------------
 3 files changed, 173 insertions(+), 138 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 678fc72c3646..e6410d8edd0e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -59,7 +59,6 @@ static void huge_pagevec_release(struct pagevec *pvec)
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 	loff_t len, vma_len;
 	int ret;
 
@@ -87,9 +86,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
 		goto out;
 
-	if (vma->vm_flags & VM_MAYSHARE)
-		if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)
-			goto out;
+	if (vma->vm_flags & VM_MAYSHARE &&
+	    hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
+				  len >> HPAGE_SHIFT))
+		goto out;
 
 	ret = 0;
 	hugetlb_prefault_arch_hook(vma->vm_mm);
@@ -195,12 +195,8 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 	const pgoff_t start = lstart >> HPAGE_SHIFT;
 	struct pagevec pvec;
 	pgoff_t next;
-	int i;
+	int i, freed = 0;
 
-	hugetlb_truncate_reservation(HUGETLBFS_I(inode),
-				     lstart >> HPAGE_SHIFT);
-	if (!mapping->nrpages)
-		return;
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (1) {
@@ -221,10 +217,12 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 			truncate_huge_page(page);
 			unlock_page(page);
 			hugetlb_put_quota(mapping);
+			freed++;
 		}
 		huge_pagevec_release(&pvec);
 	}
 	BUG_ON(!lstart && mapping->nrpages);
+	hugetlb_unreserve_pages(inode, start, freed);
 }
 
 static void hugetlbfs_delete_inode(struct inode *inode)
@@ -366,6 +364,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		INIT_LIST_HEAD(&inode->i_mapping->private_list);
 		info = HUGETLBFS_I(inode);
 		mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
 		switch (mode & S_IFMT) {
@@ -538,7 +537,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 		hugetlbfs_inc_free_inodes(sbinfo);
 		return NULL;
 	}
-	p->prereserved_hpages = 0;
 	return &p->vfs_inode;
 }
 
@@ -781,8 +779,7 @@ struct file *hugetlb_zero_setup(size_t size)
 		goto out_file;
 
 	error = -ENOMEM;
-	if (hugetlb_extend_reservation(HUGETLBFS_I(inode),
-				       size >> HPAGE_SHIFT) != 0)
+	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4c5e610fe442..c25a38d8f600 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -23,6 +23,8 @@ int hugetlb_report_node_meminfo(int, char *);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
+int hugetlb_reserve_pages(struct inode *inode, long from, long to);
+void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 
 extern unsigned long max_huge_pages;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -139,8 +141,6 @@ struct hugetlbfs_sb_info {
 
 struct hugetlbfs_inode_info {
 	struct shared_policy policy;
-	/* Protected by the (global) hugetlb_lock */
-	unsigned long prereserved_hpages;
 	struct inode vfs_inode;
 };
 
@@ -157,10 +157,6 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 extern const struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_zero_setup(size_t);
-int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
-			       unsigned long atleast_hpages);
-void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
-				  unsigned long atmost_hpages);
 int hugetlb_get_quota(struct address_space *mapping);
 void hugetlb_put_quota(struct address_space *mapping);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 832f676ca038..df499973255f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void)
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
 				    unsigned long addr)
 {
-	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page;
-	int use_reserve = 0;
-	unsigned long idx;
 
 	spin_lock(&hugetlb_lock);
-
-	if (vma->vm_flags & VM_MAYSHARE) {
-
-		/* idx = radix tree index, i.e. offset into file in
-		 * HPAGE_SIZE units */
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-
-		/* The hugetlbfs specific inode info stores the number
-		 * of "guaranteed available" (huge) pages.  That is,
-		 * the first 'prereserved_hpages' pages of the inode
-		 * are either already instantiated, or have been
-		 * pre-reserved (by hugetlb_reserve_for_inode()). Here
-		 * we're in the process of instantiating the page, so
-		 * we use this to determine whether to draw from the
-		 * pre-reserved pool or the truly free pool. */
-		if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
-			use_reserve = 1;
-	}
-
-	if (!use_reserve) {
-		if (free_huge_pages <= reserved_huge_pages)
-			goto fail;
-	} else {
-		BUG_ON(reserved_huge_pages == 0);
-		reserved_huge_pages--;
-	}
+	if (vma->vm_flags & VM_MAYSHARE)
+		resv_huge_pages--;
+	else if (free_huge_pages <= resv_huge_pages)
+		goto fail;
 
 	page = dequeue_huge_page(vma, addr);
 	if (!page)
@@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	set_page_refcounted(page);
 	return page;
 
- fail:
-	WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+fail:
 	spin_unlock(&hugetlb_lock);
 	return NULL;
 }
 
-/* hugetlb_extend_reservation()
- *
- * Ensure that at least 'atleast' hugepages are, and will remain,
- * available to instantiate the first 'atleast' pages of the given
- * inode.  If the inode doesn't already have this many pages reserved
- * or instantiated, set aside some hugepages in the reserved pool to
- * satisfy later faults (or fail now if there aren't enough, rather
- * than getting the SIGBUS later).
- */
-int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
-			       unsigned long atleast)
-{
-	struct inode *inode = &info->vfs_inode;
-	unsigned long change_in_reserve = 0;
-	int ret = 0;
-
-	spin_lock(&hugetlb_lock);
-	read_lock_irq(&inode->i_mapping->tree_lock);
-
-	if (info->prereserved_hpages >= atleast)
-		goto out;
-
-	/* Because we always call this on shared mappings, none of the
-	 * pages beyond info->prereserved_hpages can have been
-	 * instantiated, so we need to reserve all of them now. */
-	change_in_reserve = atleast - info->prereserved_hpages;
-
-	if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	reserved_huge_pages += change_in_reserve;
-	info->prereserved_hpages = atleast;
-
- out:
-	read_unlock_irq(&inode->i_mapping->tree_lock);
-	spin_unlock(&hugetlb_lock);
-
-	return ret;
-}
-
-/* hugetlb_truncate_reservation()
- *
- * This returns pages reserved for the given inode to the general free
- * hugepage pool.  If the inode has any pages prereserved, but not
- * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
- * them.
- */
-void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
-				  unsigned long atmost)
-{
-	struct inode *inode = &info->vfs_inode;
-	struct address_space *mapping = inode->i_mapping;
-	unsigned long idx;
-	unsigned long change_in_reserve = 0;
-	struct page *page;
-
-	spin_lock(&hugetlb_lock);
-	read_lock_irq(&inode->i_mapping->tree_lock);
-
-	if (info->prereserved_hpages <= atmost)
-		goto out;
-
-	/* Count pages which were reserved, but not instantiated, and
-	 * which we can now release. */
-	for (idx = atmost; idx < info->prereserved_hpages; idx++) {
-		page = radix_tree_lookup(&mapping->page_tree, idx);
-		if (!page)
-			/* Pages which are already instantiated can't
-			 * be unreserved (and in fact have already
-			 * been removed from the reserved pool) */
-			change_in_reserve++;
-	}
-
-	BUG_ON(reserved_huge_pages < change_in_reserve);
-	reserved_huge_pages -= change_in_reserve;
-	info->prereserved_hpages = atmost;
-
- out:
-	read_unlock_irq(&inode->i_mapping->tree_lock);
-	spin_unlock(&hugetlb_lock);
-}
-
 static int __init hugetlb_init(void)
 {
 	unsigned long i;
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
 		return nr_huge_pages;
 
 	spin_lock(&hugetlb_lock);
-	count = max(count, reserved_huge_pages);
+	count = max(count, resv_huge_pages);
 	try_to_free_low(count);
 	while (count < nr_huge_pages) {
 		struct page *page = dequeue_huge_page(NULL, 0);
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf)
 	return sprintf(buf,
 			"HugePages_Total: %5lu\n"
 			"HugePages_Free:  %5lu\n"
-		        "HugePages_Rsvd:  %5lu\n"
+			"HugePages_Rsvd:  %5lu\n"
 			"Hugepagesize:    %5lu kB\n",
 			nr_huge_pages,
 			free_huge_pages,
-		        reserved_huge_pages,
+			resv_huge_pages,
 			HPAGE_SIZE/1024);
 }
 
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 	flush_tlb_range(vma, start, end);
 }
 
+struct file_region {
+	struct list_head link;
+	long from;
+	long to;
+};
+
+static long region_add(struct list_head *head, long f, long t)
+{
+	struct file_region *rg, *nrg, *trg;
+
+	/* Locate the region we are either in or before. */
+	list_for_each_entry(rg, head, link)
+		if (f <= rg->to)
+			break;
+
+	/* Round our left edge to the current segment if it encloses us. */
+	if (f > rg->from)
+		f = rg->from;
+
+	/* Check for and consume any regions we now overlap with. */
+	nrg = rg;
+	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+		if (&rg->link == head)
+			break;
+		if (rg->from > t)
+			break;
+
+		/* If this area reaches higher then extend our area to
+		 * include it completely.  If this is not the first area
+		 * which we intend to reuse, free it. */
+		if (rg->to > t)
+			t = rg->to;
+		if (rg != nrg) {
+			list_del(&rg->link);
+			kfree(rg);
+		}
+	}
+	nrg->from = f;
+	nrg->to = t;
+	return 0;
+}
+
+static long region_chg(struct list_head *head, long f, long t)
+{
+	struct file_region *rg, *nrg;
+	long chg = 0;
+
+	/* Locate the region we are before or in. */
+	list_for_each_entry(rg, head, link)
+		if (f <= rg->to)
+			break;
+
+	/* If we are below the current region then a new region is required.
+	 * Subtle, allocate a new region at the position but make it zero
+	 * size such that we can guarentee to record the reservation. */
+	if (&rg->link == head || t < rg->from) {
+		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+		if (nrg == 0)
+			return -ENOMEM;
+		nrg->from = f;
+		nrg->to   = f;
+		INIT_LIST_HEAD(&nrg->link);
+		list_add(&nrg->link, rg->link.prev);
+
+		return t - f;
+	}
+
+	/* Round our left edge to the current segment if it encloses us. */
+	if (f > rg->from)
+		f = rg->from;
+	chg = t - f;
+
+	/* Check for and consume any regions we now overlap with. */
+	list_for_each_entry(rg, rg->link.prev, link) {
+		if (&rg->link == head)
+			break;
+		if (rg->from > t)
+			return chg;
+
+		/* We overlap with this area, if it extends futher than
+		 * us then we must extend ourselves.  Account for its
+		 * existing reservation. */
+		if (rg->to > t) {
+			chg += rg->to - t;
+			t = rg->to;
+		}
+		chg -= rg->to - rg->from;
+	}
+	return chg;
+}
+
+static long region_truncate(struct list_head *head, long end)
+{
+	struct file_region *rg, *trg;
+	long chg = 0;
+
+	/* Locate the region we are either in or before. */
+	list_for_each_entry(rg, head, link)
+		if (end <= rg->to)
+			break;
+	if (&rg->link == head)
+		return 0;
+
+	/* If we are in the middle of a region then adjust it. */
+	if (end > rg->from) {
+		chg = rg->to - end;
+		rg->to = end;
+		rg = list_entry(rg->link.next, typeof(*rg), link);
+	}
+
+	/* Drop any remaining regions. */
+	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+		if (&rg->link == head)
+			break;
+		chg += rg->to - rg->from;
+		list_del(&rg->link);
+		kfree(rg);
+	}
+	return chg;
+}
+
+static int hugetlb_acct_memory(long delta)
+{
+	int ret = -ENOMEM;
+
+	spin_lock(&hugetlb_lock);
+	if ((delta + resv_huge_pages) <= free_huge_pages) {
+		resv_huge_pages += delta;
+		ret = 0;
+	}
+	spin_unlock(&hugetlb_lock);
+	return ret;
+}
+
+int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+{
+	long ret, chg;
+
+	chg = region_chg(&inode->i_mapping->private_list, from, to);
+	if (chg < 0)
+		return chg;
+	ret = hugetlb_acct_memory(chg);
+	if (ret < 0)
+		return ret;
+	region_add(&inode->i_mapping->private_list, from, to);
+	return 0;
+}
+
+void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+{
+	long chg = region_truncate(&inode->i_mapping->private_list, offset);
+	hugetlb_acct_memory(freed - chg);
+}
-- 
cgit v1.2.3


From 762834e8bf46bf41ce9034d062a7c1f8563175f3 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:19 -0700
Subject: [PATCH] Unify pxm_to_node() and node_to_pxm()

Consolidate the various arch-specific implementations of pxm_to_node() and
node_to_pxm() into a single generic version.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig               |  6 ++++++
 arch/i386/kernel/srat.c         | 19 ++--------------
 arch/ia64/hp/common/sba_iommu.c |  2 +-
 arch/ia64/kernel/acpi.c         | 24 +++++++--------------
 arch/ia64/pci/pci.c             |  2 +-
 arch/ia64/sn/kernel/setup.c     |  4 ++--
 arch/x86_64/mm/srat.c           | 33 +---------------------------
 drivers/acpi/Kconfig            |  2 +-
 drivers/acpi/numa.c             | 48 +++++++++++++++++++++++++++++++++++++++++
 include/acpi/acpi_numa.h        | 23 ++++++++++++++++++++
 include/asm-x86_64/numa.h       |  1 -
 include/linux/acpi.h            |  9 ++++++++
 12 files changed, 102 insertions(+), 71 deletions(-)
 create mode 100644 include/acpi/acpi_numa.h

(limited to 'include/linux')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 8dfa3054f10f..15d23da2455f 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -173,6 +173,12 @@ config ACPI_SRAT
 	bool
 	default y
 	depends on NUMA && (X86_SUMMIT || X86_GENERICARCH)
+	select ACPI_NUMA
+
+config HAVE_ARCH_PARSE_SRAT
+       bool
+       default y
+       depends on ACPI_SRAT
 
 config X86_SUMMIT_NUMA
 	bool
diff --git a/arch/i386/kernel/srat.c b/arch/i386/kernel/srat.c
index 52b3ed5d2cb5..989c85255dbe 100644
--- a/arch/i386/kernel/srat.c
+++ b/arch/i386/kernel/srat.c
@@ -39,7 +39,6 @@
 #define NODE_ARRAY_OFFSET(x)	((x) % 8)	/* 8 bits/char */
 #define BMAP_SET(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
 #define BMAP_TEST(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
-#define MAX_PXM_DOMAINS		256	/* 1 byte and no promises about values */
 /* bitmap length; _PXM is at most 255 */
 #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
 static u8 pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
@@ -213,19 +212,11 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
 		node_end_pfn[nid] = memory_chunk->end_pfn;
 }
 
-static u8 pxm_to_nid_map[MAX_PXM_DOMAINS];/* _PXM to logical node ID map */
-
-int pxm_to_node(int pxm)
-{
-	return pxm_to_nid_map[pxm];
-}
-
 /* Parse the ACPI Static Resource Affinity Table */
 static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 {
 	u8 *start, *end, *p;
 	int i, j, nid;
-	u8 nid_to_pxm_map[MAX_NUMNODES];/* logical node ID to _PXM map */
 
 	start = (u8 *)(&(sratp->reserved) + 1);	/* skip header */
 	p = start;
@@ -235,10 +226,6 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 	memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
 	memset(zholes_size, 0, sizeof(zholes_size));
 
-	/* -1 in these maps means not available */
-	memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
-	memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
-
 	num_memory_chunks = 0;
 	while (p < end) {
 		switch (*p) {
@@ -278,9 +265,7 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 	nodes_clear(node_online_map);
 	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
 		if (BMAP_TEST(pxm_bitmap, i)) {
-			nid = num_online_nodes();
-			pxm_to_nid_map[i] = nid;
-			nid_to_pxm_map[nid] = i;
+			int nid = acpi_map_pxm_to_node(i);
 			node_set_online(nid);
 		}
 	}
@@ -288,7 +273,7 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 
 	/* set cnode id in memory chunk structure */
 	for (i = 0; i < num_memory_chunks; i++)
-		node_memory_chunk[i].nid = pxm_to_nid_map[node_memory_chunk[i].pxm];
+		node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
 
 	printk("pxm bitmap: ");
 	for (i = 0; i < sizeof(pxm_bitmap); i++) {
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index bdccd0b1eb60..3ce443e6c016 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1958,7 +1958,7 @@ sba_map_ioc_to_node(struct ioc *ioc, acpi_handle handle)
 	if (pxm < 0)
 		return;
 
-	node = pxm_to_nid_map[pxm];
+	node = pxm_to_node(pxm);
 
 	if (node >= MAX_NUMNODES || !node_online(node))
 		return;
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 58c93a30348c..d1c52cf67882 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -415,9 +415,6 @@ static int __initdata srat_num_cpus;	/* number of cpus */
 static u32 __devinitdata pxm_flag[PXM_FLAG_LEN];
 #define pxm_bit_set(bit)	(set_bit(bit,(void *)pxm_flag))
 #define pxm_bit_test(bit)	(test_bit(bit,(void *)pxm_flag))
-/* maps to convert between proximity domain and logical node ID */
-int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS];
-int __initdata nid_to_pxm_map[MAX_NUMNODES];
 static struct acpi_table_slit __initdata *slit_table;
 
 static int get_processor_proximity_domain(struct acpi_table_processor_affinity *pa)
@@ -533,22 +530,17 @@ void __init acpi_numa_arch_fixup(void)
 	 * MCD - This can probably be dropped now.  No need for pxm ID to node ID
 	 * mapping with sparse node numbering iff MAX_PXM_DOMAINS <= MAX_NUMNODES.
 	 */
-	/* calculate total number of nodes in system from PXM bitmap */
-	memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
-	memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
 	nodes_clear(node_online_map);
 	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
 		if (pxm_bit_test(i)) {
-			int nid = num_online_nodes();
-			pxm_to_nid_map[i] = nid;
-			nid_to_pxm_map[nid] = i;
+			int nid = acpi_map_pxm_to_node(i);
 			node_set_online(nid);
 		}
 	}
 
 	/* set logical node id in memory chunk structure */
 	for (i = 0; i < num_node_memblks; i++)
-		node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid];
+		node_memblk[i].nid = pxm_to_node(node_memblk[i].nid);
 
 	/* assign memory bank numbers for each chunk on each node */
 	for_each_online_node(i) {
@@ -562,7 +554,7 @@ void __init acpi_numa_arch_fixup(void)
 
 	/* set logical node id in cpu structure */
 	for (i = 0; i < srat_num_cpus; i++)
-		node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].nid];
+		node_cpuid[i].nid = pxm_to_node(node_cpuid[i].nid);
 
 	printk(KERN_INFO "Number of logical nodes in system = %d\n",
 	       num_online_nodes());
@@ -575,11 +567,11 @@ void __init acpi_numa_arch_fixup(void)
 	for (i = 0; i < slit_table->localities; i++) {
 		if (!pxm_bit_test(i))
 			continue;
-		node_from = pxm_to_nid_map[i];
+		node_from = pxm_to_node(i);
 		for (j = 0; j < slit_table->localities; j++) {
 			if (!pxm_bit_test(j))
 				continue;
-			node_to = pxm_to_nid_map[j];
+			node_to = pxm_to_node(j);
 			node_distance(node_from, node_to) =
 			    slit_table->entry[i * slit_table->localities + j];
 		}
@@ -785,9 +777,9 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, long physid)
 
 	/*
 	 * Assuming that the container driver would have set the proximity
-	 * domain and would have initialized pxm_to_nid_map[pxm_id] && pxm_flag
+	 * domain and would have initialized pxm_to_node(pxm_id) && pxm_flag
 	 */
-	node_cpuid[cpu].nid = (pxm_id < 0) ? 0 : pxm_to_nid_map[pxm_id];
+	node_cpuid[cpu].nid = (pxm_id < 0) ? 0 : pxm_to_node(pxm_id);
 
 	node_cpuid[cpu].phys_id = physid;
 #endif
@@ -966,7 +958,7 @@ acpi_map_iosapic(acpi_handle handle, u32 depth, void *context, void **ret)
 	if (pxm < 0)
 		return AE_OK;
 
-	node = pxm_to_nid_map[pxm];
+	node = pxm_to_node(pxm);
 
 	if (node >= MAX_NUMNODES || !node_online(node) ||
 	    cpus_empty(node_to_cpumask(node)))
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index ab829a22f8a4..cf7751b99d1c 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -352,7 +352,7 @@ pci_acpi_scan_root(struct acpi_device *device, int domain, int bus)
 	pxm = acpi_get_pxm(controller->acpi_handle);
 #ifdef CONFIG_NUMA
 	if (pxm >= 0)
-		controller->node = pxm_to_nid_map[pxm];
+		controller->node = pxm_to_node(pxm);
 #endif
 
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_window,
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index 30988dfbddff..93577abae36d 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -139,7 +139,7 @@ static int __init pxm_to_nasid(int pxm)
 	int i;
 	int nid;
 
-	nid = pxm_to_nid_map[pxm];
+	nid = pxm_to_node(pxm);
 	for (i = 0; i < num_node_memblks; i++) {
 		if (node_memblk[i].nid == nid) {
 			return NASID_GET(node_memblk[i].start_paddr);
@@ -704,7 +704,7 @@ void __init build_cnode_tables(void)
 	 * cnode == node for all C & M bricks.
 	 */
 	for_each_online_node(node) {
-		nasid = pxm_to_nasid(nid_to_pxm_map[node]);
+		nasid = pxm_to_nasid(node_to_pxm(node));
 		sn_cnodeid_to_nasid[node] = nasid;
 		physical_node_map[nasid] = node;
 	}
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 474df22c6ed2..502fce65e96a 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -30,7 +30,6 @@
 static struct acpi_table_slit *acpi_slit;
 
 static nodemask_t nodes_parsed __initdata;
-static nodemask_t nodes_found __initdata;
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
 static int found_add_area __initdata;
@@ -38,33 +37,14 @@ int hotadd_percent __initdata = 0;
 #ifndef RESERVE_HOTADD
 #define hotadd_percent 0	/* Ignore all settings */
 #endif
-static u8 pxm2node[256] = { [0 ... 255] = 0xff };
 
 /* Too small nodes confuse the VM badly. Usually they result
    from BIOS bugs. */
 #define NODE_MIN_SIZE (4*1024*1024)
 
-static int node_to_pxm(int n);
-
-int pxm_to_node(int pxm)
-{
-	if ((unsigned)pxm >= 256)
-		return -1;
-	/* Extend 0xff to (int)-1 */
-	return (signed char)pxm2node[pxm];
-}
-
 static __init int setup_node(int pxm)
 {
-	unsigned node = pxm2node[pxm];
-	if (node == 0xff) {
-		if (nodes_weight(nodes_found) >= MAX_NUMNODES)
-			return -1;
-		node = first_unset_node(nodes_found); 
-		node_set(node, nodes_found);
-		pxm2node[pxm] = node;
-	}
-	return pxm2node[pxm];
+	return acpi_map_pxm_to_node(pxm);
 }
 
 static __init int conflicting_nodes(unsigned long start, unsigned long end)
@@ -440,17 +420,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 	return 0;
 }
 
-static int node_to_pxm(int n)
-{
-       int i;
-       if (pxm2node[n] == n)
-               return n;
-       for (i = 0; i < 256; i++)
-               if (pxm2node[i] == n)
-                       return i;
-       return 0;
-}
-
 void __init srat_reserve_add_area(int nodeid)
 {
 	if (found_add_area && nodes_add[nodeid].end) {
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index c24652d31bf9..230c53852231 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -162,7 +162,7 @@ config ACPI_THERMAL
 config ACPI_NUMA
 	bool "NUMA support"
 	depends on NUMA
-	depends on (IA64 || X86_64)
+	depends on (X86 || IA64)
 	default y if IA64_GENERIC || IA64_SGI_SN2
 
 config ACPI_ASUS
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 64b98e82feb7..e2c1a16078c9 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -36,12 +36,60 @@
 #define _COMPONENT	ACPI_NUMA
 ACPI_MODULE_NAME("numa")
 
+static nodemask_t nodes_found_map = NODE_MASK_NONE;
+#define PXM_INVAL	-1
+#define NID_INVAL	-1
+
+/* maps to convert between proximity domain and logical node ID */
+int __cpuinitdata pxm_to_node_map[MAX_PXM_DOMAINS]
+				= { [0 ... MAX_PXM_DOMAINS - 1] = NID_INVAL };
+int __cpuinitdata node_to_pxm_map[MAX_NUMNODES]
+				= { [0 ... MAX_NUMNODES - 1] = PXM_INVAL };
+
 extern int __init acpi_table_parse_madt_family(enum acpi_table_id id,
 					       unsigned long madt_size,
 					       int entry_id,
 					       acpi_madt_entry_handler handler,
 					       unsigned int max_entries);
 
+int __cpuinit pxm_to_node(int pxm)
+{
+	if (pxm < 0)
+		return NID_INVAL;
+	return pxm_to_node_map[pxm];
+}
+
+int __cpuinit node_to_pxm(int node)
+{
+	if (node < 0)
+		return PXM_INVAL;
+	return node_to_pxm_map[node];
+}
+
+int __cpuinit acpi_map_pxm_to_node(int pxm)
+{
+	int node = pxm_to_node_map[pxm];
+
+	if (node < 0){
+		if (nodes_weight(nodes_found_map) >= MAX_NUMNODES)
+			return NID_INVAL;
+		node = first_unset_node(nodes_found_map);
+		pxm_to_node_map[pxm] = node;
+		node_to_pxm_map[node] = pxm;
+		node_set(node, nodes_found_map);
+	}
+
+	return node;
+}
+
+void __cpuinit acpi_unmap_pxm_to_node(int node)
+{
+	int pxm = node_to_pxm_map[node];
+	pxm_to_node_map[pxm] = NID_INVAL;
+	node_to_pxm_map[node] = PXM_INVAL;
+	node_clear(node, nodes_found_map);
+}
+
 void __init acpi_table_print_srat_entry(acpi_table_entry_header * header)
 {
 
diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h
new file mode 100644
index 000000000000..1049f2a0a6db
--- /dev/null
+++ b/include/acpi/acpi_numa.h
@@ -0,0 +1,23 @@
+#ifndef __ACPI_NUMA_H
+#define __ACPI_NUMA_H
+
+#ifdef CONFIG_ACPI_NUMA
+#include <linux/kernel.h>
+
+/* Proximity bitmap length */
+#if MAX_NUMNODES > 256
+#define MAX_PXM_DOMAINS MAX_NUMNODES
+#else
+#define MAX_PXM_DOMAINS (256) /* Old pxm spec is defined 8 bit */
+#endif
+
+extern int __cpuinitdata pxm_to_node_map[MAX_PXM_DOMAINS];
+extern int __cpuinitdata node_to_pxm_map[MAX_NUMNODES];
+
+extern int __cpuinit pxm_to_node(int);
+extern int __cpuinit node_to_pxm(int);
+extern int __cpuinit acpi_map_pxm_to_node(int);
+extern void __cpuinit acpi_unmap_pxm_to_node(int);
+
+#endif				/* CONFIG_ACPI_NUMA */
+#endif				/* __ACP_NUMA_H */
diff --git a/include/asm-x86_64/numa.h b/include/asm-x86_64/numa.h
index 1cc92fe02503..933ff11ece15 100644
--- a/include/asm-x86_64/numa.h
+++ b/include/asm-x86_64/numa.h
@@ -8,7 +8,6 @@ struct bootnode {
 };
 
 extern int compute_hash_shift(struct bootnode *nodes, int numnodes);
-extern int pxm_to_node(int nid);
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 1cf0b91d05bd..90d6df1551ed 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -37,6 +37,7 @@
 #include <acpi/acpi.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
+#include <acpi/acpi_numa.h>
 #include <asm/acpi.h>
 
 
@@ -407,10 +408,18 @@ void acpi_table_print_madt_entry (acpi_table_entry_header *madt);
 void acpi_table_print_srat_entry (acpi_table_entry_header *srat);
 
 /* the following four functions are architecture-dependent */
+#ifdef CONFIG_HAVE_ARCH_PARSE_SRAT
+#define NR_NODE_MEMBLKS MAX_NUMNODES
+#define acpi_numa_slit_init(slit) do {} while (0)
+#define acpi_numa_processor_affinity_init(pa) do {} while (0)
+#define acpi_numa_memory_affinity_init(ma) do {} while (0)
+#define acpi_numa_arch_fixup() do {} while (0)
+#else
 void acpi_numa_slit_init (struct acpi_table_slit *slit);
 void acpi_numa_processor_affinity_init (struct acpi_table_processor_affinity *pa);
 void acpi_numa_memory_affinity_init (struct acpi_table_memory_affinity *ma);
 void acpi_numa_arch_fixup(void);
+#endif
 
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 /* Arch dependent functions for cpu hotplug support */
-- 
cgit v1.2.3


From 833423143c3a7c6545e409d65febd0d92deb351b Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 23 Jun 2006 02:03:20 -0700
Subject: [PATCH] mm: introduce remap_vmalloc_range()

Add remap_vmalloc_range, vmalloc_user, and vmalloc_32_user so that drivers
can have a nice interface for remapping vmalloc memory.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/vmalloc.h |   8 ++++
 mm/vmalloc.c            | 122 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 128 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 1d5577b2b752..f6024ab4eff0 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -4,10 +4,13 @@
 #include <linux/spinlock.h>
 #include <asm/page.h>		/* pgprot_t */
 
+struct vm_area_struct;
+
 /* bits in vm_struct->flags */
 #define VM_IOREMAP	0x00000001	/* ioremap() and friends */
 #define VM_ALLOC	0x00000002	/* vmalloc() */
 #define VM_MAP		0x00000004	/* vmap()ed pages */
+#define VM_USERMAP	0x00000008	/* suitable for remap_vmalloc_range */
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
@@ -32,9 +35,11 @@ struct vm_struct {
  *	Highlevel APIs for driver use
  */
 extern void *vmalloc(unsigned long size);
+extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
+extern void *vmalloc_32_user(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
 extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
 				pgprot_t prot);
@@ -45,6 +50,9 @@ extern void vfree(void *addr);
 extern void *vmap(struct page **pages, unsigned int count,
 			unsigned long flags, pgprot_t prot);
 extern void vunmap(void *addr);
+
+extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+							unsigned long pgoff);
  
 /*
  *	Lowlevel-APIs (not for driver use!)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c0504f1e34eb..35f8553f893a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -256,6 +256,19 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int
 	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node);
 }
 
+/* Caller must hold vmlist_lock */
+static struct vm_struct *__find_vm_area(void *addr)
+{
+	struct vm_struct *tmp;
+
+	for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
+		 if (tmp->addr == addr)
+			break;
+	}
+
+	return tmp;
+}
+
 /* Caller must hold vmlist_lock */
 struct vm_struct *__remove_vm_area(void *addr)
 {
@@ -498,10 +511,32 @@ EXPORT_SYMBOL(__vmalloc);
  */
 void *vmalloc(unsigned long size)
 {
-       return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
 }
 EXPORT_SYMBOL(vmalloc);
 
+/**
+ *	vmalloc_user  -  allocate virtually contiguous memory which has
+ *			   been zeroed so it can be mapped to userspace without
+ *			   leaking data.
+ *
+ *	@size:		allocation size
+ */
+void *vmalloc_user(unsigned long size)
+{
+	struct vm_struct *area;
+	void *ret;
+
+	ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+	write_lock(&vmlist_lock);
+	area = __find_vm_area(ret);
+	area->flags |= VM_USERMAP;
+	write_unlock(&vmlist_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(vmalloc_user);
+
 /**
  *	vmalloc_node  -  allocate memory on a specific node
  *
@@ -516,7 +551,7 @@ EXPORT_SYMBOL(vmalloc);
  */
 void *vmalloc_node(unsigned long size, int node)
 {
-       return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+	return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
 }
 EXPORT_SYMBOL(vmalloc_node);
 
@@ -556,6 +591,28 @@ void *vmalloc_32(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc_32);
 
+/**
+ *	vmalloc_32_user  -  allocate virtually contiguous memory (32bit
+ *			      addressable) which is zeroed so it can be
+ *			      mapped to userspace without leaking data.
+ *
+ *	@size:		allocation size
+ */
+void *vmalloc_32_user(unsigned long size)
+{
+	struct vm_struct *area;
+	void *ret;
+
+	ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+	write_lock(&vmlist_lock);
+	area = __find_vm_area(ret);
+	area->flags |= VM_USERMAP;
+	write_unlock(&vmlist_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(vmalloc_32_user);
+
 long vread(char *buf, char *addr, unsigned long count)
 {
 	struct vm_struct *tmp;
@@ -630,3 +687,64 @@ finished:
 	read_unlock(&vmlist_lock);
 	return buf - buf_start;
 }
+
+/**
+ *	remap_vmalloc_range  -  map vmalloc pages to userspace
+ *
+ *	@vma:		vma to cover (map full range of vma)
+ *	@addr:		vmalloc memory
+ *	@pgoff:		number of pages into addr before first page to map
+ *	@returns:	0 for success, -Exxx on failure
+ *
+ *	This function checks that addr is a valid vmalloc'ed area, and
+ *	that it is big enough to cover the vma. Will return failure if
+ *	that criteria isn't met.
+ *
+ *	Similar to remap_pfn_range (see mm/memory.c)
+ */
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+						unsigned long pgoff)
+{
+	struct vm_struct *area;
+	unsigned long uaddr = vma->vm_start;
+	unsigned long usize = vma->vm_end - vma->vm_start;
+	int ret;
+
+	if ((PAGE_SIZE-1) & (unsigned long)addr)
+		return -EINVAL;
+
+	read_lock(&vmlist_lock);
+	area = __find_vm_area(addr);
+	if (!area)
+		goto out_einval_locked;
+
+	if (!(area->flags & VM_USERMAP))
+		goto out_einval_locked;
+
+	if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
+		goto out_einval_locked;
+	read_unlock(&vmlist_lock);
+
+	addr += pgoff << PAGE_SHIFT;
+	do {
+		struct page *page = vmalloc_to_page(addr);
+		ret = vm_insert_page(vma, uaddr, page);
+		if (ret)
+			return ret;
+
+		uaddr += PAGE_SIZE;
+		addr += PAGE_SIZE;
+		usize -= PAGE_SIZE;
+	} while (usize > 0);
+
+	/* Prevent "things" like memory migration? VM_flags need a cleanup... */
+	vma->vm_flags |= VM_RESERVED;
+
+	return ret;
+
+out_einval_locked:
+	read_unlock(&vmlist_lock);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(remap_vmalloc_range);
+
-- 
cgit v1.2.3


From 929f97276bcf7f4a95272ed08a85339b98ba210d Mon Sep 17 00:00:00 2001
From: Dean Nelson <dcn@sgi.com>
Date: Fri, 23 Jun 2006 02:03:21 -0700
Subject: [PATCH] change gen_pool allocator to not touch managed memory

Modify the gen_pool allocator (lib/genalloc.c) to utilize a bitmap scheme
instead of the buddy scheme.  The purpose of this change is to eliminate
the touching of the actual memory being allocated.

Since the change modifies the interface, a change to the uncached allocator
(arch/ia64/kernel/uncached.c) is also required.

Both Andrey Volkov and Jes Sorenson have expressed a desire that the
gen_pool allocator not write to the memory being managed. See the
following:

  http://marc.theaimsgroup.com/?l=linux-kernel&m=113518602713125&w=2
  http://marc.theaimsgroup.com/?l=linux-kernel&m=113533568827916&w=2

Signed-off-by: Dean Nelson <dcn@sgi.com>
Cc: Andrey Volkov <avolkov@varma-el.com>
Acked-by: Jes Sorensen <jes@trained-monkey.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/uncached.c     | 200 +++++++++++++++---------------
 arch/ia64/sn/kernel/sn2/cache.c |  15 ++-
 include/linux/genalloc.h        |  35 +++---
 lib/genalloc.c                  | 263 ++++++++++++++++++----------------------
 4 files changed, 252 insertions(+), 261 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index fcd2bad0286f..5f03b9e524dd 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2001-2005 Silicon Graphics, Inc.  All rights reserved.
+ * Copyright (C) 2001-2006 Silicon Graphics, Inc.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License
@@ -29,15 +29,8 @@
 #include <asm/tlbflush.h>
 #include <asm/sn/arch.h>
 
-#define DEBUG	0
 
-#if DEBUG
-#define dprintk			printk
-#else
-#define dprintk(x...)		do { } while (0)
-#endif
-
-void __init efi_memmap_walk_uc (efi_freemem_callback_t callback);
+extern void __init efi_memmap_walk_uc(efi_freemem_callback_t, void *);
 
 #define MAX_UNCACHED_GRANULES	5
 static int allocated_granules;
@@ -60,6 +53,7 @@ static void uncached_ipi_visibility(void *data)
 static void uncached_ipi_mc_drain(void *data)
 {
 	int status;
+
 	status = ia64_pal_mc_drain();
 	if (status)
 		printk(KERN_WARNING "ia64_pal_mc_drain() failed with %i on "
@@ -67,30 +61,35 @@ static void uncached_ipi_mc_drain(void *data)
 }
 
 
-static unsigned long
-uncached_get_new_chunk(struct gen_pool *poolp)
+/*
+ * Add a new chunk of uncached memory pages to the specified pool.
+ *
+ * @pool: pool to add new chunk of uncached memory to
+ * @nid: node id of node to allocate memory from, or -1
+ *
+ * This is accomplished by first allocating a granule of cached memory pages
+ * and then converting them to uncached memory pages.
+ */
+static int uncached_add_chunk(struct gen_pool *pool, int nid)
 {
 	struct page *page;
-	void *tmp;
 	int status, i;
-	unsigned long addr, node;
+	unsigned long c_addr, uc_addr;
 
 	if (allocated_granules >= MAX_UNCACHED_GRANULES)
-		return 0;
+		return -1;
+
+	/* attempt to allocate a granule's worth of cached memory pages */
 
-	node = poolp->private;
-	page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO,
+	page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO,
 				IA64_GRANULE_SHIFT-PAGE_SHIFT);
+	if (!page)
+		return -1;
 
-	dprintk(KERN_INFO "get_new_chunk page %p, addr %lx\n",
-		page, (unsigned long)(page-vmem_map) << PAGE_SHIFT);
+	/* convert the memory pages from cached to uncached */
 
-	/*
-	 * Do magic if no mem on local node! XXX
-	 */
-	if (!page)
-		return 0;
-	tmp = page_address(page);
+	c_addr = (unsigned long)page_address(page);
+	uc_addr = c_addr - PAGE_OFFSET + __IA64_UNCACHED_OFFSET;
 
 	/*
 	 * There's a small race here where it's possible for someone to
@@ -100,76 +99,90 @@ uncached_get_new_chunk(struct gen_pool *poolp)
 	for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++)
 		SetPageUncached(&page[i]);
 
-	flush_tlb_kernel_range(tmp, tmp + IA64_GRANULE_SIZE);
+	flush_tlb_kernel_range(uc_addr, uc_adddr + IA64_GRANULE_SIZE);
 
 	status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL);
-
-	dprintk(KERN_INFO "pal_prefetch_visibility() returns %i on cpu %i\n",
-		status, raw_smp_processor_id());
-
 	if (!status) {
 		status = smp_call_function(uncached_ipi_visibility, NULL, 0, 1);
 		if (status)
-			printk(KERN_WARNING "smp_call_function failed for "
-			       "uncached_ipi_visibility! (%i)\n", status);
+			goto failed;
 	}
 
+	preempt_disable();
+
 	if (ia64_platform_is("sn2"))
-		sn_flush_all_caches((unsigned long)tmp, IA64_GRANULE_SIZE);
+		sn_flush_all_caches(uc_addr, IA64_GRANULE_SIZE);
 	else
-		flush_icache_range((unsigned long)tmp,
-				   (unsigned long)tmp+IA64_GRANULE_SIZE);
+		flush_icache_range(uc_addr, uc_addr + IA64_GRANULE_SIZE);
+
+	/* flush the just introduced uncached translation from the TLB */
+	local_flush_tlb_all();
+
+	preempt_enable();
 
 	ia64_pal_mc_drain();
 	status = smp_call_function(uncached_ipi_mc_drain, NULL, 0, 1);
 	if (status)
-		printk(KERN_WARNING "smp_call_function failed for "
-		       "uncached_ipi_mc_drain! (%i)\n", status);
+		goto failed;
 
-	addr = (unsigned long)tmp - PAGE_OFFSET + __IA64_UNCACHED_OFFSET;
+	/*
+	 * The chunk of memory pages has been converted to uncached so now we
+	 * can add it to the pool.
+	 */
+	status = gen_pool_add(pool, uc_addr, IA64_GRANULE_SIZE, nid);
+	if (status)
+		goto failed;
 
 	allocated_granules++;
-	return addr;
+	return 0;
+
+	/* failed to convert or add the chunk so give it back to the kernel */
+failed:
+	for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++)
+		ClearPageUncached(&page[i]);
+
+	free_pages(c_addr, IA64_GRANULE_SHIFT-PAGE_SHIFT);
+	return -1;
 }
 
 
 /*
  * uncached_alloc_page
  *
+ * @starting_nid: node id of node to start with, or -1
+ *
  * Allocate 1 uncached page. Allocates on the requested node. If no
  * uncached pages are available on the requested node, roundrobin starting
- * with higher nodes.
+ * with the next higher node.
  */
-unsigned long
-uncached_alloc_page(int nid)
+unsigned long uncached_alloc_page(int starting_nid)
 {
-	unsigned long maddr;
+	unsigned long uc_addr;
+	struct gen_pool *pool;
+	int nid;
 
-	maddr = gen_pool_alloc(uncached_pool[nid], PAGE_SIZE);
+	if (unlikely(starting_nid >= MAX_NUMNODES))
+		return 0;
 
-	dprintk(KERN_DEBUG "uncached_alloc_page returns %lx on node %i\n",
-		maddr, nid);
+	if (starting_nid < 0)
+		starting_nid = numa_node_id();
+	nid = starting_nid;
 
-	/*
-	 * If no memory is availble on our local node, try the
-	 * remaining nodes in the system.
-	 */
-	if (!maddr) {
-		int i;
-
-		for (i = MAX_NUMNODES - 1; i >= 0; i--) {
-			if (i == nid || !node_online(i))
-				continue;
-			maddr = gen_pool_alloc(uncached_pool[i], PAGE_SIZE);
-			dprintk(KERN_DEBUG "uncached_alloc_page alternate search "
-				"returns %lx on node %i\n", maddr, i);
-			if (maddr) {
-				break;
-			}
-		}
-	}
+	do {
+		if (!node_online(nid))
+			continue;
+		pool = uncached_pool[nid];
+		if (pool == NULL)
+			continue;
+		do {
+			uc_addr = gen_pool_alloc(pool, PAGE_SIZE);
+			if (uc_addr != 0)
+				return uc_addr;
+		} while (uncached_add_chunk(pool, nid) == 0);
+
+	} while ((nid = (nid + 1) % MAX_NUMNODES) != starting_nid);
 
-	return maddr;
+	return 0;
 }
 EXPORT_SYMBOL(uncached_alloc_page);
 
@@ -177,21 +190,22 @@ EXPORT_SYMBOL(uncached_alloc_page);
 /*
  * uncached_free_page
  *
+ * @uc_addr: uncached address of page to free
+ *
  * Free a single uncached page.
  */
-void
-uncached_free_page(unsigned long maddr)
+void uncached_free_page(unsigned long uc_addr)
 {
-	int node;
-
-	node = paddr_to_nid(maddr - __IA64_UNCACHED_OFFSET);
+	int nid = paddr_to_nid(uc_addr - __IA64_UNCACHED_OFFSET);
+	struct gen_pool *pool = uncached_pool[nid];
 
-	dprintk(KERN_DEBUG "uncached_free_page(%lx) on node %i\n", maddr, node);
+	if (unlikely(pool == NULL))
+		return;
 
-	if ((maddr & (0XFUL << 60)) != __IA64_UNCACHED_OFFSET)
-		panic("uncached_free_page invalid address %lx\n", maddr);
+	if ((uc_addr & (0XFUL << 60)) != __IA64_UNCACHED_OFFSET)
+		panic("uncached_free_page invalid address %lx\n", uc_addr);
 
-	gen_pool_free(uncached_pool[node], maddr, PAGE_SIZE);
+	gen_pool_free(pool, uc_addr, PAGE_SIZE);
 }
 EXPORT_SYMBOL(uncached_free_page);
 
@@ -199,43 +213,39 @@ EXPORT_SYMBOL(uncached_free_page);
 /*
  * uncached_build_memmap,
  *
+ * @uc_start: uncached starting address of a chunk of uncached memory
+ * @uc_end: uncached ending address of a chunk of uncached memory
+ * @arg: ignored, (NULL argument passed in on call to efi_memmap_walk_uc())
+ *
  * Called at boot time to build a map of pages that can be used for
  * memory special operations.
  */
-static int __init
-uncached_build_memmap(unsigned long start, unsigned long end, void *arg)
+static int __init uncached_build_memmap(unsigned long uc_start,
+					unsigned long uc_end, void *arg)
 {
-	long length = end - start;
-	int node;
-
-	dprintk(KERN_ERR "uncached_build_memmap(%lx %lx)\n", start, end);
+	int nid = paddr_to_nid(uc_start - __IA64_UNCACHED_OFFSET);
+	struct gen_pool *pool = uncached_pool[nid];
+	size_t size = uc_end - uc_start;
 
 	touch_softlockup_watchdog();
-	memset((char *)start, 0, length);
 
-	node = paddr_to_nid(start - __IA64_UNCACHED_OFFSET);
-
-	for (; start < end ; start += PAGE_SIZE) {
-		dprintk(KERN_INFO "sticking %lx into the pool!\n", start);
-		gen_pool_free(uncached_pool[node], start, PAGE_SIZE);
+	if (pool != NULL) {
+		memset((char *)uc_start, 0, size);
+		(void) gen_pool_add(pool, uc_start, size, nid);
 	}
-
 	return 0;
 }
 
 
-static int __init uncached_init(void) {
-	int i;
+static int __init uncached_init(void)
+{
+	int nid;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (!node_online(i))
-			continue;
-		uncached_pool[i] = gen_pool_create(0, IA64_GRANULE_SHIFT,
-						   &uncached_get_new_chunk, i);
+	for_each_online_node(nid) {
+		uncached_pool[nid] = gen_pool_create(PAGE_SHIFT, nid);
 	}
 
-	efi_memmap_walk_uc(uncached_build_memmap);
-
+	efi_memmap_walk_uc(uncached_build_memmap, NULL);
 	return 0;
 }
 
diff --git a/arch/ia64/sn/kernel/sn2/cache.c b/arch/ia64/sn/kernel/sn2/cache.c
index bc3cfa17cd0f..2862cb33026d 100644
--- a/arch/ia64/sn/kernel/sn2/cache.c
+++ b/arch/ia64/sn/kernel/sn2/cache.c
@@ -3,11 +3,12 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  * 
- * Copyright (C) 2001-2003 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2001-2003, 2006 Silicon Graphics, Inc. All rights reserved.
  *
  */
 #include <linux/module.h>
 #include <asm/pgalloc.h>
+#include <asm/sn/arch.h>
 
 /**
  * sn_flush_all_caches - flush a range of address from all caches (incl. L4)
@@ -17,18 +18,24 @@
  * Flush a range of addresses from all caches including L4. 
  * All addresses fully or partially contained within 
  * @flush_addr to @flush_addr + @bytes are flushed
- * from the all caches.
+ * from all caches.
  */
 void
 sn_flush_all_caches(long flush_addr, long bytes)
 {
-	flush_icache_range(flush_addr, flush_addr+bytes);
+	unsigned long addr = flush_addr;
+
+	/* SHub1 requires a cached address */
+	if (is_shub1() && (addr & RGN_BITS) == RGN_BASE(RGN_UNCACHED))
+		addr = (addr - RGN_BASE(RGN_UNCACHED)) + RGN_BASE(RGN_KERNEL);
+
+	flush_icache_range(addr, addr + bytes);
 	/*
 	 * The last call may have returned before the caches
 	 * were actually flushed, so we call it again to make
 	 * sure.
 	 */
-	flush_icache_range(flush_addr, flush_addr+bytes);
+	flush_icache_range(addr, addr + bytes);
 	mb();
 }
 EXPORT_SYMBOL(sn_flush_all_caches);
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 7fd0576a4454..690c42803d2e 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -4,37 +4,32 @@
  * Uses for this includes on-device special memory, uncached memory
  * etc.
  *
- * This code is based on the buddy allocator found in the sym53c8xx_2
- * driver, adapted for general purpose use.
- *
  * This source code is licensed under the GNU General Public License,
  * Version 2.  See the file COPYING for more details.
  */
 
-#include <linux/spinlock.h>
 
-#define ALLOC_MIN_SHIFT		5 /* 32 bytes minimum */
 /*
- *  Link between free memory chunks of a given size.
+ *  General purpose special memory pool descriptor.
  */
-struct gen_pool_link {
-	struct gen_pool_link *next;
+struct gen_pool {
+	rwlock_t lock;
+	struct list_head chunks;	/* list of chunks in this pool */
+	int min_alloc_order;		/* minimum allocation order */
 };
 
 /*
- *  Memory pool descriptor.
+ *  General purpose special memory pool chunk descriptor.
  */
-struct gen_pool {
+struct gen_pool_chunk {
 	spinlock_t lock;
-	unsigned long (*get_new_chunk)(struct gen_pool *);
-	struct gen_pool *next;
-	struct gen_pool_link *h;
-	unsigned long private;
-	int max_chunk_shift;
+	struct list_head next_chunk;	/* next chunk in pool */
+	unsigned long start_addr;	/* starting address of memory chunk */
+	unsigned long end_addr;		/* ending address of memory chunk */
+	unsigned long bits[0];		/* bitmap for allocating memory chunk */
 };
 
-unsigned long gen_pool_alloc(struct gen_pool *poolp, int size);
-void gen_pool_free(struct gen_pool *mp, unsigned long ptr, int size);
-struct gen_pool *gen_pool_create(int nr_chunks, int max_chunk_shift,
-				 unsigned long (*fp)(struct gen_pool *),
-				 unsigned long data);
+extern struct gen_pool *gen_pool_create(int, int);
+extern int gen_pool_add(struct gen_pool *, unsigned long, size_t, int);
+extern unsigned long gen_pool_alloc(struct gen_pool *, size_t);
+extern void gen_pool_free(struct gen_pool *, unsigned long, size_t);
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 9ce0a6a3b85a..71338b48e889 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -4,10 +4,6 @@
  * Uses for this includes on-device special memory, uncached memory
  * etc.
  *
- * This code is based on the buddy allocator found in the sym53c8xx_2
- * driver Copyright (C) 1999-2001  Gerard Roudier <groudier@free.fr>,
- * and adapted for general purpose use.
- *
  * Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org>
  *
  * This source code is licensed under the GNU General Public License,
@@ -15,172 +11,155 @@
  */
 
 #include <linux/module.h>
-#include <linux/stddef.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
 #include <linux/genalloc.h>
 
-#include <asm/page.h>
-
 
-struct gen_pool *gen_pool_create(int nr_chunks, int max_chunk_shift,
-				 unsigned long (*fp)(struct gen_pool *),
-				 unsigned long data)
+/*
+ * Create a new special memory pool.
+ *
+ * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
+ * @nid: node id of the node the pool structure should be allocated on, or -1
+ */
+struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
 {
-	struct gen_pool *poolp;
-	unsigned long tmp;
-	int i;
-
-	/*
-	 * This is really an arbitrary limit, +10 is enough for
-	 * IA64_GRANULE_SHIFT, aka 16MB. If anyone needs a large limit
-	 * this can be increased without problems.
-	 */
-	if ((max_chunk_shift > (PAGE_SHIFT + 10)) ||
-	    ((max_chunk_shift < ALLOC_MIN_SHIFT) && max_chunk_shift))
-		return NULL;
-
-	if (!max_chunk_shift)
-		max_chunk_shift = PAGE_SHIFT;
-
-	poolp = kmalloc(sizeof(struct gen_pool), GFP_KERNEL);
-	if (!poolp)
-		return NULL;
-	memset(poolp, 0, sizeof(struct gen_pool));
-	poolp->h = kmalloc(sizeof(struct gen_pool_link) *
-			   (max_chunk_shift - ALLOC_MIN_SHIFT + 1),
-			   GFP_KERNEL);
-	if (!poolp->h) {
-		printk(KERN_WARNING "gen_pool_alloc() failed to allocate\n");
-		kfree(poolp);
-		return NULL;
-	}
-	memset(poolp->h, 0, sizeof(struct gen_pool_link) *
-	       (max_chunk_shift - ALLOC_MIN_SHIFT + 1));
-
-	spin_lock_init(&poolp->lock);
-	poolp->get_new_chunk = fp;
-	poolp->max_chunk_shift = max_chunk_shift;
-	poolp->private = data;
-
-	for (i = 0; i < nr_chunks; i++) {
-		tmp = poolp->get_new_chunk(poolp);
-		printk(KERN_INFO "allocated %lx\n", tmp);
-		if (!tmp)
-			break;
-		gen_pool_free(poolp, tmp, (1 << poolp->max_chunk_shift));
-	}
+	struct gen_pool *pool;
 
-	return poolp;
+	pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid);
+	if (pool != NULL) {
+		rwlock_init(&pool->lock);
+		INIT_LIST_HEAD(&pool->chunks);
+		pool->min_alloc_order = min_alloc_order;
+	}
+	return pool;
 }
 EXPORT_SYMBOL(gen_pool_create);
 
 
 /*
- *  Simple power of two buddy-like generic allocator.
- *  Provides naturally aligned memory chunks.
+ * Add a new chunk of memory to the specified pool.
+ *
+ * @pool: pool to add new memory chunk to
+ * @addr: starting address of memory chunk to add to pool
+ * @size: size in bytes of the memory chunk to add to pool
+ * @nid: node id of the node the chunk structure and bitmap should be
+ *       allocated on, or -1
  */
-unsigned long gen_pool_alloc(struct gen_pool *poolp, int size)
+int gen_pool_add(struct gen_pool *pool, unsigned long addr, size_t size,
+		 int nid)
 {
-	int j, i, s, max_chunk_size;
-	unsigned long a, flags;
-	struct gen_pool_link *h = poolp->h;
+	struct gen_pool_chunk *chunk;
+	int nbits = size >> pool->min_alloc_order;
+	int nbytes = sizeof(struct gen_pool_chunk) +
+				(nbits + BITS_PER_BYTE - 1) / BITS_PER_BYTE;
 
-	max_chunk_size = 1 << poolp->max_chunk_shift;
+	chunk = kmalloc_node(nbytes, GFP_KERNEL, nid);
+	if (unlikely(chunk == NULL))
+		return -1;
 
-	if (size > max_chunk_size)
-		return 0;
+	memset(chunk, 0, nbytes);
+	spin_lock_init(&chunk->lock);
+	chunk->start_addr = addr;
+	chunk->end_addr = addr + size;
 
-	size = max(size, 1 << ALLOC_MIN_SHIFT);
-	i = fls(size - 1);
-	s = 1 << i;
-	j = i -= ALLOC_MIN_SHIFT;
-
-	spin_lock_irqsave(&poolp->lock, flags);
-	while (!h[j].next) {
-		if (s == max_chunk_size) {
-			struct gen_pool_link *ptr;
-			spin_unlock_irqrestore(&poolp->lock, flags);
-			ptr = (struct gen_pool_link *)poolp->get_new_chunk(poolp);
-			spin_lock_irqsave(&poolp->lock, flags);
-			h[j].next = ptr;
-			if (h[j].next)
-				h[j].next->next = NULL;
-			break;
-		}
-		j++;
-		s <<= 1;
-	}
-	a = (unsigned long) h[j].next;
-	if (a) {
-		h[j].next = h[j].next->next;
-		/*
-		 * This should be split into a seperate function doing
-		 * the chunk split in order to support custom
-		 * handling memory not physically accessible by host
-		 */
-		while (j > i) {
-			j -= 1;
-			s >>= 1;
-			h[j].next = (struct gen_pool_link *) (a + s);
-			h[j].next->next = NULL;
-		}
-	}
-	spin_unlock_irqrestore(&poolp->lock, flags);
-	return a;
+	write_lock(&pool->lock);
+	list_add(&chunk->next_chunk, &pool->chunks);
+	write_unlock(&pool->lock);
+
+	return 0;
 }
-EXPORT_SYMBOL(gen_pool_alloc);
+EXPORT_SYMBOL(gen_pool_add);
 
 
 /*
- *  Counter-part of the generic allocator.
+ * Allocate the requested number of bytes from the specified pool.
+ * Uses a first-fit algorithm.
+ *
+ * @pool: pool to allocate from
+ * @size: number of bytes to allocate from the pool
  */
-void gen_pool_free(struct gen_pool *poolp, unsigned long ptr, int size)
+unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
 {
-	struct gen_pool_link *q;
-	struct gen_pool_link *h = poolp->h;
-	unsigned long a, b, flags;
-	int i, s, max_chunk_size;
-
-	max_chunk_size = 1 << poolp->max_chunk_shift;
+	struct list_head *_chunk;
+	struct gen_pool_chunk *chunk;
+	unsigned long addr, flags;
+	int order = pool->min_alloc_order;
+	int nbits, bit, start_bit, end_bit;
 
-	if (size > max_chunk_size)
-		return;
-
-	size = max(size, 1 << ALLOC_MIN_SHIFT);
-	i = fls(size - 1);
-	s = 1 << i;
-	i -= ALLOC_MIN_SHIFT;
-
-	a = ptr;
+	if (size == 0)
+		return 0;
 
-	spin_lock_irqsave(&poolp->lock, flags);
-	while (1) {
-		if (s == max_chunk_size) {
-			((struct gen_pool_link *)a)->next = h[i].next;
-			h[i].next = (struct gen_pool_link *)a;
-			break;
+	nbits = (size + (1UL << order) - 1) >> order;
+
+	read_lock(&pool->lock);
+	list_for_each(_chunk, &pool->chunks) {
+		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
+
+		end_bit = (chunk->end_addr - chunk->start_addr) >> order;
+		end_bit -= nbits + 1;
+
+		spin_lock_irqsave(&chunk->lock, flags);
+		bit = -1;
+		while (bit + 1 < end_bit) {
+			bit = find_next_zero_bit(chunk->bits, end_bit, bit + 1);
+			if (bit >= end_bit)
+				break;
+
+			start_bit = bit;
+			if (nbits > 1) {
+				bit = find_next_bit(chunk->bits, bit + nbits,
+							bit + 1);
+				if (bit - start_bit < nbits)
+					continue;
+			}
+
+			addr = chunk->start_addr +
+					    ((unsigned long)start_bit << order);
+			while (nbits--)
+				__set_bit(start_bit++, &chunk->bits);
+			spin_unlock_irqrestore(&chunk->lock, flags);
+			read_unlock(&pool->lock);
+			return addr;
 		}
-		b = a ^ s;
-		q = &h[i];
+		spin_unlock_irqrestore(&chunk->lock, flags);
+	}
+	read_unlock(&pool->lock);
+	return 0;
+}
+EXPORT_SYMBOL(gen_pool_alloc);
 
-		while (q->next && q->next != (struct gen_pool_link *)b)
-			q = q->next;
 
-		if (!q->next) {
-			((struct gen_pool_link *)a)->next = h[i].next;
-			h[i].next = (struct gen_pool_link *)a;
+/*
+ * Free the specified memory back to the specified pool.
+ *
+ * @pool: pool to free to
+ * @addr: starting address of memory to free back to pool
+ * @size: size in bytes of memory to free
+ */
+void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
+{
+	struct list_head *_chunk;
+	struct gen_pool_chunk *chunk;
+	unsigned long flags;
+	int order = pool->min_alloc_order;
+	int bit, nbits;
+
+	nbits = (size + (1UL << order) - 1) >> order;
+
+	read_lock(&pool->lock);
+	list_for_each(_chunk, &pool->chunks) {
+		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
+
+		if (addr >= chunk->start_addr && addr < chunk->end_addr) {
+			BUG_ON(addr + size > chunk->end_addr);
+			spin_lock_irqsave(&chunk->lock, flags);
+			bit = (addr - chunk->start_addr) >> order;
+			while (nbits--)
+				__clear_bit(bit++, &chunk->bits);
+			spin_unlock_irqrestore(&chunk->lock, flags);
 			break;
 		}
-		q->next = q->next->next;
-		a = a & b;
-		s <<= 1;
-		i++;
 	}
-	spin_unlock_irqrestore(&poolp->lock, flags);
+	BUG_ON(nbits > 0);
+	read_unlock(&pool->lock);
 }
 EXPORT_SYMBOL(gen_pool_free);
-- 
cgit v1.2.3


From 612d6c19db2fd0dc97b0fa370613ecd4a305ffc3 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 23 Jun 2006 02:03:22 -0700
Subject: [PATCH] radix-tree: direct data

The ability to have height 0 radix trees (a direct pointer to the data item
rather than going through a full node->slot) quietly disappeared with
old-2.6-bkcvs commit ffee171812d51652f9ba284302d9e5c5cc14bdfd.  On 64-bit
machines this causes nearly 600 bytes to be used for every <= 4K file in
pagecache.

Re-introduce this feature, root tags stored in spare ->gfp_mask bits.

Simplify radix_tree_delete's complex tag clearing arrangement (which would
become even more complex) by just falling back to tag clearing functions
(the pagecache radix-tree never uses this path anyway, so the icache
savings will mean it's actually a speedup).

On my 4GB G5, this saves 8MB RAM per kernel kernel source+object tree in
pagecache.

Pagecache lookup, insertion, and removal speed for small files will also be
improved.

This makes RCU radix tree harder, but it's worth it.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/radix-tree.h |   5 +-
 lib/radix-tree.c           | 192 ++++++++++++++++++++++++++-------------------
 2 files changed, 114 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index dd83cca28001..9158a68140c9 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -23,6 +23,9 @@
 #include <linux/preempt.h>
 #include <linux/types.h>
 
+#define RADIX_TREE_MAX_TAGS 2
+
+/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
 struct radix_tree_root {
 	unsigned int		height;
 	gfp_t			gfp_mask;
@@ -45,8 +48,6 @@ do {									\
 	(root)->rnode = NULL;						\
 } while (0)
 
-#define RADIX_TREE_MAX_TAGS 2
-
 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 7097bb239e40..35a2d93b3528 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -74,6 +74,11 @@ struct radix_tree_preload {
 };
 DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
 
+static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
+{
+	return root->gfp_mask & __GFP_BITS_MASK;
+}
+
 /*
  * This assumes that the caller has performed appropriate preallocation, and
  * that the caller has pinned this thread of control to the current CPU.
@@ -82,9 +87,10 @@ static struct radix_tree_node *
 radix_tree_node_alloc(struct radix_tree_root *root)
 {
 	struct radix_tree_node *ret;
+	gfp_t gfp_mask = root_gfp_mask(root);
 
-	ret = kmem_cache_alloc(radix_tree_node_cachep, root->gfp_mask);
-	if (ret == NULL && !(root->gfp_mask & __GFP_WAIT)) {
+	ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
+	if (ret == NULL && !(gfp_mask & __GFP_WAIT)) {
 		struct radix_tree_preload *rtp;
 
 		rtp = &__get_cpu_var(radix_tree_preloads);
@@ -152,6 +158,27 @@ static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
 	return test_bit(offset, node->tags[tag]);
 }
 
+static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask |= (1 << (tag + __GFP_BITS_SHIFT));
+}
+
+
+static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask &= ~(1 << (tag + __GFP_BITS_SHIFT));
+}
+
+static inline void root_tag_clear_all(struct radix_tree_root *root)
+{
+	root->gfp_mask &= __GFP_BITS_MASK;
+}
+
+static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
+{
+	return root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+}
+
 /*
  * Returns 1 if any slot in the node has this tag set.
  * Otherwise returns 0.
@@ -182,7 +209,6 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 {
 	struct radix_tree_node *node;
 	unsigned int height;
-	char tags[RADIX_TREE_MAX_TAGS];
 	int tag;
 
 	/* Figure out what the height should be.  */
@@ -195,16 +221,6 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 		goto out;
 	}
 
-	/*
-	 * Prepare the tag status of the top-level node for propagation
-	 * into the newly-pushed top-level node(s)
-	 */
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		tags[tag] = 0;
-		if (any_tag_set(root->rnode, tag))
-			tags[tag] = 1;
-	}
-
 	do {
 		if (!(node = radix_tree_node_alloc(root)))
 			return -ENOMEM;
@@ -214,7 +230,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 
 		/* Propagate the aggregated tag info into the new root */
 		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-			if (tags[tag])
+			if (root_tag_get(root, tag))
 				tag_set(node, tag, 0);
 		}
 
@@ -243,8 +259,7 @@ int radix_tree_insert(struct radix_tree_root *root,
 	int error;
 
 	/* Make sure the tree is high enough.  */
-	if ((!index && !root->rnode) ||
-			index > radix_tree_maxindex(root->height)) {
+	if (index > radix_tree_maxindex(root->height)) {
 		error = radix_tree_extend(root, index);
 		if (error)
 			return error;
@@ -255,7 +270,7 @@ int radix_tree_insert(struct radix_tree_root *root,
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 
 	offset = 0;			/* uninitialised var warning */
-	do {
+	while (height > 0) {
 		if (slot == NULL) {
 			/* Have to add a child node.  */
 			if (!(slot = radix_tree_node_alloc(root)))
@@ -273,16 +288,21 @@ int radix_tree_insert(struct radix_tree_root *root,
 		slot = node->slots[offset];
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
-	} while (height > 0);
+	}
 
 	if (slot != NULL)
 		return -EEXIST;
 
-	BUG_ON(!node);
-	node->count++;
-	node->slots[offset] = item;
-	BUG_ON(tag_get(node, 0, offset));
-	BUG_ON(tag_get(node, 1, offset));
+	if (node) {
+		node->count++;
+		node->slots[offset] = item;
+		BUG_ON(tag_get(node, 0, offset));
+		BUG_ON(tag_get(node, 1, offset));
+	} else {
+		root->rnode = item;
+		BUG_ON(root_tag_get(root, 0));
+		BUG_ON(root_tag_get(root, 1));
+	}
 
 	return 0;
 }
@@ -295,9 +315,13 @@ static inline void **__lookup_slot(struct radix_tree_root *root,
 	struct radix_tree_node **slot;
 
 	height = root->height;
+
 	if (index > radix_tree_maxindex(height))
 		return NULL;
 
+	if (height == 0 && root->rnode)
+		return (void **)&root->rnode;
+
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 	slot = &root->rnode;
 
@@ -368,8 +392,8 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 	if (index > radix_tree_maxindex(height))
 		return NULL;
 
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 	slot = root->rnode;
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 
 	while (height > 0) {
 		int offset;
@@ -383,6 +407,10 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 		height--;
 	}
 
+	/* set the root's tag bit */
+	if (slot && !root_tag_get(root, tag))
+		root_tag_set(root, tag);
+
 	return slot;
 }
 EXPORT_SYMBOL(radix_tree_tag_set);
@@ -405,9 +433,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag)
 {
 	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
-	struct radix_tree_node *slot;
+	struct radix_tree_node *slot = NULL;
 	unsigned int height, shift;
-	void *ret = NULL;
 
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
@@ -432,20 +459,24 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		height--;
 	}
 
-	ret = slot;
-	if (ret == NULL)
+	if (slot == NULL)
 		goto out;
 
-	do {
+	while (pathp->node) {
 		if (!tag_get(pathp->node, tag, pathp->offset))
 			goto out;
 		tag_clear(pathp->node, tag, pathp->offset);
 		if (any_tag_set(pathp->node, tag))
 			goto out;
 		pathp--;
-	} while (pathp->node);
+	}
+
+	/* clear the root's tag bit */
+	if (root_tag_get(root, tag))
+		root_tag_clear(root, tag);
+
 out:
-	return ret;
+	return slot;
 }
 EXPORT_SYMBOL(radix_tree_tag_clear);
 
@@ -458,9 +489,8 @@ EXPORT_SYMBOL(radix_tree_tag_clear);
  *
  * Return values:
  *
- *  0: tag not present
- *  1: tag present, set
- * -1: tag present, unset
+ *  0: tag not present or not set
+ *  1: tag set
  */
 int radix_tree_tag_get(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag)
@@ -473,6 +503,13 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 	if (index > radix_tree_maxindex(height))
 		return 0;
 
+	/* check the root's tag bit */
+	if (!root_tag_get(root, tag))
+		return 0;
+
+	if (height == 0)
+		return 1;
+
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 	slot = root->rnode;
 
@@ -494,7 +531,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 			int ret = tag_get(slot, tag, offset);
 
 			BUG_ON(ret && saw_unset_tag);
-			return ret ? 1 : -1;
+			return ret;
 		}
 		slot = slot->slots[offset];
 		shift -= RADIX_TREE_MAP_SHIFT;
@@ -514,8 +551,11 @@ __lookup(struct radix_tree_root *root, void **results, unsigned long index,
 	unsigned long i;
 
 	height = root->height;
-	if (height == 0)
+	if (height == 0) {
+		if (root->rnode && index == 0)
+			results[nr_found++] = root->rnode;
 		goto out;
+	}
 
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 	slot = root->rnode;
@@ -603,10 +643,16 @@ __lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
 	unsigned int height = root->height;
 	struct radix_tree_node *slot;
 
+	if (height == 0) {
+		if (root->rnode && index == 0)
+			results[nr_found++] = root->rnode;
+		goto out;
+	}
+
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 	slot = root->rnode;
 
-	while (height > 0) {
+	do {
 		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
 
 		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
@@ -637,7 +683,7 @@ __lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
 		}
 		shift -= RADIX_TREE_MAP_SHIFT;
 		slot = slot->slots[i];
-	}
+	} while (height > 0);
 out:
 	*next_index = index;
 	return nr_found;
@@ -665,6 +711,10 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 	unsigned long cur_index = first_index;
 	unsigned int ret = 0;
 
+	/* check the root's tag bit */
+	if (!root_tag_get(root, tag))
+		return 0;
+
 	while (ret < max_items) {
 		unsigned int nr_found;
 		unsigned long next_index;	/* Index of next search */
@@ -689,7 +739,7 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
 static inline void radix_tree_shrink(struct radix_tree_root *root)
 {
 	/* try to shrink tree height */
-	while (root->height > 1 &&
+	while (root->height > 0 &&
 			root->rnode->count == 1 &&
 			root->rnode->slots[0]) {
 		struct radix_tree_node *to_free = root->rnode;
@@ -717,12 +767,8 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 {
 	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
-	struct radix_tree_path *orig_pathp;
-	struct radix_tree_node *slot;
+	struct radix_tree_node *slot = NULL;
 	unsigned int height, shift;
-	void *ret = NULL;
-	char tags[RADIX_TREE_MAX_TAGS];
-	int nr_cleared_tags;
 	int tag;
 	int offset;
 
@@ -730,11 +776,17 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 	if (index > radix_tree_maxindex(height))
 		goto out;
 
+	slot = root->rnode;
+	if (height == 0 && root->rnode) {
+		root_tag_clear_all(root);
+		root->rnode = NULL;
+		goto out;
+	}
+
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 	pathp->node = NULL;
-	slot = root->rnode;
 
-	for ( ; height > 0; height--) {
+	do {
 		if (slot == NULL)
 			goto out;
 
@@ -744,44 +796,22 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 		pathp->node = slot;
 		slot = slot->slots[offset];
 		shift -= RADIX_TREE_MAP_SHIFT;
-	}
+		height--;
+	} while (height > 0);
 
-	ret = slot;
-	if (ret == NULL)
+	if (slot == NULL)
 		goto out;
 
-	orig_pathp = pathp;
-
 	/*
 	 * Clear all tags associated with the just-deleted item
 	 */
-	nr_cleared_tags = 0;
 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		tags[tag] = 1;
-		if (tag_get(pathp->node, tag, pathp->offset)) {
-			tag_clear(pathp->node, tag, pathp->offset);
-			if (!any_tag_set(pathp->node, tag)) {
-				tags[tag] = 0;
-				nr_cleared_tags++;
-			}
-		}
-	}
-
-	for (pathp--; nr_cleared_tags && pathp->node; pathp--) {
-		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-			if (tags[tag])
-				continue;
-
-			tag_clear(pathp->node, tag, pathp->offset);
-			if (any_tag_set(pathp->node, tag)) {
-				tags[tag] = 1;
-				nr_cleared_tags--;
-			}
-		}
+		if (tag_get(pathp->node, tag, pathp->offset))
+			radix_tree_tag_clear(root, index, tag);
 	}
 
 	/* Now free the nodes we do not need anymore */
-	for (pathp = orig_pathp; pathp->node; pathp--) {
+	while (pathp->node) {
 		pathp->node->slots[pathp->offset] = NULL;
 		pathp->node->count--;
 
@@ -793,11 +823,15 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 
 		/* Node with zero slots in use so free it */
 		radix_tree_node_free(pathp->node);
+
+		pathp--;
 	}
-	root->rnode = NULL;
+	root_tag_clear_all(root);
 	root->height = 0;
+	root->rnode = NULL;
+
 out:
-	return ret;
+	return slot;
 }
 EXPORT_SYMBOL(radix_tree_delete);
 
@@ -808,11 +842,7 @@ EXPORT_SYMBOL(radix_tree_delete);
  */
 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
 {
-  	struct radix_tree_node *rnode;
-  	rnode = root->rnode;
-  	if (!rnode)
-  		return 0;
-	return any_tag_set(rnode, tag);
+	return root_tag_get(root, tag);
 }
 EXPORT_SYMBOL(radix_tree_tagged);
 
-- 
cgit v1.2.3


From 111ebb6e6f7bd7de6d722c5848e95621f43700d9 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 23 Jun 2006 02:03:26 -0700
Subject: [PATCH] writeback: fix range handling

When a writeback_control's `start' and `end' fields are used to
indicate a one-byte-range starting at file offset zero, the required
values of .start=0,.end=0 mean that the ->writepages() implementation
has no way of telling that it is being asked to perform a range
request.  Because we're currently overloading (start == 0 && end == 0)
to mean "this is not a write-a-range request".

To make all this sane, the patch changes range of writeback_control.

So caller does: If it is calling ->writepages() to write pages, it
sets range (range_start/end or range_cyclic) always.

And if range_cyclic is true, ->writepages() thinks the range is
cyclic, otherwise it just uses range_start and range_end.

This patch does,

    - Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h
      -1 is usually ok for range_end (type is long long). But, if someone did,

		range_end += val;		range_end is "val - 1"
		u64val = range_end >> bits;	u64val is "~(0ULL)"

      or something, they are wrong. So, this adds LLONG_MAX to avoid nasty
      things, and uses LLONG_MAX for range_end.

    - All callers of ->writepages() sets range_start/end or range_cyclic.

    - Fix updates of ->writeback_index. It seems already bit strange.
      If it starts at 0 and ended by check of nr_to_write, this last
      index may reduce chance to scan end of file.  So, this updates
      ->writeback_index only if range_cyclic is true or whole-file is
      scanned.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Steven French <sfrench@us.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/cifs/file.c            | 24 +++++++++++-------------
 fs/fs-writeback.c         |  4 ++++
 fs/mpage.c                | 22 ++++++++++------------
 fs/sync.c                 |  2 +-
 include/linux/kernel.h    |  3 +++
 include/linux/writeback.h |  5 +++--
 mm/filemap.c              |  6 +++---
 mm/page-writeback.c       |  3 +++
 mm/vmscan.c               |  2 ++
 9 files changed, 40 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e2b4ce1dad66..487ea8b3baaa 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1079,9 +1079,9 @@ static int cifs_writepages(struct address_space *mapping,
 	unsigned int bytes_written;
 	struct cifs_sb_info *cifs_sb;
 	int done = 0;
-	pgoff_t end = -1;
+	pgoff_t end;
 	pgoff_t index;
-	int is_range = 0;
+ 	int range_whole = 0;
 	struct kvec iov[32];
 	int len;
 	int n_iov = 0;
@@ -1122,16 +1122,14 @@ static int cifs_writepages(struct address_space *mapping,
 	xid = GetXid();
 
 	pagevec_init(&pvec, 0);
-	if (wbc->sync_mode == WB_SYNC_NONE)
+	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
-	else {
-		index = 0;
-		scanned = 1;
-	}
-	if (wbc->start || wbc->end) {
-		index = wbc->start >> PAGE_CACHE_SHIFT;
-		end = wbc->end >> PAGE_CACHE_SHIFT;
-		is_range = 1;
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
 		scanned = 1;
 	}
 retry:
@@ -1167,7 +1165,7 @@ retry:
 				break;
 			}
 
-			if (unlikely(is_range) && (page->index > end)) {
+			if (!wbc->range_cyclic && page->index > end) {
 				done = 1;
 				unlock_page(page);
 				break;
@@ -1271,7 +1269,7 @@ retry:
 		index = 0;
 		goto retry;
 	}
-	if (!is_range)
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
 
 	FreeXid(xid);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f3fbe2d030f4..6db95cf3aaa2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -461,6 +461,8 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 {
 	struct writeback_control wbc = {
 		.sync_mode	= wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+		.range_start	= 0,
+		.range_end	= LLONG_MAX,
 	};
 	unsigned long nr_dirty = read_page_state(nr_dirty);
 	unsigned long nr_unstable = read_page_state(nr_unstable);
@@ -559,6 +561,8 @@ int write_inode_now(struct inode *inode, int sync)
 	struct writeback_control wbc = {
 		.nr_to_write = LONG_MAX,
 		.sync_mode = WB_SYNC_ALL,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
 	};
 
 	if (!mapping_cap_writeback_dirty(inode->i_mapping))
diff --git a/fs/mpage.c b/fs/mpage.c
index 9bf2eb30e6f4..1e4598247d0b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -707,9 +707,9 @@ mpage_writepages(struct address_space *mapping,
 	struct pagevec pvec;
 	int nr_pages;
 	pgoff_t index;
-	pgoff_t end = -1;		/* Inclusive */
+	pgoff_t end;		/* Inclusive */
 	int scanned = 0;
-	int is_range = 0;
+	int range_whole = 0;
 
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 		wbc->encountered_congestion = 1;
@@ -721,16 +721,14 @@ mpage_writepages(struct address_space *mapping,
 		writepage = mapping->a_ops->writepage;
 
 	pagevec_init(&pvec, 0);
-	if (wbc->sync_mode == WB_SYNC_NONE) {
+	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
 	} else {
-		index = 0;			  /* whole-file sweep */
-		scanned = 1;
-	}
-	if (wbc->start || wbc->end) {
-		index = wbc->start >> PAGE_CACHE_SHIFT;
-		end = wbc->end >> PAGE_CACHE_SHIFT;
-		is_range = 1;
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
 		scanned = 1;
 	}
 retry:
@@ -759,7 +757,7 @@ retry:
 				continue;
 			}
 
-			if (unlikely(is_range) && page->index > end) {
+			if (!wbc->range_cyclic && page->index > end) {
 				done = 1;
 				unlock_page(page);
 				continue;
@@ -810,7 +808,7 @@ retry:
 		index = 0;
 		goto retry;
 	}
-	if (!is_range)
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
 	if (bio)
 		mpage_bio_submit(WRITE, bio);
diff --git a/fs/sync.c b/fs/sync.c
index aab5ffe77e9f..955aef04da28 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -100,7 +100,7 @@ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
 	}
 
 	if (nbytes == 0)
-		endbyte = -1;
+		endbyte = LLONG_MAX;
 	else
 		endbyte--;		/* inclusive */
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f4fc576ed4c4..25fccd859fbf 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -24,6 +24,9 @@ extern const char linux_banner[];
 #define LONG_MAX	((long)(~0UL>>1))
 #define LONG_MIN	(-LONG_MAX - 1)
 #define ULONG_MAX	(~0UL)
+#define LLONG_MAX	((long long)(~0ULL>>1))
+#define LLONG_MIN	(-LLONG_MAX - 1)
+#define ULLONG_MAX	(~0ULL)
 
 #define STACK_MAGIC	0xdeadbeef
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 56f92fcbe94a..9e38b566d0e7 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -50,14 +50,15 @@ struct writeback_control {
 	 * a hint that the filesystem need only write out the pages inside that
 	 * byterange.  The byte at `end' is included in the writeout request.
 	 */
-	loff_t start;
-	loff_t end;
+	loff_t range_start;
+	loff_t range_end;
 
 	unsigned nonblocking:1;		/* Don't get stuck on request queues */
 	unsigned encountered_congestion:1; /* An output: a queue is full */
 	unsigned for_kupdate:1;		/* A kupdate writeback */
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned for_writepages:1;	/* This is a writepages() call */
+	unsigned range_cyclic:1;	/* range_start is cyclic */
 };
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index fd57442186cb..3342067ca436 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -190,8 +190,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 	struct writeback_control wbc = {
 		.sync_mode = sync_mode,
 		.nr_to_write = mapping->nrpages * 2,
-		.start = start,
-		.end = end,
+		.range_start = start,
+		.range_end = end,
 	};
 
 	if (!mapping_cap_writeback_dirty(mapping))
@@ -204,7 +204,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 static inline int __filemap_fdatawrite(struct address_space *mapping,
 	int sync_mode)
 {
-	return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode);
+	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
 }
 
 int filemap_fdatawrite(struct address_space *mapping)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 75d7f48b79bb..8ccf6f1b1473 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -204,6 +204,7 @@ static void balance_dirty_pages(struct address_space *mapping)
 			.sync_mode	= WB_SYNC_NONE,
 			.older_than_this = NULL,
 			.nr_to_write	= write_chunk,
+			.range_cyclic	= 1,
 		};
 
 		get_dirty_limits(&wbs, &background_thresh,
@@ -331,6 +332,7 @@ static void background_writeout(unsigned long _min_pages)
 		.older_than_this = NULL,
 		.nr_to_write	= 0,
 		.nonblocking	= 1,
+		.range_cyclic	= 1,
 	};
 
 	for ( ; ; ) {
@@ -407,6 +409,7 @@ static void wb_kupdate(unsigned long arg)
 		.nr_to_write	= 0,
 		.nonblocking	= 1,
 		.for_kupdate	= 1,
+		.range_cyclic	= 1,
 	};
 
 	sync_supers();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 46be8a02280e..bc5d4f43036c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -339,6 +339,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping)
 		struct writeback_control wbc = {
 			.sync_mode = WB_SYNC_NONE,
 			.nr_to_write = SWAP_CLUSTER_MAX,
+			.range_start = 0,
+			.range_end = LLONG_MAX,
 			.nonblocking = 1,
 			.for_reclaim = 1,
 		};
-- 
cgit v1.2.3


From e7340f73307abed9283d0a07570d06e228c205dd Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:29 -0700
Subject: [PATCH] page migration cleanup: remove useless definitions

Remove the export for migrate_page_remove_references() and migrate_page_copy()
that are unlikely to be used directly by filesystems implementing migration.
The export was useful when buffer_migrate_page() lived in fs/buffer.c but it
has now been moved to migrate.c in the migration reorg.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/migrate.h | 2 --
 mm/migrate.c            | 6 ++----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 6789c4940c9c..e8d3b08cc354 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -7,8 +7,6 @@
 extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct page *, struct page *);
-extern void migrate_page_copy(struct page *, struct page *);
-extern int migrate_page_remove_references(struct page *, struct page *, int);
 extern int migrate_pages(struct list_head *l, struct list_head *t,
 		struct list_head *moved, struct list_head *failed);
 extern int migrate_pages_to(struct list_head *pagelist,
diff --git a/mm/migrate.c b/mm/migrate.c
index 49e71ddb6792..be3f141e53a4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -169,7 +169,7 @@ retry:
  * Remove references for a page and establish the new page with the correct
  * basic settings to be able to stop accesses to the page.
  */
-int migrate_page_remove_references(struct page *newpage,
+static int migrate_page_remove_references(struct page *newpage,
 				struct page *page, int nr_refs)
 {
 	struct address_space *mapping = page_mapping(page);
@@ -246,12 +246,11 @@ int migrate_page_remove_references(struct page *newpage,
 
 	return 0;
 }
-EXPORT_SYMBOL(migrate_page_remove_references);
 
 /*
  * Copy the page to its new location
  */
-void migrate_page_copy(struct page *newpage, struct page *page)
+static void migrate_page_copy(struct page *newpage, struct page *page)
 {
 	copy_highpage(newpage, page);
 
@@ -286,7 +285,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 	if (PageWriteback(newpage))
 		end_page_writeback(newpage);
 }
-EXPORT_SYMBOL(migrate_page_copy);
 
 /************************************************************
  *                    Migration functions
-- 
cgit v1.2.3


From 2d1db3b1170db4e8bf0531dd636742269c2cf579 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:33 -0700
Subject: [PATCH] page migration cleanup: pass "mapping" to migration functions

Change handling of address spaces.

Pass a pointer to the address space in which the page is migrated to all
migration function.  This avoids repeatedly having to retrieve the address
space pointer from the page and checking it for validity.  The old page
mapping will change once migration has gone to a certain step, so it is less
confusing to have the pointer always available.

Move the setting of the mapping and index for the new page into
migrate_pages().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fs.h      |  6 +++--
 include/linux/migrate.h |  6 +++--
 mm/migrate.c            | 70 ++++++++++++++++++++++++-------------------------
 3 files changed, 42 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c823a3815e24..e917403f4d58 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -376,7 +376,8 @@ struct address_space_operations {
 	struct page* (*get_xip_page)(struct address_space *, sector_t,
 			int);
 	/* migrate the contents of a page to the specified target */
-	int (*migratepage) (struct page *, struct page *);
+	int (*migratepage) (struct address_space *,
+			struct page *, struct page *);
 };
 
 struct backing_dev_info;
@@ -1772,7 +1773,8 @@ extern void simple_release_fs(struct vfsmount **mount, int *count);
 extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const void *, size_t);
 
 #ifdef CONFIG_MIGRATION
-extern int buffer_migrate_page(struct page *, struct page *);
+extern int buffer_migrate_page(struct address_space *,
+				struct page *, struct page *);
 #else
 #define buffer_migrate_page NULL
 #endif
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index e8d3b08cc354..287c47b5e5df 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -6,12 +6,14 @@
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
-extern int migrate_page(struct page *, struct page *);
+extern int migrate_page(struct address_space *,
+			struct page *, struct page *);
 extern int migrate_pages(struct list_head *l, struct list_head *t,
 		struct list_head *moved, struct list_head *failed);
 extern int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest);
-extern int fail_migrate_page(struct page *, struct page *);
+extern int fail_migrate_page(struct address_space *,
+			struct page *, struct page *);
 
 extern int migrate_prep(void);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 8095c607a494..f65e69d94527 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -173,15 +173,11 @@ retry:
  * 2 for pages with a mapping
  * 3 for pages with a mapping and PagePrivate set.
  */
-static int migrate_page_move_mapping(struct page *newpage,
-				struct page *page)
+static int migrate_page_move_mapping(struct address_space *mapping,
+		struct page *newpage, struct page *page)
 {
-	struct address_space *mapping = page_mapping(page);
 	struct page **radix_pointer;
 
-	if (!mapping)
-		return -EAGAIN;
-
 	write_lock_irq(&mapping->tree_lock);
 
 	radix_pointer = (struct page **)radix_tree_lookup_slot(
@@ -197,15 +193,8 @@ static int migrate_page_move_mapping(struct page *newpage,
 
 	/*
 	 * Now we know that no one else is looking at the page.
-	 *
-	 * Certain minimal information about a page must be available
-	 * in order for other subsystems to properly handle the page if they
-	 * find it through the radix tree update before we are finished
-	 * copying the page.
 	 */
 	get_page(newpage);
-	newpage->index = page->index;
-	newpage->mapping = page->mapping;
 	if (PageSwapCache(page)) {
 		SetPageSwapCache(newpage);
 		set_page_private(newpage, page_private(page));
@@ -262,7 +251,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
  ***********************************************************/
 
 /* Always fail migration. Used for mappings that are not movable */
-int fail_migrate_page(struct page *newpage, struct page *page)
+int fail_migrate_page(struct address_space *mapping,
+			struct page *newpage, struct page *page)
 {
 	return -EIO;
 }
@@ -274,13 +264,14 @@ EXPORT_SYMBOL(fail_migrate_page);
  *
  * Pages are locked upon entry and exit.
  */
-int migrate_page(struct page *newpage, struct page *page)
+int migrate_page(struct address_space *mapping,
+		struct page *newpage, struct page *page)
 {
 	int rc;
 
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	rc = migrate_page_move_mapping(newpage, page);
+	rc = migrate_page_move_mapping(mapping, newpage, page);
 
 	if (rc)
 		return rc;
@@ -305,21 +296,18 @@ EXPORT_SYMBOL(migrate_page);
  * if the underlying filesystem guarantees that no other references to "page"
  * exist.
  */
-int buffer_migrate_page(struct page *newpage, struct page *page)
+int buffer_migrate_page(struct address_space *mapping,
+		struct page *newpage, struct page *page)
 {
-	struct address_space *mapping = page->mapping;
 	struct buffer_head *bh, *head;
 	int rc;
 
-	if (!mapping)
-		return -EAGAIN;
-
 	if (!page_has_buffers(page))
-		return migrate_page(newpage, page);
+		return migrate_page(mapping, newpage, page);
 
 	head = page_buffers(page);
 
-	rc = migrate_page_move_mapping(newpage, page);
+	rc = migrate_page_move_mapping(mapping, newpage, page);
 
 	if (rc)
 		return rc;
@@ -448,9 +436,6 @@ redo:
 			goto next;
 		}
 
-		newpage = lru_to_page(to);
-		lock_page(newpage);
-
 		/*
 		 * Establish swap ptes for anonymous pages or destroy pte
 		 * maps for files.
@@ -473,11 +458,18 @@ redo:
 		rc = -EPERM;
 		if (try_to_unmap(page, 1) == SWAP_FAIL)
 			/* A vma has VM_LOCKED set -> permanent failure */
-			goto unlock_both;
+			goto unlock_page;
 
 		rc = -EAGAIN;
 		if (page_mapped(page))
-			goto unlock_both;
+			goto unlock_page;
+
+		newpage = lru_to_page(to);
+		lock_page(newpage);
+		/* Prepare mapping for the new page.*/
+		newpage->index = page->index;
+		newpage->mapping = page->mapping;
+
 		/*
 		 * Pages are properly locked and writeback is complete.
 		 * Try to migrate the page.
@@ -494,7 +486,8 @@ redo:
 			 * own migration function. This is the most common
 			 * path for page migration.
 			 */
-			rc = mapping->a_ops->migratepage(newpage, page);
+			rc = mapping->a_ops->migratepage(mapping,
+							newpage, page);
 			goto unlock_both;
                 }
 
@@ -524,7 +517,7 @@ redo:
 		 */
 		if (!page_has_buffers(page) ||
 		    try_to_release_page(page, GFP_KERNEL)) {
-			rc = migrate_page(newpage, page);
+			rc = migrate_page(mapping, newpage, page);
 			goto unlock_both;
 		}
 
@@ -553,12 +546,17 @@ unlock_page:
 		unlock_page(page);
 
 next:
-		if (rc == -EAGAIN) {
-			retry++;
-		} else if (rc) {
-			/* Permanent failure */
-			list_move(&page->lru, failed);
-			nr_failed++;
+		if (rc) {
+			if (newpage)
+				newpage->mapping = NULL;
+
+			if (rc == -EAGAIN)
+				retry++;
+			else {
+				/* Permanent failure */
+				list_move(&page->lru, failed);
+				nr_failed++;
+			}
 		} else {
 			if (newpage) {
 				/* Successful migration. Return page to LRU */
-- 
cgit v1.2.3


From 0697212a411c1dae03c27845f2de2f3adb32c331 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:35 -0700
Subject: [PATCH] Swapless page migration: add R/W migration entries

Implement read/write migration ptes

We take the upper two swapfiles for the two types of migration ptes and define
a series of macros in swapops.h.

The VM is modified to handle the migration entries.  migration entries can
only be encountered when the page they are pointing to is locked.  This limits
the number of places one has to fix.  We also check in copy_pte_range and in
mprotect_pte_range() for migration ptes.

We check for migration ptes in do_swap_cache and call a function that will
then wait on the page lock.  This allows us to effectively stop all accesses
to apge.

Migration entries are created by try_to_unmap if called for migration and
removed by local functions in migrate.c

From: Hugh Dickins <hugh@veritas.com>

  Several times while testing swapless page migration (I've no NUMA, just
  hacking it up to migrate recklessly while running load), I've hit the
  BUG_ON(!PageLocked(p)) in migration_entry_to_page.

  This comes from an orphaned migration entry, unrelated to the current
  correctly locked migration, but hit by remove_anon_migration_ptes as it
  checks an address in each vma of the anon_vma list.

  Such an orphan may be left behind if an earlier migration raced with fork:
  copy_one_pte can duplicate a migration entry from parent to child, after
  remove_anon_migration_ptes has checked the child vma, but before it has
  removed it from the parent vma.  (If the process were later to fault on this
  orphaned entry, it would hit the same BUG from migration_entry_wait.)

  This could be fixed by locking anon_vma in copy_one_pte, but we'd rather
  not.  There's no such problem with file pages, because vma_prio_tree_add
  adds child vma after parent vma, and the page table locking at each end is
  enough to serialize.  Follow that example with anon_vma: add new vmas to the
  tail instead of the head.

  (There's no corresponding problem when inserting migration entries,
  because a missed pte will leave the page count and mapcount high, which is
  allowed for.  And there's no corresponding problem when migrating via swap,
  because a leftover swap entry will be correctly faulted.  But the swapless
  method has no refcounting of its entries.)

From: Ingo Molnar <mingo@elte.hu>

  pte_unmap_unlock() takes the pte pointer as an argument.

From: Hugh Dickins <hugh@veritas.com>

  Several times while testing swapless page migration, gcc has tried to exec
  a pointer instead of a string: smells like COW mappings are not being
  properly write-protected on fork.

  The protection in copy_one_pte looks very convincing, until at last you
  realize that the second arg to make_migration_entry is a boolean "write",
  and SWP_MIGRATION_READ is 30.

  Anyway, it's better done like in change_pte_range, using
  is_write_migration_entry and make_migration_entry_read.

From: Hugh Dickins <hugh@veritas.com>

  Remove unnecessary obfuscation from sys_swapon's range check on swap type,
  which blew up causing memory corruption once swapless migration made
  MAX_SWAPFILES no longer 2 ^ MAX_SWAPFILES_SHIFT.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
From: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h    |   7 +++
 include/linux/swapops.h |  53 ++++++++++++++++++++
 mm/memory.c             |  18 ++++++-
 mm/migrate.c            | 128 +++++++++++++++++++++++++++++++++++++++++++++++-
 mm/mprotect.c           |  23 +++++++--
 mm/rmap.c               |  38 ++++++++------
 mm/swapfile.c           |  20 +++-----
 7 files changed, 255 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index cd28ad206dae..7cee73ef4f15 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -28,7 +28,14 @@ static inline int current_is_kswapd(void)
  * the type/offset into the pte as 5/27 as well.
  */
 #define MAX_SWAPFILES_SHIFT	5
+#ifndef CONFIG_MIGRATION
 #define MAX_SWAPFILES		(1 << MAX_SWAPFILES_SHIFT)
+#else
+/* Use last two entries for page migration swap entries */
+#define MAX_SWAPFILES		((1 << MAX_SWAPFILES_SHIFT)-2)
+#define SWP_MIGRATION_READ	MAX_SWAPFILES
+#define SWP_MIGRATION_WRITE	(MAX_SWAPFILES + 1)
+#endif
 
 /*
  * Magic header for a swap area. The first part of the union is
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 87b9d14c710d..ec639aa3a1d3 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -67,3 +67,56 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry)
 	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
 	return __swp_entry_to_pte(arch_entry);
 }
+
+#ifdef CONFIG_MIGRATION
+static inline swp_entry_t make_migration_entry(struct page *page, int write)
+{
+	BUG_ON(!PageLocked(page));
+	return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ,
+			page_to_pfn(page));
+}
+
+static inline int is_migration_entry(swp_entry_t entry)
+{
+	return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
+			swp_type(entry) == SWP_MIGRATION_WRITE);
+}
+
+static inline int is_write_migration_entry(swp_entry_t entry)
+{
+	return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
+}
+
+static inline struct page *migration_entry_to_page(swp_entry_t entry)
+{
+	struct page *p = pfn_to_page(swp_offset(entry));
+	/*
+	 * Any use of migration entries may only occur while the
+	 * corresponding page is locked
+	 */
+	BUG_ON(!PageLocked(p));
+	return p;
+}
+
+static inline void make_migration_entry_read(swp_entry_t *entry)
+{
+	*entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
+}
+
+extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+					unsigned long address);
+#else
+
+#define make_migration_entry(page, write) swp_entry(0, 0)
+#define is_migration_entry(swp) 0
+#define migration_entry_to_page(swp) NULL
+static inline void make_migration_entry_read(swp_entry_t *entryp) { }
+static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+					 unsigned long address) { }
+static inline int is_write_migration_entry(swp_entry_t entry)
+{
+	return 0;
+}
+
+#endif
+
diff --git a/mm/memory.c b/mm/memory.c
index 7e3683fd4f3c..11673c5d2c20 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
 		if (!pte_file(pte)) {
-			swap_duplicate(pte_to_swp_entry(pte));
+			swp_entry_t entry = pte_to_swp_entry(pte);
+
+			swap_duplicate(entry);
 			/* make sure dst_mm is on swapoff's mmlist. */
 			if (unlikely(list_empty(&dst_mm->mmlist))) {
 				spin_lock(&mmlist_lock);
@@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
+			if (is_write_migration_entry(entry) &&
+					is_cow_mapping(vm_flags)) {
+				/*
+				 * COW mappings require pages in both parent
+				 * and child to be set to read.
+				 */
+				make_migration_entry_read(&entry);
+				pte = swp_entry_to_pte(entry);
+				set_pte_at(src_mm, addr, src_pte, pte);
+			}
 		}
 		goto out_set_pte;
 	}
@@ -1879,6 +1891,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out;
 
 	entry = pte_to_swp_entry(orig_pte);
+	if (is_migration_entry(entry)) {
+		migration_entry_wait(mm, pmd, address);
+		goto out;
+	}
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
diff --git a/mm/migrate.c b/mm/migrate.c
index 5a340f4ca212..0a011e421bb4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -15,6 +15,7 @@
 #include <linux/migrate.h>
 #include <linux/module.h>
 #include <linux/swap.h>
+#include <linux/swapops.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/mm_inline.h>
@@ -23,7 +24,6 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
-#include <linux/swapops.h>
 
 #include "internal.h"
 
@@ -119,6 +119,132 @@ int putback_lru_pages(struct list_head *l)
 	return count;
 }
 
+static inline int is_swap_pte(pte_t pte)
+{
+	return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
+}
+
+/*
+ * Restore a potential migration pte to a working pte entry
+ */
+static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
+		struct page *old, struct page *new)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	swp_entry_t entry;
+ 	pgd_t *pgd;
+ 	pud_t *pud;
+ 	pmd_t *pmd;
+	pte_t *ptep, pte;
+ 	spinlock_t *ptl;
+
+ 	pgd = pgd_offset(mm, addr);
+	if (!pgd_present(*pgd))
+                return;
+
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud))
+                return;
+
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		return;
+
+	ptep = pte_offset_map(pmd, addr);
+
+	if (!is_swap_pte(*ptep)) {
+		pte_unmap(ptep);
+ 		return;
+ 	}
+
+ 	ptl = pte_lockptr(mm, pmd);
+ 	spin_lock(ptl);
+	pte = *ptep;
+	if (!is_swap_pte(pte))
+		goto out;
+
+	entry = pte_to_swp_entry(pte);
+
+	if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
+		goto out;
+
+	inc_mm_counter(mm, anon_rss);
+	get_page(new);
+	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
+	if (is_write_migration_entry(entry))
+		pte = pte_mkwrite(pte);
+	set_pte_at(mm, addr, ptep, pte);
+	page_add_anon_rmap(new, vma, addr);
+out:
+	pte_unmap_unlock(ptep, ptl);
+}
+
+/*
+ * Get rid of all migration entries and replace them by
+ * references to the indicated page.
+ *
+ * Must hold mmap_sem lock on at least one of the vmas containing
+ * the page so that the anon_vma cannot vanish.
+ */
+static void remove_migration_ptes(struct page *old, struct page *new)
+{
+	struct anon_vma *anon_vma;
+	struct vm_area_struct *vma;
+	unsigned long mapping;
+
+	mapping = (unsigned long)new->mapping;
+
+	if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
+		return;
+
+	/*
+	 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
+	 */
+	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
+	spin_lock(&anon_vma->lock);
+
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
+		remove_migration_pte(vma, page_address_in_vma(new, vma),
+					old, new);
+
+	spin_unlock(&anon_vma->lock);
+}
+
+/*
+ * Something used the pte of a page under migration. We need to
+ * get to the page and wait until migration is finished.
+ * When we return from this function the fault will be retried.
+ *
+ * This function is called from do_swap_page().
+ */
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long address)
+{
+	pte_t *ptep, pte;
+	spinlock_t *ptl;
+	swp_entry_t entry;
+	struct page *page;
+
+	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+	pte = *ptep;
+	if (!is_swap_pte(pte))
+		goto out;
+
+	entry = pte_to_swp_entry(pte);
+	if (!is_migration_entry(entry))
+		goto out;
+
+	page = migration_entry_to_page(entry);
+
+	get_page(page);
+	pte_unmap_unlock(ptep, ptl);
+	wait_on_page_locked(page);
+	put_page(page);
+	return;
+out:
+	pte_unmap_unlock(ptep, ptl);
+}
+
 /*
  * swapout a single page
  * page is locked upon entry, unlocked on exit
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 5faf01ad3ef8..14f93e62270f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -19,7 +19,8 @@
 #include <linux/mempolicy.h>
 #include <linux/personality.h>
 #include <linux/syscalls.h>
-
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -28,12 +29,13 @@
 static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		unsigned long addr, unsigned long end, pgprot_t newprot)
 {
-	pte_t *pte;
+	pte_t *pte, oldpte;
 	spinlock_t *ptl;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	do {
-		if (pte_present(*pte)) {
+		oldpte = *pte;
+		if (pte_present(oldpte)) {
 			pte_t ptent;
 
 			/* Avoid an SMP race with hardware updated dirty/clean
@@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
 			set_pte_at(mm, addr, pte, ptent);
 			lazy_mmu_prot_update(ptent);
+#ifdef CONFIG_MIGRATION
+		} else if (!pte_file(oldpte)) {
+			swp_entry_t entry = pte_to_swp_entry(oldpte);
+
+			if (is_write_migration_entry(entry)) {
+				/*
+				 * A protection check is difficult so
+				 * just be safe and disable write
+				 */
+				make_migration_entry_read(&entry);
+				set_pte_at(mm, addr, pte,
+					swp_entry_to_pte(entry));
+			}
+#endif
 		}
+
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(pte - 1, ptl);
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 10806b7af40c..3b8ce86daa3a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 		spin_lock(&mm->page_table_lock);
 		if (likely(!vma->anon_vma)) {
 			vma->anon_vma = anon_vma;
-			list_add(&vma->anon_vma_node, &anon_vma->head);
+			list_add_tail(&vma->anon_vma_node, &anon_vma->head);
 			allocated = NULL;
 		}
 		spin_unlock(&mm->page_table_lock);
@@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma)
 	struct anon_vma *anon_vma = vma->anon_vma;
 
 	if (anon_vma) {
-		list_add(&vma->anon_vma_node, &anon_vma->head);
+		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
 		validate_anon_vma(vma);
 	}
 }
@@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma)
 
 	if (anon_vma) {
 		spin_lock(&anon_vma->lock);
-		list_add(&vma->anon_vma_node, &anon_vma->head);
+		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
 		validate_anon_vma(vma);
 		spin_unlock(&anon_vma->lock);
 	}
@@ -620,17 +620,27 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 	if (PageAnon(page)) {
 		swp_entry_t entry = { .val = page_private(page) };
-		/*
-		 * Store the swap location in the pte.
-		 * See handle_pte_fault() ...
-		 */
-		BUG_ON(!PageSwapCache(page));
-		swap_duplicate(entry);
-		if (list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			if (list_empty(&mm->mmlist))
-				list_add(&mm->mmlist, &init_mm.mmlist);
-			spin_unlock(&mmlist_lock);
+
+		if (PageSwapCache(page)) {
+			/*
+			 * Store the swap location in the pte.
+			 * See handle_pte_fault() ...
+			 */
+			swap_duplicate(entry);
+			if (list_empty(&mm->mmlist)) {
+				spin_lock(&mmlist_lock);
+				if (list_empty(&mm->mmlist))
+					list_add(&mm->mmlist, &init_mm.mmlist);
+				spin_unlock(&mmlist_lock);
+			}
+		} else {
+			/*
+			 * Store the pfn of the page in a special migration
+			 * pte. do_swap_page() will wait until the migration
+			 * pte is removed and then restart fault handling.
+			 */
+			BUG_ON(!migration);
+			entry = make_migration_entry(page, pte_write(pteval));
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 47a6812f5f8c..e3b1362372c2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -395,6 +395,9 @@ void free_swap_and_cache(swp_entry_t entry)
 	struct swap_info_struct * p;
 	struct page *page = NULL;
 
+	if (is_migration_entry(entry))
+		return;
+
 	p = swap_info_get(entry);
 	if (p) {
 		if (swap_entry_free(p, swp_offset(entry)) == 1) {
@@ -1400,19 +1403,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		if (!(p->flags & SWP_USED))
 			break;
 	error = -EPERM;
-	/*
-	 * Test if adding another swap device is possible. There are
-	 * two limiting factors: 1) the number of bits for the swap
-	 * type swp_entry_t definition and 2) the number of bits for
-	 * the swap type in the swap ptes as defined by the different
-	 * architectures. To honor both limitations a swap entry
-	 * with swap offset 0 and swap type ~0UL is created, encoded
-	 * to a swap pte, decoded to a swp_entry_t again and finally
-	 * the swap type part is extracted. This will mask all bits
-	 * from the initial ~0UL that can't be encoded in either the
-	 * swp_entry_t or the architecture definition of a swap pte.
-	 */
-	if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
+	if (type >= MAX_SWAPFILES) {
 		spin_unlock(&swap_lock);
 		goto out;
 	}
@@ -1702,6 +1693,9 @@ int swap_duplicate(swp_entry_t entry)
 	unsigned long offset, type;
 	int result = 0;
 
+	if (is_migration_entry(entry))
+		return 1;
+
 	type = swp_type(entry);
 	if (type >= nr_swapfiles)
 		goto bad_file;
-- 
cgit v1.2.3


From d75a0fcda2cfc71b50e16dc89e0c32c57d427e85 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:36 -0700
Subject: [PATCH] Swapless page migration: rip out swap based logic

Rip the page migration logic out.

Remove all code that has to do with swapping during page migration.

This also guts the ability to migrate pages to swap.  No one used that so lets
let it go for good.

Page migration should be a bit broken after this patch.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rmap.h |  1 -
 mm/migrate.c         | 75 +++-------------------------------------------------
 mm/rmap.c            | 38 --------------------------
 mm/swapfile.c        |  9 -------
 4 files changed, 3 insertions(+), 120 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 2d4c81a220db..bf97b0900014 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -91,7 +91,6 @@ static inline void page_dup_rmap(struct page *page)
  */
 int page_referenced(struct page *, int is_locked);
 int try_to_unmap(struct page *, int ignore_refs);
-void remove_from_swap(struct page *page);
 
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
diff --git a/mm/migrate.c b/mm/migrate.c
index 0a011e421bb4..81721a061d50 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -70,10 +70,6 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist)
  */
 int migrate_prep(void)
 {
-	/* Must have swap device for migration */
-	if (nr_swap_pages <= 0)
-		return -ENODEV;
-
 	/*
 	 * Clear the LRU lists so pages can be isolated.
 	 * Note that pages may be moved off the LRU after we have
@@ -245,52 +241,6 @@ out:
 	pte_unmap_unlock(ptep, ptl);
 }
 
-/*
- * swapout a single page
- * page is locked upon entry, unlocked on exit
- */
-static int swap_page(struct page *page)
-{
-	struct address_space *mapping = page_mapping(page);
-
-	if (page_mapped(page) && mapping)
-		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
-			goto unlock_retry;
-
-	if (PageDirty(page)) {
-		/* Page is dirty, try to write it out here */
-		switch(pageout(page, mapping)) {
-		case PAGE_KEEP:
-		case PAGE_ACTIVATE:
-			goto unlock_retry;
-
-		case PAGE_SUCCESS:
-			goto retry;
-
-		case PAGE_CLEAN:
-			; /* try to free the page below */
-		}
-	}
-
-	if (PagePrivate(page)) {
-		if (!try_to_release_page(page, GFP_KERNEL) ||
-		    (!mapping && page_count(page) == 1))
-			goto unlock_retry;
-	}
-
-	if (remove_mapping(mapping, page)) {
-		/* Success */
-		unlock_page(page);
-		return 0;
-	}
-
-unlock_retry:
-	unlock_page(page);
-
-retry:
-	return -EAGAIN;
-}
-
 /*
  * Replace the page in the mapping.
  *
@@ -517,8 +467,7 @@ static int fallback_migrate_page(struct address_space *mapping,
  * Two lists are passed to this function. The first list
  * contains the pages isolated from the LRU to be migrated.
  * The second list contains new pages that the pages isolated
- * can be moved to. If the second list is NULL then all
- * pages are swapped out.
+ * can be moved to.
  *
  * The function returns after 10 attempts or if no pages
  * are movable anymore because to has become empty
@@ -574,29 +523,11 @@ redo:
 		 * Only wait on writeback if we have already done a pass where
 		 * we we may have triggered writeouts for lots of pages.
 		 */
-		if (pass > 0) {
+		if (pass > 0)
 			wait_on_page_writeback(page);
-		} else {
+		else
 			if (PageWriteback(page))
 				goto unlock_page;
-		}
-
-		/*
-		 * Anonymous pages must have swap cache references otherwise
-		 * the information contained in the page maps cannot be
-		 * preserved.
-		 */
-		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!add_to_swap(page, GFP_KERNEL)) {
-				rc = -ENOMEM;
-				goto unlock_page;
-			}
-		}
-
-		if (!to) {
-			rc = swap_page(page);
-			goto next;
-		}
 
 		/*
 		 * Establish swap ptes for anonymous pages or destroy pte
diff --git a/mm/rmap.c b/mm/rmap.c
index 3b8ce86daa3a..a53a10b93ecf 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -205,44 +205,6 @@ out:
 	return anon_vma;
 }
 
-#ifdef CONFIG_MIGRATION
-/*
- * Remove an anonymous page from swap replacing the swap pte's
- * through real pte's pointing to valid pages and then releasing
- * the page from the swap cache.
- *
- * Must hold page lock on page and mmap_sem of one vma that contains
- * the page.
- */
-void remove_from_swap(struct page *page)
-{
-	struct anon_vma *anon_vma;
-	struct vm_area_struct *vma;
-	unsigned long mapping;
-
-	if (!PageSwapCache(page))
-		return;
-
-	mapping = (unsigned long)page->mapping;
-
-	if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
-		return;
-
-	/*
-	 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
-	 */
-	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
-	spin_lock(&anon_vma->lock);
-
-	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
-		remove_vma_swap(vma, page);
-
-	spin_unlock(&anon_vma->lock);
-	delete_from_swap_cache(page);
-}
-EXPORT_SYMBOL(remove_from_swap);
-#endif
-
 /*
  * At what user virtual address is page expected in vma?
  */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e3b1362372c2..fbceed67a075 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -618,15 +618,6 @@ static int unuse_mm(struct mm_struct *mm,
 	return 0;
 }
 
-#ifdef CONFIG_MIGRATION
-int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
-{
-	swp_entry_t entry = { .val = page_private(page) };
-
-	return unuse_vma(vma, entry, page);
-}
-#endif
-
 /*
  * Scan swap_map from current position to next entry still in use.
  * Recycle to start on reaching the end, returning 0 when empty.
-- 
cgit v1.2.3


From 04e62a29bf157ce1edd168f2b71b533c80d13628 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:38 -0700
Subject: [PATCH] More page migration: use migration entries for file pages

This implements the use of migration entries to preserve ptes of file backed
pages during migration.  Processes can therefore be migrated back and forth
without loosing their connection to pagecache pages.

Note that we implement the migration entries only for linear mappings.
Nonlinear mappings still require the unmapping of the ptes for migration.

And another writepage() ugliness shows up.  writepage() can drop the page
lock.  Therefore we have to remove migration ptes before calling writepages()
in order to avoid having migration entries point to unlocked pages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  15 ------
 mm/migrate.c         | 127 ++++++++++++++++++++++++++++++++++++++++-----------
 mm/rmap.c            |  11 +++++
 mm/vmscan.c          |  14 +++++-
 4 files changed, 124 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7cee73ef4f15..1cf234e8df55 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -186,20 +186,6 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 
-/* possible outcome of pageout() */
-typedef enum {
-	/* failed to write page out, page is locked */
-	PAGE_KEEP,
-	/* move page to the active list, page is locked */
-	PAGE_ACTIVATE,
-	/* page has been sent to the disk successfully, page is unlocked */
-	PAGE_SUCCESS,
-	/* page is clean and locked */
-	PAGE_CLEAN,
-} pageout_t;
-
-extern pageout_t pageout(struct page *page, struct address_space *mapping);
-
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
 extern int zone_reclaim_interval;
@@ -259,7 +245,6 @@ extern int remove_exclusive_swap_page(struct page *);
 struct backing_dev_info;
 
 extern spinlock_t swap_lock;
-extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page);
 
 /* linux/mm/thrash.c */
 extern struct mm_struct * swap_token_mm;
diff --git a/mm/migrate.c b/mm/migrate.c
index 96b9546e69e0..b5000d463893 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -24,6 +24,7 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/writeback.h>
 
 #include "internal.h"
 
@@ -123,7 +124,7 @@ static inline int is_swap_pte(pte_t pte)
 /*
  * Restore a potential migration pte to a working pte entry
  */
-static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
+static void remove_migration_pte(struct vm_area_struct *vma,
 		struct page *old, struct page *new)
 {
 	struct mm_struct *mm = vma->vm_mm;
@@ -133,6 +134,10 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
  	pmd_t *pmd;
 	pte_t *ptep, pte;
  	spinlock_t *ptl;
+	unsigned long addr = page_address_in_vma(new, vma);
+
+	if (addr == -EFAULT)
+		return;
 
  	pgd = pgd_offset(mm, addr);
 	if (!pgd_present(*pgd))
@@ -169,19 +174,47 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
 	if (is_write_migration_entry(entry))
 		pte = pte_mkwrite(pte);
 	set_pte_at(mm, addr, ptep, pte);
-	page_add_anon_rmap(new, vma, addr);
+
+	if (PageAnon(new))
+		page_add_anon_rmap(new, vma, addr);
+	else
+		page_add_file_rmap(new);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, addr, pte);
+	lazy_mmu_prot_update(pte);
+
 out:
 	pte_unmap_unlock(ptep, ptl);
 }
 
 /*
- * Get rid of all migration entries and replace them by
- * references to the indicated page.
- *
+ * Note that remove_file_migration_ptes will only work on regular mappings,
+ * Nonlinear mappings do not use migration entries.
+ */
+static void remove_file_migration_ptes(struct page *old, struct page *new)
+{
+	struct vm_area_struct *vma;
+	struct address_space *mapping = page_mapping(new);
+	struct prio_tree_iter iter;
+	pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+	if (!mapping)
+		return;
+
+	spin_lock(&mapping->i_mmap_lock);
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
+		remove_migration_pte(vma, old, new);
+
+	spin_unlock(&mapping->i_mmap_lock);
+}
+
+/*
  * Must hold mmap_sem lock on at least one of the vmas containing
  * the page so that the anon_vma cannot vanish.
  */
-static void remove_migration_ptes(struct page *old, struct page *new)
+static void remove_anon_migration_ptes(struct page *old, struct page *new)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
@@ -199,12 +232,23 @@ static void remove_migration_ptes(struct page *old, struct page *new)
 	spin_lock(&anon_vma->lock);
 
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
-		remove_migration_pte(vma, page_address_in_vma(new, vma),
-					old, new);
+		remove_migration_pte(vma, old, new);
 
 	spin_unlock(&anon_vma->lock);
 }
 
+/*
+ * Get rid of all migration entries and replace them by
+ * references to the indicated page.
+ */
+static void remove_migration_ptes(struct page *old, struct page *new)
+{
+	if (PageAnon(new))
+		remove_anon_migration_ptes(old, new);
+	else
+		remove_file_migration_ptes(old, new);
+}
+
 /*
  * Something used the pte of a page under migration. We need to
  * get to the page and wait until migration is finished.
@@ -424,30 +468,59 @@ int buffer_migrate_page(struct address_space *mapping,
 }
 EXPORT_SYMBOL(buffer_migrate_page);
 
-static int fallback_migrate_page(struct address_space *mapping,
-	struct page *newpage, struct page *page)
+/*
+ * Writeback a page to clean the dirty state
+ */
+static int writeout(struct address_space *mapping, struct page *page)
 {
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = 1,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+		.nonblocking = 1,
+		.for_reclaim = 1
+	};
+	int rc;
+
+	if (!mapping->a_ops->writepage)
+		/* No write method for the address space */
+		return -EINVAL;
+
+	if (!clear_page_dirty_for_io(page))
+		/* Someone else already triggered a write */
+		return -EAGAIN;
+
 	/*
-	 * Default handling if a filesystem does not provide
-	 * a migration function. We can only migrate clean
-	 * pages so try to write out any dirty pages first.
+	 * A dirty page may imply that the underlying filesystem has
+	 * the page on some queue. So the page must be clean for
+	 * migration. Writeout may mean we loose the lock and the
+	 * page state is no longer what we checked for earlier.
+	 * At this point we know that the migration attempt cannot
+	 * be successful.
 	 */
-	if (PageDirty(page)) {
-		switch (pageout(page, mapping)) {
-		case PAGE_KEEP:
-		case PAGE_ACTIVATE:
-			return -EAGAIN;
+	remove_migration_ptes(page, page);
 
-		case PAGE_SUCCESS:
-			/* Relock since we lost the lock */
-			lock_page(page);
-			/* Must retry since page state may have changed */
-			return -EAGAIN;
+	rc = mapping->a_ops->writepage(page, &wbc);
+	if (rc < 0)
+		/* I/O Error writing */
+		return -EIO;
 
-		case PAGE_CLEAN:
-			; /* try to migrate the page below */
-		}
-	}
+	if (rc != AOP_WRITEPAGE_ACTIVATE)
+		/* unlocked. Relock */
+		lock_page(page);
+
+	return -EAGAIN;
+}
+
+/*
+ * Default handling if a filesystem does not provide a migration function.
+ */
+static int fallback_migrate_page(struct address_space *mapping,
+	struct page *newpage, struct page *page)
+{
+	if (PageDirty(page))
+		return writeout(mapping, page);
 
 	/*
 	 * Buffers may be managed in a filesystem specific way.
diff --git a/mm/rmap.c b/mm/rmap.c
index 05d6d73a692d..882a85826bb2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -596,6 +596,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				spin_unlock(&mmlist_lock);
 			}
 			dec_mm_counter(mm, anon_rss);
+#ifdef CONFIG_MIGRATION
 		} else {
 			/*
 			 * Store the pfn of the page in a special migration
@@ -604,12 +605,22 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 */
 			BUG_ON(!migration);
 			entry = make_migration_entry(page, pte_write(pteval));
+#endif
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
 	} else
+#ifdef CONFIG_MIGRATION
+	if (migration) {
+		/* Establish migration entry for a file page */
+		swp_entry_t entry;
+		entry = make_migration_entry(page, pte_write(pteval));
+		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+	} else
+#endif
 		dec_mm_counter(mm, file_rss);
 
+
 	page_remove_rmap(page);
 	page_cache_release(page);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bc5d4f43036c..71a02e295037 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -290,11 +290,23 @@ static void handle_write_error(struct address_space *mapping,
 	unlock_page(page);
 }
 
+/* possible outcome of pageout() */
+typedef enum {
+	/* failed to write page out, page is locked */
+	PAGE_KEEP,
+	/* move page to the active list, page is locked */
+	PAGE_ACTIVATE,
+	/* page has been sent to the disk successfully, page is unlocked */
+	PAGE_SUCCESS,
+	/* page is clean and locked */
+	PAGE_CLEAN,
+} pageout_t;
+
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
-pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
-- 
cgit v1.2.3


From 30c253e6da655d73eb8bfe2adca9b8f4d82fb81e Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 23 Jun 2006 02:03:41 -0700
Subject: [PATCH] sparsemem: record nid during memory present

Record the node id as we mark sections for instantiation.  Use this nid
during instantiation to direct allocations.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Cc: Mike Kravetz <kravetz@us.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Bob Picco <bob.picco@hp.com>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Martin Bligh <mbligh@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h |  5 +++++
 mm/sparse.c            | 22 ++++++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e82fc1a52cd0..d6120fa69116 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -509,6 +509,10 @@ struct mem_section {
 	 * pages.  However, it is stored with some other magic.
 	 * (see sparse.c::sparse_init_one_section())
 	 *
+	 * Additionally during early boot we encode node id of
+	 * the location of the section here to guide allocation.
+	 * (see sparse.c::memory_present())
+	 *
 	 * Making it a UL at least makes someone do a cast
 	 * before using it wrong.
 	 */
@@ -548,6 +552,7 @@ extern int __section_nr(struct mem_section* ms);
 #define SECTION_HAS_MEM_MAP	(1UL<<1)
 #define SECTION_MAP_LAST_BIT	(1UL<<2)
 #define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
+#define SECTION_NID_SHIFT	2
 
 static inline struct page *__section_mem_map_addr(struct mem_section *section)
 {
diff --git a/mm/sparse.c b/mm/sparse.c
index 100040c0dfb6..e0a3fe48aa37 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -99,6 +99,22 @@ int __section_nr(struct mem_section* ms)
 	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
 
+/*
+ * During early boot, before section_mem_map is used for an actual
+ * mem_map, we use section_mem_map to store the section's NUMA
+ * node.  This keeps us from having to use another data structure.  The
+ * node information is cleared just before we store the real mem_map.
+ */
+static inline unsigned long sparse_encode_early_nid(int nid)
+{
+	return (nid << SECTION_NID_SHIFT);
+}
+
+static inline int sparse_early_nid(struct mem_section *section)
+{
+	return (section->section_mem_map >> SECTION_NID_SHIFT);
+}
+
 /* Record a memory area against a node. */
 void memory_present(int nid, unsigned long start, unsigned long end)
 {
@@ -113,7 +129,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
 
 		ms = __nr_to_section(section);
 		if (!ms->section_mem_map)
-			ms->section_mem_map = SECTION_MARKED_PRESENT;
+			ms->section_mem_map = sparse_encode_early_nid(nid) |
+							SECTION_MARKED_PRESENT;
 	}
 }
 
@@ -164,6 +181,7 @@ static int sparse_init_one_section(struct mem_section *ms,
 	if (!valid_section(ms))
 		return -EINVAL;
 
+	ms->section_mem_map &= ~SECTION_MAP_MASK;
 	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
 
 	return 1;
@@ -172,8 +190,8 @@ static int sparse_init_one_section(struct mem_section *ms,
 static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 {
 	struct page *map;
-	int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
 	struct mem_section *ms = __nr_to_section(pnum);
+	int nid = sparse_early_nid(ms);
 
 	map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
 	if (map)
-- 
cgit v1.2.3


From bd96b9eb7cfd6ab24ba244360a09980a720874d2 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Fri, 23 Jun 2006 02:03:42 -0700
Subject: [PATCH] mm: fix swap unused warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If CONFIG_SWAP is not defined we get:

mm/vmscan.c: In function âremove_mappingâ:
mm/vmscan.c:387: warning: unused variable âswapâ

Convert defines in swap.h into blank inline functions to fix this warning
and be consistent.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h | 64 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1cf234e8df55..f1a827a972e0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -282,18 +282,60 @@ static inline void disable_swap_token(void)
 #define free_pages_and_swap_cache(pages, nr) \
 	release_pages((pages), (nr), 0);
 
-#define show_swap_cache_info()			/*NOTHING*/
-#define free_swap_and_cache(swp)		/*NOTHING*/
-#define swap_duplicate(swp)			/*NOTHING*/
-#define swap_free(swp)				/*NOTHING*/
-#define read_swap_cache_async(swp,vma,addr)	NULL
-#define lookup_swap_cache(swp)			NULL
-#define valid_swaphandles(swp, off)		0
+static inline void show_swap_cache_info(void)
+{
+}
+
+static inline void free_swap_and_cache(swp_entry_t swp)
+{
+}
+
+static inline int swap_duplicate(swp_entry_t swp)
+{
+	return 0;
+}
+
+static inline void swap_free(swp_entry_t swp)
+{
+}
+
+static inline struct page *read_swap_cache_async(swp_entry_t swp,
+			struct vm_area_struct *vma, unsigned long addr)
+{
+	return NULL;
+}
+
+static inline struct page *lookup_swap_cache(swp_entry_t swp)
+{
+	return NULL;
+}
+
+static inline int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
+{
+	return 0;
+}
+
 #define can_share_swap_page(p)			(page_mapcount(p) == 1)
-#define move_to_swap_cache(p, swp)		1
-#define move_from_swap_cache(p, i, m)		1
-#define __delete_from_swap_cache(p)		/*NOTHING*/
-#define delete_from_swap_cache(p)		/*NOTHING*/
+
+static inline int move_to_swap_cache(struct page *page, swp_entry_t entry)
+{
+	return 1;
+}
+
+static inline int move_from_swap_cache(struct page *page, unsigned long index,
+					struct address_space *mapping)
+{
+	return 1;
+}
+
+static inline void __delete_from_swap_cache(struct page *page)
+{
+}
+
+static inline void delete_from_swap_cache(struct page *page)
+{
+}
+
 #define swap_token_default_timeout		0
 
 static inline int remove_exclusive_swap_page(struct page *p)
-- 
cgit v1.2.3


From 9637a5efd4fbe36164c5ce7f6a0ee68b2bf22b7f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Jun 2006 02:03:43 -0700
Subject: [PATCH] add page_mkwrite() vm_operations method

Add a new VMA operation to notify a filesystem or other driver about the
MMU generating a fault because userspace attempted to write to a page
mapped through a read-only PTE.

This facility permits the filesystem or driver to:

 (*) Implement storage allocation/reservation on attempted write, and so to
     deal with problems such as ENOSPC more gracefully (perhaps by generating
     SIGBUS).

 (*) Delay making the page writable until the contents have been written to a
     backing cache. This is useful for NFS/AFS when using FS-Cache/CacheFS.
     It permits the filesystem to have some guarantee about the state of the
     cache.

 (*) Account and limit number of dirty pages. This is one piece of the puzzle
     needed to make shared writable mapping work safely in FUSE.

Needed by cachefs (Or is it cachefiles?  Or fscache? <head spins>).

At least four other groups have stated an interest in it or a desire to use
the functionality it provides: FUSE, OCFS2, NTFS and JFFS2.  Also, things like
EXT3 really ought to use it to deal with the case of shared-writable mmap
encountering ENOSPC before we permit the page to be dirtied.

From: Peter Zijlstra <a.p.zijlstra@chello.nl>

  get_user_pages(.write=1, .force=1) can generate COW hits on read-only
  shared mappings, this patch traps those as mkpage_write candidates and fails
  to handle them the old way.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Joel Becker <Joel.Becker@oracle.com>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h |   4 +++
 mm/memory.c        | 100 ++++++++++++++++++++++++++++++++++++++++-------------
 mm/mmap.c          |  12 +++++--
 mm/mprotect.c      |  11 ++++--
 4 files changed, 99 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 697c6bf248c2..3b09444121d9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -199,6 +199,10 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
 	int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
+
+	/* notification that a previously read-only page is about to become
+	 * writable, if an error is returned it will cause a SIGBUS */
+	int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
 #ifdef CONFIG_NUMA
 	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 11673c5d2c20..247b5c312b9b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1457,25 +1457,60 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct page *old_page, *new_page;
 	pte_t entry;
-	int ret = VM_FAULT_MINOR;
+	int reuse, ret = VM_FAULT_MINOR;
 
 	old_page = vm_normal_page(vma, address, orig_pte);
 	if (!old_page)
 		goto gotten;
 
-	if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
-		int reuse = can_share_swap_page(old_page);
-		unlock_page(old_page);
-		if (reuse) {
-			flush_cache_page(vma, address, pte_pfn(orig_pte));
-			entry = pte_mkyoung(orig_pte);
-			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-			ptep_set_access_flags(vma, address, page_table, entry, 1);
-			update_mmu_cache(vma, address, entry);
-			lazy_mmu_prot_update(entry);
-			ret |= VM_FAULT_WRITE;
-			goto unlock;
+	if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
+				(VM_SHARED|VM_WRITE))) {
+		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+			/*
+			 * Notify the address space that the page is about to
+			 * become writable so that it can prohibit this or wait
+			 * for the page to get into an appropriate state.
+			 *
+			 * We do this without the lock held, so that it can
+			 * sleep if it needs to.
+			 */
+			page_cache_get(old_page);
+			pte_unmap_unlock(page_table, ptl);
+
+			if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+				goto unwritable_page;
+
+			page_cache_release(old_page);
+
+			/*
+			 * Since we dropped the lock we need to revalidate
+			 * the PTE as someone else may have changed it.  If
+			 * they did, we just return, as we can count on the
+			 * MMU to tell us if they didn't also make it writable.
+			 */
+			page_table = pte_offset_map_lock(mm, pmd, address,
+							 &ptl);
+			if (!pte_same(*page_table, orig_pte))
+				goto unlock;
 		}
+
+		reuse = 1;
+	} else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
+		reuse = can_share_swap_page(old_page);
+		unlock_page(old_page);
+	} else {
+		reuse = 0;
+	}
+
+	if (reuse) {
+		flush_cache_page(vma, address, pte_pfn(orig_pte));
+		entry = pte_mkyoung(orig_pte);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		ptep_set_access_flags(vma, address, page_table, entry, 1);
+		update_mmu_cache(vma, address, entry);
+		lazy_mmu_prot_update(entry);
+		ret |= VM_FAULT_WRITE;
+		goto unlock;
 	}
 
 	/*
@@ -1535,6 +1570,10 @@ oom:
 	if (old_page)
 		page_cache_release(old_page);
 	return VM_FAULT_OOM;
+
+unwritable_page:
+	page_cache_release(old_page);
+	return VM_FAULT_SIGBUS;
 }
 
 /*
@@ -2083,18 +2122,31 @@ retry:
 	/*
 	 * Should we do an early C-O-W break?
 	 */
-	if (write_access && !(vma->vm_flags & VM_SHARED)) {
-		struct page *page;
+	if (write_access) {
+		if (!(vma->vm_flags & VM_SHARED)) {
+			struct page *page;
 
-		if (unlikely(anon_vma_prepare(vma)))
-			goto oom;
-		page = alloc_page_vma(GFP_HIGHUSER, vma, address);
-		if (!page)
-			goto oom;
-		copy_user_highpage(page, new_page, address);
-		page_cache_release(new_page);
-		new_page = page;
-		anon = 1;
+			if (unlikely(anon_vma_prepare(vma)))
+				goto oom;
+			page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+			if (!page)
+				goto oom;
+			copy_user_highpage(page, new_page, address);
+			page_cache_release(new_page);
+			new_page = page;
+			anon = 1;
+
+		} else {
+			/* if the page will be shareable, see if the backing
+			 * address space wants to know that the page is about
+			 * to become writable */
+			if (vma->vm_ops->page_mkwrite &&
+			    vma->vm_ops->page_mkwrite(vma, new_page) < 0
+			    ) {
+				page_cache_release(new_page);
+				return VM_FAULT_SIGBUS;
+			}
+		}
 	}
 
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
diff --git a/mm/mmap.c b/mm/mmap.c
index e6ee12344b13..6446c6134b04 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1065,7 +1065,8 @@ munmap_back:
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 	vma->vm_flags = vm_flags;
-	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
+	vma->vm_page_prot = protection_map[vm_flags &
+				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
 	vma->vm_pgoff = pgoff;
 
 	if (file) {
@@ -1089,6 +1090,12 @@ munmap_back:
 			goto free_vma;
 	}
 
+	/* Don't make the VMA automatically writable if it's shared, but the
+	 * backer wishes to know when pages are first written to */
+	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+		vma->vm_page_prot =
+			protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
+
 	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
 	 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
 	 * that memory reservation must be checked; but that reservation
@@ -1921,7 +1928,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	vma->vm_end = addr + len;
 	vma->vm_pgoff = pgoff;
 	vma->vm_flags = flags;
-	vma->vm_page_prot = protection_map[flags & 0x0f];
+	vma->vm_page_prot = protection_map[flags &
+				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
 	vma_link(mm, vma, prev, rb_link, rb_parent);
 out:
 	mm->total_vm += len >> PAGE_SHIFT;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 14f93e62270f..638edabaff71 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -123,6 +123,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	unsigned long oldflags = vma->vm_flags;
 	long nrpages = (end - start) >> PAGE_SHIFT;
 	unsigned long charged = 0;
+	unsigned int mask;
 	pgprot_t newprot;
 	pgoff_t pgoff;
 	int error;
@@ -149,8 +150,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 		}
 	}
 
-	newprot = protection_map[newflags & 0xf];
-
 	/*
 	 * First try to merge with previous and/or next vma.
 	 */
@@ -177,6 +176,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	}
 
 success:
+	/* Don't make the VMA automatically writable if it's shared, but the
+	 * backer wishes to know when pages are first written to */
+	mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
+	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+		mask &= ~VM_SHARED;
+
+	newprot = protection_map[newflags & mask];
+
 	/*
 	 * vm_flags and vm_page_prot are protected by the mmap_sem
 	 * held in write mode.
-- 
cgit v1.2.3


From bd1e22b8e0a90f9a91e4c27db14ca15773659bf7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 23 Jun 2006 02:03:47 -0700
Subject: [PATCH] initialise total_memory() earlier

Initialise total_memory earlier in boot.  Because if for some reason we run
page reclaim early in boot, we don't want total_memory to be zero when we use
it as a divisor.

And rename total_memory to vm_total_pages to avoid naming clashes with
architectures.

Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Martin Bligh <mbligh@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h | 1 +
 mm/page_alloc.c      | 6 +++---
 mm/vmscan.c          | 5 ++---
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f1a827a972e0..dc3f3aa0c83e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -185,6 +185,7 @@ extern unsigned long try_to_free_pages(struct zone **, gfp_t);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
+extern long vm_total_pages;
 
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5af33186a25f..71a0b2a23f5b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1725,9 +1725,9 @@ void __meminit build_all_zonelists(void)
 		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
 		/* cpuset refresh routine should be here */
 	}
-
-	printk("Built %i zonelists\n", num_online_nodes());
-
+	vm_total_pages = nr_free_pagecache_pages();
+	printk("Built %i zonelists.  Total pages: %ld\n",
+			num_online_nodes(), vm_total_pages);
 }
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 71a02e295037..72babac71dea 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -110,7 +110,7 @@ struct shrinker {
  * From 0 .. 100.  Higher means more swappy.
  */
 int vm_swappiness = 60;
-static long total_memory;
+long vm_total_pages;	/* The total number of pages which the VM controls */
 
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
@@ -743,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		 * how much memory
 		 * is mapped.
 		 */
-		mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+		mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages;
 
 		/*
 		 * Now decide how much we really want to unmap some pages.  The
@@ -1482,7 +1482,6 @@ static int __init kswapd_init(void)
 		pgdat->kswapd = find_task_by_pid(pid);
 		read_unlock(&tasklist_lock);
 	}
-	total_memory = nr_free_pagecache_pages();
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
 }
-- 
cgit v1.2.3


From 800590f523bf3bde9fa6c8e4d6763e4bf6a2c8ec Mon Sep 17 00:00:00 2001
From: Paul Drynoff <pauldrynoff@gmail.com>
Date: Fri, 23 Jun 2006 02:03:48 -0700
Subject: [PATCH] slab: kmalloc, kzalloc comments cleanup and fix

- Move comments for kmalloc to right place, currently it near __do_kmalloc

- Comments for kzalloc

- More detailed comments for kmalloc

- Appearance of "kmalloc" and "kzalloc" man pages after "make mandocs"

[rdunlap@xenotime.net: simplification]
Signed-off-by: Paul Drynoff <pauldrynoff@gmail.com>
Acked-by: Randy Dunlap <rdunlap@xenotime.net>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/DocBook/kernel-api.tmpl |  1 +
 include/linux/slab.h                  | 50 +++++++++++++++++++++++++++++++++++
 mm/slab.c                             | 20 ++------------
 3 files changed, 53 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index ca02e04a906c..6dab3dd36995 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -117,6 +117,7 @@ X!Ilib/string.c
   <chapter id="mm">
      <title>Memory Management in Linux</title>
      <sect1><title>The Slab Cache</title>
+!Iinclude/linux/slab.h
 !Emm/slab.c
      </sect1>
      <sect1><title>User Space Memory Access</title>
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9dc93163e065..45ad55b70d1c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -86,6 +86,51 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
     __kmalloc_track_caller(size, flags, __builtin_return_address(0))
 #endif
 
+/**
+ * kmalloc - allocate memory
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * kmalloc is the normal method of allocating memory
+ * in the kernel.
+ *
+ * The @flags argument may be one of:
+ *
+ * %GFP_USER - Allocate memory on behalf of user.  May sleep.
+ *
+ * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
+ *
+ * %GFP_ATOMIC - Allocation will not sleep.
+ *   For example, use this inside interrupt handlers.
+ *
+ * %GFP_HIGHUSER - Allocate pages from high memory.
+ *
+ * %GFP_NOIO - Do not do any I/O at all while trying to get memory.
+ *
+ * %GFP_NOFS - Do not make any fs calls while trying to get memory.
+ *
+ * Also it is possible to set different flags by OR'ing
+ * in one or more of the following additional @flags:
+ *
+ * %__GFP_COLD - Request cache-cold pages instead of
+ *   trying to return cache-warm pages.
+ *
+ * %__GFP_DMA - Request memory from the DMA-capable zone.
+ *
+ * %__GFP_HIGH - This allocation has high priority and may use emergency pools.
+ *
+ * %__GFP_HIGHMEM - Allocated memory may be from highmem.
+ *
+ * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail
+ *   (think twice before using).
+ *
+ * %__GFP_NORETRY - If memory is not immediately available,
+ *   then give up at once.
+ *
+ * %__GFP_NOWARN - If allocation fails, don't issue any warnings.
+ *
+ * %__GFP_REPEAT - If allocation fails initially, try once more before failing.
+ */
 static inline void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
@@ -111,6 +156,11 @@ found:
 
 extern void *__kzalloc(size_t, gfp_t);
 
+/**
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kmalloc).
+ */
 static inline void *kzalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
diff --git a/mm/slab.c b/mm/slab.c
index 664c3a10acf2..98ac20bc0de9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3277,26 +3277,10 @@ EXPORT_SYMBOL(kmalloc_node);
 #endif
 
 /**
- * kmalloc - allocate memory
+ * __do_kmalloc - allocate memory
  * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
+ * @flags: the type of memory to allocate (see kmalloc).
  * @caller: function caller for debug tracking of the caller
- *
- * kmalloc is the normal method of allocating memory
- * in the kernel.
- *
- * The @flags argument may be one of:
- *
- * %GFP_USER - Allocate memory on behalf of user.  May sleep.
- *
- * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
- *
- * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
- *
- * Additionally, the %GFP_DMA flag may be set to indicate the memory
- * must be suitable for DMA.  This can mean different things on different
- * platforms.  For example, on i386, it means that the memory must come
- * from the first 16MB.
  */
 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 					  void *caller)
-- 
cgit v1.2.3


From aaa994b300a172afafab47938804836b923e5ef7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:52 -0700
Subject: [PATCH] page migration: handle freeing of pages in migrate_pages()

Do not leave pages on the lists passed to migrate_pages().  Seems that we will
not need any postprocessing of pages.  This will simplify the handling of
pages by the callers of migrate_pages().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Jes Sorensen <jes@trained-monkey.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/migrate.h |  7 +++----
 mm/mempolicy.c          |  8 +-------
 mm/migrate.c            | 48 +++++++++++++++++++++++-------------------------
 3 files changed, 27 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 287c47b5e5df..83af25949fa9 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -8,8 +8,7 @@ extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *);
-extern int migrate_pages(struct list_head *l, struct list_head *t,
-		struct list_head *moved, struct list_head *failed);
+extern int migrate_pages(struct list_head *l, struct list_head *t);
 extern int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest);
 extern int fail_migrate_page(struct address_space *,
@@ -22,8 +21,8 @@ extern int migrate_prep(void);
 static inline int isolate_lru_page(struct page *p, struct list_head *list)
 					{ return -ENOSYS; }
 static inline int putback_lru_pages(struct list_head *l) { return 0; }
-static inline int migrate_pages(struct list_head *l, struct list_head *t,
-	struct list_head *moved, struct list_head *failed) { return -ENOSYS; }
+static inline int migrate_pages(struct list_head *l, struct list_head *t)
+					{ return -ENOSYS; }
 
 static inline int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest) { return 0; }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8778f58880c4..244f3f130e4a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -603,11 +603,8 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
-	if (!list_empty(&pagelist)) {
+	if (!list_empty(&pagelist))
 		err = migrate_pages_to(&pagelist, NULL, dest);
-		if (!list_empty(&pagelist))
-			putback_lru_pages(&pagelist);
-	}
 	return err;
 }
 
@@ -773,9 +770,6 @@ long do_mbind(unsigned long start, unsigned long len,
 			err = -EIO;
 	}
 
-	if (!list_empty(&pagelist))
-		putback_lru_pages(&pagelist);
-
 	up_write(&mm->mmap_sem);
 	mpol_free(new);
 	return err;
diff --git a/mm/migrate.c b/mm/migrate.c
index 09038163bfec..d3a1810a4c9f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -624,6 +624,15 @@ unlock:
 	unlock_page(page);
 ret:
 	if (rc != -EAGAIN) {
+ 		/*
+ 		 * A page that has been migrated has all references
+ 		 * removed and will be freed. A page that has not been
+ 		 * migrated will have kepts its references and be
+ 		 * restored.
+ 		 */
+ 		list_del(&page->lru);
+ 		move_to_lru(page);
+
 		list_del(&newpage->lru);
 		move_to_lru(newpage);
 	}
@@ -640,12 +649,12 @@ ret:
  *
  * The function returns after 10 attempts or if no pages
  * are movable anymore because to has become empty
- * or no retryable pages exist anymore.
+ * or no retryable pages exist anymore. All pages will be
+ * retruned to the LRU or freed.
  *
- * Return: Number of pages not migrated when "to" ran empty.
+ * Return: Number of pages not migrated.
  */
-int migrate_pages(struct list_head *from, struct list_head *to,
-		  struct list_head *moved, struct list_head *failed)
+int migrate_pages(struct list_head *from, struct list_head *to)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -675,11 +684,9 @@ int migrate_pages(struct list_head *from, struct list_head *to,
 				retry++;
 				break;
 			case 0:
-				list_move(&page->lru, moved);
 				break;
 			default:
 				/* Permanent failure */
-				list_move(&page->lru, failed);
 				nr_failed++;
 				break;
 			}
@@ -689,6 +696,7 @@ int migrate_pages(struct list_head *from, struct list_head *to,
 	if (!swapwrite)
 		current->flags &= ~PF_SWAPWRITE;
 
+	putback_lru_pages(from);
 	return nr_failed + retry;
 }
 
@@ -702,11 +710,10 @@ int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest)
 {
 	LIST_HEAD(newlist);
-	LIST_HEAD(moved);
-	LIST_HEAD(failed);
 	int err = 0;
 	unsigned long offset = 0;
 	int nr_pages;
+	int nr_failed = 0;
 	struct page *page;
 	struct list_head *p;
 
@@ -740,26 +747,17 @@ redo:
 		if (nr_pages > MIGRATE_CHUNK_SIZE)
 			break;
 	}
-	err = migrate_pages(pagelist, &newlist, &moved, &failed);
+	err = migrate_pages(pagelist, &newlist);
 
-	putback_lru_pages(&moved);	/* Call release pages instead ?? */
-
-	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
-		goto redo;
-out:
-	/* Return leftover allocated pages */
-	while (!list_empty(&newlist)) {
-		page = list_entry(newlist.next, struct page, lru);
-		list_del(&page->lru);
-		__free_page(page);
+	if (err >= 0) {
+		nr_failed += err;
+		if (list_empty(&newlist) && !list_empty(pagelist))
+			goto redo;
 	}
-	list_splice(&failed, pagelist);
-	if (err < 0)
-		return err;
+out:
 
 	/* Calculate number of leftover pages */
-	nr_pages = 0;
 	list_for_each(p, pagelist)
-		nr_pages++;
-	return nr_pages;
+		nr_failed++;
+	return nr_failed;
 }
-- 
cgit v1.2.3


From 95a402c3847cc16f4ba03013cd01404fa0f14c2e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:53 -0700
Subject: [PATCH] page migration: use allocator function for migrate_pages()

Instead of passing a list of new pages, pass a function to allocate a new
page.  This allows the correct placement of MPOL_INTERLEAVE pages during page
migration.  It also further simplifies the callers of migrate pages.
migrate_pages() becomes similar to migrate_pages_to() so drop
migrate_pages_to().  The batching of new page allocations becomes unnecessary.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Jes Sorensen <jes@trained-monkey.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/migrate.h |  11 ++---
 mm/mempolicy.c          |  23 +++++++++-
 mm/migrate.c            | 115 ++++++++++++++----------------------------------
 3 files changed, 59 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 83af25949fa9..5b95d6568dc4 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -3,14 +3,15 @@
 
 #include <linux/mm.h>
 
+typedef struct page *new_page_t(struct page *, unsigned long private);
+
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *);
-extern int migrate_pages(struct list_head *l, struct list_head *t);
-extern int migrate_pages_to(struct list_head *pagelist,
-			struct vm_area_struct *vma, int dest);
+extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long);
+
 extern int fail_migrate_page(struct address_space *,
 			struct page *, struct page *);
 
@@ -21,8 +22,8 @@ extern int migrate_prep(void);
 static inline int isolate_lru_page(struct page *p, struct list_head *list)
 					{ return -ENOSYS; }
 static inline int putback_lru_pages(struct list_head *l) { return 0; }
-static inline int migrate_pages(struct list_head *l, struct list_head *t)
-					{ return -ENOSYS; }
+static inline int migrate_pages(struct list_head *l, new_page_t x,
+		unsigned long private) { return -ENOSYS; }
 
 static inline int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest) { return 0; }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 244f3f130e4a..f432642e9e66 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -87,6 +87,7 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/migrate.h>
+#include <linux/rmap.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -587,6 +588,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 		isolate_lru_page(page, pagelist);
 }
 
+static struct page *new_node_page(struct page *page, unsigned long node)
+{
+	return alloc_pages_node(node, GFP_HIGHUSER, 0);
+}
+
 /*
  * Migrate pages from one node to a target node.
  * Returns error or the number of pages not migrated.
@@ -604,7 +610,8 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist))
-		err = migrate_pages_to(&pagelist, NULL, dest);
+		err = migrate_pages(&pagelist, new_node_page, dest);
+
 	return err;
 }
 
@@ -691,6 +698,12 @@ int do_migrate_pages(struct mm_struct *mm,
 
 }
 
+static struct page *new_vma_page(struct page *page, unsigned long private)
+{
+	struct vm_area_struct *vma = (struct vm_area_struct *)private;
+
+	return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
+}
 #else
 
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -703,6 +716,11 @@ int do_migrate_pages(struct mm_struct *mm,
 {
 	return -ENOSYS;
 }
+
+static struct page *new_vma_page(struct page *page, unsigned long private)
+{
+	return NULL;
+}
 #endif
 
 long do_mbind(unsigned long start, unsigned long len,
@@ -764,7 +782,8 @@ long do_mbind(unsigned long start, unsigned long len,
 		err = mbind_range(vma, start, end, new);
 
 		if (!list_empty(&pagelist))
-			nr_failed = migrate_pages_to(&pagelist, vma, -1);
+			nr_failed = migrate_pages(&pagelist, new_vma_page,
+						(unsigned long)vma);
 
 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
diff --git a/mm/migrate.c b/mm/migrate.c
index d3a1810a4c9f..251a8d158257 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -28,9 +28,6 @@
 
 #include "internal.h"
 
-/* The maximum number of pages to take off the LRU for migration */
-#define MIGRATE_CHUNK_SIZE 256
-
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 
 /*
@@ -587,18 +584,23 @@ static int move_to_new_page(struct page *newpage, struct page *page)
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(struct page *newpage, struct page *page, int force)
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+			struct page *page, int force)
 {
 	int rc = 0;
+	struct page *newpage = get_new_page(page, private);
+
+	if (!newpage)
+		return -ENOMEM;
 
 	if (page_count(page) == 1)
 		/* page was freed from under us. So we are done. */
-		goto ret;
+		goto move_newpage;
 
 	rc = -EAGAIN;
 	if (TestSetPageLocked(page)) {
 		if (!force)
-			goto ret;
+			goto move_newpage;
 		lock_page(page);
 	}
 
@@ -622,7 +624,7 @@ static int unmap_and_move(struct page *newpage, struct page *page, int force)
 		remove_migration_ptes(page, page);
 unlock:
 	unlock_page(page);
-ret:
+
 	if (rc != -EAGAIN) {
  		/*
  		 * A page that has been migrated has all references
@@ -632,29 +634,33 @@ ret:
  		 */
  		list_del(&page->lru);
  		move_to_lru(page);
-
-		list_del(&newpage->lru);
-		move_to_lru(newpage);
 	}
+
+move_newpage:
+	/*
+	 * Move the new page to the LRU. If migration was not successful
+	 * then this will free the page.
+	 */
+	move_to_lru(newpage);
 	return rc;
 }
 
 /*
  * migrate_pages
  *
- * Two lists are passed to this function. The first list
- * contains the pages isolated from the LRU to be migrated.
- * The second list contains new pages that the isolated pages
- * can be moved to.
+ * The function takes one list of pages to migrate and a function
+ * that determines from the page to be migrated and the private data
+ * the target of the move and allocates the page.
  *
  * The function returns after 10 attempts or if no pages
  * are movable anymore because to has become empty
  * or no retryable pages exist anymore. All pages will be
  * retruned to the LRU or freed.
  *
- * Return: Number of pages not migrated.
+ * Return: Number of pages not migrated or error code.
  */
-int migrate_pages(struct list_head *from, struct list_head *to)
+int migrate_pages(struct list_head *from,
+		new_page_t get_new_page, unsigned long private)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -671,15 +677,14 @@ int migrate_pages(struct list_head *from, struct list_head *to)
 		retry = 0;
 
 		list_for_each_entry_safe(page, page2, from, lru) {
-
-			if (list_empty(to))
-				break;
-
 			cond_resched();
 
-			rc = unmap_and_move(lru_to_page(to), page, pass > 2);
+			rc = unmap_and_move(get_new_page, private,
+						page, pass > 2);
 
 			switch(rc) {
+			case -ENOMEM:
+				goto out;
 			case -EAGAIN:
 				retry++;
 				break;
@@ -692,72 +697,16 @@ int migrate_pages(struct list_head *from, struct list_head *to)
 			}
 		}
 	}
-
+	rc = 0;
+out:
 	if (!swapwrite)
 		current->flags &= ~PF_SWAPWRITE;
 
 	putback_lru_pages(from);
-	return nr_failed + retry;
-}
 
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-int migrate_pages_to(struct list_head *pagelist,
-			struct vm_area_struct *vma, int dest)
-{
-	LIST_HEAD(newlist);
-	int err = 0;
-	unsigned long offset = 0;
-	int nr_pages;
-	int nr_failed = 0;
-	struct page *page;
-	struct list_head *p;
-
-redo:
-	nr_pages = 0;
-	list_for_each(p, pagelist) {
-		if (vma) {
-			/*
-			 * The address passed to alloc_page_vma is used to
-			 * generate the proper interleave behavior. We fake
-			 * the address here by an increasing offset in order
-			 * to get the proper distribution of pages.
-			 *
-			 * No decision has been made as to which page
-			 * a certain old page is moved to so we cannot
-			 * specify the correct address.
-			 */
-			page = alloc_page_vma(GFP_HIGHUSER, vma,
-					offset + vma->vm_start);
-			offset += PAGE_SIZE;
-		}
-		else
-			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-
-		if (!page) {
-			err = -ENOMEM;
-			goto out;
-		}
-		list_add_tail(&page->lru, &newlist);
-		nr_pages++;
-		if (nr_pages > MIGRATE_CHUNK_SIZE)
-			break;
-	}
-	err = migrate_pages(pagelist, &newlist);
-
-	if (err >= 0) {
-		nr_failed += err;
-		if (list_empty(&newlist) && !list_empty(pagelist))
-			goto redo;
-	}
-out:
+	if (rc)
+		return rc;
 
-	/* Calculate number of leftover pages */
-	list_for_each(p, pagelist)
-		nr_failed++;
-	return nr_failed;
+	return nr_failed + retry;
 }
+
-- 
cgit v1.2.3


From 742755a1d8ce2b548428f7aacf1758b4bba50080 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:55 -0700
Subject: [PATCH] page migration: sys_move_pages(): support moving of
 individual pages

move_pages() is used to move individual pages of a process. The function can
be used to determine the location of pages and to move them onto the desired
node. move_pages() returns status information for each page.

long move_pages(pid, number_of_pages_to_move,
		addresses_of_pages[],
		nodes[] or NULL,
		status[],
		flags);

The addresses of pages is an array of void * pointing to the
pages to be moved.

The nodes array contains the node numbers that the pages should be moved
to. If a NULL is passed instead of an array then no pages are moved but
the status array is updated. The status request may be used to determine
the page state before issuing another move_pages() to move pages.

The status array will contain the state of all individual page migration
attempts when the function terminates. The status array is only valid if
move_pages() completed successfullly.

Possible page states in status[]:

0..MAX_NUMNODES	The page is now on the indicated node.

-ENOENT		Page is not present

-EACCES		Page is mapped by multiple processes and can only
		be moved if MPOL_MF_MOVE_ALL is specified.

-EPERM		The page has been mlocked by a process/driver and
		cannot be moved.

-EBUSY		Page is busy and cannot be moved. Try again later.

-EFAULT		Invalid address (no VMA or zero page).

-ENOMEM		Unable to allocate memory on target node.

-EIO		Unable to write back page. The page must be written
		back in order to move it since the page is dirty and the
		filesystem does not provide a migration function that
		would allow the moving of dirty pages.

-EINVAL		A dirty page cannot be moved. The filesystem does not provide
		a migration function and has no ability to write back pages.

The flags parameter indicates what types of pages to move:

MPOL_MF_MOVE	Move pages that are only mapped by the process.

MPOL_MF_MOVE_ALL Also move pages that are mapped by multiple processes.
		Requires sufficient capabilities.

Possible return codes from move_pages()

-ENOENT		No pages found that would require moving. All pages
		are either already on the target node, not present, had an
		invalid address or could not be moved because they were
		mapped by multiple processes.

-EINVAL		Flags other than MPOL_MF_MOVE(_ALL) specified or an attempt
		to migrate pages in a kernel thread.

-EPERM		MPOL_MF_MOVE_ALL specified without sufficient priviledges.
		or an attempt to move a process belonging to another user.

-EACCES		One of the target nodes is not allowed by the current cpuset.

-ENODEV		One of the target nodes is not online.

-ESRCH		Process does not exist.

-E2BIG		Too many pages to move.

-ENOMEM		Not enough memory to allocate control array.

-EFAULT		Parameters could not be accessed.

A test program for move_pages() may be found with the patches
on ftp.kernel.org:/pub/linux/kernel/people/christoph/pmig/patches-2.6.17-rc4-mm3

From: Christoph Lameter <clameter@sgi.com>

  Detailed results for sys_move_pages()

  Pass a pointer to an integer to get_new_page() that may be used to
  indicate where the completion status of a migration operation should be
  placed.  This allows sys_move_pags() to report back exactly what happened to
  each page.

  Wish there would be a better way to do this. Looks a bit hacky.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Jes Sorensen <jes@trained-monkey.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/vm/page_migration |  29 ++---
 arch/ia64/kernel/entry.S        |   2 +-
 include/asm-ia64/unistd.h       |   2 +-
 include/linux/migrate.h         |   2 +-
 include/linux/syscalls.h        |   5 +
 kernel/sys_ni.c                 |   1 +
 mm/mempolicy.c                  |   4 +-
 mm/migrate.c                    | 268 +++++++++++++++++++++++++++++++++++++++-
 8 files changed, 288 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
index 0a5d5fb18854..99f89aa10169 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration
@@ -26,8 +26,13 @@ a process are located. See also the numa_maps manpage in the numactl package.
 Manual migration is useful if for example the scheduler has relocated
 a process to a processor on a distant node. A batch scheduler or an
 administrator may detect the situation and move the pages of the process
-nearer to the new processor. At some point in the future we may have
-some mechanism in the scheduler that will automatically move the pages.
+nearer to the new processor. The kernel itself does only provide
+manual page migration support. Automatic page migration may be implemented
+through user space processes that move pages. A special function call
+"move_pages" allows the moving of individual pages within a process.
+A NUMA profiler may f.e. obtain a log showing frequent off node
+accesses and may use the result to move pages to more advantageous
+locations.
 
 Larger installations usually partition the system using cpusets into
 sections of nodes. Paul Jackson has equipped cpusets with the ability to
@@ -62,22 +67,14 @@ A. In kernel use of migrate_pages()
    It also prevents the swapper or other scans to encounter
    the page.
 
-2. Generate a list of newly allocates pages. These pages will contain the
-   contents of the pages from the first list after page migration is
-   complete.
+2. We need to have a function of type new_page_t that can be
+   passed to migrate_pages(). This function should figure out
+   how to allocate the correct new page given the old page.
 
 3. The migrate_pages() function is called which attempts
-   to do the migration. It returns the moved pages in the
-   list specified as the third parameter and the failed
-   migrations in the fourth parameter. When the function
-   returns the first list will contain the pages that could still be retried.
-
-4. The leftover pages of various types are returned
-   to the LRU using putback_to_lru_pages() or otherwise
-   disposed of. The pages will still have the refcount as
-   increased by isolate_lru_pages() if putback_to_lru_pages() is not
-   used! The kernel may want to handle the various cases of failures in
-   different ways.
+   to do the migration. It will call the function to allocate
+   the new page for each page that is considered for
+   moving.
 
 B. How migrate_pages() works
 ----------------------------
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index bcb80ca5cf40..32c999f58d12 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1584,7 +1584,7 @@ sys_call_table:
 	data8 sys_keyctl
 	data8 sys_ioprio_set
 	data8 sys_ioprio_get			// 1275
-	data8 sys_ni_syscall
+	data8 sys_move_pages
 	data8 sys_inotify_init
 	data8 sys_inotify_add_watch
 	data8 sys_inotify_rm_watch
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 632f2eedf72c..bb0eb727dcd0 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -265,7 +265,7 @@
 #define __NR_keyctl			1273
 #define __NR_ioprio_set			1274
 #define __NR_ioprio_get			1275
-/* 1276 is available for reuse (was briefly sys_set_zone_reclaim) */
+#define __NR_move_pages			1276
 #define __NR_inotify_init		1277
 #define __NR_inotify_add_watch		1278
 #define __NR_inotify_rm_watch		1279
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 5b95d6568dc4..5dba23a1c0d0 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -3,7 +3,7 @@
 
 #include <linux/mm.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private);
+typedef struct page *new_page_t(struct page *, unsigned long private, int **);
 
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index bd67a4413df7..7e3f23490918 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -516,6 +516,11 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 				const unsigned long __user *from,
 				const unsigned long __user *to);
+asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
+				const void __user * __user *pages,
+				const int __user *nodes,
+				int __user *status,
+				int flags);
 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 				unsigned long mode,
 				unsigned long __user *nmask,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5433195040f1..597229749dec 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
 cond_syscall(sys_migrate_pages);
+cond_syscall(sys_move_pages);
 cond_syscall(sys_chown16);
 cond_syscall(sys_fchown16);
 cond_syscall(sys_getegid16);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f432642e9e66..05b84acf0bb3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -588,7 +588,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 		isolate_lru_page(page, pagelist);
 }
 
-static struct page *new_node_page(struct page *page, unsigned long node)
+static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 {
 	return alloc_pages_node(node, GFP_HIGHUSER, 0);
 }
@@ -698,7 +698,7 @@ int do_migrate_pages(struct mm_struct *mm,
 
 }
 
-static struct page *new_vma_page(struct page *page, unsigned long private)
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 {
 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 251a8d158257..033a12f4c949 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -25,6 +25,8 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/writeback.h>
+#include <linux/mempolicy.h>
+#include <linux/vmalloc.h>
 
 #include "internal.h"
 
@@ -62,9 +64,8 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist)
 }
 
 /*
- * migrate_prep() needs to be called after we have compiled the list of pages
- * to be migrated using isolate_lru_page() but before we begin a series of calls
- * to migrate_pages().
+ * migrate_prep() needs to be called before we start compiling a list of pages
+ * to be migrated using isolate_lru_page().
  */
 int migrate_prep(void)
 {
@@ -588,7 +589,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 			struct page *page, int force)
 {
 	int rc = 0;
-	struct page *newpage = get_new_page(page, private);
+	int *result = NULL;
+	struct page *newpage = get_new_page(page, private, &result);
 
 	if (!newpage)
 		return -ENOMEM;
@@ -642,6 +644,12 @@ move_newpage:
 	 * then this will free the page.
 	 */
 	move_to_lru(newpage);
+	if (result) {
+		if (rc)
+			*result = rc;
+		else
+			*result = page_to_nid(newpage);
+	}
 	return rc;
 }
 
@@ -710,3 +718,255 @@ out:
 	return nr_failed + retry;
 }
 
+#ifdef CONFIG_NUMA
+/*
+ * Move a list of individual pages
+ */
+struct page_to_node {
+	unsigned long addr;
+	struct page *page;
+	int node;
+	int status;
+};
+
+static struct page *new_page_node(struct page *p, unsigned long private,
+		int **result)
+{
+	struct page_to_node *pm = (struct page_to_node *)private;
+
+	while (pm->node != MAX_NUMNODES && pm->page != p)
+		pm++;
+
+	if (pm->node == MAX_NUMNODES)
+		return NULL;
+
+	*result = &pm->status;
+
+	return alloc_pages_node(pm->node, GFP_HIGHUSER, 0);
+}
+
+/*
+ * Move a set of pages as indicated in the pm array. The addr
+ * field must be set to the virtual address of the page to be moved
+ * and the node number must contain a valid target node.
+ */
+static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
+				int migrate_all)
+{
+	int err;
+	struct page_to_node *pp;
+	LIST_HEAD(pagelist);
+
+	down_read(&mm->mmap_sem);
+
+	/*
+	 * Build a list of pages to migrate
+	 */
+	migrate_prep();
+	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
+		struct vm_area_struct *vma;
+		struct page *page;
+
+		/*
+		 * A valid page pointer that will not match any of the
+		 * pages that will be moved.
+		 */
+		pp->page = ZERO_PAGE(0);
+
+		err = -EFAULT;
+		vma = find_vma(mm, pp->addr);
+		if (!vma)
+			goto set_status;
+
+		page = follow_page(vma, pp->addr, FOLL_GET);
+		err = -ENOENT;
+		if (!page)
+			goto set_status;
+
+		if (PageReserved(page))		/* Check for zero page */
+			goto put_and_set;
+
+		pp->page = page;
+		err = page_to_nid(page);
+
+		if (err == pp->node)
+			/*
+			 * Node already in the right place
+			 */
+			goto put_and_set;
+
+		err = -EACCES;
+		if (page_mapcount(page) > 1 &&
+				!migrate_all)
+			goto put_and_set;
+
+		err = isolate_lru_page(page, &pagelist);
+put_and_set:
+		/*
+		 * Either remove the duplicate refcount from
+		 * isolate_lru_page() or drop the page ref if it was
+		 * not isolated.
+		 */
+		put_page(page);
+set_status:
+		pp->status = err;
+	}
+
+	if (!list_empty(&pagelist))
+		err = migrate_pages(&pagelist, new_page_node,
+				(unsigned long)pm);
+	else
+		err = -ENOENT;
+
+	up_read(&mm->mmap_sem);
+	return err;
+}
+
+/*
+ * Determine the nodes of a list of pages. The addr in the pm array
+ * must have been set to the virtual address of which we want to determine
+ * the node number.
+ */
+static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
+{
+	down_read(&mm->mmap_sem);
+
+	for ( ; pm->node != MAX_NUMNODES; pm++) {
+		struct vm_area_struct *vma;
+		struct page *page;
+		int err;
+
+		err = -EFAULT;
+		vma = find_vma(mm, pm->addr);
+		if (!vma)
+			goto set_status;
+
+		page = follow_page(vma, pm->addr, 0);
+		err = -ENOENT;
+		/* Use PageReserved to check for zero page */
+		if (!page || PageReserved(page))
+			goto set_status;
+
+		err = page_to_nid(page);
+set_status:
+		pm->status = err;
+	}
+
+	up_read(&mm->mmap_sem);
+	return 0;
+}
+
+/*
+ * Move a list of pages in the address space of the currently executing
+ * process.
+ */
+asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
+			const void __user * __user *pages,
+			const int __user *nodes,
+			int __user *status, int flags)
+{
+	int err = 0;
+	int i;
+	struct task_struct *task;
+	nodemask_t task_nodes;
+	struct mm_struct *mm;
+	struct page_to_node *pm = NULL;
+
+	/* Check flags */
+	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+		return -EINVAL;
+
+	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+		return -EPERM;
+
+	/* Find the mm_struct */
+	read_lock(&tasklist_lock);
+	task = pid ? find_task_by_pid(pid) : current;
+	if (!task) {
+		read_unlock(&tasklist_lock);
+		return -ESRCH;
+	}
+	mm = get_task_mm(task);
+	read_unlock(&tasklist_lock);
+
+	if (!mm)
+		return -EINVAL;
+
+	/*
+	 * Check if this process has the right to modify the specified
+	 * process. The right exists if the process has administrative
+	 * capabilities, superuser privileges or the same
+	 * userid as the target process.
+	 */
+	if ((current->euid != task->suid) && (current->euid != task->uid) &&
+	    (current->uid != task->suid) && (current->uid != task->uid) &&
+	    !capable(CAP_SYS_NICE)) {
+		err = -EPERM;
+		goto out2;
+	}
+
+	task_nodes = cpuset_mems_allowed(task);
+
+	/* Limit nr_pages so that the multiplication may not overflow */
+	if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
+		err = -E2BIG;
+		goto out2;
+	}
+
+	pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
+	if (!pm) {
+		err = -ENOMEM;
+		goto out2;
+	}
+
+	/*
+	 * Get parameters from user space and initialize the pm
+	 * array. Return various errors if the user did something wrong.
+	 */
+	for (i = 0; i < nr_pages; i++) {
+		const void *p;
+
+		err = -EFAULT;
+		if (get_user(p, pages + i))
+			goto out;
+
+		pm[i].addr = (unsigned long)p;
+		if (nodes) {
+			int node;
+
+			if (get_user(node, nodes + i))
+				goto out;
+
+			err = -ENODEV;
+			if (!node_online(node))
+				goto out;
+
+			err = -EACCES;
+			if (!node_isset(node, task_nodes))
+				goto out;
+
+			pm[i].node = node;
+		}
+	}
+	/* End marker */
+	pm[nr_pages].node = MAX_NUMNODES;
+
+	if (nodes)
+		err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
+	else
+		err = do_pages_stat(mm, pm);
+
+	if (err >= 0)
+		/* Return status information */
+		for (i = 0; i < nr_pages; i++)
+			if (put_user(pm[i].status, status + i))
+				err = -EFAULT;
+
+out:
+	vfree(pm);
+out2:
+	mmput(mm);
+	return err;
+}
+#endif
+
-- 
cgit v1.2.3


From 1b2db9fb7adc4d67d9ce7d16ce79c41ee84730fe Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:56 -0700
Subject: [PATCH] sys_move_pages: 32bit support (i386, x86_64)

sys_move_pages() support for 32bit (i386 plus x86_64 compat layer)

Add support for move_pages() on i386 and also add the compat functions
necessary to run 32 bit binaries on x86_64.

Add compat_sys_move_pages to the x86_64 32bit binary layer.  Note that it is
not up to date so I added the missing pieces.  Not sure if this is done the
right way.

[akpm@osdl.org: compile fix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/syscall_table.S |  1 +
 arch/x86_64/ia32/ia32entry.S     |  1 +
 include/asm-i386/unistd.h        |  3 ++-
 include/linux/syscalls.h         |  5 +++++
 kernel/compat.c                  | 23 +++++++++++++++++++++++
 kernel/sys_ni.c                  |  1 +
 6 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index af56987f69b0..dd63d4775398 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -316,3 +316,4 @@ ENTRY(sys_call_table)
 	.long sys_sync_file_range
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
+	.long sys_move_pages
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5a92fed2d1d5..4ec594ab1a98 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -696,4 +696,5 @@ ia32_sys_call_table:
 	.quad sys_sync_file_range
 	.quad sys_tee
 	.quad compat_sys_vmsplice
+	.quad compat_sys_move_pages
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index de2ccc149e34..fc1c8ddae149 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -322,10 +322,11 @@
 #define __NR_sync_file_range	314
 #define __NR_tee		315
 #define __NR_vmsplice		316
+#define __NR_move_pages		317
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 317
+#define NR_syscalls 318
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7e3f23490918..e42738c69166 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -521,6 +521,11 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 				const int __user *nodes,
 				int __user *status,
 				int flags);
+asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page,
+				void __user *pages,
+				const int __user *nodes,
+				int __user *status,
+				int flags);
 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 				unsigned long mode,
 				unsigned long __user *nmask,
diff --git a/kernel/compat.c b/kernel/compat.c
index c1601a84f8d8..ccea93e28954 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
 #include <linux/unistd.h>
 #include <linux/security.h>
 #include <linux/timex.h>
+#include <linux/migrate.h>
 
 #include <asm/uaccess.h>
 
@@ -934,3 +935,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 
 	return ret;
 }
+
+#ifdef CONFIG_NUMA
+asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
+		void __user *pages32,
+		const int __user *nodes,
+		int __user *status,
+		int flags)
+{
+	const void __user * __user *pages;
+	int i;
+
+	pages = compat_alloc_user_space(nr_pages * sizeof(void *));
+	for (i = 0; i < nr_pages; i++) {
+		compat_uptr_t p;
+
+		if (get_user(p, (compat_uptr_t *)(pages32 + i)) ||
+			put_user(compat_ptr(p), pages + i))
+			return -EFAULT;
+	}
+	return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
+}
+#endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 597229749dec..6991bece67e8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -133,3 +133,4 @@ cond_syscall(sys_mincore);
 cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
+cond_syscall(compat_sys_move_pages);
-- 
cgit v1.2.3


From 9216dfad4fc97ab639ef0885efc713f3d7a20d5b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:57 -0700
Subject: [PATCH] move_pages: fix 32 -> 64 bit compat function

The definition of the third parameter is a pointer to an array of virtual
addresses which give us some trouble.  The existing code calculated the
wrong address in the array since I used void to avoid having to specify a
type.

I now use the correct type "compat_uptr_t __user *" in the definition of
the function in kernel/compat.c.

However, I used __u32 in syscalls.h.  Would have to include compat.h there
in order to provide the same definition which would generate an ugly
include situation.

On both ia64 and x86_64 compat_uptr_t is u32. So this works although
parameter declarations differ.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/syscalls.h | 2 +-
 kernel/compat.c          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e42738c69166..33785b79d548 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -522,7 +522,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 				int __user *status,
 				int flags);
 asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page,
-				void __user *pages,
+				__u32 __user *pages,
 				const int __user *nodes,
 				int __user *status,
 				int flags);
diff --git a/kernel/compat.c b/kernel/compat.c
index ccea93e28954..2f672332430f 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -938,7 +938,7 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 
 #ifdef CONFIG_NUMA
 asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
-		void __user *pages32,
+		compat_uptr_t __user *pages32,
 		const int __user *nodes,
 		int __user *status,
 		int flags)
@@ -950,7 +950,7 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
 	for (i = 0; i < nr_pages; i++) {
 		compat_uptr_t p;
 
-		if (get_user(p, (compat_uptr_t *)(pages32 + i)) ||
+		if (get_user(p, pages32 + i) ||
 			put_user(compat_ptr(p), pages + i))
 			return -EFAULT;
 	}
-- 
cgit v1.2.3


From 03e68060636e05989ea94bcb671ab633948f328c Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 23 Jun 2006 02:03:58 -0700
Subject: [PATCH] lsm: add task_setioprio hook

Implement an LSM hook for setting a task's IO priority, similar to the hook
for setting a tasks's nice value.

A previous version of this LSM hook was included in an older version of
multiadm by Jan Engelhardt, although I don't recall it being submitted
upstream.

Also included is the corresponding SELinux hook, which re-uses the setsched
permission in the proccess class.

Signed-off-by: James Morris <jmorris@namei.org>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Cc: Jan Engelhardt <jengelh@linux01.gwdg.de>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ioprio.c              |  6 ++++++
 include/linux/security.h | 16 ++++++++++++++++
 security/dummy.c         |  6 ++++++
 security/selinux/hooks.c |  6 ++++++
 4 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index ca77008146c0..7fa76ed53c10 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -24,15 +24,21 @@
 #include <linux/blkdev.h>
 #include <linux/capability.h>
 #include <linux/syscalls.h>
+#include <linux/security.h>
 
 static int set_task_ioprio(struct task_struct *task, int ioprio)
 {
+	int err;
 	struct io_context *ioc;
 
 	if (task->uid != current->euid &&
 	    task->uid != current->uid && !capable(CAP_SYS_NICE))
 		return -EPERM;
 
+	err = security_task_setioprio(task, ioprio);
+	if (err)
+		return err;
+
 	task_lock(task);
 
 	task->ioprio = ioprio;
diff --git a/include/linux/security.h b/include/linux/security.h
index 383c320fc834..65b32a0c6207 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -577,6 +577,11 @@ struct swap_info_struct;
  *	@p contains the task_struct of process.
  *	@nice contains the new nice value.
  *	Return 0 if permission is granted.
+ * @task_setioprio
+ *	Check permission before setting the ioprio value of @p to @ioprio.
+ *	@p contains the task_struct of process.
+ *	@ioprio contains the new ioprio value
+ *	Return 0 if permission is granted.
  * @task_setrlimit:
  *	Check permission before setting the resource limits of the current
  *	process for @resource to @new_rlim.  The old resource limit values can
@@ -1210,6 +1215,7 @@ struct security_operations {
 	int (*task_getsid) (struct task_struct * p);
 	int (*task_setgroups) (struct group_info *group_info);
 	int (*task_setnice) (struct task_struct * p, int nice);
+	int (*task_setioprio) (struct task_struct * p, int ioprio);
 	int (*task_setrlimit) (unsigned int resource, struct rlimit * new_rlim);
 	int (*task_setscheduler) (struct task_struct * p, int policy,
 				  struct sched_param * lp);
@@ -1836,6 +1842,11 @@ static inline int security_task_setnice (struct task_struct *p, int nice)
 	return security_ops->task_setnice (p, nice);
 }
 
+static inline int security_task_setioprio (struct task_struct *p, int ioprio)
+{
+	return security_ops->task_setioprio (p, ioprio);
+}
+
 static inline int security_task_setrlimit (unsigned int resource,
 					   struct rlimit *new_rlim)
 {
@@ -2478,6 +2489,11 @@ static inline int security_task_setnice (struct task_struct *p, int nice)
 	return 0;
 }
 
+static inline int security_task_setioprio (struct task_struct *p, int ioprio)
+{
+	return 0;
+}
+
 static inline int security_task_setrlimit (unsigned int resource,
 					   struct rlimit *new_rlim)
 {
diff --git a/security/dummy.c b/security/dummy.c
index c98d553984ec..879a98523b1b 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -516,6 +516,11 @@ static int dummy_task_setnice (struct task_struct *p, int nice)
 	return 0;
 }
 
+static int dummy_task_setioprio (struct task_struct *p, int ioprio)
+{
+	return 0;
+}
+
 static int dummy_task_setrlimit (unsigned int resource, struct rlimit *new_rlim)
 {
 	return 0;
@@ -972,6 +977,7 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, task_getsid);
 	set_to_dummy_if_null(ops, task_setgroups);
 	set_to_dummy_if_null(ops, task_setnice);
+	set_to_dummy_if_null(ops, task_setioprio);
 	set_to_dummy_if_null(ops, task_setrlimit);
 	set_to_dummy_if_null(ops, task_setscheduler);
 	set_to_dummy_if_null(ops, task_getscheduler);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 093efba4d9b6..9dcf298921d4 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2645,6 +2645,11 @@ static int selinux_task_setnice(struct task_struct *p, int nice)
 	return task_has_perm(current,p, PROCESS__SETSCHED);
 }
 
+static int selinux_task_setioprio(struct task_struct *p, int ioprio)
+{
+	return task_has_perm(current, p, PROCESS__SETSCHED);
+}
+
 static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim)
 {
 	struct rlimit *old_rlim = current->signal->rlim + resource;
@@ -4383,6 +4388,7 @@ static struct security_operations selinux_ops = {
 	.task_getsid =		        selinux_task_getsid,
 	.task_setgroups =		selinux_task_setgroups,
 	.task_setnice =			selinux_task_setnice,
+	.task_setioprio =		selinux_task_setioprio,
 	.task_setrlimit =		selinux_task_setrlimit,
 	.task_setscheduler =		selinux_task_setscheduler,
 	.task_getscheduler =		selinux_task_getscheduler,
-- 
cgit v1.2.3


From 35601547baf92d984b6e59cf3583649da04baea5 Mon Sep 17 00:00:00 2001
From: David Quigley <dpquigl@tycho.nsa.gov>
Date: Fri, 23 Jun 2006 02:04:01 -0700
Subject: [PATCH] SELinux: add task_movememory hook

This patch adds new security hook, task_movememory, to be called when memory
owened by a task is to be moved (e.g.  when migrating pages to a this hook is
identical to the setscheduler implementation, but a separate hook introduced
to allow this check to be specialized in the future if necessary.

Since the last posting, the hook has been renamed following feedback from
Christoph Lameter.

Signed-off-by: David Quigley <dpquigl@tycho.nsa.gov>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: James Morris <jmorris@namei.org>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/security.h | 15 +++++++++++++++
 security/dummy.c         |  6 ++++++
 security/selinux/hooks.c |  6 ++++++
 3 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 65b32a0c6207..d2c17bd91a29 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -601,6 +601,10 @@ struct swap_info_struct;
  *	@p.
  *	@p contains the task_struct for process.
  *	Return 0 if permission is granted.
+ * @task_movememory
+ *	Check permission before moving memory owned by process @p.
+ *	@p contains the task_struct for process.
+ *	Return 0 if permission is granted.
  * @task_kill:
  *	Check permission before sending signal @sig to @p.  @info can be NULL,
  *	the constant 1, or a pointer to a siginfo structure.  If @info is 1 or
@@ -1220,6 +1224,7 @@ struct security_operations {
 	int (*task_setscheduler) (struct task_struct * p, int policy,
 				  struct sched_param * lp);
 	int (*task_getscheduler) (struct task_struct * p);
+	int (*task_movememory) (struct task_struct * p);
 	int (*task_kill) (struct task_struct * p,
 			  struct siginfo * info, int sig);
 	int (*task_wait) (struct task_struct * p);
@@ -1865,6 +1870,11 @@ static inline int security_task_getscheduler (struct task_struct *p)
 	return security_ops->task_getscheduler (p);
 }
 
+static inline int security_task_movememory (struct task_struct *p)
+{
+	return security_ops->task_movememory (p);
+}
+
 static inline int security_task_kill (struct task_struct *p,
 				      struct siginfo *info, int sig)
 {
@@ -2512,6 +2522,11 @@ static inline int security_task_getscheduler (struct task_struct *p)
 	return 0;
 }
 
+static inline int security_task_movememory (struct task_struct *p)
+{
+	return 0;
+}
+
 static inline int security_task_kill (struct task_struct *p,
 				      struct siginfo *info, int sig)
 {
diff --git a/security/dummy.c b/security/dummy.c
index 879a98523b1b..c3c5493581e2 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -537,6 +537,11 @@ static int dummy_task_getscheduler (struct task_struct *p)
 	return 0;
 }
 
+static int dummy_task_movememory (struct task_struct *p)
+{
+	return 0;
+}
+
 static int dummy_task_wait (struct task_struct *p)
 {
 	return 0;
@@ -981,6 +986,7 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, task_setrlimit);
 	set_to_dummy_if_null(ops, task_setscheduler);
 	set_to_dummy_if_null(ops, task_getscheduler);
+	set_to_dummy_if_null(ops, task_movememory);
 	set_to_dummy_if_null(ops, task_wait);
 	set_to_dummy_if_null(ops, task_kill);
 	set_to_dummy_if_null(ops, task_prctl);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9dcf298921d4..79c16e31c884 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2679,6 +2679,11 @@ static int selinux_task_getscheduler(struct task_struct *p)
 	return task_has_perm(current, p, PROCESS__GETSCHED);
 }
 
+static int selinux_task_movememory(struct task_struct *p)
+{
+	return task_has_perm(current, p, PROCESS__SETSCHED);
+}
+
 static int selinux_task_kill(struct task_struct *p, struct siginfo *info, int sig)
 {
 	u32 perm;
@@ -4392,6 +4397,7 @@ static struct security_operations selinux_ops = {
 	.task_setrlimit =		selinux_task_setrlimit,
 	.task_setscheduler =		selinux_task_setscheduler,
 	.task_getscheduler =		selinux_task_getscheduler,
+	.task_movememory =		selinux_task_movememory,
 	.task_kill =			selinux_task_kill,
 	.task_wait =			selinux_task_wait,
 	.task_prctl =			selinux_task_prctl,
-- 
cgit v1.2.3


From c22ce143d15eb288543fe9873e1c5ac1c01b69a1 Mon Sep 17 00:00:00 2001
From: Hiro Yoshioka <hyoshiok@miraclelinux.com>
Date: Fri, 23 Jun 2006 02:04:16 -0700
Subject: [PATCH] x86: cache pollution aware __copy_from_user_ll()

Use the x86 cache-bypassing copy instructions for copy_from_user().

Some performance data are

Total of GLOBAL_POWER_EVENTS (CPU cycle samples)

2.6.12.4.orig    1921587
2.6.12.4.nt      1599424
1599424/1921587=83.23% (16.77% reduction)

BSQ_CACHE_REFERENCE (L3 cache miss)
2.6.12.4.orig      57427
2.6.12.4.nt        20858
20858/57427=36.32% (63.7% reduction)

L3 cache miss reduction of __copy_from_user_ll
samples  %
37408    65.1412  vmlinux                  __copy_from_user_ll
23        0.1103  vmlinux                  __copy_user_zeroing_intel_nocache
23/37408=0.061% (99.94% reduction)

Top 5 of 2.6.12.4.nt
Counted GLOBAL_POWER_EVENTS events (time during which processor is not stopped) with a unit mask of 0x01 (mandatory) count 100000
samples  %        app name                 symbol name
128392    8.0274  vmlinux                  __copy_user_zeroing_intel_nocache
64206     4.0143  vmlinux                  journal_add_journal_head
59746     3.7355  vmlinux                  do_get_write_access
47674     2.9807  vmlinux                  journal_put_journal_head
46021     2.8774  vmlinux                  journal_dirty_metadata
pattern9-0-cpu4-0-09011728/summary.out

Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x3f (multiple flags) count 3000
samples  %        app name                 symbol name
69755     4.2861  vmlinux                  __copy_user_zeroing_intel_nocache
55685     3.4215  vmlinux                  journal_add_journal_head
52371     3.2179  vmlinux                  __find_get_block
45504     2.7960  vmlinux                  journal_put_journal_head
36005     2.2123  vmlinux                  journal_stop
pattern9-0-cpu4-0-09011744/summary.out

Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x200 (read 3rd level cache miss) count 3000
samples  %        app name                 symbol name
1147      5.4994  vmlinux                  journal_add_journal_head
881       4.2240  vmlinux                  journal_dirty_data
872       4.1809  vmlinux                  blk_rq_map_sg
734       3.5192  vmlinux                  journal_commit_transaction
617       2.9582  vmlinux                  radix_tree_delete
pattern9-0-cpu4-0-09011731/summary.out

iozone results are

original 2.6.12.4 CPU time = 207.768 sec
cache aware       CPU time = 184.783 sec
(three times run)
184.783/207.768=88.94% (11.06% reduction)

original:
pattern9-0-cpu4-0-08191720/iozone.out:  CPU Utilization: Wall time   45.997    CPU time   64.527    CPU utilization 140.28 %
pattern9-0-cpu4-0-08191741/iozone.out:  CPU Utilization: Wall time   46.878    CPU time   71.933    CPU utilization 153.45 %
pattern9-0-cpu4-0-08191743/iozone.out:  CPU Utilization: Wall time   45.152    CPU time   71.308    CPU utilization 157.93 %

cache awre:
pattern9-0-cpu4-0-09011728/iozone.out:  CPU Utilization: Wall time   44.842    CPU time   62.465    CPU utilization 139.30 %
pattern9-0-cpu4-0-09011731/iozone.out:  CPU Utilization: Wall time   44.718    CPU time   59.273    CPU utilization 132.55 %
pattern9-0-cpu4-0-09011744/iozone.out:  CPU Utilization: Wall time   44.367    CPU time   63.045    CPU utilization 142.10 %

Signed-off-by: Hiro Yoshioka <hyoshiok@miraclelinux.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/lib/usercopy.c   | 137 ++++++++++++++++++++++++++++++++++++++++++---
 include/asm-i386/uaccess.h |  33 +++++++++++
 include/linux/uaccess.h    |  22 ++++++++
 mm/filemap.c               |   4 +-
 mm/filemap.h               |   6 +-
 5 files changed, 189 insertions(+), 13 deletions(-)
 create mode 100644 include/linux/uaccess.h

(limited to 'include/linux')

diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c
index 4cf981d70f45..6979297ce278 100644
--- a/arch/i386/lib/usercopy.c
+++ b/arch/i386/lib/usercopy.c
@@ -425,15 +425,121 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
 		       : "eax", "edx", "memory");
 	return size;
 }
+
+/*
+ * Non Temporal Hint version of __copy_user_zeroing_intel.  It is cache aware.
+ * hyoshiok@miraclelinux.com
+ */
+
+static unsigned long __copy_user_zeroing_intel_nocache(void *to,
+				const void __user *from, unsigned long size)
+{
+        int d0, d1;
+
+	__asm__ __volatile__(
+	       "        .align 2,0x90\n"
+	       "0:      movl 32(%4), %%eax\n"
+	       "        cmpl $67, %0\n"
+	       "        jbe 2f\n"
+	       "1:      movl 64(%4), %%eax\n"
+	       "        .align 2,0x90\n"
+	       "2:      movl 0(%4), %%eax\n"
+	       "21:     movl 4(%4), %%edx\n"
+	       "        movnti %%eax, 0(%3)\n"
+	       "        movnti %%edx, 4(%3)\n"
+	       "3:      movl 8(%4), %%eax\n"
+	       "31:     movl 12(%4),%%edx\n"
+	       "        movnti %%eax, 8(%3)\n"
+	       "        movnti %%edx, 12(%3)\n"
+	       "4:      movl 16(%4), %%eax\n"
+	       "41:     movl 20(%4), %%edx\n"
+	       "        movnti %%eax, 16(%3)\n"
+	       "        movnti %%edx, 20(%3)\n"
+	       "10:     movl 24(%4), %%eax\n"
+	       "51:     movl 28(%4), %%edx\n"
+	       "        movnti %%eax, 24(%3)\n"
+	       "        movnti %%edx, 28(%3)\n"
+	       "11:     movl 32(%4), %%eax\n"
+	       "61:     movl 36(%4), %%edx\n"
+	       "        movnti %%eax, 32(%3)\n"
+	       "        movnti %%edx, 36(%3)\n"
+	       "12:     movl 40(%4), %%eax\n"
+	       "71:     movl 44(%4), %%edx\n"
+	       "        movnti %%eax, 40(%3)\n"
+	       "        movnti %%edx, 44(%3)\n"
+	       "13:     movl 48(%4), %%eax\n"
+	       "81:     movl 52(%4), %%edx\n"
+	       "        movnti %%eax, 48(%3)\n"
+	       "        movnti %%edx, 52(%3)\n"
+	       "14:     movl 56(%4), %%eax\n"
+	       "91:     movl 60(%4), %%edx\n"
+	       "        movnti %%eax, 56(%3)\n"
+	       "        movnti %%edx, 60(%3)\n"
+	       "        addl $-64, %0\n"
+	       "        addl $64, %4\n"
+	       "        addl $64, %3\n"
+	       "        cmpl $63, %0\n"
+	       "        ja  0b\n"
+	       "        sfence \n"
+	       "5:      movl  %0, %%eax\n"
+	       "        shrl  $2, %0\n"
+	       "        andl $3, %%eax\n"
+	       "        cld\n"
+	       "6:      rep; movsl\n"
+	       "        movl %%eax,%0\n"
+	       "7:      rep; movsb\n"
+	       "8:\n"
+	       ".section .fixup,\"ax\"\n"
+	       "9:      lea 0(%%eax,%0,4),%0\n"
+	       "16:     pushl %0\n"
+	       "        pushl %%eax\n"
+	       "        xorl %%eax,%%eax\n"
+	       "        rep; stosb\n"
+	       "        popl %%eax\n"
+	       "        popl %0\n"
+	       "        jmp 8b\n"
+	       ".previous\n"
+	       ".section __ex_table,\"a\"\n"
+	       "	.align 4\n"
+	       "	.long 0b,16b\n"
+	       "	.long 1b,16b\n"
+	       "	.long 2b,16b\n"
+	       "	.long 21b,16b\n"
+	       "	.long 3b,16b\n"
+	       "	.long 31b,16b\n"
+	       "	.long 4b,16b\n"
+	       "	.long 41b,16b\n"
+	       "	.long 10b,16b\n"
+	       "	.long 51b,16b\n"
+	       "	.long 11b,16b\n"
+	       "	.long 61b,16b\n"
+	       "	.long 12b,16b\n"
+	       "	.long 71b,16b\n"
+	       "	.long 13b,16b\n"
+	       "	.long 81b,16b\n"
+	       "	.long 14b,16b\n"
+	       "	.long 91b,16b\n"
+	       "	.long 6b,9b\n"
+	       "        .long 7b,16b\n"
+	       ".previous"
+	       : "=&c"(size), "=&D" (d0), "=&S" (d1)
+	       :  "1"(to), "2"(from), "0"(size)
+	       : "eax", "edx", "memory");
+	return size;
+}
+
 #else
+
 /*
  * Leave these declared but undefined.  They should not be any references to
  * them
  */
-unsigned long
-__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size);
-unsigned long
-__copy_user_intel(void __user *to, const void *from, unsigned long size);
+unsigned long __copy_user_zeroing_intel(void *to, const void __user *from,
+					unsigned long size);
+unsigned long __copy_user_intel(void __user *to, const void *from,
+					unsigned long size);
+unsigned long __copy_user_zeroing_intel_nocache(void *to,
+				const void __user *from, unsigned long size);
 #endif /* CONFIG_X86_INTEL_USERCOPY */
 
 /* Generic arbitrary sized copy.  */
@@ -515,8 +621,8 @@ do {									\
 		: "memory");						\
 } while (0)
 
-
-unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long n)
+unsigned long __copy_to_user_ll(void __user *to, const void *from,
+				unsigned long n)
 {
 	BUG_ON((long) n < 0);
 #ifndef CONFIG_X86_WP_WORKS_OK
@@ -576,8 +682,8 @@ survive:
 }
 EXPORT_SYMBOL(__copy_to_user_ll);
 
-unsigned long
-__copy_from_user_ll(void *to, const void __user *from, unsigned long n)
+unsigned long __copy_from_user_ll(void *to, const void __user *from,
+					unsigned long n)
 {
 	BUG_ON((long)n < 0);
 	if (movsl_is_ok(to, from, n))
@@ -588,6 +694,21 @@ __copy_from_user_ll(void *to, const void __user *from, unsigned long n)
 }
 EXPORT_SYMBOL(__copy_from_user_ll);
 
+unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
+					unsigned long n)
+{
+	BUG_ON((long)n < 0);
+#ifdef CONFIG_X86_INTEL_USERCOPY
+	if ( n > 64 && cpu_has_xmm2)
+                n = __copy_user_zeroing_intel_nocache(to, from, n);
+	else
+		__copy_user_zeroing(to, from, n);
+#else
+        __copy_user_zeroing(to, from, n);
+#endif
+	return n;
+}
+
 /**
  * copy_to_user: - Copy a block of data into user space.
  * @to:   Destination address, in user space.
diff --git a/include/asm-i386/uaccess.h b/include/asm-i386/uaccess.h
index 1ec65523ea5e..82af28a943ab 100644
--- a/include/asm-i386/uaccess.h
+++ b/include/asm-i386/uaccess.h
@@ -390,6 +390,8 @@ unsigned long __must_check __copy_to_user_ll(void __user *to,
 				const void *from, unsigned long n);
 unsigned long __must_check __copy_from_user_ll(void *to,
 				const void __user *from, unsigned long n);
+unsigned long __must_check __copy_from_user_ll_nocache(void *to,
+				const void __user *from, unsigned long n);
 
 /*
  * Here we special-case 1, 2 and 4-byte copy_*_user invocations.  On a fault
@@ -478,12 +480,43 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 	return __copy_from_user_ll(to, from, n);
 }
 
+#define ARCH_HAS_NOCACHE_UACCESS
+
+static __always_inline unsigned long __copy_from_user_inatomic_nocache(void *to,
+				const void __user *from, unsigned long n)
+{
+	if (__builtin_constant_p(n)) {
+		unsigned long ret;
+
+		switch (n) {
+		case 1:
+			__get_user_size(*(u8 *)to, from, 1, ret, 1);
+			return ret;
+		case 2:
+			__get_user_size(*(u16 *)to, from, 2, ret, 2);
+			return ret;
+		case 4:
+			__get_user_size(*(u32 *)to, from, 4, ret, 4);
+			return ret;
+		}
+	}
+	return __copy_from_user_ll_nocache(to, from, n);
+}
+
 static __always_inline unsigned long
 __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
        might_sleep();
        return __copy_from_user_inatomic(to, from, n);
 }
+
+static __always_inline unsigned long
+__copy_from_user_nocache(void *to, const void __user *from, unsigned long n)
+{
+       might_sleep();
+       return __copy_from_user_inatomic_nocache(to, from, n);
+}
+
 unsigned long __must_check copy_to_user(void __user *to,
 				const void *from, unsigned long n);
 unsigned long __must_check copy_from_user(void *to,
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
new file mode 100644
index 000000000000..391e7ed1eb3f
--- /dev/null
+++ b/include/linux/uaccess.h
@@ -0,0 +1,22 @@
+#ifndef __LINUX_UACCESS_H__
+#define __LINUX_UACCESS_H__
+
+#include <asm/uaccess.h>
+
+#ifndef ARCH_HAS_NOCACHE_UACCESS
+
+static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
+				const void __user *from, unsigned long n)
+{
+	return __copy_from_user_inatomic(to, from, n);
+}
+
+static inline unsigned long __copy_from_user_nocache(void *to,
+				const void __user *from, unsigned long n)
+{
+	return __copy_from_user(to, from, n);
+}
+
+#endif		/* ARCH_HAS_NOCACHE_UACCESS */
+
+#endif		/* __LINUX_UACCESS_H__ */
diff --git a/mm/filemap.c b/mm/filemap.c
index 368678c2d531..807a463fd5ed 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
+#include <linux/uaccess.h>
 #include <linux/aio.h>
 #include <linux/capability.h>
 #include <linux/kernel_stat.h>
@@ -38,7 +39,6 @@
  */
 #include <linux/buffer_head.h> /* for generic_osync_inode */
 
-#include <asm/uaccess.h>
 #include <asm/mman.h>
 
 static ssize_t
@@ -1902,7 +1902,7 @@ __filemap_copy_from_user_iovec(char *vaddr,
 		int copy = min(bytes, iov->iov_len - base);
 
 		base = 0;
-		left = __copy_from_user_inatomic(vaddr, buf, copy);
+		left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
 		copied += copy;
 		bytes -= copy;
 		vaddr += copy;
diff --git a/mm/filemap.h b/mm/filemap.h
index 13793ba0ce17..5683cde22055 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -13,7 +13,7 @@
 #include <linux/highmem.h>
 #include <linux/uio.h>
 #include <linux/config.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 size_t
 __filemap_copy_from_user_iovec(char *vaddr,
@@ -34,13 +34,13 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
 	int left;
 
 	kaddr = kmap_atomic(page, KM_USER0);
-	left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+	left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
 	kunmap_atomic(kaddr, KM_USER0);
 
 	if (left != 0) {
 		/* Do it the slow way */
 		kaddr = kmap(page);
-		left = __copy_from_user(kaddr + offset, buf, bytes);
+		left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
 		kunmap(page);
 	}
 	return bytes - left;
-- 
cgit v1.2.3


From 1b61b910e99059abdd54c93aa70e84e076e33d16 Mon Sep 17 00:00:00 2001
From: Zhang Yanmin <yanmin.zhang@intel.com>
Date: Fri, 23 Jun 2006 02:04:22 -0700
Subject: [PATCH] x86: kernel irq balance doesn't work

On i386, kernel irq balance doesn't work.

1) In function do_irq_balance, after kernel finds the min_loaded cpu but
   before calling set_pending_irq to really pin the selected_irq to the
   target cpu, kernel does a cpus_and with irq_affinity[selected_irq].
   Later on, when the irq is acked, kernel would calls
   move_native_irq=>desc->handler->set_affinity to change the irq affinity.
    However, every function pointed by
   hw_interrupt_type->set_affinity(unsigned int irq, cpumask_t cpumask)
   always changes irq_affinity[irq] to cpumask.  Next time when recalling
   do_irq_balance, it has to do cpu_ands again with
   irq_affinity[selected_irq], but irq_affinity[selected_irq] already
   becomes one cpu selected by the first irq balance.

2) Function balance_irq in file arch/i386/kernel/io_apic.c has the same
   issue.

[akpm@osdl.org: cleanups]
Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/io_apic.c | 31 ++++++++++++++++++++-----------
 include/linux/irq.h        |  8 ++++++++
 kernel/irq/proc.c          |  3 +++
 3 files changed, 31 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index d70f2ade5cde..a62df3e764c5 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -267,7 +267,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 # include <linux/slab.h>		/* kmalloc() */
 # include <linux/timer.h>	/* time_after() */
  
-# ifdef CONFIG_BALANCED_IRQ_DEBUG
+#ifdef CONFIG_BALANCED_IRQ_DEBUG
 #  define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
 #  define Dprintk(x...) do { TDprintk(x); } while (0)
 # else
@@ -275,10 +275,15 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 #  define Dprintk(x...) 
 # endif
 
-
 #define IRQBALANCE_CHECK_ARCH -999
-static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
-static int physical_balance = 0;
+#define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
+#define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
+#define BALANCED_IRQ_MORE_DELTA		(HZ/10)
+#define BALANCED_IRQ_LESS_DELTA		(HZ)
+
+static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
+static int physical_balance __read_mostly;
+static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
 
 static struct irq_cpu_info {
 	unsigned long * last_irq;
@@ -297,12 +302,14 @@ static struct irq_cpu_info {
 
 #define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
 
-#define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
-#define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
-#define BALANCED_IRQ_MORE_DELTA		(HZ/10)
-#define BALANCED_IRQ_LESS_DELTA		(HZ)
+static cpumask_t balance_irq_affinity[NR_IRQS] = {
+	[0 ... NR_IRQS-1] = CPU_MASK_ALL
+};
 
-static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL;
+void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	balance_irq_affinity[irq] = mask;
+}
 
 static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
 			unsigned long now, int direction)
@@ -340,7 +347,7 @@ static inline void balance_irq(int cpu, int irq)
 	if (irqbalance_disabled)
 		return; 
 
-	cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
+	cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
 	new_cpu = move(cpu, allowed_mask, now, 1);
 	if (cpu != new_cpu) {
 		set_pending_irq(irq, cpumask_of_cpu(new_cpu));
@@ -529,7 +536,9 @@ tryanotherirq:
 		}
 	}
 
-	cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]);
+	cpus_and(allowed_mask,
+		cpu_online_map,
+		balance_irq_affinity[selected_irq]);
 	target_cpu_mask = cpumask_of_cpu(min_loaded);
 	cpus_and(tmp, target_cpu_mask, allowed_mask);
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 42c9cd562860..e8a07e75e4fb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -164,6 +164,14 @@ static inline void set_irq_info(int irq, cpumask_t mask)
 
 #endif // CONFIG_SMP
 
+#ifdef CONFIG_IRQBALANCE
+extern void set_balance_irq_affinity(unsigned int irq, cpumask_t mask);
+#else
+static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+}
+#endif
+
 extern int no_irq_affinity;
 extern int noirqdebug_setup(char *str);
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d03b5eef8ce0..afacd6f585fa 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -24,6 +24,8 @@ static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
+	set_balance_irq_affinity(irq, mask_val);
+
 	/*
 	 * Save these away for later use. Re-progam when the
 	 * interrupt is pending
@@ -33,6 +35,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 #else
 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
+	set_balance_irq_affinity(irq, mask_val);
 	irq_affinity[irq] = mask_val;
 	irq_desc[irq].handler->set_affinity(irq, mask_val);
 }
-- 
cgit v1.2.3


From ce4ab0012b32c1a4a1d6e934aeb73bf3151c48d9 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 23 Jun 2006 02:04:44 -0700
Subject: [PATCH] swsusp: add architecture special saveable pages support

1. Add architecture specific pages save/restore support.  Next two patches
   will use this to save/restore 'ACPI NVS' pages.

2. Allow reserved pages 'nosave'.  This could avoid save/restore BIOS
   reserved pages.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Nigel Cunningham <nigel@suspend2.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/suspend.h |   1 +
 kernel/power/power.h    |   4 ++
 kernel/power/snapshot.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/power/swsusp.c   |  18 ++------
 4 files changed, 118 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 96e31aa64cc7..e82cb10fb3ea 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -71,6 +71,7 @@ struct saved_context;
 void __save_processor_state(struct saved_context *ctxt);
 void __restore_processor_state(struct saved_context *ctxt);
 unsigned long get_safe_page(gfp_t gfp_mask);
+int swsusp_add_arch_pages(unsigned long start, unsigned long end);
 
 /*
  * XXX: We try to keep some more pages free so that I/O operations succeed
diff --git a/kernel/power/power.h b/kernel/power/power.h
index f06f12f21767..c81f0ed3eeba 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -105,6 +105,10 @@ extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
 extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
 extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
 
+extern unsigned int count_special_pages(void);
+extern int save_special_mem(void);
+extern int restore_special_mem(void);
+
 extern int swsusp_check(void);
 extern int swsusp_shrink_memory(void);
 extern void swsusp_free(void);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3eeedbb13b78..7f511d89c667 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -39,6 +39,88 @@ static unsigned int nr_copy_pages;
 static unsigned int nr_meta_pages;
 static unsigned long *buffer;
 
+struct arch_saveable_page {
+	unsigned long start;
+	unsigned long end;
+	char *data;
+	struct arch_saveable_page *next;
+};
+static struct arch_saveable_page *arch_pages;
+
+int swsusp_add_arch_pages(unsigned long start, unsigned long end)
+{
+	struct arch_saveable_page *tmp;
+
+	while (start < end) {
+		tmp = kzalloc(sizeof(struct arch_saveable_page), GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+		tmp->start = start;
+		tmp->end = ((start >> PAGE_SHIFT) + 1) << PAGE_SHIFT;
+		if (tmp->end > end)
+			tmp->end = end;
+		tmp->next = arch_pages;
+		start = tmp->end;
+		arch_pages = tmp;
+	}
+	return 0;
+}
+
+static unsigned int count_arch_pages(void)
+{
+	unsigned int count = 0;
+	struct arch_saveable_page *tmp = arch_pages;
+	while (tmp) {
+		count++;
+		tmp = tmp->next;
+	}
+	return count;
+}
+
+static int save_arch_mem(void)
+{
+	char *kaddr;
+	struct arch_saveable_page *tmp = arch_pages;
+	int offset;
+
+	pr_debug("swsusp: Saving arch specific memory");
+	while (tmp) {
+		tmp->data = (char *)__get_free_page(GFP_ATOMIC);
+		if (!tmp->data)
+			return -ENOMEM;
+		offset = tmp->start - (tmp->start & PAGE_MASK);
+		/* arch pages might haven't a 'struct page' */
+		kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0);
+		memcpy(tmp->data + offset, kaddr + offset,
+			tmp->end - tmp->start);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		tmp = tmp->next;
+	}
+	return 0;
+}
+
+static int restore_arch_mem(void)
+{
+	char *kaddr;
+	struct arch_saveable_page *tmp = arch_pages;
+	int offset;
+
+	while (tmp) {
+		if (!tmp->data)
+			continue;
+		offset = tmp->start - (tmp->start & PAGE_MASK);
+		kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0);
+		memcpy(kaddr + offset, tmp->data + offset,
+			tmp->end - tmp->start);
+		kunmap_atomic(kaddr, KM_USER0);
+		free_page((long)tmp->data);
+		tmp->data = NULL;
+		tmp = tmp->next;
+	}
+	return 0;
+}
+
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void)
 {
@@ -150,8 +232,35 @@ int restore_highmem(void)
 	}
 	return 0;
 }
+#else
+static unsigned int count_highmem_pages(void) {return 0;}
+static int save_highmem(void) {return 0;}
+static int restore_highmem(void) {return 0;}
 #endif
 
+unsigned int count_special_pages(void)
+{
+	return count_arch_pages() + count_highmem_pages();
+}
+
+int save_special_mem(void)
+{
+	int ret;
+	ret = save_arch_mem();
+	if (!ret)
+		ret = save_highmem();
+	return ret;
+}
+
+int restore_special_mem(void)
+{
+	int ret;
+	ret = restore_arch_mem();
+	if (!ret)
+		ret = restore_highmem();
+	return ret;
+}
+
 static int pfn_is_nosave(unsigned long pfn)
 {
 	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
@@ -177,7 +286,6 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn)
 		return 0;
 
 	page = pfn_to_page(pfn);
-	BUG_ON(PageReserved(page) && PageNosave(page));
 	if (PageNosave(page))
 		return 0;
 	if (PageReserved(page) && pfn_is_nosave(pfn))
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f9238faf76e4..78b6e71b0813 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -62,16 +62,6 @@ unsigned long image_size = 500 * 1024 * 1024;
 
 int in_suspend __nosavedata = 0;
 
-#ifdef CONFIG_HIGHMEM
-unsigned int count_highmem_pages(void);
-int save_highmem(void);
-int restore_highmem(void);
-#else
-static int save_highmem(void) { return 0; }
-static int restore_highmem(void) { return 0; }
-static unsigned int count_highmem_pages(void) { return 0; }
-#endif
-
 /**
  *	The following functions are used for tracing the allocated
  *	swap pages, so that they can be freed in case of an error.
@@ -192,7 +182,7 @@ int swsusp_shrink_memory(void)
 
 	printk("Shrinking memory...  ");
 	do {
-		size = 2 * count_highmem_pages();
+		size = 2 * count_special_pages();
 		size += size / 50 + count_data_pages();
 		size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
 			PAGES_FOR_IO;
@@ -234,7 +224,7 @@ int swsusp_suspend(void)
 		goto Enable_irqs;
 	}
 
-	if ((error = save_highmem())) {
+	if ((error = save_special_mem())) {
 		printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
 		goto Restore_highmem;
 	}
@@ -245,7 +235,7 @@ int swsusp_suspend(void)
 	/* Restore control flow magically appears here */
 	restore_processor_state();
 Restore_highmem:
-	restore_highmem();
+	restore_special_mem();
 	device_power_up();
 Enable_irqs:
 	local_irq_enable();
@@ -271,7 +261,7 @@ int swsusp_resume(void)
 	 */
 	swsusp_free();
 	restore_processor_state();
-	restore_highmem();
+	restore_special_mem();
 	touch_softlockup_watchdog();
 	device_power_up();
 	local_irq_enable();
-- 
cgit v1.2.3


From 98317f1271e7fd472983b013c76df6cc15fbef22 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Fri, 23 Jun 2006 02:04:54 -0700
Subject: [PATCH] m68k: Remove some unused definitions in zorro.h

These definitions have long been superseded by asm-offsets.h

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/zorro.h | 42 ------------------------------------------
 1 file changed, 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/zorro.h b/include/linux/zorro.h
index 2f135cf6eef1..913bfc226dda 100644
--- a/include/linux/zorro.h
+++ b/include/linux/zorro.h
@@ -11,8 +11,6 @@
 #ifndef _LINUX_ZORRO_H
 #define _LINUX_ZORRO_H
 
-#ifndef __ASSEMBLY__
-
 #include <linux/device.h>
 
 
@@ -112,45 +110,6 @@ struct ConfigDev {
     __u32		cd_Unused[4];	/* for whatever the driver wants */
 } __attribute__ ((packed));
 
-#else /* __ASSEMBLY__ */
-
-LN_Succ		= 0
-LN_Pred		= LN_Succ+4
-LN_Type		= LN_Pred+4
-LN_Pri		= LN_Type+1
-LN_Name		= LN_Pri+1
-LN_sizeof	= LN_Name+4
-
-ER_Type		= 0
-ER_Product	= ER_Type+1
-ER_Flags	= ER_Product+1
-ER_Reserved03	= ER_Flags+1
-ER_Manufacturer	= ER_Reserved03+1
-ER_SerialNumber	= ER_Manufacturer+2
-ER_InitDiagVec	= ER_SerialNumber+4
-ER_Reserved0c	= ER_InitDiagVec+2
-ER_Reserved0d	= ER_Reserved0c+1
-ER_Reserved0e	= ER_Reserved0d+1
-ER_Reserved0f	= ER_Reserved0e+1
-ER_sizeof	= ER_Reserved0f+1
-
-CD_Node		= 0
-CD_Flags	= CD_Node+LN_sizeof
-CD_Pad		= CD_Flags+1
-CD_Rom		= CD_Pad+1
-CD_BoardAddr	= CD_Rom+ER_sizeof
-CD_BoardSize	= CD_BoardAddr+4
-CD_SlotAddr	= CD_BoardSize+4
-CD_SlotSize	= CD_SlotAddr+2
-CD_Driver	= CD_SlotSize+2
-CD_NextCD	= CD_Driver+4
-CD_Unused	= CD_NextCD+4
-CD_sizeof	= CD_Unused+(4*4)
-
-#endif /* __ASSEMBLY__ */
-
-#ifndef __ASSEMBLY__
-
 #define ZORRO_NUM_AUTO		16
 
 #ifdef __KERNEL__
@@ -290,7 +249,6 @@ extern DECLARE_BITMAP(zorro_unused_z2ram, 128);
 #define Z2RAM_CHUNKSHIFT	(16)
 
 
-#endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_ZORRO_H */
-- 
cgit v1.2.3


From c330dda908b5a46469a997eea90b66f2f9f02b34 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Fri, 23 Jun 2006 02:05:07 -0700
Subject: [PATCH] Add a sysfs file to determine if a kexec kernel is loaded

Create two files in /sys/kernel, kexec_loaded and kexec_crash_loaded.  Each
file contains a simple boolean value indicating whether the relevant kernel
has been loaded into memory.  The motivation for this is geared around
support.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kexec.h |  1 +
 kernel/kexec.c        |  6 +++---
 kernel/ksysfs.c       | 19 +++++++++++++++++++
 3 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index cfb3410e32b1..6427949ddf99 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -106,6 +106,7 @@ extern struct page *kimage_alloc_control_pages(struct kimage *image,
 extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
 extern struct kimage *kexec_image;
+extern struct kimage *kexec_crash_image;
 
 #define KEXEC_ON_CRASH  0x00000001
 #define KEXEC_ARCH_MASK 0xffff0000
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bf39d28e4c0e..58f0f382597c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image,
  * kexec does not sync, or unmount filesystems so if you need
  * that to happen you need to do that yourself.
  */
-struct kimage *kexec_image = NULL;
-static struct kimage *kexec_crash_image = NULL;
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
 /*
  * A home grown binary mutex.
  * Nothing can wait so this mutex is safe to use
  * in interrupt context :)
  */
-static int kexec_lock = 0;
+static int kexec_lock;
 
 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 				struct kexec_segment __user *segments,
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index f119e098e67b..9e28478a17a5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
 #include <linux/sysfs.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/kexec.h>
 
 #define KERNEL_ATTR_RO(_name) \
 static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -48,6 +49,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s
 KERNEL_ATTR_RW(uevent_helper);
 #endif
 
+#ifdef CONFIG_KEXEC
+static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%d\n", !!kexec_image);
+}
+KERNEL_ATTR_RO(kexec_loaded);
+
+static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%d\n", !!kexec_crash_image);
+}
+KERNEL_ATTR_RO(kexec_crash_loaded);
+#endif /* CONFIG_KEXEC */
+
 decl_subsys(kernel, NULL, NULL);
 EXPORT_SYMBOL_GPL(kernel_subsys);
 
@@ -55,6 +70,10 @@ static struct attribute * kernel_attrs[] = {
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 	&uevent_seqnum_attr.attr,
 	&uevent_helper_attr.attr,
+#endif
+#ifdef CONFIG_KEXEC
+	&kexec_loaded_attr.attr,
+	&kexec_crash_loaded_attr.attr,
 #endif
 	NULL
 };
-- 
cgit v1.2.3


From 090d2b185d8680fc26a2eaf4245d4171dcf4baf1 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Fri, 23 Jun 2006 02:05:08 -0700
Subject: [PATCH] read_mapping_page for address space

Add read_mapping_page() which is used for callers that pass
mapping->a_ops->readpage as the filler for read_cache_page.  This removes
some duplication from filesystem code.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/afs/dir.c            |  4 +---
 fs/afs/mntpt.c          | 11 ++---------
 fs/cramfs/inode.c       |  4 +---
 fs/ext2/dir.c           |  3 +--
 fs/freevxfs/vxfs_subr.c |  3 +--
 fs/hfs/bnode.c          |  2 +-
 fs/hfs/btree.c          |  2 +-
 fs/hfsplus/bitmap.c     | 15 +++++++--------
 fs/hfsplus/bnode.c      |  2 +-
 fs/hfsplus/btree.c      |  2 +-
 fs/jfs/jfs_metapage.c   |  5 ++---
 fs/minix/dir.c          |  3 +--
 fs/namei.c              |  3 +--
 fs/ntfs/aops.h          |  3 +--
 fs/ntfs/attrib.c        |  6 ++----
 fs/ntfs/file.c          |  3 +--
 fs/ocfs2/symlink.c      |  3 +--
 fs/partitions/check.c   |  4 ++--
 fs/reiserfs/xattr.c     |  3 +--
 fs/sysv/dir.c           |  3 +--
 include/linux/pagemap.h |  7 +++++++
 mm/swapfile.c           |  3 +--
 22 files changed, 38 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index a6dff6a4f204..2fc99877cb0d 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -185,9 +185,7 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
 
 	_enter("{%lu},%lu", dir->i_ino, index);
 
-	page = read_cache_page(dir->i_mapping,index,
-			       (filler_t *) dir->i_mapping->a_ops->readpage,
-			       NULL);
+	page = read_mapping_page(dir->i_mapping, index, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 4e6eeb59b83c..b5cf9e1205ad 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -63,7 +63,6 @@ unsigned long afs_mntpt_expiry_timeout = 20;
 int afs_mntpt_check_symlink(struct afs_vnode *vnode)
 {
 	struct page *page;
-	filler_t *filler;
 	size_t size;
 	char *buf;
 	int ret;
@@ -71,10 +70,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
 	_enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique);
 
 	/* read the contents of the symlink into the pagecache */
-	filler = (filler_t *) AFS_VNODE_TO_I(vnode)->i_mapping->a_ops->readpage;
-
-	page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
-			       filler, NULL);
+	page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, NULL);
 	if (IS_ERR(page)) {
 		ret = PTR_ERR(page);
 		goto out;
@@ -160,7 +156,6 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	struct page *page = NULL;
 	size_t size;
 	char *buf, *devname = NULL, *options = NULL;
-	filler_t *filler;
 	int ret;
 
 	kenter("{%s}", mntpt->d_name.name);
@@ -182,9 +177,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 		goto error;
 
 	/* read the contents of the AFS special symlink */
-	filler = (filler_t *)mntpt->d_inode->i_mapping->a_ops->readpage;
-
-	page = read_cache_page(mntpt->d_inode->i_mapping, 0, filler, NULL);
+	page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
 	if (IS_ERR(page)) {
 		ret = PTR_ERR(page);
 		goto error;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 8a9d5d3b3262..c45d73860803 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -181,9 +181,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 		struct page *page = NULL;
 
 		if (blocknr + i < devsize) {
-			page = read_cache_page(mapping, blocknr + i,
-				(filler_t *)mapping->a_ops->readpage,
-				NULL);
+			page = read_mapping_page(mapping, blocknr + i, NULL);
 			/* synchronous error? */
 			if (IS_ERR(page))
 				page = NULL;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index d672aa9f4061..3c1c9aaaca6b 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -159,8 +159,7 @@ fail:
 static struct page * ext2_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 50aae77651b2..c1be118fc067 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -71,8 +71,7 @@ vxfs_get_page(struct address_space *mapping, u_long n)
 {
 	struct page *			pp;
 
-	pp = read_cache_page(mapping, n,
-			(filler_t*)mapping->a_ops->readpage, NULL);
+	pp = read_mapping_page(mapping, n, NULL);
 
 	if (!IS_ERR(pp)) {
 		wait_on_page_locked(pp);
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 1e44dcfe49c4..13231dd5ce66 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -280,7 +280,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	block = off >> PAGE_CACHE_SHIFT;
 	node->page_offset = off & ~PAGE_CACHE_MASK;
 	for (i = 0; i < tree->pages_per_bnode; i++) {
-		page = read_cache_page(mapping, block++, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, block++, NULL);
 		if (IS_ERR(page))
 			goto fail;
 		if (PageError(page)) {
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index d20131ce4b95..400357994319 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -59,7 +59,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	unlock_new_inode(tree->inode);
 
 	mapping = tree->inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto free_tree;
 
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index 9fb51632303c..d128a25b74d2 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -31,8 +31,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 	dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
 	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
-	page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-			       (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
 	pptr = kmap(page);
 	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
 	i = offset % 32;
@@ -72,8 +71,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 		offset += PAGE_CACHE_BITS;
 		if (offset >= size)
 			break;
-		page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-				       (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
+					 NULL);
 		curr = pptr = kmap(page);
 		if ((size ^ offset) / PAGE_CACHE_BITS)
 			end = pptr + PAGE_CACHE_BITS / 32;
@@ -119,8 +118,8 @@ found:
 		set_page_dirty(page);
 		kunmap(page);
 		offset += PAGE_CACHE_BITS;
-		page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-				       (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
+					 NULL);
 		pptr = kmap(page);
 		curr = pptr;
 		end = pptr + PAGE_CACHE_BITS / 32;
@@ -167,7 +166,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
 	pnr = offset / PAGE_CACHE_BITS;
-	page = read_cache_page(mapping, pnr, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, pnr, NULL);
 	pptr = kmap(page);
 	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
 	end = pptr + PAGE_CACHE_BITS / 32;
@@ -199,7 +198,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 			break;
 		set_page_dirty(page);
 		kunmap(page);
-		page = read_cache_page(mapping, ++pnr, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, ++pnr, NULL);
 		pptr = kmap(page);
 		curr = pptr;
 		end = pptr + PAGE_CACHE_BITS / 32;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 746abc9ecf70..77bf434da679 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -440,7 +440,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	block = off >> PAGE_CACHE_SHIFT;
 	node->page_offset = off & ~PAGE_CACHE_MASK;
 	for (i = 0; i < tree->pages_per_bnode; block++, i++) {
-		page = read_cache_page(mapping, block, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, block, NULL);
 		if (IS_ERR(page))
 			goto fail;
 		if (PageError(page)) {
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index effa8991999c..cfc852fdd1b5 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -38,7 +38,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 		goto free_tree;
 
 	mapping = tree->inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto free_tree;
 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 2b220dd6b4e7..7f6e88039700 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -632,10 +632,9 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
 		}
 		SetPageUptodate(page);
 	} else {
-		page = read_cache_page(mapping, page_index,
-			    (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, page_index, NULL);
 		if (IS_ERR(page) || !PageUptodate(page)) {
-			jfs_err("read_cache_page failed!");
+			jfs_err("read_mapping_page failed!");
 			return NULL;
 		}
 		lock_page(page);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 69224d1fe043..2b0a389d1987 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -60,8 +60,7 @@ static int dir_commit_chunk(struct page *page, unsigned from, unsigned to)
 static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/namei.c b/fs/namei.c
index 184fe4acf824..bb4a3e40e432 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2577,8 +2577,7 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
 {
 	struct page * page;
 	struct address_space *mapping = dentry->d_inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage,
-				NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto sync_fail;
 	wait_on_page_locked(page);
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index 3b74e66ca2ff..325ce261a107 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -86,8 +86,7 @@ static inline void ntfs_unmap_page(struct page *page)
 static inline struct page *ntfs_map_page(struct address_space *mapping,
 		unsigned long index)
 {
-	struct page *page = read_cache_page(mapping, index,
-			(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, index, NULL);
 
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 1663f5c3c6aa..6708e1d68a9e 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -2529,8 +2529,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	end >>= PAGE_CACHE_SHIFT;
 	/* If there is a first partial page, need to do it the slow way. */
 	if (start_ofs) {
-		page = read_cache_page(mapping, idx,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read first partial "
 					"page (sync error, index 0x%lx).", idx);
@@ -2600,8 +2599,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	}
 	/* If there is a last partial page, need to do it the slow way. */
 	if (end_ofs) {
-		page = read_cache_page(mapping, idx,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read last partial page "
 					"(sync error, index 0x%lx).", idx);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 36e1e136bb0c..88292f9e4b9b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -231,8 +231,7 @@ do_non_resident_extend:
 		 * Read the page.  If the page is not present, this will zero
 		 * the uninitialized regions for us.
 		 */
-		page = read_cache_page(mapping, index,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, index, NULL);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
 			goto init_err_out;
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index f6986bd79e75..0c8a1294ec96 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -64,8 +64,7 @@ static char *ocfs2_page_getlink(struct dentry * dentry,
 {
 	struct page * page;
 	struct address_space *mapping = dentry->d_inode->i_mapping;
-	page = read_cache_page(mapping, 0,
-			       (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto sync_fail;
 	wait_on_page_locked(page);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8851b81e7c5a..cd885b23cb5c 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -499,8 +499,8 @@ unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 	struct page *page;
 
-	page = read_cache_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
-			(filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
+				 NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		if (!PageUptodate(page))
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ffb79c48c5bf..39fedaa88a0c 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -452,8 +452,7 @@ static struct page *reiserfs_get_page(struct inode *dir, unsigned long n)
 	/* We can deadlock if we try to free dentries,
 	   and an unlink/rmdir has just occured - GFP_NOFS avoids this */
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
-	page = read_cache_page(mapping, n,
-			       (filler_t *) mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index d7074341ee87..f2bef962d309 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -53,8 +53,7 @@ static int dir_commit_chunk(struct page *page, unsigned from, unsigned to)
 static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7a1af574dedf..1245df7141aa 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -99,6 +99,13 @@ extern struct page * read_cache_page(struct address_space *mapping,
 extern int read_cache_pages(struct address_space *mapping,
 		struct list_head *pages, filler_t *filler, void *data);
 
+static inline struct page *read_mapping_page(struct address_space *mapping,
+					     unsigned long index, void *data)
+{
+	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+	return read_cache_page(mapping, index, filler, data);
+}
+
 int add_to_page_cache(struct page *page, struct address_space *mapping,
 				unsigned long index, gfp_t gfp_mask);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f2824c3c31b4..cc367f7e75d8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1477,8 +1477,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		error = -EINVAL;
 		goto bad_swap;
 	}
-	page = read_cache_page(mapping, 0,
-			(filler_t *)mapping->a_ops->readpage, swap_file);
+	page = read_mapping_page(mapping, 0, swap_file);
 	if (IS_ERR(page)) {
 		error = PTR_ERR(page);
 		goto bad_swap;
-- 
cgit v1.2.3


From 75e1fcc0b18df0a65ab113198e9dc0e98999a08c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 23 Jun 2006 02:05:12 -0700
Subject: [PATCH] vfs: add lock owner argument to flush operation

Pass the POSIX lock owner ID to the flush operation.

This is useful for filesystems which don't want to store any locking state
in inode->i_flock but want to handle locking/unlocking POSIX locks
internally.  FUSE is one such filesystem but I think it possible that some
network filesystems would need this also.

Also add a flag to indicate that a POSIX locking request was generated by
close(), so filesystems using the above feature won't send an extra locking
request in this case.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/perfmon.c | 3 +--
 drivers/input/evdev.c      | 2 +-
 drivers/scsi/osst.c        | 2 +-
 drivers/scsi/st.c          | 2 +-
 fs/cifs/cifsfs.h           | 2 +-
 fs/cifs/file.c             | 2 +-
 fs/coda/file.c             | 2 +-
 fs/fuse/file.c             | 2 +-
 fs/locks.c                 | 2 +-
 fs/nfs/file.c              | 4 ++--
 fs/open.c                  | 2 +-
 include/linux/coda_linux.h | 2 +-
 include/linux/fs.h         | 3 ++-
 ipc/mqueue.c               | 2 +-
 14 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 2359e2809f50..6d7bc8ff7b3a 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -532,7 +532,6 @@ static ctl_table pfm_sysctl_root[] = {
 static struct ctl_table_header *pfm_sysctl_header;
 
 static int pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
-static int pfm_flush(struct file *filp);
 
 #define pfm_get_cpu_var(v)		__ia64_per_cpu_var(v)
 #define pfm_get_cpu_data(a,b)		per_cpu(a, b)
@@ -1774,7 +1773,7 @@ pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx)
  * When caller is self-monitoring, the context is unloaded.
  */
 static int
-pfm_flush(struct file *filp)
+pfm_flush(struct file *filp, fl_owner_t id)
 {
 	pfm_context_t *ctx;
 	struct task_struct *task;
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index ba325f16d077..5f561fce32d8 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -82,7 +82,7 @@ static int evdev_fasync(int fd, struct file *file, int on)
 	return retval < 0 ? retval : 0;
 }
 
-static int evdev_flush(struct file * file)
+static int evdev_flush(struct file * file, fl_owner_t id)
 {
 	struct evdev_list *list = file->private_data;
 	if (!list->evdev->exist) return -ENODEV;
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index ce0ba3a174f9..4a2fed350d4e 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -4724,7 +4724,7 @@ err_out:
 
 
 /* Flush the tape buffer before close */
-static int os_scsi_tape_flush(struct file * filp)
+static int os_scsi_tape_flush(struct file * filp, fl_owner_t id)
 {
 	int		      result = 0, result2;
 	struct osst_tape    * STp    = filp->private_data;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index ad87d73f88ee..1272dd249af3 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -1193,7 +1193,7 @@ static int st_open(struct inode *inode, struct file *filp)
 
 
 /* Flush the tape buffer before close */
-static int st_flush(struct file *filp)
+static int st_flush(struct file *filp, fl_owner_t id)
 {
 	int result = 0, result2;
 	unsigned char cmd[MAX_COMMAND_SIZE];
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c98755dca868..d56c0577c710 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -74,7 +74,7 @@ extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 			 size_t write_size, loff_t * poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, struct dentry *, int);
-extern int cifs_flush(struct file *);
+extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 487ea8b3baaa..b4a18c1cab0a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1417,7 +1417,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
  * As file closes, flush all cached write data for this inode checking
  * for write behind errors.
  */
-int cifs_flush(struct file *file)
+int cifs_flush(struct file *file, fl_owner_t id)
 {
 	struct inode * inode = file->f_dentry->d_inode;
 	int rc = 0;
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 7c2642431fa5..cc66c681bd11 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -164,7 +164,7 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
 	return 0;
 }
 
-int coda_flush(struct file *coda_file)
+int coda_flush(struct file *coda_file, fl_owner_t id)
 {
 	unsigned short flags = coda_file->f_flags & ~O_EXCL;
 	unsigned short coda_flags = coda_flags_to_cflags(flags);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index fc342cf7c2cc..087f3b734f40 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -169,7 +169,7 @@ static int fuse_release(struct inode *inode, struct file *file)
 	return fuse_release_common(inode, file, 0);
 }
 
-static int fuse_flush(struct file *file)
+static int fuse_flush(struct file *file, fl_owner_t id)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
diff --git a/fs/locks.c b/fs/locks.c
index e588e1c265f7..f8a634ac1121 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1907,7 +1907,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 		return;
 
 	lock.fl_type = F_UNLCK;
-	lock.fl_flags = FL_POSIX;
+	lock.fl_flags = FL_POSIX | FL_CLOSE;
 	lock.fl_start = 0;
 	lock.fl_end = OFFSET_MAX;
 	lock.fl_owner = owner;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index fade02c15e6e..fa05c027ea11 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -43,7 +43,7 @@ static int  nfs_file_mmap(struct file *, struct vm_area_struct *);
 static ssize_t nfs_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *);
 static ssize_t nfs_file_read(struct kiocb *, char __user *, size_t, loff_t);
 static ssize_t nfs_file_write(struct kiocb *, const char __user *, size_t, loff_t);
-static int  nfs_file_flush(struct file *);
+static int  nfs_file_flush(struct file *, fl_owner_t id);
 static int  nfs_fsync(struct file *, struct dentry *dentry, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
@@ -188,7 +188,7 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
  *
  */
 static int
-nfs_file_flush(struct file *file)
+nfs_file_flush(struct file *file, fl_owner_t id)
 {
 	struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
 	struct inode	*inode = file->f_dentry->d_inode;
diff --git a/fs/open.c b/fs/open.c
index a37ff861108f..5fb16e5267dc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1152,7 +1152,7 @@ int filp_close(struct file *filp, fl_owner_t id)
 	}
 
 	if (filp->f_op && filp->f_op->flush)
-		retval = filp->f_op->flush(filp);
+		retval = filp->f_op->flush(filp, id);
 
 	dnotify_flush(filp, id);
 	locks_remove_posix(filp, id);
diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h
index b3ecf8f71d97..7b5c5df5cb69 100644
--- a/include/linux/coda_linux.h
+++ b/include/linux/coda_linux.h
@@ -36,7 +36,7 @@ extern const struct file_operations coda_ioctl_operations;
 
 /* operations shared over more than one file */
 int coda_open(struct inode *i, struct file *f);
-int coda_flush(struct file *f);
+int coda_flush(struct file *f, fl_owner_t id);
 int coda_release(struct inode *i, struct file *f);
 int coda_permission(struct inode *inode, int mask, struct nameidata *nd);
 int coda_revalidate_inode(struct dentry *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e917403f4d58..56d8bf0d0a77 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -683,6 +683,7 @@ extern spinlock_t files_lock;
 #define FL_FLOCK	2
 #define FL_ACCESS	8	/* not trying to lock, just looking */
 #define FL_LEASE	32	/* lease held on this file */
+#define FL_CLOSE	64	/* unlock on close */
 #define FL_SLEEP	128	/* A blocking lock */
 
 /*
@@ -1025,7 +1026,7 @@ struct file_operations {
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
 	int (*open) (struct inode *, struct file *);
-	int (*flush) (struct file *);
+	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);
 	int (*fsync) (struct file *, struct dentry *, int datasync);
 	int (*aio_fsync) (struct kiocb *, int datasync);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 0a2a24b6ebe4..02e6f6798972 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -359,7 +359,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
 	return count;
 }
 
-static int mqueue_flush_file(struct file *filp)
+static int mqueue_flush_file(struct file *filp, fl_owner_t id)
 {
 	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
 
-- 
cgit v1.2.3


From b0904e147f7cbe4be3b4dae49ddccd627bb66f16 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 23 Jun 2006 02:05:13 -0700
Subject: [PATCH] fs/locks.c: make posix_locks_deadlock() static

We can now make posix_locks_deadlock() static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/locks.c         | 4 +---
 include/linux/fs.h | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index f8a634ac1121..1ad29c9b6252 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -703,7 +703,7 @@ EXPORT_SYMBOL(posix_test_lock);
  * from a broken NFS client. But broken NFS clients have a lot more to
  * worry about than proper deadlock detection anyway... --okir
  */
-int posix_locks_deadlock(struct file_lock *caller_fl,
+static int posix_locks_deadlock(struct file_lock *caller_fl,
 				struct file_lock *block_fl)
 {
 	struct list_head *tmp;
@@ -722,8 +722,6 @@ next_task:
 	return 0;
 }
 
-EXPORT_SYMBOL(posix_locks_deadlock);
-
 /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
  * at the head of the list, but that's secret knowledge known only to
  * flock_lock_file and posix_lock_file.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 56d8bf0d0a77..dba4cbd157ee 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -776,7 +776,6 @@ extern int posix_lock_file_conf(struct file *, struct file_lock *, struct file_l
 extern int posix_lock_file(struct file *, struct file_lock *);
 extern int posix_lock_file_wait(struct file *, struct file_lock *);
 extern int posix_unblock_lock(struct file *, struct file_lock *);
-extern int posix_locks_deadlock(struct file_lock *, struct file_lock *);
 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
 extern int __break_lease(struct inode *inode, unsigned int flags);
 extern void lease_get_mtime(struct inode *, struct timespec *time);
-- 
cgit v1.2.3


From 8d27e9084b372441dc8c9cf361a965ee58032767 Mon Sep 17 00:00:00 2001
From: Xose Vazquez Perez <xose.vazquez@gmail.com>
Date: Fri, 23 Jun 2006 02:05:13 -0700
Subject: [PATCH] module.h: updated comments with a new license

"Dual MIT/GPL" is also accepted (kernel/module.c), so updated comments.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/module.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index c2d89e037af0..2d366098eab5 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -105,6 +105,8 @@ extern struct module __this_module;
  *	"GPL and additional rights"	[GNU Public License v2 rights and more]
  *	"Dual BSD/GPL"			[GNU Public License v2
  *					 or BSD license choice]
+ *	"Dual MIT/GPL"			[GNU Public License v2
+ *					 or MIT license choice]
  *	"Dual MPL/GPL"			[GNU Public License v2
  *					 or Mozilla license choice]
  *
-- 
cgit v1.2.3


From 260ea1013283d8acbb451459ed1ca560c1445c20 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 23 Jun 2006 02:05:18 -0700
Subject: [PATCH] ptrace: document the locking rules

After a lot of reading the code and thinking about how it behaves I have
managed to figure out what the current ptrace locking rules are.  The
current code is in much better that it appears at first glance.  The
troublesome code paths are actually the code paths that violate the current
rules.

ptrace uses simple exclusive access as it's locking.  You can only touch
task->ptrace if the task is stopped and you are the ptracer, or if the task
is running and are the task itself.

Very simple, very easy to maintain.  It just needs to be documented so
people know not to touch ptrace from elsewhere.

Currently we do have a few pieces of code that are in violation of this
rule.  Particularly the core dump code, and ptrace_attach.  But so far the
code looks fixable.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/ptrace.h | 4 ++++
 include/linux/sched.h  | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 0d36750fc0f1..ee918bc6e18c 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -51,6 +51,10 @@
 #ifdef __KERNEL__
 /*
  * Ptrace flags
+ *
+ * The owner ship rules for task->ptrace which holds the ptrace
+ * flags is simple.  When a task is running it owns it's task->ptrace
+ * flags.  When the a task is stopped the ptracer owns task->ptrace.
  */
 
 #define PT_PTRACED	0x00000001
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 267f15257040..a9d23c7d1b25 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1225,7 +1225,7 @@ static inline int thread_group_empty(task_t *p)
 		(thread_group_leader(p) && !thread_group_empty(p))
 
 /*
- * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
+ * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
  * pins the final release of task.io_context.  Also protects ->cpuset.
  *
-- 
cgit v1.2.3


From 0216bfcffe424a5473daa4da47440881b36c1f41 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Fri, 23 Jun 2006 02:05:41 -0700
Subject: [PATCH] percpu counter data type changes to suppport more than 2**31
 ext3 free blocks counter

The percpu counter data type are changed in this set of patches to support
more users like ext3 who need more than 32 bit to store the free blocks
total in the filesystem.

- Generic perpcu counters data type changes.  The size of the global counter
  and local counter were explictly specified using s64 and s32.  The global
  counter is changed from long to s64, while the local counter is changed from
  long to s32, so we could avoid doing 64 bit update in most cases.

- Users of the percpu counters are updated to make use of the new
  percpu_counter_init() routine now taking an additional parameter to allow
  users to pass the initial value of the global counter.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/super.c                | 25 +++++++++++++------------
 fs/ext3/super.c                | 36 +++++++++++++++++++-----------------
 fs/file_table.c                |  2 +-
 include/linux/percpu_counter.h | 38 ++++++++++++++++++++------------------
 lib/percpu_counter.c           | 10 +++++-----
 5 files changed, 58 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a6c4d6e02324..ee4ba759581e 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -834,9 +834,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount;
 	}
-	percpu_counter_init(&sbi->s_freeblocks_counter);
-	percpu_counter_init(&sbi->s_freeinodes_counter);
-	percpu_counter_init(&sbi->s_dirs_counter);
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 	sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
 			       GFP_KERNEL);
@@ -863,6 +860,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
+
+	percpu_counter_init(&sbi->s_freeblocks_counter,
+				ext2_count_free_blocks(sb));
+	percpu_counter_init(&sbi->s_freeinodes_counter,
+				ext2_count_free_inodes(sb));
+	percpu_counter_init(&sbi->s_dirs_counter,
+				ext2_count_dirs(sb));
 	/*
 	 * set up enough so that it can read an inode
 	 */
@@ -874,24 +878,18 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sb->s_root) {
 		iput(root);
 		printk(KERN_ERR "EXT2-fs: get root inode failed\n");
-		goto failed_mount2;
+		goto failed_mount3;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		dput(sb->s_root);
 		sb->s_root = NULL;
 		printk(KERN_ERR "EXT2-fs: corrupt root inode, run e2fsck\n");
-		goto failed_mount2;
+		goto failed_mount3;
 	}
 	if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
 		ext2_warning(sb, __FUNCTION__,
 			"mounting ext3 filesystem as ext2");
 	ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
-				ext2_count_free_blocks(sb));
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
-				ext2_count_free_inodes(sb));
-	percpu_counter_mod(&sbi->s_dirs_counter,
-				ext2_count_dirs(sb));
 	return 0;
 
 cantfind_ext2:
@@ -899,7 +897,10 @@ cantfind_ext2:
 		printk("VFS: Can't find an ext2 filesystem on dev %s.\n",
 		       sb->s_id);
 	goto failed_mount;
-
+failed_mount3:
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 1a198b3985c9..a60cc6ec130f 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1580,9 +1580,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	percpu_counter_init(&sbi->s_freeblocks_counter);
-	percpu_counter_init(&sbi->s_freeinodes_counter);
-	percpu_counter_init(&sbi->s_dirs_counter);
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
@@ -1602,6 +1599,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
+
+	percpu_counter_init(&sbi->s_freeblocks_counter,
+		ext3_count_free_blocks(sb));
+	percpu_counter_init(&sbi->s_freeinodes_counter,
+		ext3_count_free_inodes(sb));
+	percpu_counter_init(&sbi->s_dirs_counter,
+		ext3_count_dirs(sb));
+
 	/* per fileystem reservation list head & lock */
 	spin_lock_init(&sbi->s_rsv_window_lock);
 	sbi->s_rsv_window_root = RB_ROOT;
@@ -1640,16 +1645,16 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	if (!test_opt(sb, NOLOAD) &&
 	    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
 		if (ext3_load_journal(sb, es, journal_devnum))
-			goto failed_mount2;
+			goto failed_mount3;
 	} else if (journal_inum) {
 		if (ext3_create_journal(sb, es, journal_inum))
-			goto failed_mount2;
+			goto failed_mount3;
 	} else {
 		if (!silent)
 			printk (KERN_ERR
 				"ext3: No journal on filesystem on %s\n",
 				sb->s_id);
-		goto failed_mount2;
+		goto failed_mount3;
 	}
 
 	/* We have now updated the journal if required, so we can
@@ -1672,7 +1677,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
 			printk(KERN_ERR "EXT3-fs: Journal does not support "
 			       "requested data journaling mode\n");
-			goto failed_mount3;
+			goto failed_mount4;
 		}
 	default:
 		break;
@@ -1695,13 +1700,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	if (!sb->s_root) {
 		printk(KERN_ERR "EXT3-fs: get root inode failed\n");
 		iput(root);
-		goto failed_mount3;
+		goto failed_mount4;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		dput(sb->s_root);
 		sb->s_root = NULL;
 		printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
-		goto failed_mount3;
+		goto failed_mount4;
 	}
 
 	ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
@@ -1724,13 +1729,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
 
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
-		ext3_count_free_blocks(sb));
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
-		ext3_count_free_inodes(sb));
-	percpu_counter_mod(&sbi->s_dirs_counter,
-		ext3_count_dirs(sb));
-
 	lock_kernel();
 	return 0;
 
@@ -1740,8 +1738,12 @@ cantfind_ext3:
 		       sb->s_id);
 	goto failed_mount;
 
-failed_mount3:
+failed_mount4:
 	journal_destroy(sbi->s_journal);
+failed_mount3:
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
diff --git a/fs/file_table.c b/fs/file_table.c
index bcea1998b4de..506d5307108d 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -300,5 +300,5 @@ void __init files_init(unsigned long mempages)
 	if (files_stat.max_files < NR_FILE)
 		files_stat.max_files = NR_FILE;
 	files_defer_init();
-	percpu_counter_init(&nr_files);
+	percpu_counter_init(&nr_files, 0);
 } 
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 66b5de404f22..f5aa593ccf32 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -10,13 +10,14 @@
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/percpu.h>
+#include <linux/types.h>
 
 #ifdef CONFIG_SMP
 
 struct percpu_counter {
 	spinlock_t lock;
-	long count;
-	long *counters;
+	s64 count;
+	s32 *counters;
 };
 
 #if NR_CPUS >= 16
@@ -25,11 +26,11 @@ struct percpu_counter {
 #define FBC_BATCH	(NR_CPUS*4)
 #endif
 
-static inline void percpu_counter_init(struct percpu_counter *fbc)
+static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 {
 	spin_lock_init(&fbc->lock);
-	fbc->count = 0;
-	fbc->counters = alloc_percpu(long);
+	fbc->count = amount;
+	fbc->counters = alloc_percpu(s32);
 }
 
 static inline void percpu_counter_destroy(struct percpu_counter *fbc)
@@ -37,10 +38,10 @@ static inline void percpu_counter_destroy(struct percpu_counter *fbc)
 	free_percpu(fbc->counters);
 }
 
-void percpu_counter_mod(struct percpu_counter *fbc, long amount);
-long percpu_counter_sum(struct percpu_counter *fbc);
+void percpu_counter_mod(struct percpu_counter *fbc, s32 amount);
+s64 percpu_counter_sum(struct percpu_counter *fbc);
 
-static inline long percpu_counter_read(struct percpu_counter *fbc)
+static inline s64 percpu_counter_read(struct percpu_counter *fbc)
 {
 	return fbc->count;
 }
@@ -48,13 +49,14 @@ static inline long percpu_counter_read(struct percpu_counter *fbc)
 /*
  * It is possible for the percpu_counter_read() to return a small negative
  * number for some counter which should never be negative.
+ *
  */
-static inline long percpu_counter_read_positive(struct percpu_counter *fbc)
+static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
 {
-	long ret = fbc->count;
+	s64 ret = fbc->count;
 
 	barrier();		/* Prevent reloads of fbc->count */
-	if (ret > 0)
+	if (ret >= 0)
 		return ret;
 	return 1;
 }
@@ -62,12 +64,12 @@ static inline long percpu_counter_read_positive(struct percpu_counter *fbc)
 #else
 
 struct percpu_counter {
-	long count;
+	s64 count;
 };
 
-static inline void percpu_counter_init(struct percpu_counter *fbc)
+static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 {
-	fbc->count = 0;
+	fbc->count = amount;
 }
 
 static inline void percpu_counter_destroy(struct percpu_counter *fbc)
@@ -75,24 +77,24 @@ static inline void percpu_counter_destroy(struct percpu_counter *fbc)
 }
 
 static inline void
-percpu_counter_mod(struct percpu_counter *fbc, long amount)
+percpu_counter_mod(struct percpu_counter *fbc, s32 amount)
 {
 	preempt_disable();
 	fbc->count += amount;
 	preempt_enable();
 }
 
-static inline long percpu_counter_read(struct percpu_counter *fbc)
+static inline s64 percpu_counter_read(struct percpu_counter *fbc)
 {
 	return fbc->count;
 }
 
-static inline long percpu_counter_read_positive(struct percpu_counter *fbc)
+static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
 {
 	return fbc->count;
 }
 
-static inline long percpu_counter_sum(struct percpu_counter *fbc)
+static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
 {
 	return percpu_counter_read_positive(fbc);
 }
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 7a87003f8e8f..850449080e1c 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -5,10 +5,10 @@
 #include <linux/percpu_counter.h>
 #include <linux/module.h>
 
-void percpu_counter_mod(struct percpu_counter *fbc, long amount)
+void percpu_counter_mod(struct percpu_counter *fbc, s32 amount)
 {
 	long count;
-	long *pcount;
+	s32 *pcount;
 	int cpu = get_cpu();
 
 	pcount = per_cpu_ptr(fbc->counters, cpu);
@@ -29,15 +29,15 @@ EXPORT_SYMBOL(percpu_counter_mod);
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-long percpu_counter_sum(struct percpu_counter *fbc)
+s64 percpu_counter_sum(struct percpu_counter *fbc)
 {
-	long ret;
+	s64 ret;
 	int cpu;
 
 	spin_lock(&fbc->lock);
 	ret = fbc->count;
 	for_each_possible_cpu(cpu) {
-		long *pcount = per_cpu_ptr(fbc->counters, cpu);
+		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
 	}
 	spin_unlock(&fbc->lock);
-- 
cgit v1.2.3


From 368a5fa1f28589e6b54588a139ea872d5b4b1914 Mon Sep 17 00:00:00 2001
From: Hua Zhong <hzhong@gmail.com>
Date: Fri, 23 Jun 2006 02:05:42 -0700
Subject: [PATCH] remove unlikely() in might_sleep_if()

The likely() profiling tools show that __alloc_page() causes a lot of misses:

!       132    119193 __alloc_pages():mm/page_alloc.c@937

Because most __alloc_page() calls are not atomic.

Signed-off-by: Hua Zhong <hzhong@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 25fccd859fbf..8c21aaa248b4 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -78,7 +78,7 @@ extern int cond_resched(void);
 # define might_sleep() do { might_resched(); } while (0)
 #endif
 
-#define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0)
+#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
 
 #define abs(x) ({				\
 		int __x = (x);			\
-- 
cgit v1.2.3


From 1d31a4ea8cf7a2afc7299d1d3d8732ca54a5934e Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Fri, 23 Jun 2006 02:05:42 -0700
Subject: [PATCH] Process Events - Header Cleanup

Move connector header include to precisely where it's needed.

Remove unused time.h header file as well.  This was leftover from previous
iterations of the process events patches.

Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Cc: Nguyen Anh Quynh <aquynh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/connector/cn_proc.c | 1 +
 include/linux/cn_proc.h     | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index 4b4d7db1ff7b..498aa37bca22 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -26,6 +26,7 @@
 #include <linux/kernel.h>
 #include <linux/ktime.h>
 #include <linux/init.h>
+#include <linux/connector.h>
 #include <asm/atomic.h>
 
 #include <linux/cn_proc.h>
diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h
index 1417de935057..1e3459a14e20 100644
--- a/include/linux/cn_proc.h
+++ b/include/linux/cn_proc.h
@@ -26,8 +26,6 @@
 #define CN_PROC_H
 
 #include <linux/types.h>
-#include <linux/time.h>
-#include <linux/connector.h>
 
 /*
  * Userspace sends this enum to register with the kernel that it is listening
-- 
cgit v1.2.3


From 3fa2164d03fb7af17fcfe4f8800dd754fbd99ff7 Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Fri, 23 Jun 2006 02:05:44 -0700
Subject: [PATCH] Process Events: License Change

Change the license on the process event structure passed between kernel and
userspace.

Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Acked-by: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Acked-by: Nguyen Anh Quynh <aquynh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cn_proc.h | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h
index 1e3459a14e20..dbb7769009be 100644
--- a/include/linux/cn_proc.h
+++ b/include/linux/cn_proc.h
@@ -3,23 +3,16 @@
  *
  * Copyright (C) Matt Helsley, IBM Corp. 2005
  * Based on cn_fork.h by Nguyen Anh Quynh and Guillaume Thouvenin
- * Original copyright notice follows:
  * Copyright (C) 2005 Nguyen Anh Quynh <aquynh@gmail.com>
  * Copyright (C) 2005 Guillaume Thouvenin <guillaume.thouvenin@bull.net>
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
  *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  */
 
 #ifndef CN_PROC_H
-- 
cgit v1.2.3


From 481fad483487ea967fe20bbc9e565d787f7bf20f Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Fri, 23 Jun 2006 02:05:44 -0700
Subject: [PATCH] strstrip() API

Add a new strstrip() function to lib/string.c for removing leading and
trailing whitespace from a string.

Cc: Michael Holzheu <holzheu@de.ibm.com>
Acked-by: Ingo Oeser <ioe-lkml@rameria.de>
Acked-by: Joern Engel <joern@wohnheim.fh-wedel.de>
Cc: Corey Minyard <minyard@acm.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Michael Holzheu <HOLZHEU@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/string.h |  1 +
 lib/string.c           | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index c61306da8c52..e4c755860316 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -56,6 +56,7 @@ extern char * strnchr(const char *, size_t, int);
 #ifndef __HAVE_ARCH_STRRCHR
 extern char * strrchr(const char *,int);
 #endif
+extern char * strstrip(char *);
 #ifndef __HAVE_ARCH_STRSTR
 extern char * strstr(const char *,const char *);
 #endif
diff --git a/lib/string.c b/lib/string.c
index 064f6315b1c3..63077267367e 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -301,6 +301,36 @@ char *strnchr(const char *s, size_t count, int c)
 EXPORT_SYMBOL(strnchr);
 #endif
 
+/**
+ * strstrip - Removes leading and trailing whitespace from @s.
+ * @s: The string to be stripped.
+ *
+ * Note that the first trailing whitespace is replaced with a %NUL-terminator
+ * in the given string @s. Returns a pointer to the first non-whitespace
+ * character in @s.
+ */
+char *strstrip(char *s)
+{
+	size_t size;
+	char *end;
+
+	size = strlen(s);
+
+	if (!size)
+		return s;
+
+	end = s + size - 1;
+	while (end != s && isspace(*end))
+		end--;
+	*(end + 1) = '\0';
+
+	while (*s && isspace(*s))
+		s++;
+
+	return s;
+}
+EXPORT_SYMBOL(strstrip);
+
 #ifndef __HAVE_ARCH_STRLEN
 /**
  * strlen - Find the length of a string
-- 
cgit v1.2.3


From d83015b8f62ee3fcd338f6f009051ed57f77a531 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Fri, 23 Jun 2006 02:05:51 -0700
Subject: [PATCH] Make RCU API inaccessible to non-GPL Linux kernel modules

Remove synchronize_kernel() (deprecated 2-APR-2005 in
http://lkml.org/lkml/2005/4/3/11) and makes the RCU API inaccessible to
non-GPL Linux kernel modules (as was announced more than one year ago in
http://lkml.org/lkml/2005/4/3/8).  Tested on x86 and ppc64.

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/RCU/whatisRCU.txt            |  1 -
 Documentation/feature-removal-schedule.txt | 15 ---------------
 include/linux/rcupdate.h                   |  3 +--
 kernel/rcupdate.c                          | 13 ++-----------
 4 files changed, 3 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 07cb93b82ba9..6e459420ee9f 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -790,7 +790,6 @@ RCU pointer update:
 
 RCU grace period:
 
-	synchronize_kernel (deprecated)
 	synchronize_net
 	synchronize_sched
 	synchronize_rcu
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index f7293297f326..027285d0c26c 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -33,21 +33,6 @@ Who:	Adrian Bunk <bunk@stusta.de>
 
 ---------------------------
 
-What:	RCU API moves to EXPORT_SYMBOL_GPL
-When:	April 2006
-Files:	include/linux/rcupdate.h, kernel/rcupdate.c
-Why:	Outside of Linux, the only implementations of anything even
-	vaguely resembling RCU that I am aware of are in DYNIX/ptx,
-	VM/XA, Tornado, and K42.  I do not expect anyone to port binary
-	drivers or kernel modules from any of these, since the first two
-	are owned by IBM and the last two are open-source research OSes.
-	So these will move to GPL after a grace period to allow
-	people, who might be using implementations that I am not aware
-	of, to adjust to this upcoming change.
-Who:	Paul E. McKenney <paulmck@us.ibm.com>
-
----------------------------
-
 What:	raw1394: requests of type RAW1394_REQ_ISO_SEND, RAW1394_REQ_ISO_LISTEN
 When:	November 2006
 Why:	Deprecated in favour of the new ioctl-based rawiso interface, which is
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 970284f571a6..6312758393b6 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -246,7 +246,7 @@ extern int rcu_needs_cpu(int cpu);
  * softirq handlers will have completed, since in some kernels, these
  * handlers can run in process context, and can block.
  *
- * This primitive provides the guarantees made by the (deprecated)
+ * This primitive provides the guarantees made by the (now removed)
  * synchronize_kernel() API.  In contrast, synchronize_rcu() only
  * guarantees that rcu_read_lock() sections will have completed.
  * In "classic RCU", these two guarantees happen to be one and
@@ -264,7 +264,6 @@ extern void FASTCALL(call_rcu(struct rcu_head *head,
 				void (*func)(struct rcu_head *head)));
 extern void FASTCALL(call_rcu_bh(struct rcu_head *head,
 				void (*func)(struct rcu_head *head)));
-extern __deprecated_for_modules void synchronize_kernel(void);
 extern void synchronize_rcu(void);
 void synchronize_idle(void);
 extern void rcu_barrier(void);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2058f88c7bbb..20e9710fc21c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -612,14 +612,6 @@ void synchronize_rcu(void)
 	wait_for_completion(&rcu.completion);
 }
 
-/*
- * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
- */
-void synchronize_kernel(void)
-{
-	synchronize_rcu();
-}
-
 module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
@@ -627,7 +619,6 @@ module_param(qlowmark, int, 0);
 module_param(rsinterval, int, 0);
 #endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL_GPL_FUTURE(call_rcu);	/* WARNING: GPL-only in April 2006. */
-EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh);	/* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_bh);
 EXPORT_SYMBOL_GPL(synchronize_rcu);
-EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
-- 
cgit v1.2.3


From f5befceb5cfecba49fdf61f8e0eb4d453200eac9 Mon Sep 17 00:00:00 2001
From: Brent Casavant <bcasavan@sgi.com>
Date: Fri, 23 Jun 2006 02:05:52 -0700
Subject: [PATCH] SGI IOC4: Detect IO card variant

There are three different IO cards which an SGI IOC4 controller may find
itself on.  One of these variants does not bring out the IDE and serial
signals, so we need to disable attaching the corresponding IOC4 subdrivers
to such cards.

Cleans up message clutter emitted during device probing.

Signed-off-by: Brent Casavant <bcasavan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/ide/pci/sgiioc4.c    |  6 +++++
 drivers/serial/ioc4_serial.c |  9 +++++++
 drivers/sn/ioc4.c            | 64 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/ioc4.h         |  5 ++++
 4 files changed, 79 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/pci/sgiioc4.c b/drivers/ide/pci/sgiioc4.c
index 27c9eb989a9a..e125032bb403 100644
--- a/drivers/ide/pci/sgiioc4.c
+++ b/drivers/ide/pci/sgiioc4.c
@@ -723,6 +723,12 @@ static ide_pci_device_t sgiioc4_chipsets[] __devinitdata = {
 int
 ioc4_ide_attach_one(struct ioc4_driver_data *idd)
 {
+	/* PCI-RT does not bring out IDE connection.
+	 * Do not attach to this particular IOC4.
+	 */
+	if (idd->idd_variant == IOC4_VARIANT_PCI_RT)
+		return 0;
+
 	return pci_init_sgiioc4(idd->idd_pdev,
 				&sgiioc4_chipsets[idd->idd_pci_id->driver_data]);
 }
diff --git a/drivers/serial/ioc4_serial.c b/drivers/serial/ioc4_serial.c
index c620209d7b9a..717e47bbd784 100644
--- a/drivers/serial/ioc4_serial.c
+++ b/drivers/serial/ioc4_serial.c
@@ -2646,7 +2646,10 @@ static int ioc4_serial_remove_one(struct ioc4_driver_data *idd)
 	struct ioc4_port *port;
 	struct ioc4_soft *soft;
 
+	/* If serial driver did not attach, don't try to detach */
 	control = idd->idd_serial_data;
+	if (!control)
+		return 0;
 
 	for (port_num = 0; port_num < IOC4_NUM_SERIAL_PORTS; port_num++) {
 		for (port_type = UART_PORT_MIN;
@@ -2778,6 +2781,12 @@ ioc4_serial_attach_one(struct ioc4_driver_data *idd)
 	DPRINT_CONFIG(("%s (0x%p, 0x%p)\n", __FUNCTION__, idd->idd_pdev,
 							idd->idd_pci_id));
 
+	/* PCI-RT does not bring out serial connections.
+	 * Do not attach to this particular IOC4.
+	 */
+	if (idd->idd_variant == IOC4_VARIANT_PCI_RT)
+		return 0;
+
 	/* request serial registers */
 	tmp_addr1 = idd->idd_bar0 + IOC4_SERIAL_OFFSET;
 
diff --git a/drivers/sn/ioc4.c b/drivers/sn/ioc4.c
index cdeff909403e..8256a97eb508 100644
--- a/drivers/sn/ioc4.c
+++ b/drivers/sn/ioc4.c
@@ -160,9 +160,6 @@ ioc4_clock_calibrate(struct ioc4_driver_data *idd)
 	writel(0, &idd->idd_misc_regs->int_out.raw);
 	mmiowb();
 
-	printk(KERN_INFO
-	       "%s: Calibrating PCI bus speed "
-	       "for pci_dev %s ... ", __FUNCTION__, pci_name(idd->idd_pdev));
 	/* Set up square wave */
 	int_out.raw = 0;
 	int_out.fields.count = IOC4_CALIBRATE_COUNT;
@@ -206,11 +203,16 @@ ioc4_clock_calibrate(struct ioc4_driver_data *idd)
 	/* Bounds check the result. */
 	if (period > IOC4_CALIBRATE_LOW_LIMIT ||
 	    period < IOC4_CALIBRATE_HIGH_LIMIT) {
-		printk("failed. Assuming PCI clock ticks are %d ns.\n",
+		printk(KERN_INFO
+		       "IOC4 %s: Clock calibration failed.  Assuming"
+		       "PCI clock is %d ns.\n",
+		       pci_name(idd->idd_pdev),
 		       IOC4_CALIBRATE_DEFAULT / IOC4_EXTINT_COUNT_DIVISOR);
 		period = IOC4_CALIBRATE_DEFAULT;
 	} else {
-		printk("succeeded. PCI clock ticks are %ld ns.\n",
+		printk(KERN_DEBUG
+		       "IOC4 %s: PCI clock is %ld ns.\n",
+		       pci_name(idd->idd_pdev),
 		       period / IOC4_EXTINT_COUNT_DIVISOR);
 	}
 
@@ -222,6 +224,51 @@ ioc4_clock_calibrate(struct ioc4_driver_data *idd)
 	idd->count_period = period;
 }
 
+/* There are three variants of IOC4 cards: IO9, IO10, and PCI-RT.
+ * Each brings out different combinations of IOC4 signals, thus.
+ * the IOC4 subdrivers need to know to which we're attached.
+ *
+ * We look for the presence of a SCSI (IO9) or SATA (IO10) controller
+ * on the same PCI bus at slot number 3 to differentiate IO9 from IO10.
+ * If neither is present, it's a PCI-RT.
+ */
+static unsigned int
+ioc4_variant(struct ioc4_driver_data *idd)
+{
+	struct pci_dev *pdev = NULL;
+	int found = 0;
+
+	/* IO9: Look for a QLogic ISP 12160 at the same bus and slot 3. */
+	do {
+		pdev = pci_get_device(PCI_VENDOR_ID_QLOGIC,
+				      PCI_DEVICE_ID_QLOGIC_ISP12160, pdev);
+		if (pdev &&
+		    idd->idd_pdev->bus->number == pdev->bus->number &&
+		    3 == PCI_SLOT(pdev->devfn))
+			found = 1;
+		pci_dev_put(pdev);
+	} while (pdev && !found);
+	if (NULL != pdev)
+		return IOC4_VARIANT_IO9;
+
+	/* IO10: Look for a Vitesse VSC 7174 at the same bus and slot 3. */
+	pdev = NULL;
+	do {
+		pdev = pci_get_device(PCI_VENDOR_ID_VITESSE,
+				      PCI_DEVICE_ID_VITESSE_VSC7174, pdev);
+		if (pdev &&
+		    idd->idd_pdev->bus->number == pdev->bus->number &&
+		    3 == PCI_SLOT(pdev->devfn))
+			found = 1;
+		pci_dev_put(pdev);
+	} while (pdev && !found);
+	if (NULL != pdev)
+		return IOC4_VARIANT_IO10;
+
+	/* PCI-RT: No SCSI/SATA controller will be present */
+	return IOC4_VARIANT_PCI_RT;
+}
+
 /* Adds a new instance of an IOC4 card */
 static int
 ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id)
@@ -286,6 +333,13 @@ ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id)
 
 	/* Failsafe portion of per-IOC4 initialization */
 
+	/* Detect card variant */
+	idd->idd_variant = ioc4_variant(idd);
+	printk(KERN_INFO "IOC4 %s: %s card detected.\n", pci_name(pdev),
+	       idd->idd_variant == IOC4_VARIANT_IO9 ? "IO9" :
+	       idd->idd_variant == IOC4_VARIANT_PCI_RT ? "PCI-RT" :
+	       idd->idd_variant == IOC4_VARIANT_IO10 ? "IO10" : "unknown");
+
 	/* Initialize IOC4 */
 	pci_read_config_dword(idd->idd_pdev, PCI_COMMAND, &pcmd);
 	pci_write_config_dword(idd->idd_pdev, PCI_COMMAND,
diff --git a/include/linux/ioc4.h b/include/linux/ioc4.h
index 3dd18b785ebd..de73a3289cc2 100644
--- a/include/linux/ioc4.h
+++ b/include/linux/ioc4.h
@@ -147,6 +147,10 @@ struct ioc4_misc_regs {
 #define IOC4_GPCR_EDGE_6 0x40
 #define IOC4_GPCR_EDGE_7 0x80
 
+#define IOC4_VARIANT_IO9	0x0900
+#define IOC4_VARIANT_PCI_RT	0x0901
+#define IOC4_VARIANT_IO10	0x1000
+
 /* One of these per IOC4 */
 struct ioc4_driver_data {
 	struct list_head idd_list;
@@ -156,6 +160,7 @@ struct ioc4_driver_data {
 	struct __iomem ioc4_misc_regs *idd_misc_regs;
 	unsigned long count_period;
 	void *idd_serial_data;
+	unsigned int idd_variant;
 };
 
 /* One per submodule */
-- 
cgit v1.2.3


From 54e73770357142e297c916c7865f5fca7499f69c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 23 Jun 2006 02:05:54 -0700
Subject: [PATCH] list: introduce list_replace() helper

list_replace() is similar to list_replace_rcu(), but unlike
list_replace_rcu() it

	could be used when list_empty(old) == 1

	doesn't use barriers

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/list.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 76f05718342c..a02642e4710a 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -197,12 +197,35 @@ static inline void list_del_rcu(struct list_head *entry)
 	entry->prev = LIST_POISON2;
 }
 
+/**
+ * list_replace - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ * Note: if 'old' was empty, it will be overwritten.
+ */
+static inline void list_replace(struct list_head *old,
+				struct list_head *new)
+{
+	new->next = old->next;
+	new->next->prev = new;
+	new->prev = old->prev;
+	new->prev->next = new;
+}
+
+static inline void list_replace_init(struct list_head *old,
+					struct list_head *new)
+{
+	list_replace(old, new);
+	INIT_LIST_HEAD(old);
+}
+
 /*
  * list_replace_rcu - replace old entry by new one
  * @old : the element to be replaced
  * @new : the new element to insert
  *
  * The old entry will be replaced with the new entry atomically.
+ * Note: 'old' should not be empty.
  */
 static inline void list_replace_rcu(struct list_head *old,
 				struct list_head *new)
-- 
cgit v1.2.3


From 908dcecda1d18803b5823f30e6c47d2882dc0cf1 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Fri, 23 Jun 2006 02:06:00 -0700
Subject: [PATCH] adjust handle_IRR_event() return type

Correct the return type of handle_IRQ_event() (inconsistency noticed during
Xen development), and remove redundant declarations.  The return type
adjustment required breaking out the definition of irqreturn_t into a
separate header, in order to satisfy current include order dependencies.

Signed-off-by: Jan Beulich <jbeulich@novell.com>

Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Hirokazu Takata <takata.hirokazu@renesas.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/irq.h     |  4 ----
 include/asm-arm/irq.h       |  4 ----
 include/asm-arm26/irq.h     |  4 ----
 include/asm-h8300/irq.h     |  4 ----
 include/asm-m68k/irq.h      |  4 ----
 include/asm-m68knommu/irq.h |  4 ----
 include/asm-s390/irq.h      |  4 ----
 include/asm-sparc/irq.h     |  4 ----
 include/asm-v850/irq.h      |  2 --
 include/linux/interrupt.h   | 21 +--------------------
 include/linux/irq.h         |  3 ++-
 include/linux/irqreturn.h   | 25 +++++++++++++++++++++++++
 kernel/irq/handle.c         |  5 +++--
 13 files changed, 31 insertions(+), 57 deletions(-)
 create mode 100644 include/linux/irqreturn.h

(limited to 'include/linux')

diff --git a/include/asm-alpha/irq.h b/include/asm-alpha/irq.h
index f6de033718a0..917b9fe372cf 100644
--- a/include/asm-alpha/irq.h
+++ b/include/asm-alpha/irq.h
@@ -92,8 +92,4 @@ extern void enable_irq(unsigned int);
 struct pt_regs;
 extern void (*perf_irq)(unsigned long, struct pt_regs *);
 
-struct irqaction;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
-
 #endif /* _ALPHA_IRQ_H */
diff --git a/include/asm-arm/irq.h b/include/asm-arm/irq.h
index 60b5105c9c93..66e67e60bc56 100644
--- a/include/asm-arm/irq.h
+++ b/include/asm-arm/irq.h
@@ -47,10 +47,6 @@ void disable_irq_wake(unsigned int irq);
 void enable_irq_wake(unsigned int irq);
 int setup_irq(unsigned int, struct irqaction *);
 
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 extern void migrate_irqs(void);
 #endif
 
diff --git a/include/asm-arm26/irq.h b/include/asm-arm26/irq.h
index 06bd5a543d13..9aaac87efba9 100644
--- a/include/asm-arm26/irq.h
+++ b/include/asm-arm26/irq.h
@@ -44,9 +44,5 @@ extern void enable_irq(unsigned int);
 
 int set_irq_type(unsigned int irq, unsigned int type);
 
-int setup_irq(unsigned int, struct irqaction *);
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 #endif
 
diff --git a/include/asm-h8300/irq.h b/include/asm-h8300/irq.h
index 73065f5bda0e..42a3ac424a9e 100644
--- a/include/asm-h8300/irq.h
+++ b/include/asm-h8300/irq.h
@@ -63,8 +63,4 @@ extern void enable_irq(unsigned int);
 extern void disable_irq(unsigned int);
 #define disable_irq_nosync(x)	disable_irq(x)
 
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 #endif /* _H8300_IRQ_H_ */
diff --git a/include/asm-m68k/irq.h b/include/asm-m68k/irq.h
index b4f48b2a6a57..9727ca9d9f26 100644
--- a/include/asm-m68k/irq.h
+++ b/include/asm-m68k/irq.h
@@ -130,8 +130,4 @@ extern volatile unsigned int num_spurious;
  */
 extern irq_node_t *new_irq_node(void);
 
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 #endif /* _M68K_IRQ_H_ */
diff --git a/include/asm-m68knommu/irq.h b/include/asm-m68knommu/irq.h
index 2b408842a30e..c5247516fcfe 100644
--- a/include/asm-m68knommu/irq.h
+++ b/include/asm-m68knommu/irq.h
@@ -87,8 +87,4 @@ extern void (*mach_disable_irq)(unsigned int);
 #define disable_irq(x)	do { } while (0)
 #define disable_irq_nosync(x)	disable_irq(x)
 
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 #endif /* _M68K_IRQ_H_ */
diff --git a/include/asm-s390/irq.h b/include/asm-s390/irq.h
index 916a1aa0b073..bd1a721f7aa2 100644
--- a/include/asm-s390/irq.h
+++ b/include/asm-s390/irq.h
@@ -21,10 +21,6 @@ enum interruption_class {
 
 #define touch_nmi_watchdog() do { } while(0)
 
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 #endif /* __KERNEL__ */
 #endif
 
diff --git a/include/asm-sparc/irq.h b/include/asm-sparc/irq.h
index f2d64537e29d..3141ddfea97d 100644
--- a/include/asm-sparc/irq.h
+++ b/include/asm-sparc/irq.h
@@ -181,8 +181,4 @@ extern struct sun4m_intregs *sun4m_interrupts;
 #define SUN4M_INT_SBUS(x)	(1 << (x+7))
 #define SUN4M_INT_VME(x)	(1 << (x))
 
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 #endif
diff --git a/include/asm-v850/irq.h b/include/asm-v850/irq.h
index 44431152b36d..1bf096db8f4c 100644
--- a/include/asm-v850/irq.h
+++ b/include/asm-v850/irq.h
@@ -62,8 +62,6 @@ extern void disable_irq (unsigned int irq);
 /* Disable an irq without waiting. */
 extern void disable_irq_nosync (unsigned int irq);
 
-extern int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __V850_IRQ_H__ */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9e0fefd7884a..70741e170114 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -7,32 +7,13 @@
 #include <linux/bitops.h>
 #include <linux/preempt.h>
 #include <linux/cpumask.h>
+#include <linux/irqreturn.h>
 #include <linux/hardirq.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
-/*
- * For 2.4.x compatibility, 2.4.x can use
- *
- *	typedef void irqreturn_t;
- *	#define IRQ_NONE
- *	#define IRQ_HANDLED
- *	#define IRQ_RETVAL(x)
- *
- * To mix old-style and new-style irq handler returns.
- *
- * IRQ_NONE means we didn't handle it.
- * IRQ_HANDLED means that we did have a valid interrupt and handled it.
- * IRQ_RETVAL(x) selects on the two depending on x being non-zero (for handled)
- */
-typedef int irqreturn_t;
-
-#define IRQ_NONE	(0)
-#define IRQ_HANDLED	(1)
-#define IRQ_RETVAL(x)	((x) != 0)
-
 struct irqaction {
 	irqreturn_t (*handler)(int, void *, struct pt_regs *);
 	unsigned long flags;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index e8a07e75e4fb..676e00dfb21a 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -17,6 +17,7 @@
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/cpumask.h>
+#include <linux/irqreturn.h>
 
 #include <asm/irq.h>
 #include <asm/ptrace.h>
@@ -175,7 +176,7 @@ static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
 extern int no_irq_affinity;
 extern int noirqdebug_setup(char *str);
 
-extern fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+extern fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 					struct irqaction *action);
 extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
 extern void note_interrupt(unsigned int irq, irq_desc_t *desc,
diff --git a/include/linux/irqreturn.h b/include/linux/irqreturn.h
new file mode 100644
index 000000000000..881883c2009d
--- /dev/null
+++ b/include/linux/irqreturn.h
@@ -0,0 +1,25 @@
+/* irqreturn.h */
+#ifndef _LINUX_IRQRETURN_H
+#define _LINUX_IRQRETURN_H
+
+/*
+ * For 2.4.x compatibility, 2.4.x can use
+ *
+ *	typedef void irqreturn_t;
+ *	#define IRQ_NONE
+ *	#define IRQ_HANDLED
+ *	#define IRQ_RETVAL(x)
+ *
+ * To mix old-style and new-style irq handler returns.
+ *
+ * IRQ_NONE means we didn't handle it.
+ * IRQ_HANDLED means that we did have a valid interrupt and handled it.
+ * IRQ_RETVAL(x) selects on the two depending on x being non-zero (for handled)
+ */
+typedef int irqreturn_t;
+
+#define IRQ_NONE	(0)
+#define IRQ_HANDLED	(1)
+#define IRQ_RETVAL(x)	((x) != 0)
+
+#endif
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 51df337b37db..0f6530117105 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -76,10 +76,11 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
 /*
  * Have got an event to handle:
  */
-fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 				struct irqaction *action)
 {
-	int ret, retval = 0, status = 0;
+	irqreturn_t ret, retval = IRQ_NONE;
+	unsigned int status = 0;
 
 	if (!(action->flags & SA_INTERRUPT))
 		local_irq_enable();
-- 
cgit v1.2.3


From 78ce89c92bc6eaf5933b5664bff64253a7103bd7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 23 Jun 2006 02:06:05 -0700
Subject: [PATCH] JBD: split checkpoint lists

Split the checkpoint list of the transaction into two lists.  In the first
list we keep the buffers that need to be submitted for IO.  In the second
list are kept buffers that were already submitted and we just have to wait
for the IO to complete.  This should simplify a handling of checkpoint
lists a bit and can eventually be also a performance gain.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Cc: "Stephen C. Tweedie" <sct@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/checkpoint.c | 419 ++++++++++++++++++++++++++++++----------------------
 include/linux/jbd.h |   8 +-
 2 files changed, 246 insertions(+), 181 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 3f5102b069db..47678a26c13b 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,29 +24,67 @@
 #include <linux/slab.h>
 
 /*
- * Unlink a buffer from a transaction.
+ * Unlink a buffer from a transaction checkpoint list.
  *
  * Called with j_list_lock held.
  */
-
-static inline void __buffer_unlink(struct journal_head *jh)
+static inline void __buffer_unlink_first(struct journal_head *jh)
 {
-	transaction_t *transaction;
-
-	transaction = jh->b_cp_transaction;
-	jh->b_cp_transaction = NULL;
+	transaction_t *transaction = jh->b_cp_transaction;
 
 	jh->b_cpnext->b_cpprev = jh->b_cpprev;
 	jh->b_cpprev->b_cpnext = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
+	if (transaction->t_checkpoint_list == jh) {
 		transaction->t_checkpoint_list = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
-		transaction->t_checkpoint_list = NULL;
+		if (transaction->t_checkpoint_list == jh)
+			transaction->t_checkpoint_list = NULL;
+	}
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+	if (transaction->t_checkpoint_io_list == jh) {
+		transaction->t_checkpoint_io_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_io_list == jh)
+			transaction->t_checkpoint_io_list = NULL;
+	}
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+
+	if (!transaction->t_checkpoint_io_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_io_list;
+		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_io_list = jh;
 }
 
 /*
  * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
  * Requires j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
@@ -57,12 +95,11 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
-		__journal_remove_checkpoint(jh);
+		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
-		ret = 1;
 	} else {
 		jbd_unlock_bh_state(bh);
 	}
@@ -117,83 +154,54 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 }
 
 /*
- * Clean up a transaction's checkpoint list.
- *
- * We wait for any pending IO to complete and make sure any clean
- * buffers are removed from the transaction.
- *
- * Return 1 if we performed any actions which might have destroyed the
- * checkpoint.  (journal_remove_checkpoint() deletes the transaction when
- * the last checkpoint buffer is cleansed)
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
  *
  * Called with j_list_lock held.
  */
-static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
+static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
-	struct journal_head *jh, *next_jh, *last_jh;
+	struct journal_head *jh;
 	struct buffer_head *bh;
-	int ret = 0;
-
-	assert_spin_locked(&journal->j_list_lock);
-	jh = transaction->t_checkpoint_list;
-	if (!jh)
-		return 0;
-
-	last_jh = jh->b_cpprev;
-	next_jh = jh;
-	do {
-		jh = next_jh;
+	tid_t this_tid;
+	int released = 0;
+
+	this_tid = transaction->t_tid;
+restart:
+	/* Did somebody clean up the transaction in the meanwhile? */
+	if (journal->j_checkpoint_transactions != transaction ||
+			transaction->t_tid != this_tid)
+		return;
+	while (!released && transaction->t_checkpoint_io_list) {
+		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
+		if (!jbd_trylock_bh_state(bh)) {
+			jbd_sync_bh(journal, bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		if (buffer_locked(bh)) {
 			atomic_inc(&bh->b_count);
 			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
 			/* the journal_head may have gone by now */
 			BUFFER_TRACE(bh, "brelse");
 			__brelse(bh);
-			goto out_return_1;
+			spin_lock(&journal->j_list_lock);
+			goto restart;
 		}
-
 		/*
-		 * This is foul
+		 * Now in whatever state the buffer currently is, we know that
+		 * it has been written out and so we can drop it from the list
 		 */
-		if (!jbd_trylock_bh_state(bh)) {
-			jbd_sync_bh(journal, bh);
-			goto out_return_1;
-		}
-
-		if (jh->b_transaction != NULL) {
-			transaction_t *t = jh->b_transaction;
-			tid_t tid = t->t_tid;
-
-			spin_unlock(&journal->j_list_lock);
-			jbd_unlock_bh_state(bh);
-			log_start_commit(journal, tid);
-			log_wait_commit(journal, tid);
-			goto out_return_1;
-		}
-
-		/*
-		 * AKPM: I think the buffer_jbddirty test is redundant - it
-		 * shouldn't have NULL b_transaction?
-		 */
-		next_jh = jh->b_cpnext;
-		if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) {
-			BUFFER_TRACE(bh, "remove from checkpoint");
-			__journal_remove_checkpoint(jh);
-			jbd_unlock_bh_state(bh);
-			journal_remove_journal_head(bh);
-			__brelse(bh);
-			ret = 1;
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-	} while (jh != last_jh);
-
-	return ret;
-out_return_1:
-	spin_lock(&journal->j_list_lock);
-	return 1;
+		released = __journal_remove_checkpoint(jh);
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		__brelse(bh);
+	}
 }
 
 #define NR_BATCH	64
@@ -203,9 +211,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
 
-	spin_unlock(&journal->j_list_lock);
 	ll_rw_block(SWRITE, *batch_count, bhs);
-	spin_lock(&journal->j_list_lock);
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
 		clear_buffer_jwrite(bh);
@@ -221,19 +227,43 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Return 1 if something happened which requires us to abort the current
  * scan of the checkpoint list.  
  *
- * Called with j_list_lock held.
+ * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
-static int __flush_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count,
-			int *drop_count)
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+			struct buffer_head **bhs, int *batch_count)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
 
-	if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
-		J_ASSERT_JH(jh, jh->b_transaction == NULL);
+	if (buffer_locked(bh)) {
+		atomic_inc(&bh->b_count);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		wait_on_buffer(bh);
+		/* the journal_head may have gone by now */
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+		ret = 1;
+	} else if (jh->b_transaction != NULL) {
+		transaction_t *t = jh->b_transaction;
+		tid_t tid = t->t_tid;
 
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		log_start_commit(journal, tid);
+		log_wait_commit(journal, tid);
+		ret = 1;
+	} else if (!buffer_dirty(bh)) {
+		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+		BUFFER_TRACE(bh, "remove from checkpoint");
+		__journal_remove_checkpoint(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		__brelse(bh);
+		ret = 1;
+	} else {
 		/*
 		 * Important: we are about to write the buffer, and
 		 * possibly block, while still holding the journal lock.
@@ -246,45 +276,30 @@ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
 		bhs[*batch_count] = bh;
+		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		(*batch_count)++;
 		if (*batch_count == NR_BATCH) {
+			spin_unlock(&journal->j_list_lock);
 			__flush_batch(journal, bhs, batch_count);
 			ret = 1;
 		}
-	} else {
-		int last_buffer = 0;
-		if (jh->b_cpnext == jh) {
-			/* We may be about to drop the transaction.  Tell the
-			 * caller that the lists have changed.
-			 */
-			last_buffer = 1;
-		}
-		if (__try_to_free_cp_buf(jh)) {
-			(*drop_count)++;
-			ret = last_buffer;
-		}
 	}
 	return ret;
 }
 
 /*
- * Perform an actual checkpoint.  We don't write out only enough to
- * satisfy the current blocked requests: rather we submit a reasonably
- * sized chunk of the outstanding data to disk at once for
- * efficiency.  __log_wait_for_space() will retry if we didn't free enough.
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
  * 
- * However, we _do_ take into account the amount requested so that once
- * the IO has been queued, we can return as soon as enough of it has
- * completed to disk.
- *
  * The journal should be locked before calling this function.
  */
 int log_do_checkpoint(journal_t *journal)
 {
+	transaction_t *transaction;
+	tid_t this_tid;
 	int result;
-	int batch_count = 0;
-	struct buffer_head *bhs[NR_BATCH];
 
 	jbd_debug(1, "Start checkpoint\n");
 
@@ -299,79 +314,68 @@ int log_do_checkpoint(journal_t *journal)
 		return result;
 
 	/*
-	 * OK, we need to start writing disk blocks.  Try to free up a
-	 * quarter of the log in a single checkpoint if we can.
+	 * OK, we need to start writing disk blocks.  Take one transaction
+	 * and write it.
 	 */
+	spin_lock(&journal->j_list_lock);
+	if (!journal->j_checkpoint_transactions)
+		goto out;
+	transaction = journal->j_checkpoint_transactions;
+	this_tid = transaction->t_tid;
+restart:
 	/*
-	 * AKPM: check this code.  I had a feeling a while back that it
-	 * degenerates into a busy loop at unmount time.
+	 * If someone cleaned up this transaction while we slept, we're
+	 * done (maybe it's a new transaction, but it fell at the same
+	 * address).
 	 */
-	spin_lock(&journal->j_list_lock);
-	while (journal->j_checkpoint_transactions) {
-		transaction_t *transaction;
-		struct journal_head *jh, *last_jh, *next_jh;
-		int drop_count = 0;
-		int cleanup_ret, retry = 0;
-		tid_t this_tid;
-
-		transaction = journal->j_checkpoint_transactions;
-		this_tid = transaction->t_tid;
-		jh = transaction->t_checkpoint_list;
-		last_jh = jh->b_cpprev;
-		next_jh = jh;
-		do {
+	if (journal->j_checkpoint_transactions == transaction &&
+			transaction->t_tid == this_tid) {
+		int batch_count = 0;
+		struct buffer_head *bhs[NR_BATCH];
+		struct journal_head *jh;
+		int retry = 0;
+
+		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
 
-			jh = next_jh;
-			next_jh = jh->b_cpnext;
+			jh = transaction->t_checkpoint_list;
 			bh = jh2bh(jh);
 			if (!jbd_trylock_bh_state(bh)) {
 				jbd_sync_bh(journal, bh);
-				spin_lock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-			retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count);
-			if (cond_resched_lock(&journal->j_list_lock)) {
+			retry = __process_buffer(journal, jh, bhs,&batch_count);
+			if (!retry && lock_need_resched(&journal->j_list_lock)){
+				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-		} while (jh != last_jh && !retry);
+		}
 
 		if (batch_count) {
+			if (!retry) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+			}
 			__flush_batch(journal, bhs, &batch_count);
-			retry = 1;
 		}
 
+		if (retry) {
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		/*
-		 * If someone cleaned up this transaction while we slept, we're
-		 * done
-		 */
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
-		if (retry)
-			continue;
-		/*
-		 * Maybe it's a new transaction, but it fell at the same
-		 * address
-		 */
-		if (transaction->t_tid != this_tid)
-			continue;
-		/*
-		 * We have walked the whole transaction list without
-		 * finding anything to write to disk.  We had better be
-		 * able to make some progress or we are in trouble.
+		 * Now we have cleaned up the first transaction's checkpoint
+		 * list. Let's clean up the second one
 		 */
-		cleanup_ret = __cleanup_transaction(journal, transaction);
-		J_ASSERT(drop_count != 0 || cleanup_ret != 0);
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
+		__wait_cp_io(journal, transaction);
 	}
+out:
 	spin_unlock(&journal->j_list_lock);
 	result = cleanup_journal_tail(journal);
 	if (result < 0)
 		return result;
-
 	return 0;
 }
 
@@ -455,6 +459,54 @@ int cleanup_journal_tail(journal_t *journal)
 
 /* Checkpoint list management */
 
+/*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+	struct journal_head *last_jh;
+	struct journal_head *next_jh = jh;
+	int ret, freed = 0;
+
+	*released = 0;
+	if (!jh)
+		return 0;
+
+ 	last_jh = jh->b_cpprev;
+	do {
+		jh = next_jh;
+		next_jh = jh->b_cpnext;
+		/* Use trylock because of the ranking */
+		if (jbd_trylock_bh_state(jh2bh(jh))) {
+			ret = __try_to_free_cp_buf(jh);
+			if (ret) {
+				freed++;
+				if (ret == 2) {
+					*released = 1;
+					return freed;
+				}
+			}
+		}
+		/*
+		 * This function only frees up some memory
+		 * if possible so we dont have an obligation
+		 * to finish processing. Bail out if preemption
+		 * requested:
+		 */
+		if (need_resched())
+			return freed;
+	} while (jh != last_jh);
+
+	return freed;
+}
+
 /*
  * journal_clean_checkpoint_list
  *
@@ -462,46 +514,44 @@ int cleanup_journal_tail(journal_t *journal)
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
- * Returns number of bufers reaped (for debug)
+ * Returns number of buffers reaped (for debug)
  */
 
 int __journal_clean_checkpoint_list(journal_t *journal)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
 	int ret = 0;
+	int released;
 
 	transaction = journal->j_checkpoint_transactions;
-	if (transaction == 0)
+	if (!transaction)
 		goto out;
 
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
 	do {
-		struct journal_head *jh;
-
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
-		jh = transaction->t_checkpoint_list;
-		if (jh) {
-			struct journal_head *last_jh = jh->b_cpprev;
-			struct journal_head *next_jh = jh;
-
-			do {
-				jh = next_jh;
-				next_jh = jh->b_cpnext;
-				/* Use trylock because of the ranknig */
-				if (jbd_trylock_bh_state(jh2bh(jh)))
-					ret += __try_to_free_cp_buf(jh);
-				/*
-				 * This function only frees up some memory
-				 * if possible so we dont have an obligation
-				 * to finish processing. Bail out if preemption
-				 * requested:
-				 */
-				if (need_resched())
-					goto out;
-			} while (jh != last_jh);
-		}
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_list, &released);
+		/*
+		 * This function only frees up some memory if possible so we
+		 * dont have an obligation to finish processing. Bail out if
+		 * preemption requested:
+		 */
+		if (need_resched())
+			goto out;
+		if (released)
+			continue;
+		/*
+		 * It is essential that we are as careful as in the case of
+		 * t_checkpoint_list with removing the buffer from the list as
+		 * we can possibly see not yet submitted buffers on io_list
+		 */
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_io_list, &released);
+		if (need_resched())
+			goto out;
 	} while (transaction != last_transaction);
 out:
 	return ret;
@@ -516,18 +566,22 @@ out:
  * buffer updates committed in that transaction have safely been stored
  * elsewhere on disk.  To achieve this, all of the buffers in a
  * transaction need to be maintained on the transaction's checkpoint
- * list until they have been rewritten, at which point this function is
+ * lists until they have been rewritten, at which point this function is
  * called to remove the buffer from the existing transaction's
- * checkpoint list.
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
  *
  * This function is called with the journal locked.
  * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
 
-void __journal_remove_checkpoint(struct journal_head *jh)
+int __journal_remove_checkpoint(struct journal_head *jh)
 {
 	transaction_t *transaction;
 	journal_t *journal;
+	int ret = 0;
 
 	JBUFFER_TRACE(jh, "entry");
 
@@ -538,8 +592,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	journal = transaction->t_journal;
 
 	__buffer_unlink(jh);
+	jh->b_cp_transaction = NULL;
 
-	if (transaction->t_checkpoint_list != NULL)
+	if (transaction->t_checkpoint_list != NULL ||
+	    transaction->t_checkpoint_io_list != NULL)
 		goto out;
 	JBUFFER_TRACE(jh, "transaction has no more buffers");
 
@@ -565,8 +621,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
 	wake_up(&journal->j_wait_logspace);
+	ret = 1;
 out:
 	JBUFFER_TRACE(jh, "exit");
+	return ret;
 }
 
 /*
@@ -628,6 +686,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 	J_ASSERT(transaction->t_shadow_list == NULL);
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 	J_ASSERT(transaction->t_updates == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 6a425e370cb3..20eb34403d0c 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -500,6 +500,12 @@ struct transaction_s
 	 */
 	struct journal_head	*t_checkpoint_list;
 
+	/*
+	 * Doubly-linked circular list of all buffers submitted for IO while
+	 * checkpointing. [j_list_lock]
+	 */
+	struct journal_head	*t_checkpoint_io_list;
+
 	/*
 	 * Doubly-linked circular list of temporary buffers currently undergoing
 	 * IO in the log [j_list_lock]
@@ -849,7 +855,7 @@ extern void journal_commit_transaction(journal_t *);
 
 /* Checkpoint list management */
 int __journal_clean_checkpoint_list(journal_t *journal);
-void __journal_remove_checkpoint(struct journal_head *);
+int __journal_remove_checkpoint(struct journal_head *);
 void __journal_insert_checkpoint(struct journal_head *, transaction_t *);
 
 /* Buffer IO */
-- 
cgit v1.2.3


From fda151d9feafc0e8418f23c716587c44394fcdbf Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 5 Jun 2006 12:09:50 +0200
Subject: [PATCH] blktrace_api.h: endian annotations

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 include/linux/blktrace_api.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index eb1a867ed245..a7e8cef73d15 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -90,9 +90,9 @@ struct blk_io_trace {
  * The remap event
  */
 struct blk_io_trace_remap {
-	u32 device;
+	__be32 device;
 	u32 __pad;
-	u64 sector;
+	__be64 sector;
 };
 
 enum {
@@ -224,7 +224,7 @@ static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
 					 struct bio *bio, unsigned int pdu)
 {
 	struct blk_trace *bt = q->blk_trace;
-	u64 rpdu = cpu_to_be64(pdu);
+	__be64 rpdu = cpu_to_be64(pdu);
 
 	if (likely(!bt))
 		return;
-- 
cgit v1.2.3


From b31dc66a54ad986b6b73bdc49c8efc17cbad1833 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 13 Jun 2006 08:26:10 +0200
Subject: [PATCH] Kill PF_SYNCWRITE flag

A process flag to indicate whether we are doing sync io is incredibly
ugly. It also causes performance problems when one does a lot of async
io and then proceeds to sync it. Part of the io will go out as async,
and the other part as sync. This causes a disconnect between the
previously submitted io and the synced io. For io schedulers such as CFQ,
this will cause us lost merges and suboptimal behaviour in scheduling.

Remove PF_SYNCWRITE completely from the fsync/msync paths, and let
the O_DIRECT path just directly indicate that the writes are sync
by using WRITE_SYNC instead.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/as-iosched.c                |  2 +-
 block/cfq-iosched.c               |  4 +---
 block/ll_rw_blk.c                 |  3 +++
 drivers/usb/gadget/file_storage.c |  2 --
 fs/buffer.c                       |  2 --
 fs/direct-io.c                    | 18 ++++++++----------
 fs/fs-writeback.c                 |  2 --
 include/linux/blkdev.h            |  2 ++
 include/linux/sched.h             | 11 +++++------
 mm/msync.c                        |  3 ---
 10 files changed, 20 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 9b13d72ffefa..56c99fa037df 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1339,7 +1339,7 @@ static void as_add_request(request_queue_t *q, struct request *rq)
 	arq->state = AS_RQ_NEW;
 
 	if (rq_data_dir(arq->request) == READ
-			|| current->flags&PF_SYNCWRITE)
+			|| (arq->request->flags & REQ_RW_SYNC))
 		arq->is_sync = 1;
 	else
 		arq->is_sync = 0;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c88f161d3fb3..4c4e9cc3ae26 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -277,8 +277,6 @@ static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsi
 static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
 
-#define process_sync(tsk)	((tsk)->flags & PF_SYNCWRITE)
-
 /*
  * lots of deadline iosched dupes, can be abstracted later...
  */
@@ -334,7 +332,7 @@ static int cfq_queue_empty(request_queue_t *q)
 
 static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
 {
-	if (rw == READ || process_sync(task))
+	if (rw == READ || rw == WRITE_SYNC)
 		return task->pid;
 
 	return CFQ_KEY_ASYNC;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 17c42ddd31db..2270bb451385 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -2827,6 +2827,9 @@ static void init_request_from_bio(struct request *req, struct bio *bio)
 	if (unlikely(bio_barrier(bio)))
 		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
 
+	if (bio_sync(bio))
+		req->flags |= REQ_RW_SYNC;
+
 	req->errors = 0;
 	req->hard_sector = req->sector = bio->bi_sector;
 	req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index 6f887478b148..a43dc908ac59 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -1906,7 +1906,6 @@ static int fsync_sub(struct lun *curlun)
 
 	inode = filp->f_dentry->d_inode;
 	mutex_lock(&inode->i_mutex);
-	current->flags |= PF_SYNCWRITE;
 	rc = filemap_fdatawrite(inode->i_mapping);
 	err = filp->f_op->fsync(filp, filp->f_dentry, 1);
 	if (!rc)
@@ -1914,7 +1913,6 @@ static int fsync_sub(struct lun *curlun)
 	err = filemap_fdatawait(inode->i_mapping);
 	if (!rc)
 		rc = err;
-	current->flags &= ~PF_SYNCWRITE;
 	mutex_unlock(&inode->i_mutex);
 	VLDBG(curlun, "fdatasync -> %d\n", rc);
 	return rc;
diff --git a/fs/buffer.c b/fs/buffer.c
index 23f1f3a68077..373bb6292bdc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -331,7 +331,6 @@ long do_fsync(struct file *file, int datasync)
 		goto out;
 	}
 
-	current->flags |= PF_SYNCWRITE;
 	ret = filemap_fdatawrite(mapping);
 
 	/*
@@ -346,7 +345,6 @@ long do_fsync(struct file *file, int datasync)
 	err = filemap_fdatawait(mapping);
 	if (!ret)
 		ret = err;
-	current->flags &= ~PF_SYNCWRITE;
 out:
 	return ret;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b05d1b218776..538fb0418fba 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,7 +162,7 @@ static int dio_refill_pages(struct dio *dio)
 		NULL);				/* vmas */
 	up_read(&current->mm->mmap_sem);
 
-	if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(dio->curr_user_address);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
@@ -535,7 +535,7 @@ static int get_more_blocks(struct dio *dio)
 		map_bh->b_state = 0;
 		map_bh->b_size = fs_count << dio->inode->i_blkbits;
 
-		create = dio->rw == WRITE;
+		create = dio->rw & WRITE;
 		if (dio->lock_type == DIO_LOCKING) {
 			if (dio->block_in_file < (i_size_read(dio->inode) >>
 							dio->blkbits))
@@ -867,7 +867,7 @@ do_holes:
 				loff_t i_size_aligned;
 
 				/* AKPM: eargh, -ENOTBLK is a hack */
-				if (dio->rw == WRITE) {
+				if (dio->rw & WRITE) {
 					page_cache_release(page);
 					return -ENOTBLK;
 				}
@@ -1045,7 +1045,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		}
 	} /* end iovec loop */
 
-	if (ret == -ENOTBLK && rw == WRITE) {
+	if (ret == -ENOTBLK && (rw & WRITE)) {
 		/*
 		 * The remaining part of the request will be
 		 * be handled by buffered I/O when we return
@@ -1089,7 +1089,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	if (dio->is_async) {
 		int should_wait = 0;
 
-		if (dio->result < dio->size && rw == WRITE) {
+		if (dio->result < dio->size && (rw & WRITE)) {
 			dio->waiter = current;
 			should_wait = 1;
 		}
@@ -1142,7 +1142,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 			ret = transferred;
 
 		/* We could have also come here on an AIO file extend */
-		if (!is_sync_kiocb(iocb) && rw == WRITE &&
+		if (!is_sync_kiocb(iocb) && (rw & WRITE) &&
 		    ret >= 0 && dio->result == dio->size)
 			/*
 			 * For AIO writes where we have completed the
@@ -1194,7 +1194,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		current->flags |= PF_SYNCWRITE;
+		rw = WRITE_SYNC;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
@@ -1270,7 +1270,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 * even for AIO, we need to wait for i/o to complete before
 	 * returning in this case.
 	 */
-	dio->is_async = !is_sync_kiocb(iocb) && !((rw == WRITE) &&
+	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
 		(end > i_size_read(inode)));
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
@@ -1284,8 +1284,6 @@ out:
 		mutex_unlock(&inode->i_mutex);
 	else if (acquire_i_mutex)
 		mutex_lock(&inode->i_mutex);
-	if (rw & WRITE)
-		current->flags &= ~PF_SYNCWRITE;
 	return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6db95cf3aaa2..031b27a4bc9a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -623,7 +623,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 	int need_write_inode_now = 0;
 	int err2;
 
-	current->flags |= PF_SYNCWRITE;
 	if (what & OSYNC_DATA)
 		err = filemap_fdatawrite(mapping);
 	if (what & (OSYNC_METADATA|OSYNC_DATA)) {
@@ -636,7 +635,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 		if (!err)
 			err = err2;
 	}
-	current->flags &= ~PF_SYNCWRITE;
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & I_DIRTY) &&
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3457e7b97363..482a21d67627 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -241,6 +241,7 @@ enum rq_flag_bits {
 	__REQ_PM_RESUME,	/* resume request */
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
 	__REQ_ORDERED_COLOR,	/* is before or after barrier */
+	__REQ_RW_SYNC,		/* request is sync (O_DIRECT) */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -270,6 +271,7 @@ enum rq_flag_bits {
 #define REQ_PM_RESUME	(1 << __REQ_PM_RESUME)
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
 #define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
+#define REQ_RW_SYNC	(1 << __REQ_RW_SYNC)
 
 /*
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a9d23c7d1b25..38b4791e6a5d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -941,12 +941,11 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
 #define PF_SWAPOFF	0x00080000	/* I am in swapoff */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
-#define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
-#define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
-#define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
-#define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
-#define PF_SPREAD_PAGE	0x04000000	/* Spread page cache over cpuset */
-#define PF_SPREAD_SLAB	0x08000000	/* Spread some slab caches over cpuset */
+#define PF_BORROWED_MM	0x00200000	/* I am a kthread doing use_mm */
+#define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
+#define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
+#define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
+#define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 
 /*
diff --git a/mm/msync.c b/mm/msync.c
index bc6c95376366..d083544df21b 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 	 * just ignore them, but return -ENOMEM at the end.
 	 */
 	down_read(&current->mm->mmap_sem);
-	if (flags & MS_SYNC)
-		current->flags |= PF_SYNCWRITE;
 	vma = find_vma(current->mm, start);
 	if (!vma) {
 		error = -ENOMEM;
@@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 		}
 	} while (vma && !done);
 out_unlock:
-	current->flags &= ~PF_SYNCWRITE;
 	up_read(&current->mm->mmap_sem);
 out:
 	return error;
-- 
cgit v1.2.3


From ad3caddaa1708e506f20b8e25a4a8ae586fc7d5b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 13 Jun 2006 08:46:57 +0200
Subject: [PATCH] Get rid of struct request request_pm_state member

The IDE power management can just use the ->end_io_data member to store
it's data.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/ide/ide-io.c   | 91 ++++++++++++++++++++++++++++----------------------
 drivers/ide/ide.c      |  4 +--
 include/linux/blkdev.h |  5 ---
 3 files changed, 54 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index c01615dec202..4f2f138de2ca 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -142,38 +142,41 @@ enum {
 
 static void ide_complete_power_step(ide_drive_t *drive, struct request *rq, u8 stat, u8 error)
 {
+	struct request_pm_state *pm = rq->end_io_data;
+
 	if (drive->media != ide_disk)
 		return;
 
-	switch (rq->pm->pm_step) {
+	switch (pm->pm_step) {
 	case ide_pm_flush_cache:	/* Suspend step 1 (flush cache) complete */
-		if (rq->pm->pm_state == PM_EVENT_FREEZE)
-			rq->pm->pm_step = ide_pm_state_completed;
+		if (pm->pm_state == PM_EVENT_FREEZE)
+			pm->pm_step = ide_pm_state_completed;
 		else
-			rq->pm->pm_step = idedisk_pm_standby;
+			pm->pm_step = idedisk_pm_standby;
 		break;
 	case idedisk_pm_standby:	/* Suspend step 2 (standby) complete */
-		rq->pm->pm_step = ide_pm_state_completed;
+		pm->pm_step = ide_pm_state_completed;
 		break;
 	case idedisk_pm_idle:		/* Resume step 1 (idle) complete */
-		rq->pm->pm_step = ide_pm_restore_dma;
+		pm->pm_step = ide_pm_restore_dma;
 		break;
 	}
 }
 
 static ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 {
+	struct request_pm_state *pm = rq->end_io_data;
 	ide_task_t *args = rq->special;
 
 	memset(args, 0, sizeof(*args));
 
 	if (drive->media != ide_disk) {
 		/* skip idedisk_pm_idle for ATAPI devices */
-		if (rq->pm->pm_step == idedisk_pm_idle)
-			rq->pm->pm_step = ide_pm_restore_dma;
+		if (pm->pm_step == idedisk_pm_idle)
+			pm->pm_step = ide_pm_restore_dma;
 	}
 
-	switch (rq->pm->pm_step) {
+	switch (pm->pm_step) {
 	case ide_pm_flush_cache:	/* Suspend step 1 (flush cache) */
 		if (drive->media != ide_disk)
 			break;
@@ -215,7 +218,7 @@ static ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *
 		drive->hwif->ide_dma_check(drive);
 		break;
 	}
-	rq->pm->pm_step = ide_pm_state_completed;
+	pm->pm_step = ide_pm_state_completed;
 	return ide_stopped;
 }
 
@@ -362,12 +365,13 @@ void ide_end_drive_cmd (ide_drive_t *drive, u8 stat, u8 err)
 			}
 		}
 	} else if (blk_pm_request(rq)) {
+		struct request_pm_state *pm = rq->end_io_data;
 #ifdef DEBUG_PM
 		printk("%s: complete_power_step(step: %d, stat: %x, err: %x)\n",
 			drive->name, rq->pm->pm_step, stat, err);
 #endif
 		ide_complete_power_step(drive, rq, stat, err);
-		if (rq->pm->pm_step == ide_pm_state_completed)
+		if (pm->pm_step == ide_pm_state_completed)
 			ide_complete_pm_request(drive, rq);
 		return;
 	}
@@ -871,6 +875,39 @@ done:
  	return ide_stopped;
 }
 
+static void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
+{
+	struct request_pm_state *pm = rq->end_io_data;
+
+	if (blk_pm_suspend_request(rq) &&
+	    pm->pm_step == ide_pm_state_start_suspend)
+		/* Mark drive blocked when starting the suspend sequence. */
+		drive->blocked = 1;
+	else if (blk_pm_resume_request(rq) &&
+		 pm->pm_step == ide_pm_state_start_resume) {
+		/* 
+		 * The first thing we do on wakeup is to wait for BSY bit to
+		 * go away (with a looong timeout) as a drive on this hwif may
+		 * just be POSTing itself.
+		 * We do that before even selecting as the "other" device on
+		 * the bus may be broken enough to walk on our toes at this
+		 * point.
+		 */
+		int rc;
+#ifdef DEBUG_PM
+		printk("%s: Wakeup request inited, waiting for !BSY...\n", drive->name);
+#endif
+		rc = ide_wait_not_busy(HWIF(drive), 35000);
+		if (rc)
+			printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name);
+		SELECT_DRIVE(drive);
+		HWIF(drive)->OUTB(8, HWIF(drive)->io_ports[IDE_CONTROL_OFFSET]);
+		rc = ide_wait_not_busy(HWIF(drive), 10000);
+		if (rc)
+			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
+	}
+}
+
 /**
  *	start_request	-	start of I/O and command issuing for IDE
  *
@@ -909,33 +946,8 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 	if (block == 0 && drive->remap_0_to_1 == 1)
 		block = 1;  /* redirect MBR access to EZ-Drive partn table */
 
-	if (blk_pm_suspend_request(rq) &&
-	    rq->pm->pm_step == ide_pm_state_start_suspend)
-		/* Mark drive blocked when starting the suspend sequence. */
-		drive->blocked = 1;
-	else if (blk_pm_resume_request(rq) &&
-		 rq->pm->pm_step == ide_pm_state_start_resume) {
-		/* 
-		 * The first thing we do on wakeup is to wait for BSY bit to
-		 * go away (with a looong timeout) as a drive on this hwif may
-		 * just be POSTing itself.
-		 * We do that before even selecting as the "other" device on
-		 * the bus may be broken enough to walk on our toes at this
-		 * point.
-		 */
-		int rc;
-#ifdef DEBUG_PM
-		printk("%s: Wakeup request inited, waiting for !BSY...\n", drive->name);
-#endif
-		rc = ide_wait_not_busy(HWIF(drive), 35000);
-		if (rc)
-			printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name);
-		SELECT_DRIVE(drive);
-		HWIF(drive)->OUTB(8, HWIF(drive)->io_ports[IDE_CONTROL_OFFSET]);
-		rc = ide_wait_not_busy(HWIF(drive), 10000);
-		if (rc)
-			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
-	}
+	if (blk_pm_request(rq))
+		ide_check_pm_state(drive, rq);
 
 	SELECT_DRIVE(drive);
 	if (ide_wait_stat(&startstop, drive, drive->ready_stat, BUSY_STAT|DRQ_STAT, WAIT_READY)) {
@@ -950,13 +962,14 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		else if (rq->flags & REQ_DRIVE_TASKFILE)
 			return execute_drive_cmd(drive, rq);
 		else if (blk_pm_request(rq)) {
+			struct request_pm_state *pm = rq->end_io_data;
 #ifdef DEBUG_PM
 			printk("%s: start_power_step(step: %d)\n",
 				drive->name, rq->pm->pm_step);
 #endif
 			startstop = ide_start_power_step(drive, rq);
 			if (startstop == ide_stopped &&
-			    rq->pm->pm_step == ide_pm_state_completed)
+			    pm->pm_step == ide_pm_state_completed)
 				ide_complete_pm_request(drive, rq);
 			return startstop;
 		}
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 9799aed772e1..59fe358048b3 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -1226,7 +1226,7 @@ static int generic_ide_suspend(struct device *dev, pm_message_t state)
 	memset(&args, 0, sizeof(args));
 	rq.flags = REQ_PM_SUSPEND;
 	rq.special = &args;
-	rq.pm = &rqpm;
+	rq.end_io_data = &rqpm;
 	rqpm.pm_step = ide_pm_state_start_suspend;
 	rqpm.pm_state = state.event;
 
@@ -1245,7 +1245,7 @@ static int generic_ide_resume(struct device *dev)
 	memset(&args, 0, sizeof(args));
 	rq.flags = REQ_PM_RESUME;
 	rq.special = &args;
-	rq.pm = &rqpm;
+	rq.end_io_data = &rqpm;
 	rqpm.pm_step = ide_pm_state_start_resume;
 	rqpm.pm_state = PM_EVENT_ON;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 482a21d67627..371c0ce5f630 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -195,11 +195,6 @@ struct request {
 	unsigned int timeout;
 	int retries;
 
-	/*
-	 * For Power Management requests
-	 */
-	struct request_pm_state *pm;
-
 	/*
 	 * completion callback. end_io_data should be folded in with waiting
 	 */
-- 
cgit v1.2.3


From 8f34ee75decb80007ba77bba5a7384eadff4866d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 13 Jun 2006 09:02:34 +0200
Subject: [PATCH] Rearrange a few struct request members

This saves 8 bytes of data in 64-bit archs.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 include/linux/blkdev.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 371c0ce5f630..aafe82788b4e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -151,11 +151,9 @@ struct request {
 	void *elevator_private;
 	void *completion_data;
 
-	unsigned short ioprio;
-
 	int rq_status;	/* should split this into a few status bits */
-	struct gendisk *rq_disk;
 	int errors;
+	struct gendisk *rq_disk;
 	unsigned long start_time;
 
 	/* Number of scatter-gather DMA addr+len pairs after
@@ -170,8 +168,9 @@ struct request {
 	 */
 	unsigned short nr_hw_segments;
 
+	unsigned short ioprio;
+
 	int tag;
-	char *buffer;
 
 	int ref_count;
 	request_queue_t *q;
@@ -179,6 +178,7 @@ struct request {
 
 	struct completion *waiting;
 	void *special;
+	char *buffer;
 
 	/*
 	 * when request is used as a packet command carrier
@@ -187,9 +187,8 @@ struct request {
 	unsigned char cmd[BLK_MAX_CDB];
 
 	unsigned int data_len;
-	void *data;
-
 	unsigned int sense_len;
+	void *data;
 	void *sense;
 
 	unsigned int timeout;
-- 
cgit v1.2.3


From dd67d051529387f6e44d22d1d5540ef281965fdd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 21 Jun 2006 09:36:18 +0200
Subject: [PATCH] rbtree: support functions used by the io schedulers

They all duplicate macros to check for empty root and/or node, and
clearing a node. So put those in rbtree.h.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/as-iosched.c       | 17 +++++++----------
 block/cfq-iosched.c      | 22 ++++++++--------------
 block/deadline-iosched.c | 13 +++++--------
 include/linux/rbtree.h   |  4 ++++
 4 files changed, 24 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 56c99fa037df..1ec5df466708 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -347,9 +347,6 @@ static struct request *as_find_arq_hash(struct as_data *ad, sector_t offset)
 /*
  * rb tree support functions
  */
-#define RB_EMPTY(root)	((root)->rb_node == NULL)
-#define ON_RB(node)	(rb_parent(node) != node)
-#define RB_CLEAR(node)	(rb_set_parent(node, node))
 #define rb_entry_arq(node)	rb_entry((node), struct as_rq, rb_node)
 #define ARQ_RB_ROOT(ad, arq)	(&(ad)->sort_list[(arq)->is_sync])
 #define rq_rb_key(rq)		(rq)->sector
@@ -418,13 +415,13 @@ static void as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
 
 static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq)
 {
-	if (!ON_RB(&arq->rb_node)) {
+	if (!RB_EMPTY_NODE(&arq->rb_node)) {
 		WARN_ON(1);
 		return;
 	}
 
 	rb_erase(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
-	RB_CLEAR(&arq->rb_node);
+	RB_CLEAR_NODE(&arq->rb_node);
 }
 
 static struct request *
@@ -545,7 +542,7 @@ static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last)
 	struct rb_node *rbprev = rb_prev(&last->rb_node);
 	struct as_rq *arq_next, *arq_prev;
 
-	BUG_ON(!ON_RB(&last->rb_node));
+	BUG_ON(!RB_EMPTY_NODE(&last->rb_node));
 
 	if (rbprev)
 		arq_prev = rb_entry_arq(rbprev);
@@ -1122,7 +1119,7 @@ static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
 	struct request *rq = arq->request;
 	const int data_dir = arq->is_sync;
 
-	BUG_ON(!ON_RB(&arq->rb_node));
+	BUG_ON(!RB_EMPTY_NODE(&arq->rb_node));
 
 	as_antic_stop(ad);
 	ad->antic_status = ANTIC_OFF;
@@ -1247,7 +1244,7 @@ static int as_dispatch_request(request_queue_t *q, int force)
 	 */
 
 	if (reads) {
-		BUG_ON(RB_EMPTY(&ad->sort_list[REQ_SYNC]));
+		BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_SYNC]));
 
 		if (writes && ad->batch_data_dir == REQ_SYNC)
 			/*
@@ -1271,7 +1268,7 @@ static int as_dispatch_request(request_queue_t *q, int force)
 
 	if (writes) {
 dispatch_writes:
-		BUG_ON(RB_EMPTY(&ad->sort_list[REQ_ASYNC]));
+		BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_ASYNC]));
 
 		if (ad->batch_data_dir == REQ_SYNC) {
 			ad->changed_batch = 1;
@@ -1591,7 +1588,7 @@ static int as_set_request(request_queue_t *q, struct request *rq,
 
 	if (arq) {
 		memset(arq, 0, sizeof(*arq));
-		RB_CLEAR(&arq->rb_node);
+		RB_CLEAR_NODE(&arq->rb_node);
 		arq->request = rq;
 		arq->state = AS_RQ_PRESCHED;
 		arq->io_context = NULL;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 940364edf2b9..e25223e147a2 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -60,11 +60,6 @@ static DEFINE_SPINLOCK(cfq_exit_lock);
 /*
  * rb-tree defines
  */
-#define RB_EMPTY(node)		((node)->rb_node == NULL)
-#define RB_CLEAR(node)		do {	\
-		memset(node, 0, sizeof(*node)); \
-} while (0)
-#define RB_CLEAR_ROOT(root)	((root)->rb_node = NULL)
 #define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
 #define rq_rb_key(rq)		(rq)->sector
 
@@ -559,7 +554,7 @@ static inline void cfq_del_crq_rb(struct cfq_rq *crq)
 
 	rb_erase(&crq->rb_node, &cfqq->sort_list);
 
-	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY(&cfqq->sort_list))
+	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
 		cfq_del_cfqq_rr(cfqd, cfqq);
 }
 
@@ -914,7 +909,7 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	struct cfq_io_context *cic;
 	unsigned long sl;
 
-	WARN_ON(!RB_EMPTY(&cfqq->sort_list));
+	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
 	WARN_ON(cfqq != cfqd->active_queue);
 
 	/*
@@ -1042,7 +1037,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	 * if queue has requests, dispatch one. if not, check if
 	 * enough slice is left to wait for one
 	 */
-	if (!RB_EMPTY(&cfqq->sort_list))
+	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 		goto keep_queue;
 	else if (cfq_cfqq_dispatched(cfqq)) {
 		cfqq = NULL;
@@ -1066,7 +1061,7 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 {
 	int dispatched = 0;
 
-	BUG_ON(RB_EMPTY(&cfqq->sort_list));
+	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
 
 	do {
 		struct cfq_rq *crq;
@@ -1090,7 +1085,7 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			cfqd->active_cic = crq->io_context;
 		}
 
-		if (RB_EMPTY(&cfqq->sort_list))
+		if (RB_EMPTY_ROOT(&cfqq->sort_list))
 			break;
 
 	} while (dispatched < max_dispatch);
@@ -1480,7 +1475,6 @@ retry:
 
 		INIT_HLIST_NODE(&cfqq->cfq_hash);
 		INIT_LIST_HEAD(&cfqq->cfq_list);
-		RB_CLEAR_ROOT(&cfqq->sort_list);
 		INIT_LIST_HEAD(&cfqq->fifo);
 
 		cfqq->key = key;
@@ -1873,7 +1867,7 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
 	if (cfqd->active_queue == cfqq) {
 		if (time_after(now, cfqq->slice_end))
 			cfq_slice_expired(cfqd, 0);
-		else if (sync && RB_EMPTY(&cfqq->sort_list)) {
+		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) {
 			if (!cfq_arm_slice_timer(cfqd, cfqq))
 				cfq_schedule_dispatch(cfqd);
 		}
@@ -2059,7 +2053,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 
 	crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
 	if (crq) {
-		RB_CLEAR(&crq->rb_node);
+		RB_CLEAR_NODE(&crq->rb_node);
 		crq->rb_key = 0;
 		crq->request = rq;
 		INIT_HLIST_NODE(&crq->hash);
@@ -2151,7 +2145,7 @@ static void cfq_idle_slice_timer(unsigned long data)
 		/*
 		 * not expired and it has a request pending, let it dispatch
 		 */
-		if (!RB_EMPTY(&cfqq->sort_list)) {
+		if (!RB_EMPTY_ROOT(&cfqq->sort_list)) {
 			cfq_mark_cfqq_must_dispatch(cfqq);
 			goto out_kick;
 		}
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index e5bccaaed563..4469dd84623c 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -159,9 +159,6 @@ deadline_find_drq_hash(struct deadline_data *dd, sector_t offset)
 /*
  * rb tree support functions
  */
-#define RB_EMPTY(root)	((root)->rb_node == NULL)
-#define ON_RB(node)	(rb_parent(node) != node)
-#define RB_CLEAR(node)	(rb_set_parent(node, node))
 #define rb_entry_drq(node)	rb_entry((node), struct deadline_rq, rb_node)
 #define DRQ_RB_ROOT(dd, drq)	(&(dd)->sort_list[rq_data_dir((drq)->request)])
 #define rq_rb_key(rq)		(rq)->sector
@@ -220,9 +217,9 @@ deadline_del_drq_rb(struct deadline_data *dd, struct deadline_rq *drq)
 			dd->next_drq[data_dir] = rb_entry_drq(rbnext);
 	}
 
-	BUG_ON(!ON_RB(&drq->rb_node));
+	BUG_ON(!RB_EMPTY_NODE(&drq->rb_node));
 	rb_erase(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
-	RB_CLEAR(&drq->rb_node);
+	RB_CLEAR_NODE(&drq->rb_node);
 }
 
 static struct request *
@@ -496,7 +493,7 @@ static int deadline_dispatch_requests(request_queue_t *q, int force)
 	 */
 
 	if (reads) {
-		BUG_ON(RB_EMPTY(&dd->sort_list[READ]));
+		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
 
 		if (writes && (dd->starved++ >= dd->writes_starved))
 			goto dispatch_writes;
@@ -512,7 +509,7 @@ static int deadline_dispatch_requests(request_queue_t *q, int force)
 
 	if (writes) {
 dispatch_writes:
-		BUG_ON(RB_EMPTY(&dd->sort_list[WRITE]));
+		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
 
 		dd->starved = 0;
 
@@ -668,7 +665,7 @@ deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 	drq = mempool_alloc(dd->drq_pool, gfp_mask);
 	if (drq) {
 		memset(drq, 0, sizeof(*drq));
-		RB_CLEAR(&drq->rb_node);
+		RB_CLEAR_NODE(&drq->rb_node);
 		drq->request = rq;
 
 		INIT_HLIST_NODE(&drq->hash);
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index f37006f21664..8d5382e62c08 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -132,6 +132,10 @@ static inline void rb_set_color(struct rb_node *rb, int color)
 #define RB_ROOT	(struct rb_root) { NULL, }
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 
+#define RB_EMPTY_ROOT(root)	((root)->rb_node == NULL)
+#define RB_EMPTY_NODE(node)	(rb_parent(node) != node)
+#define RB_CLEAR_NODE(node)	(rb_set_parent(node, node))
+
 extern void rb_insert_color(struct rb_node *, struct rb_root *);
 extern void rb_erase(struct rb_node *, struct rb_root *);
 
-- 
cgit v1.2.3


From 06cf6f2ed0b19629700794727d86ed57b9c0583e Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Tue, 20 Jun 2006 12:56:49 -0400
Subject: NFS: Eliminate nfs_get_user_pages()

Neil Brown observed that the kmalloc() in nfs_get_user_pages() is more
likely to fail if the I/O is large enough to require the allocation of more
than a single page to keep track of all the pinned pages in the user's
buffer.

Instead of tracking one large page array per dreq/iocb, track pages per
nfs_read/write_data, just like the cached I/O path does.  An array for
pages is already allocated for us by nfs_readdata_alloc() (and the write
and commit equivalents).

This is also required for adding support for vectored I/O to the NFS direct
I/O path.

The original reason to pin the user buffer and allocate all the NFS data
structures before trying to schedule I/O was to ensure all needed resources
are allocated on the client before starting to send requests.  This reduces
the chance that resource exhaustion on the client will cause a short read
or write.

On the other hand, for an application making very large application I/O
requests, this means that it will be nearly impossible for the application
to make forward progress on a resource-limited client.

Thus, moving the buffer pinning functionality into the I/O scheduling
loops should be good for scalability.  The next patch will do the same for
NFS data structure allocation.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c         | 205 ++++++++++++++++++++++++++----------------------
 include/linux/nfs_xdr.h |   2 +
 2 files changed, 113 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4cb3446220ba..b1630d53fbb1 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -73,8 +73,6 @@ struct nfs_direct_req {
 	struct nfs_open_context	*ctx;		/* file open context info */
 	struct kiocb *		iocb;		/* controlling i/o request */
 	struct inode *		inode;		/* target file of i/o */
-	struct page **		pages;		/* pages in our buffer */
-	unsigned int		npages;		/* count of pages */
 
 	/* completion state */
 	atomic_t		io_count;	/* i/os we're waiting for */
@@ -104,6 +102,20 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
 	return atomic_dec_and_test(&dreq->io_count);
 }
 
+/*
+ * "size" is never larger than rsize or wsize.
+ */
+static inline int nfs_direct_count_pages(unsigned long user_addr, size_t size)
+{
+	int page_count;
+
+	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	page_count -= user_addr >> PAGE_SHIFT;
+	BUG_ON(page_count < 0);
+
+	return page_count;
+}
+
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
  * @rw: direction (read or write)
@@ -143,40 +155,6 @@ static void nfs_direct_release_pages(struct page **pages, int npages)
 		page_cache_release(pages[i]);
 }
 
-static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
-{
-	int result = -ENOMEM;
-	unsigned long page_count;
-	size_t array_size;
-
-	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	page_count -= user_addr >> PAGE_SHIFT;
-
-	array_size = (page_count * sizeof(struct page *));
-	*pages = kmalloc(array_size, GFP_KERNEL);
-	if (*pages) {
-		down_read(&current->mm->mmap_sem);
-		result = get_user_pages(current, current->mm, user_addr,
-					page_count, (rw == READ), 0,
-					*pages, NULL);
-		up_read(&current->mm->mmap_sem);
-		if (result != page_count) {
-			/*
-			 * If we got fewer pages than expected from
-			 * get_user_pages(), the user buffer runs off the
-			 * end of a mapping; return EFAULT.
-			 */
-			if (result >= 0) {
-				nfs_direct_release_pages(*pages, result);
-				result = -EFAULT;
-			} else
-				kfree(*pages);
-			*pages = NULL;
-		}
-	}
-	return result;
-}
-
 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 {
 	struct nfs_direct_req *dreq;
@@ -233,13 +211,8 @@ out:
 }
 
 /*
- * We must hold a reference to all the pages in this direct read request
- * until the RPCs complete.  This could be long *after* we are woken up in
- * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
- *
- * In addition, synchronous I/O uses a stack-allocated iocb.  Thus we
- * can't trust the iocb is still valid here if this is a synchronous
- * request.  If the waiter is woken prematurely, the iocb is long gone.
+ * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
+ * the iocb is still valid here if this is a synchronous request.
  */
 static void nfs_direct_complete(struct nfs_direct_req *dreq)
 {
@@ -297,6 +270,11 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
 	return dreq;
 }
 
+/*
+ * We must hold a reference to all the pages in this direct read request
+ * until the RPCs complete.  This could be long *after* we are woken up in
+ * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
+ */
 static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
@@ -305,6 +283,9 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 	if (nfs_readpage_result(task, data) != 0)
 		return;
 
+	nfs_direct_dirty_pages(data->pagevec, data->npages);
+	nfs_direct_release_pages(data->pagevec, data->npages);
+
 	spin_lock(&dreq->lock);
 
 	if (likely(task->tk_status >= 0))
@@ -314,11 +295,8 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 
 	spin_unlock(&dreq->lock);
 
-	if (put_dreq(dreq)) {
-		nfs_direct_dirty_pages(dreq->pages, dreq->npages);
-		nfs_direct_release_pages(dreq->pages, dreq->npages);
+	if (put_dreq(dreq))
 		nfs_direct_complete(dreq);
-	}
 }
 
 static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -328,21 +306,23 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
 
 /*
  * For each nfs_read_data struct that was allocated on the list, dispatch
- * an NFS READ operation
+ * an NFS READ operation.  If get_user_pages() fails, we stop sending reads.
+ * Read length accounting is handled by nfs_direct_read_result().
+ * Otherwise, if no requests have been sent, just return an error.
  */
-static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
 	struct list_head *list = &dreq->list;
-	struct page **pages = dreq->pages;
 	size_t rsize = NFS_SERVER(inode)->rsize;
-	unsigned int curpage, pgbase;
+	unsigned int pgbase;
+	int result;
+	ssize_t started = 0;
+	struct nfs_read_data *data;
 
-	curpage = 0;
 	pgbase = user_addr & ~PAGE_MASK;
 	do {
-		struct nfs_read_data *data;
 		size_t bytes;
 
 		bytes = rsize;
@@ -353,13 +333,21 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long
 		data = list_entry(list->next, struct nfs_read_data, pages);
 		list_del_init(&data->pages);
 
+		data->npages = nfs_direct_count_pages(user_addr, bytes);
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, user_addr,
+					data->npages, 1, 0, data->pagevec, NULL);
+		up_read(&current->mm->mmap_sem);
+		if (unlikely(result < data->npages))
+			goto out_err;
+
 		data->inode = inode;
 		data->cred = ctx->cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-		data->args.pages = &pages[curpage];
+		data->args.pages = data->pagevec;
 		data->args.count = bytes;
 		data->res.fattr = &data->fattr;
 		data->res.eof = 0;
@@ -382,17 +370,36 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long
 				bytes,
 				(unsigned long long)data->args.offset);
 
+		started += bytes;
+		user_addr += bytes;
 		pos += bytes;
 		pgbase += bytes;
-		curpage += pgbase >> PAGE_SHIFT;
 		pgbase &= ~PAGE_MASK;
 
 		count -= bytes;
 	} while (count != 0);
 	BUG_ON(!list_empty(list));
+	return 0;
+
+out_err:
+	if (result > 0)
+		nfs_direct_release_pages(data->pagevec, result);
+
+	list_add(&data->pages, list);
+	while (!list_empty(list)) {
+		data = list_entry(list->next, struct nfs_read_data, pages);
+		list_del(&data->pages);
+		nfs_readdata_free(data);
+		if (put_dreq(dreq))
+			nfs_direct_complete(dreq);
+	}
+
+	if (started)
+		return 0;
+	return result < 0 ? (ssize_t) result : -EFAULT;
 }
 
-static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages)
+static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
 {
 	ssize_t result;
 	sigset_t oldset;
@@ -404,8 +411,6 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 	if (!dreq)
 		return -ENOMEM;
 
-	dreq->pages = pages;
-	dreq->npages = nr_pages;
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
 	if (!is_sync_kiocb(iocb))
@@ -413,8 +418,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 
 	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
 	rpc_clnt_sigmask(clnt, &oldset);
-	nfs_direct_read_schedule(dreq, user_addr, count, pos);
-	result = nfs_direct_wait(dreq);
+	result = nfs_direct_read_schedule(dreq, user_addr, count, pos);
+	if (!result)
+		result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
 	return result;
@@ -426,9 +432,9 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 	while (!list_empty(&dreq->list)) {
 		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
 		list_del(&data->pages);
+		nfs_direct_release_pages(data->pagevec, data->npages);
 		nfs_writedata_release(data);
 	}
-	nfs_direct_release_pages(dreq->pages, dreq->npages);
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -672,21 +678,23 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
 
 /*
  * For each nfs_write_data struct that was allocated on the list, dispatch
- * an NFS WRITE operation
+ * an NFS WRITE operation.  If get_user_pages() fails, we stop sending writes.
+ * Write length accounting is handled by nfs_direct_write_result().
+ * Otherwise, if no requests have been sent, just return an error.
  */
-static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
+static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
 	struct list_head *list = &dreq->list;
-	struct page **pages = dreq->pages;
 	size_t wsize = NFS_SERVER(inode)->wsize;
-	unsigned int curpage, pgbase;
+	unsigned int pgbase;
+	int result;
+	ssize_t started = 0;
+	struct nfs_write_data *data;
 
-	curpage = 0;
 	pgbase = user_addr & ~PAGE_MASK;
 	do {
-		struct nfs_write_data *data;
 		size_t bytes;
 
 		bytes = wsize;
@@ -695,6 +703,15 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
 
 		BUG_ON(list_empty(list));
 		data = list_entry(list->next, struct nfs_write_data, pages);
+
+		data->npages = nfs_direct_count_pages(user_addr, bytes);
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, user_addr,
+					data->npages, 0, 0, data->pagevec, NULL);
+		up_read(&current->mm->mmap_sem);
+		if (unlikely(result < data->npages))
+			goto out_err;
+
 		list_move_tail(&data->pages, &dreq->rewrite_list);
 
 		data->inode = inode;
@@ -703,7 +720,7 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
 		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-		data->args.pages = &pages[curpage];
+		data->args.pages = data->pagevec;
 		data->args.count = bytes;
 		data->res.fattr = &data->fattr;
 		data->res.count = bytes;
@@ -727,17 +744,36 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
 				bytes,
 				(unsigned long long)data->args.offset);
 
+		started += bytes;
+		user_addr += bytes;
 		pos += bytes;
 		pgbase += bytes;
-		curpage += pgbase >> PAGE_SHIFT;
 		pgbase &= ~PAGE_MASK;
 
 		count -= bytes;
 	} while (count != 0);
 	BUG_ON(!list_empty(list));
+	return 0;
+
+out_err:
+	if (result > 0)
+		nfs_direct_release_pages(data->pagevec, result);
+
+	list_add(&data->pages, list);
+	while (!list_empty(list)) {
+		data = list_entry(list->next, struct nfs_write_data, pages);
+		list_del(&data->pages);
+		nfs_writedata_free(data);
+		if (put_dreq(dreq))
+			nfs_direct_write_complete(dreq, inode);
+	}
+
+	if (started)
+		return 0;
+	return result < 0 ? (ssize_t) result : -EFAULT;
 }
 
-static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages)
+static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
 {
 	ssize_t result;
 	sigset_t oldset;
@@ -753,8 +789,6 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	if (dreq->commit_data == NULL || count < wsize)
 		sync = FLUSH_STABLE;
 
-	dreq->pages = pages;
-	dreq->npages = nr_pages;
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
 	if (!is_sync_kiocb(iocb))
@@ -765,8 +799,9 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	nfs_begin_data_update(inode);
 
 	rpc_clnt_sigmask(clnt, &oldset);
-	nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
-	result = nfs_direct_wait(dreq);
+	result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
+	if (!result)
+		result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
 	return result;
@@ -796,8 +831,6 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
 {
 	ssize_t retval = -EINVAL;
-	int page_count;
-	struct page **pages;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 
@@ -819,14 +852,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count,
 	if (retval)
 		goto out;
 
-	retval = nfs_get_user_pages(READ, (unsigned long) buf,
-						count, &pages);
-	if (retval < 0)
-		goto out;
-	page_count = retval;
-
-	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
-						pages, page_count);
+	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos);
 	if (retval > 0)
 		iocb->ki_pos = pos + retval;
 
@@ -862,8 +888,6 @@ out:
 ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
 {
 	ssize_t retval;
-	int page_count;
-	struct page **pages;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 
@@ -891,14 +915,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t
 	if (retval)
 		goto out;
 
-	retval = nfs_get_user_pages(WRITE, (unsigned long) buf,
-						count, &pages);
-	if (retval < 0)
-		goto out;
-	page_count = retval;
-
-	retval = nfs_direct_write(iocb, (unsigned long) buf, count,
-					pos, pages, page_count);
+	retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos);
 
 	/*
 	 * XXX: nfs_end_data_update() already ensures this file's
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7c7320fa51aa..2d3fb6416d91 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -729,6 +729,7 @@ struct nfs_read_data {
 	struct list_head	pages;	/* Coalesced read requests */
 	struct nfs_page		*req;	/* multi ops per nfs_page */
 	struct page		**pagevec;
+	unsigned int		npages;	/* active pages in pagevec */
 	struct nfs_readargs args;
 	struct nfs_readres  res;
 #ifdef CONFIG_NFS_V4
@@ -747,6 +748,7 @@ struct nfs_write_data {
 	struct list_head	pages;		/* Coalesced requests we wish to flush */
 	struct nfs_page		*req;		/* multi ops per nfs_page */
 	struct page		**pagevec;
+	unsigned int		npages;		/* active pages in pagevec */
 	struct nfs_writeargs	args;		/* argument struct */
 	struct nfs_writeres	res;		/* result struct */
 #ifdef CONFIG_NFS_V4
-- 
cgit v1.2.3


From 73e55cb3b3549d0174d1dadb755200938232e8d0 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben-linux@fluff.org>
Date: Sat, 24 Jun 2006 21:21:32 +0100
Subject: [ARM] 3639/1: S3C2412: serial port support

Patch from Ben Dooks

Serial port support for the on-board UART blocks
on the Samsung S3C2412 and S3C2413 UARTs.

Signed-off-by: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/Kconfig                     |   9 +-
 drivers/serial/s3c2410.c                   | 143 ++++++++++++++++++++++++++++-
 include/asm-arm/arch-s3c2410/regs-serial.h |  15 +++
 include/linux/serial_core.h                |   3 +
 4 files changed, 166 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index bef4a9622ed7..5b48ac22c9c5 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -354,21 +354,24 @@ config SERIAL_CLPS711X_CONSOLE
 	  kernel at boot time.)
 
 config SERIAL_S3C2410
-	tristate "Samsung S3C2410 Serial port support"
+	tristate "Samsung S3C2410/S3C2440/S3C2442/S3C2412 Serial port support"
 	depends on ARM && ARCH_S3C2410
 	select SERIAL_CORE
 	help
-	  Support for the on-chip UARTs on the Samsung S3C2410X CPU,
+	  Support for the on-chip UARTs on the Samsung S3C24XX series CPUs,
 	  providing /dev/ttySAC0, 1 and 2 (note, some machines may not
 	  provide all of these ports, depending on how the serial port
 	  pins are configured.
 
+	  Currently this driver supports the UARTS on the S3C2410, S3C2440,
+	  S3C2442, S3C2412 and S3C2413 CPUs.
+
 config SERIAL_S3C2410_CONSOLE
 	bool "Support for console on S3C2410 serial port"
 	depends on SERIAL_S3C2410=y
 	select SERIAL_CORE_CONSOLE
 	help
-	  Allow selection of the S3C2410 on-board serial ports for use as
+	  Allow selection of the S3C24XX on-board serial ports for use as
 	  an virtual console.
 
 	  Even if you say Y here, the currently visible virtual console
diff --git a/drivers/serial/s3c2410.c b/drivers/serial/s3c2410.c
index 53c2465bad2d..837b6da520b3 100644
--- a/drivers/serial/s3c2410.c
+++ b/drivers/serial/s3c2410.c
@@ -872,6 +872,8 @@ static const char *s3c24xx_serial_type(struct uart_port *port)
 		return "S3C2410";
 	case PORT_S3C2440:
 		return "S3C2440";
+	case PORT_S3C2412:
+		return "S3C2412";
 	default:
 		return NULL;
 	}
@@ -1528,6 +1530,141 @@ static inline void s3c2440_serial_exit(void)
 #define s3c2440_uart_inf_at NULL
 #endif /* CONFIG_CPU_S3C2440 */
 
+#if defined(CONFIG_CPU_S3C2412) || defined(CONFIG_CPU_S3C2413)
+
+static int s3c2412_serial_setsource(struct uart_port *port,
+				     struct s3c24xx_uart_clksrc *clk)
+{
+	unsigned long ucon = rd_regl(port, S3C2410_UCON);
+
+	ucon &= ~S3C2412_UCON_CLKMASK;
+
+	if (strcmp(clk->name, "uclk") == 0)
+		ucon |= S3C2440_UCON_UCLK;
+	else if (strcmp(clk->name, "pclk") == 0)
+		ucon |= S3C2440_UCON_PCLK;
+	else if (strcmp(clk->name, "usysclk") == 0)
+		ucon |= S3C2412_UCON_USYSCLK;
+	else {
+		printk(KERN_ERR "unknown clock source %s\n", clk->name);
+		return -EINVAL;
+	}
+
+	wr_regl(port, S3C2410_UCON, ucon);
+	return 0;
+}
+
+
+static int s3c2412_serial_getsource(struct uart_port *port,
+				    struct s3c24xx_uart_clksrc *clk)
+{
+	unsigned long ucon = rd_regl(port, S3C2410_UCON);
+
+	switch (ucon & S3C2412_UCON_CLKMASK) {
+	case S3C2412_UCON_UCLK:
+		clk->divisor = 1;
+		clk->name = "uclk";
+		break;
+
+	case S3C2412_UCON_PCLK:
+	case S3C2412_UCON_PCLK2:
+		clk->divisor = 1;
+		clk->name = "pclk";
+		break;
+
+	case S3C2412_UCON_USYSCLK:
+		clk->divisor = 1;
+		clk->name = "usysclk";
+		break;
+	}
+
+	return 0;
+}
+
+static int s3c2412_serial_resetport(struct uart_port *port,
+				    struct s3c2410_uartcfg *cfg)
+{
+	unsigned long ucon = rd_regl(port, S3C2410_UCON);
+
+	dbg("%s: port=%p (%08lx), cfg=%p\n",
+	    __FUNCTION__, port, port->mapbase, cfg);
+
+	/* ensure we don't change the clock settings... */
+
+	ucon &= S3C2412_UCON_CLKMASK;
+
+	wr_regl(port, S3C2410_UCON,  ucon | cfg->ucon);
+	wr_regl(port, S3C2410_ULCON, cfg->ulcon);
+
+	/* reset both fifos */
+
+	wr_regl(port, S3C2410_UFCON, cfg->ufcon | S3C2410_UFCON_RESETBOTH);
+	wr_regl(port, S3C2410_UFCON, cfg->ufcon);
+
+	return 0;
+}
+
+static struct s3c24xx_uart_info s3c2412_uart_inf = {
+	.name		= "Samsung S3C2412 UART",
+	.type		= PORT_S3C2412,
+	.fifosize	= 64,
+	.rx_fifomask	= S3C2440_UFSTAT_RXMASK,
+	.rx_fifoshift	= S3C2440_UFSTAT_RXSHIFT,
+	.rx_fifofull	= S3C2440_UFSTAT_RXFULL,
+	.tx_fifofull	= S3C2440_UFSTAT_TXFULL,
+	.tx_fifomask	= S3C2440_UFSTAT_TXMASK,
+	.tx_fifoshift	= S3C2440_UFSTAT_TXSHIFT,
+	.get_clksrc	= s3c2412_serial_getsource,
+	.set_clksrc	= s3c2412_serial_setsource,
+	.reset_port	= s3c2412_serial_resetport,
+};
+
+/* device management */
+
+static int s3c2412_serial_probe(struct platform_device *dev)
+{
+	dbg("s3c2440_serial_probe: dev=%p\n", dev);
+	return s3c24xx_serial_probe(dev, &s3c2440_uart_inf);
+}
+
+static struct platform_driver s3c2412_serial_drv = {
+	.probe		= s3c2412_serial_probe,
+	.remove		= s3c24xx_serial_remove,
+	.suspend	= s3c24xx_serial_suspend,
+	.resume		= s3c24xx_serial_resume,
+	.driver		= {
+		.name	= "s3c2412-uart",
+		.owner	= THIS_MODULE,
+	},
+};
+
+
+static inline int s3c2412_serial_init(void)
+{
+	return s3c24xx_serial_init(&s3c2412_serial_drv, &s3c2412_uart_inf);
+}
+
+static inline void s3c2412_serial_exit(void)
+{
+	platform_driver_unregister(&s3c2412_serial_drv);
+}
+
+#define s3c2412_uart_inf_at &s3c2412_uart_inf
+#else
+
+static inline int s3c2412_serial_init(void)
+{
+	return 0;
+}
+
+static inline void s3c2412_serial_exit(void)
+{
+}
+
+#define s3c2412_uart_inf_at NULL
+#endif /* CONFIG_CPU_S3C2440 */
+
+
 /* module initialisation code */
 
 static int __init s3c24xx_serial_modinit(void)
@@ -1542,6 +1679,7 @@ static int __init s3c24xx_serial_modinit(void)
 
 	s3c2400_serial_init();
 	s3c2410_serial_init();
+	s3c2412_serial_init();
 	s3c2440_serial_init();
 
 	return 0;
@@ -1551,6 +1689,7 @@ static void __exit s3c24xx_serial_modexit(void)
 {
 	s3c2400_serial_exit();
 	s3c2410_serial_exit();
+	s3c2412_serial_exit();
 	s3c2440_serial_exit();
 
 	uart_unregister_driver(&s3c24xx_uart_drv);
@@ -1773,6 +1912,8 @@ static int s3c24xx_serial_initconsole(void)
 		info = s3c2410_uart_inf_at;
 	} else if (strcmp(dev->name, "s3c2440-uart") == 0) {
 		info = s3c2440_uart_inf_at;
+	} else if (strcmp(dev->name, "s3c2412-uart") == 0) {
+		info = s3c2412_uart_inf_at;
 	} else {
 		printk(KERN_ERR "s3c24xx: no driver for %s\n", dev->name);
 		return 0;
@@ -1796,4 +1937,4 @@ console_initcall(s3c24xx_serial_initconsole);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Ben Dooks <ben@simtec.co.uk>");
-MODULE_DESCRIPTION("Samsung S3C2410/S3C2440 Serial port driver");
+MODULE_DESCRIPTION("Samsung S3C2410/S3C2440/S3C2412 Serial port driver");
diff --git a/include/asm-arm/arch-s3c2410/regs-serial.h b/include/asm-arm/arch-s3c2410/regs-serial.h
index 83b01254c4ac..93f651ae2967 100644
--- a/include/asm-arm/arch-s3c2410/regs-serial.h
+++ b/include/asm-arm/arch-s3c2410/regs-serial.h
@@ -82,6 +82,12 @@
 #define S3C2440_UCON2_DIVMASK	  (7 << 12)
 #define S3C2440_UCON_DIVSHIFT	  (12)
 
+#define S3C2412_UCON_CLKMASK	(3<<10)
+#define S3C2412_UCON_UCLK	(1<<10)
+#define S3C2412_UCON_USYSCLK	(3<<10)
+#define S3C2412_UCON_PCLK	(0<<10)
+#define S3C2412_UCON_PCLK2	(2<<10)
+
 #define S3C2410_UCON_UCLK	  (1<<10)
 #define S3C2410_UCON_SBREAK	  (1<<4)
 
@@ -124,6 +130,15 @@
 #define	S3C2410_UMCOM_AFC	  (1<<4)
 #define	S3C2410_UMCOM_RTS_LOW	  (1<<0)
 
+#define S3C2412_UMCON_AFC_63	(0<<5)
+#define S3C2412_UMCON_AFC_56	(1<<5)
+#define S3C2412_UMCON_AFC_48	(2<<5)
+#define S3C2412_UMCON_AFC_40	(3<<5)
+#define S3C2412_UMCON_AFC_32	(4<<5)
+#define S3C2412_UMCON_AFC_24	(5<<5)
+#define S3C2412_UMCON_AFC_16	(6<<5)
+#define S3C2412_UMCON_AFC_8	(7<<5)
+
 #define S3C2410_UFSTAT_TXFULL	  (1<<9)
 #define S3C2410_UFSTAT_RXFULL	  (1<<8)
 #define S3C2410_UFSTAT_TXMASK	  (15<<4)
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 0ef50baa7da6..951c4e858274 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -130,6 +130,9 @@
 /* SUN4V Hypervisor Console */
 #define PORT_SUNHV	72
 
+#define PORT_S3C2412	73
+
+
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
-- 
cgit v1.2.3


From eb71c87a492b7090ff9e8ac46912c480a1687e38 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sat, 24 Jun 2006 14:27:42 -0700
Subject: Add some basic resume trace facilities

Considering that there isn't a lot of hw we can depend on during resume,
this is about as good as it gets.

This is x86-only for now, although the basic concept (and most of the
code) will certainly work on almost any platform.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/vmlinux.lds.S |   7 ++
 drivers/base/power/Makefile    |   1 +
 drivers/base/power/trace.c     | 228 +++++++++++++++++++++++++++++++++++++++++
 include/asm-generic/rtc.h      |   7 +-
 include/linux/resume-trace.h   |  30 ++++++
 kernel/power/Kconfig           |   9 ++
 6 files changed, 279 insertions(+), 3 deletions(-)
 create mode 100644 drivers/base/power/trace.c
 create mode 100644 include/linux/resume-trace.h

(limited to 'include/linux')

diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 8831303a473f..7512f39c9f25 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -37,6 +37,13 @@ SECTIONS
 
   RODATA
 
+  . = ALIGN(4);
+  __tracedata_start = .;
+  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
+	*(.tracedata)
+  }
+  __tracedata_end = .;
+
   /* writeable */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {	/* Data */
 	*(.data)
diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile
index ceeeba2c56c7..91f230939c1e 100644
--- a/drivers/base/power/Makefile
+++ b/drivers/base/power/Makefile
@@ -1,5 +1,6 @@
 obj-y			:= shutdown.o
 obj-$(CONFIG_PM)	+= main.o suspend.o resume.o runtime.o sysfs.o
+obj-$(CONFIG_PM_TRACE)	+= trace.o
 
 ifeq ($(CONFIG_DEBUG_DRIVER),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c
new file mode 100644
index 000000000000..a9ab30fefffc
--- /dev/null
+++ b/drivers/base/power/trace.c
@@ -0,0 +1,228 @@
+/*
+ * drivers/base/power/trace.c
+ *
+ * Copyright (C) 2006 Linus Torvalds
+ *
+ * Trace facility for suspend/resume problems, when none of the
+ * devices may be working.
+ */
+
+#include <linux/resume-trace.h>
+#include <linux/rtc.h>
+
+#include <asm/rtc.h>
+
+#include "power.h"
+
+/*
+ * Horrid, horrid, horrid.
+ *
+ * It turns out that the _only_ piece of hardware that actually
+ * keeps its value across a hard boot (and, more importantly, the
+ * POST init sequence) is literally the realtime clock.
+ *
+ * Never mind that an RTC chip has 114 bytes (and often a whole
+ * other bank of an additional 128 bytes) of nice SRAM that is
+ * _designed_ to keep data - the POST will clear it. So we literally
+ * can just use the few bytes of actual time data, which means that
+ * we're really limited.
+ *
+ * It means, for example, that we can't use the seconds at all
+ * (since the time between the hang and the boot might be more
+ * than a minute), and we'd better not depend on the low bits of
+ * the minutes either.
+ *
+ * There are the wday fields etc, but I wouldn't guarantee those
+ * are dependable either. And if the date isn't valid, either the
+ * hw or POST will do strange things.
+ *
+ * So we're left with:
+ *  - year: 0-99
+ *  - month: 0-11
+ *  - day-of-month: 1-28
+ *  - hour: 0-23
+ *  - min: (0-30)*2
+ *
+ * Giving us a total range of 0-16128000 (0xf61800), ie less
+ * than 24 bits of actual data we can save across reboots.
+ *
+ * And if your box can't boot in less than three minutes,
+ * you're screwed.
+ *
+ * Now, almost 24 bits of data is pitifully small, so we need
+ * to be pretty dense if we want to use it for anything nice.
+ * What we do is that instead of saving off nice readable info,
+ * we save off _hashes_ of information that we can hopefully
+ * regenerate after the reboot.
+ *
+ * In particular, this means that we might be unlucky, and hit
+ * a case where we have a hash collision, and we end up not
+ * being able to tell for certain exactly which case happened.
+ * But that's hopefully unlikely.
+ *
+ * What we do is to take the bits we can fit, and split them
+ * into three parts (16*997*1009 = 16095568), and use the values
+ * for:
+ *  - 0-15: user-settable
+ *  - 0-996: file + line number
+ *  - 0-1008: device
+ */
+#define USERHASH (16)
+#define FILEHASH (997)
+#define DEVHASH (1009)
+
+#define DEVSEED (7919)
+
+static unsigned int dev_hash_value;
+
+static int set_magic_time(unsigned int user, unsigned int file, unsigned int device)
+{
+	unsigned int n = user + USERHASH*(file + FILEHASH*device);
+
+	// June 7th, 2006
+	static struct rtc_time time = {
+		.tm_sec = 0,
+		.tm_min = 0,
+		.tm_hour = 0,
+		.tm_mday = 7,
+		.tm_mon = 5,	// June - counting from zero
+		.tm_year = 106,
+		.tm_wday = 3,
+		.tm_yday = 160,
+		.tm_isdst = 1
+	};
+
+	time.tm_year = (n % 100);
+	n /= 100;
+	time.tm_mon = (n % 12);
+	n /= 12;
+	time.tm_mday = (n % 28) + 1;
+	n /= 28;
+	time.tm_hour = (n % 24);
+	n /= 24;
+	time.tm_min = (n % 20) * 3;
+	n /= 20;
+	set_rtc_time(&time);
+	return n ? -1 : 0;
+}
+
+static unsigned int read_magic_time(void)
+{
+	struct rtc_time time;
+	unsigned int val;
+
+	get_rtc_time(&time);
+	printk("Time: %2d:%02d:%02d  Date: %02d/%02d/%02d\n",
+		time.tm_hour, time.tm_min, time.tm_sec,
+		time.tm_mon, time.tm_mday, time.tm_year);
+	val = time.tm_year;				/* 100 years */
+	if (val > 100)
+		val -= 100;
+	val += time.tm_mon * 100;			/* 12 months */
+	val += (time.tm_mday-1) * 100 * 12;		/* 28 month-days */
+	val += time.tm_hour * 100 * 12 * 28;		/* 24 hours */
+	val += (time.tm_min / 3) * 100 * 12 * 28 * 24;	/* 20 3-minute intervals */
+	return val;
+}
+
+/*
+ * This is just the sdbm hash function with a user-supplied
+ * seed and final size parameter.
+ */
+static unsigned int hash_string(unsigned int seed, const char *data, unsigned int mod)
+{
+	unsigned char c;
+	while ((c = *data++) != 0) {
+		seed = (seed << 16) + (seed << 6) - seed + c;
+	}
+	return seed % mod;
+}
+
+void set_trace_device(struct device *dev)
+{
+	dev_hash_value = hash_string(DEVSEED, dev->bus_id, DEVHASH);
+}
+
+/*
+ * We could just take the "tracedata" index into the .tracedata
+ * section instead. Generating a hash of the data gives us a
+ * chance to work across kernel versions, and perhaps more
+ * importantly it also gives us valid/invalid check (ie we will
+ * likely not give totally bogus reports - if the hash matches,
+ * it's not any guarantee, but it's a high _likelihood_ that
+ * the match is valid).
+ */
+void generate_resume_trace(void *tracedata, unsigned int user)
+{
+	unsigned short lineno = *(unsigned short *)tracedata;
+	const char *file = *(const char **)(tracedata + 2);
+	unsigned int user_hash_value, file_hash_value;
+
+	user_hash_value = user % USERHASH;
+	file_hash_value = hash_string(lineno, file, FILEHASH);
+	set_magic_time(user_hash_value, file_hash_value, dev_hash_value);
+}
+
+extern char __tracedata_start, __tracedata_end;
+static int show_file_hash(unsigned int value)
+{
+	int match;
+	char *tracedata;
+
+	match = 0;
+	for (tracedata = &__tracedata_start ; tracedata < &__tracedata_end ; tracedata += 6) {
+		unsigned short lineno = *(unsigned short *)tracedata;
+		const char *file = *(const char **)(tracedata + 2);
+		unsigned int hash = hash_string(lineno, file, FILEHASH);
+		if (hash != value)
+			continue;
+		printk("  hash matches %s:%u\n", file, lineno);
+		match++;
+	}
+	return match;
+}
+
+static int show_dev_hash(unsigned int value)
+{
+	int match = 0;
+	struct list_head * entry = dpm_active.prev;
+
+	while (entry != &dpm_active) {
+		struct device * dev = to_device(entry);
+		unsigned int hash = hash_string(DEVSEED, dev->bus_id, DEVHASH);
+		if (hash == value) {
+			printk("  hash matches device %s\n", dev->bus_id);
+			match++;
+		}
+		entry = entry->prev;
+	}
+	return match;
+}
+
+static unsigned int hash_value_early_read;
+
+static int early_resume_init(void)
+{
+	hash_value_early_read = read_magic_time();
+	return 0;
+}
+
+static int late_resume_init(void)
+{
+	unsigned int val = hash_value_early_read;
+	unsigned int user, file, dev;
+
+	user = val % USERHASH;
+	val = val / USERHASH;
+	file = val % FILEHASH;
+	val = val / FILEHASH;
+	dev = val /* % DEVHASH */;
+
+	printk("  Magic number: %d:%d:%d\n", user, file, dev);
+	show_file_hash(file);
+	show_dev_hash(dev);
+	return 0;
+}
+
+core_initcall(early_resume_init);
+late_initcall(late_resume_init);
diff --git a/include/asm-generic/rtc.h b/include/asm-generic/rtc.h
index cef08db34ada..4087037a4225 100644
--- a/include/asm-generic/rtc.h
+++ b/include/asm-generic/rtc.h
@@ -114,6 +114,7 @@ static inline unsigned int get_rtc_time(struct rtc_time *time)
 /* Set the current date and time in the real time clock. */
 static inline int set_rtc_time(struct rtc_time *time)
 {
+	unsigned long flags;
 	unsigned char mon, day, hrs, min, sec;
 	unsigned char save_control, save_freq_select;
 	unsigned int yrs;
@@ -131,7 +132,7 @@ static inline int set_rtc_time(struct rtc_time *time)
 	if (yrs > 255)	/* They are unsigned */
 		return -EINVAL;
 
-	spin_lock_irq(&rtc_lock);
+	spin_lock_irqsave(&rtc_lock, flags);
 #ifdef CONFIG_MACH_DECSTATION
 	real_yrs = yrs;
 	leap_yr = ((!((yrs + 1900) % 4) && ((yrs + 1900) % 100)) ||
@@ -152,7 +153,7 @@ static inline int set_rtc_time(struct rtc_time *time)
 	 * whether the chip is in binary mode or not.
 	 */
 	if (yrs > 169) {
-		spin_unlock_irq(&rtc_lock);
+		spin_unlock_irqrestore(&rtc_lock, flags);
 		return -EINVAL;
 	}
 
@@ -187,7 +188,7 @@ static inline int set_rtc_time(struct rtc_time *time)
 	CMOS_WRITE(save_control, RTC_CONTROL);
 	CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
 
-	spin_unlock_irq(&rtc_lock);
+	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	return 0;
 }
diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h
new file mode 100644
index 000000000000..a376bd4ade39
--- /dev/null
+++ b/include/linux/resume-trace.h
@@ -0,0 +1,30 @@
+#ifndef RESUME_TRACE_H
+#define RESUME_TRACE_H
+
+#ifdef CONFIG_PM_TRACE
+
+struct device;
+extern void set_trace_device(struct device *);
+extern void generate_resume_trace(void *tracedata, unsigned int user);
+
+#define TRACE_DEVICE(dev) set_trace_device(dev)
+#define TRACE_RESUME(user) do {				\
+	void *tracedata;				\
+	asm volatile("movl $1f,%0\n"			\
+		".section .tracedata,\"a\"\n"		\
+		"1:\t.word %c1\n"			\
+		"\t.long %c2\n"				\
+		".previous"				\
+		:"=r" (tracedata)			\
+		: "i" (__LINE__), "i" (__FILE__));	\
+	generate_resume_trace(tracedata, user);		\
+} while (0)
+
+#else
+
+#define TRACE_DEVICE(dev) do { } while (0)
+#define TRACE_RESUME(dev) do { } while (0)
+
+#endif
+
+#endif
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ce0dfb8f4a4e..cdf315e794ff 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,6 +36,15 @@ config PM_DEBUG
 	code. This is helpful when debugging and reporting various PM bugs, 
 	like suspend support.
 
+config PM_TRACE
+	bool "Suspend/resume event tracing"
+	depends on PM && PM_DEBUG && X86
+	default y
+	---help---
+	This enables some cheesy code to save the last PM event point in the
+	RTC across reboots, so that you can debug a machine that just hangs
+	during suspend (or more commonly, during resume).
+
 config SOFTWARE_SUSPEND
 	bool "Software Suspend"
 	depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
-- 
cgit v1.2.3


From 62838084b4c4c83cd511893132e2d8da84f48813 Mon Sep 17 00:00:00 2001
From: Andreas Oberritter <obi@linuxtv.org>
Date: Wed, 5 Apr 2006 16:36:51 -0300
Subject: V4L/DVB (3727): Remove DMX_GET_EVENT and associated data structures

The ioctl DMX_GET_EVENT has never been implemented.
I guess no software is using it because of its lack of implementation.
Future software won't use it, too, because this API doesn't make much
sense the way it is: Frontend events have their own different API.
Scrambling events can't be generated in a useful way by the hardware I
know of.

Signed-off-by: Andreas Oberritter <obi@linuxtv.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 drivers/media/dvb/dvb-core/dmxdev.c |  3 ---
 include/linux/dvb/dmx.h             | 26 --------------------------
 2 files changed, 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/dvb/dvb-core/dmxdev.c b/drivers/media/dvb/dvb-core/dmxdev.c
index 04578df3f249..988499dfddf8 100644
--- a/drivers/media/dvb/dvb-core/dmxdev.c
+++ b/drivers/media/dvb/dvb-core/dmxdev.c
@@ -872,9 +872,6 @@ static int dvb_demux_do_ioctl(struct inode *inode, struct file *file,
 		mutex_unlock(&dmxdevfilter->mutex);
 		break;
 
-	case DMX_GET_EVENT:
-		break;
-
 	case DMX_GET_PES_PIDS:
 		if (!dmxdev->demux->get_pes_pids) {
 			ret = -EINVAL;
diff --git a/include/linux/dvb/dmx.h b/include/linux/dvb/dmx.h
index 2787b8a22ff1..c6a2353c4e68 100644
--- a/include/linux/dvb/dmx.h
+++ b/include/linux/dvb/dmx.h
@@ -88,20 +88,6 @@ typedef enum
 #define DMX_PES_PCR      DMX_PES_PCR0
 
 
-typedef enum
-{
-	DMX_SCRAMBLING_EV,
-	DMX_FRONTEND_EV
-} dmx_event_t;
-
-
-typedef enum
-{
-	DMX_SCRAMBLING_OFF,
-	DMX_SCRAMBLING_ON
-} dmx_scrambling_status_t;
-
-
 typedef struct dmx_filter
 {
 	__u8  filter[DMX_FILTER_SIZE];
@@ -132,17 +118,6 @@ struct dmx_pes_filter_params
 	__u32          flags;
 };
 
-
-struct dmx_event
-{
-	dmx_event_t         event;
-	time_t              timeStamp;
-	union
-	{
-		dmx_scrambling_status_t scrambling;
-	} u;
-};
-
 typedef struct dmx_caps {
 	__u32 caps;
 	int num_decoders;
@@ -171,7 +146,6 @@ struct dmx_stc {
 #define DMX_SET_FILTER           _IOW('o', 43, struct dmx_sct_filter_params)
 #define DMX_SET_PES_FILTER       _IOW('o', 44, struct dmx_pes_filter_params)
 #define DMX_SET_BUFFER_SIZE      _IO('o', 45)
-#define DMX_GET_EVENT            _IOR('o', 46, struct dmx_event)
 #define DMX_GET_PES_PIDS         _IOR('o', 47, __u16[5])
 #define DMX_GET_CAPS             _IOR('o', 48, dmx_caps_t)
 #define DMX_SET_SOURCE           _IOW('o', 49, dmx_source_t)
-- 
cgit v1.2.3


From fbe60daac4c34e39d1ca69684bcb76e62461ac21 Mon Sep 17 00:00:00 2001
From: Martin Samuelsson <sam@home.se>
Date: Thu, 27 Apr 2006 10:17:00 -0300
Subject: V4L/DVB (3916): AverMedia 6 Eyes AVS6EYES support

Add support for the AverMedia 6 Eyes MJPEG card.
- Updated drivers/media/video/Kconfig with AVS6EYES
  options.
- Added CONFIG_VIDEO_ZORAN_AVS6EYES to
  drivers/media/video/Makefile.
- Added I2C_DRIVERID_BT866 and I2C_DRIVERID_KS0127 to
  include/linux/i2c-id.h
- Added drivers/media/video/ks0127.c, imported and modified from
  the Marvel project.
- Added drivers/media/video/ks0127.h, imported and modified from
  the Marvel project.
- Added drivers/media/video/bt866.c, ported from a 2.4 version
  by Christer Weinigel.
- Added AVS6EYES to drivers/media/video/zoran_card.c
- Added input_mux to all cards in drivers/media/video/zoran_card.c
- Added input mux module parameter to drivers/media/video/zoran_card.c
- Added AVS6EYES to card_type in drivers/media/video/zoran.h
- Added input_mux to card_info in drivers/media/video/zoran.h
- Upped BUZ_MAX_INPUT in drivers/media/video/zoran.h from 8 to 16,
  as the AVS6EYES has 10.
- Updated Documentation/video4linux/Zoran with information about AVS6EYES.

Signed-off-by: Martin Samuelsson <sam@home.se>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 Documentation/video4linux/Zoran  |  23 ++
 drivers/media/video/Kconfig      |   6 +
 drivers/media/video/Makefile     |   1 +
 drivers/media/video/bt866.c      | 377 +++++++++++++++++
 drivers/media/video/ks0127.c     | 846 +++++++++++++++++++++++++++++++++++++++
 drivers/media/video/ks0127.h     |  53 +++
 drivers/media/video/zoran.h      |   8 +-
 drivers/media/video/zoran_card.c |  87 ++++
 include/linux/i2c-id.h           |   2 +
 9 files changed, 1402 insertions(+), 1 deletion(-)
 create mode 100644 drivers/media/video/bt866.c
 create mode 100644 drivers/media/video/ks0127.c
 create mode 100644 drivers/media/video/ks0127.h

(limited to 'include/linux')

diff --git a/Documentation/video4linux/Zoran b/Documentation/video4linux/Zoran
index be9f21b84555..040a2c841ae9 100644
--- a/Documentation/video4linux/Zoran
+++ b/Documentation/video4linux/Zoran
@@ -33,6 +33,21 @@ Inputs/outputs: Composite and S-video
 Norms: PAL, SECAM (720x576 @ 25 fps), NTSC (720x480 @ 29.97 fps)
 Card number: 7
 
+AverMedia 6 Eyes AVS6EYES:
+* Zoran zr36067 PCI controller
+* Zoran zr36060 MJPEG codec
+* Samsung ks0127 TV decoder
+* Conexant bt866  TV encoder
+Drivers to use: videodev, i2c-core, i2c-algo-bit,
+		videocodec, ks0127, bt866, zr36060, zr36067
+Inputs/outputs: Six physical inputs. 1-6 are composite,
+		1-2, 3-4, 5-6 doubles as S-video,
+		1-3 triples as component.
+		One composite output.
+Norms: PAL, SECAM (720x576 @ 25 fps), NTSC (720x480 @ 29.97 fps)
+Card number: 8
+Not autodetected, card=8 is necessary.
+
 Linux Media Labs LML33:
 * Zoran zr36067 PCI controller
 * Zoran zr36060 MJPEG codec
@@ -192,6 +207,10 @@ Micronas vpx3220a TV decoder
 was introduced in 1996, is used in the DC30 and DC30+ and
 can handle: PAL B/G/H/I, PAL N, PAL M, NTSC M, NTSC 44, PAL 60, SECAM,NTSC Comb
 
+Samsung ks0127 TV decoder
+is used in the AVS6EYES card and
+can handle: NTSC-M/N/44, PAL-M/N/B/G/H/I/D/K/L and SECAM
+
 ===========================
 
 1.2 What the TV encoder can do an what not
@@ -221,6 +240,10 @@ ITT mse3000 TV encoder
 was introduced in 1991, is used in the DC10 old
 can generate: PAL , NTSC , SECAM
 
+Conexant bt866 TV encoder
+is used in AVS6EYES, and
+can generate: NTSC/PAL, PAL�M, PAL�N
+
 The adv717x, should be able to produce PAL N. But you find nothing PAL N
 specific in the registers. Seem that you have to reuse a other standard
 to generate PAL N, maybe it would work if you use the PAL M settings.
diff --git a/drivers/media/video/Kconfig b/drivers/media/video/Kconfig
index 6b4197018561..515e16acf0dd 100644
--- a/drivers/media/video/Kconfig
+++ b/drivers/media/video/Kconfig
@@ -224,6 +224,12 @@ config VIDEO_ZORAN_LML33R10
 	  support for the Linux Media Labs LML33R10 MJPEG capture/playback
 	  card.
 
+config VIDEO_ZORAN_AVS6EYES
+	tristate "AverMedia 6 Eyes support (EXPERIMENTAL)"
+	depends on VIDEO_ZORAN && EXPERIMENTAL && VIDEO_V4L1
+	help
+	  Support for the AverMedia 6 Eyes video surveillance card.
+
 config VIDEO_ZR36120
 	tristate "Zoran ZR36120/36125 Video For Linux"
 	depends on PCI && I2C && VIDEO_V4L1 && BROKEN
diff --git a/drivers/media/video/Makefile b/drivers/media/video/Makefile
index e5bf2687b76d..97e899074436 100644
--- a/drivers/media/video/Makefile
+++ b/drivers/media/video/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_VIDEO_ZORAN_DC30) += adv7175.o vpx3220.o zr36050.o \
 	zr36016.o
 obj-$(CONFIG_VIDEO_ZORAN_LML33) += bt819.o bt856.o zr36060.o
 obj-$(CONFIG_VIDEO_ZORAN_LML33R10) += saa7114.o adv7170.o zr36060.o
+obj-$(CONFIG_VIDEO_ZORAN_AVS6EYES) += bt866.o ks0127.o zr36060.o
 obj-$(CONFIG_VIDEO_ZORAN) += zr36067.o videocodec.o
 obj-$(CONFIG_VIDEO_PMS) += pms.o
 obj-$(CONFIG_VIDEO_PLANB) += planb.o
diff --git a/drivers/media/video/bt866.c b/drivers/media/video/bt866.c
new file mode 100644
index 000000000000..05e42bbcfc3d
--- /dev/null
+++ b/drivers/media/video/bt866.c
@@ -0,0 +1,377 @@
+/*
+    bt866 - BT866 Digital Video Encoder (Rockwell Part)
+
+    Copyright (C) 1999 Mike Bernson <mike@mlb.org>
+    Copyright (C) 1998 Dave Perks <dperks@ibm.net>
+
+    Modifications for LML33/DC10plus unified driver
+    Copyright (C) 2000 Serguei Miridonov <mirsev@cicese.mx>
+
+    This code was modify/ported from the saa7111 driver written
+    by Dave Perks.
+
+    This code was adapted for the bt866 by Christer Weinigel and ported
+    to 2.6 by Martin Samuelsson.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+#include <linux/signal.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/i2c.h>
+
+#include <linux/videodev.h>
+#include <asm/uaccess.h>
+
+#include <linux/video_encoder.h>
+
+MODULE_LICENSE("GPL");
+
+#define	BT866_DEVNAME	"bt866"
+#define I2C_BT866	0x88
+
+MODULE_LICENSE("GPL");
+
+#define DEBUG(x)		/* Debug driver */
+
+/* ----------------------------------------------------------------------- */
+
+struct bt866 {
+	struct i2c_client *i2c;
+	int addr;
+	unsigned char reg[128];
+
+	int norm;
+	int enable;
+	int bright;
+	int contrast;
+	int hue;
+	int sat;
+};
+
+static int bt866_write(struct bt866 *dev,
+			unsigned char subaddr, unsigned char data);
+
+static int bt866_do_command(struct bt866 *encoder,
+			unsigned int cmd, void *arg)
+{
+	switch (cmd) {
+	case ENCODER_GET_CAPABILITIES:
+	{
+		struct video_encoder_capability *cap = arg;
+
+		DEBUG(printk
+		      (KERN_INFO "%s: get capabilities\n",
+		       encoder->i2c->name));
+
+		cap->flags
+			= VIDEO_ENCODER_PAL
+			| VIDEO_ENCODER_NTSC
+			| VIDEO_ENCODER_CCIR;
+		cap->inputs = 2;
+		cap->outputs = 1;
+	}
+	break;
+
+	case ENCODER_SET_NORM:
+	{
+		int *iarg = arg;
+
+		DEBUG(printk(KERN_INFO "%s: set norm %d\n",
+			     encoder->i2c->name, *iarg));
+
+		switch (*iarg) {
+
+		case VIDEO_MODE_NTSC:
+			break;
+
+		case VIDEO_MODE_PAL:
+			break;
+
+		default:
+			return -EINVAL;
+
+		}
+		encoder->norm = *iarg;
+	}
+	break;
+
+	case ENCODER_SET_INPUT:
+	{
+		int *iarg = arg;
+		static const __u8 init[] = {
+			0xc8, 0xcc, /* CRSCALE */
+			0xca, 0x91, /* CBSCALE */
+			0xcc, 0x24, /* YC16 | OSDNUM */
+			0xda, 0x00, /*  */
+			0xdc, 0x24, /* SETMODE | PAL */
+			0xde, 0x02, /* EACTIVE */
+
+			/* overlay colors */
+			0x70, 0xEB, 0x90, 0x80, 0xB0, 0x80, /* white */
+			0x72, 0xA2, 0x92, 0x8E, 0xB2, 0x2C, /* yellow */
+			0x74, 0x83, 0x94, 0x2C, 0xB4, 0x9C, /* cyan */
+			0x76, 0x70, 0x96, 0x3A, 0xB6, 0x48, /* green */
+			0x78, 0x54, 0x98, 0xC6, 0xB8, 0xB8, /* magenta */
+			0x7A, 0x41, 0x9A, 0xD4, 0xBA, 0x64, /* red */
+			0x7C, 0x23, 0x9C, 0x72, 0xBC, 0xD4, /* blue */
+			0x7E, 0x10, 0x9E, 0x80, 0xBE, 0x80, /* black */
+
+			0x60, 0xEB, 0x80, 0x80, 0xc0, 0x80, /* white */
+			0x62, 0xA2, 0x82, 0x8E, 0xc2, 0x2C, /* yellow */
+			0x64, 0x83, 0x84, 0x2C, 0xc4, 0x9C, /* cyan */
+			0x66, 0x70, 0x86, 0x3A, 0xc6, 0x48, /* green */
+			0x68, 0x54, 0x88, 0xC6, 0xc8, 0xB8, /* magenta */
+			0x6A, 0x41, 0x8A, 0xD4, 0xcA, 0x64, /* red */
+			0x6C, 0x23, 0x8C, 0x72, 0xcC, 0xD4, /* blue */
+			0x6E, 0x10, 0x8E, 0x80, 0xcE, 0x80, /* black */
+		};
+		int i;
+		u8 val;
+
+		for (i = 0; i < ARRAY_SIZE(init) / 2; i += 2)
+			bt866_write(encoder, init[i], init[i+1]);
+
+		val = encoder->reg[0xdc];
+
+		if (*iarg == 0)
+			val |= 0x40; /* CBSWAP */
+		else
+			val &= ~0x40; /* !CBSWAP */
+
+		bt866_write(encoder, 0xdc, val);
+
+		val = encoder->reg[0xcc];
+		if (*iarg == 2)
+			val |= 0x01; /* OSDBAR */
+		else
+			val &= ~0x01; /* !OSDBAR */
+		bt866_write(encoder, 0xcc, val);
+
+		DEBUG(printk(KERN_INFO "%s: set input %d\n",
+			     encoder->i2c->name, *iarg));
+
+		switch (*iarg) {
+		case 0:
+			break;
+		case 1:
+			break;
+		default:
+			return -EINVAL;
+
+		}
+	}
+	break;
+
+	case ENCODER_SET_OUTPUT:
+	{
+		int *iarg = arg;
+
+		DEBUG(printk(KERN_INFO "%s: set output %d\n",
+			     encoder->i2c->name, *iarg));
+
+		/* not much choice of outputs */
+		if (*iarg != 0)
+			return -EINVAL;
+	}
+	break;
+
+	case ENCODER_ENABLE_OUTPUT:
+	{
+		int *iarg = arg;
+		encoder->enable = !!*iarg;
+
+		DEBUG(printk
+		      (KERN_INFO "%s: enable output %d\n",
+		       encoder->i2c->name, encoder->enable));
+	}
+	break;
+
+	case 4711:
+	{
+		int *iarg = arg;
+		__u8 val;
+
+		printk("bt866: square = %d\n", *iarg);
+
+		val = encoder->reg[0xdc];
+		if (*iarg)
+			val |= 1; /* SQUARE */
+		else
+			val &= ~1; /* !SQUARE */
+		bt866_write(encoder, 0xdc, val);
+		break;
+	}
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int bt866_write(struct bt866 *encoder,
+			unsigned char subaddr, unsigned char data)
+{
+	unsigned char buffer[2];
+	int err;
+
+	buffer[0] = subaddr;
+	buffer[1] = data;
+
+	encoder->reg[subaddr] = data;
+
+	DEBUG(printk
+	      ("%s: write 0x%02X = 0x%02X\n",
+	       encoder->i2c->name, subaddr, data));
+
+	for (err = 0; err < 3;) {
+		if (i2c_master_send(encoder->i2c, buffer, 2) == 2)
+			break;
+		err++;
+		printk(KERN_WARNING "%s: I/O error #%d "
+		       "(write 0x%02x/0x%02x)\n",
+		       encoder->i2c->name, err, encoder->addr, subaddr);
+		schedule_timeout_interruptible(HZ/10);
+	}
+	if (err == 3) {
+		printk(KERN_WARNING "%s: giving up\n",
+		       encoder->i2c->name);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int bt866_attach(struct i2c_adapter *adapter);
+static int bt866_detach(struct i2c_client *client);
+static int bt866_command(struct i2c_client *client,
+			 unsigned int cmd, void *arg);
+
+
+/* Addresses to scan */
+static unsigned short normal_i2c[]	= {I2C_BT866>>1, I2C_CLIENT_END};
+static unsigned short probe[2]		= {I2C_CLIENT_END, I2C_CLIENT_END};
+static unsigned short ignore[2]		= {I2C_CLIENT_END, I2C_CLIENT_END};
+
+static struct i2c_client_address_data addr_data = {
+	normal_i2c,
+	probe,
+	ignore,
+};
+
+static struct i2c_driver i2c_driver_bt866 = {
+	.driver.name = BT866_DEVNAME,
+	.id = I2C_DRIVERID_BT866,
+	.attach_adapter = bt866_attach,
+	.detach_client = bt866_detach,
+	.command = bt866_command
+};
+
+
+static struct i2c_client bt866_client_tmpl =
+{
+	.name = "(nil)",
+	.addr = 0,
+	.adapter = NULL,
+	.driver = &i2c_driver_bt866,
+	.usage_count = 0
+};
+
+static int bt866_found_proc(struct i2c_adapter *adapter,
+			    int addr, int kind)
+{
+	struct bt866 *encoder;
+	struct i2c_client *client;
+
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (client == NULL)
+		return -ENOMEM;
+	memcpy(client, &bt866_client_tmpl, sizeof(*client));
+
+	encoder = kzalloc(sizeof(*encoder), GFP_KERNEL);
+	if (encoder == NULL) {
+		kfree(client);
+		return -ENOMEM;
+	}
+
+	i2c_set_clientdata(client, encoder);
+	client->adapter = adapter;
+	client->addr = addr;
+	sprintf(client->name, "%s-%02x", BT866_DEVNAME, adapter->id);
+
+	encoder->i2c = client;
+	encoder->addr = addr;
+	//encoder->encoder_type = ENCODER_TYPE_UNKNOWN;
+
+	/* initialize */
+
+	i2c_attach_client(client);
+
+	return 0;
+}
+
+static int bt866_attach(struct i2c_adapter *adapter)
+{
+	if (adapter->id == I2C_HW_B_ZR36067)
+		return i2c_probe(adapter, &addr_data, bt866_found_proc);
+	return 0;
+}
+
+static int bt866_detach(struct i2c_client *client)
+{
+	struct bt866 *encoder = i2c_get_clientdata(client);
+
+	i2c_detach_client(client);
+	kfree(encoder);
+	kfree(client);
+
+	return 0;
+}
+
+static int bt866_command(struct i2c_client *client,
+			 unsigned int cmd, void *arg)
+{
+	struct bt866 *encoder = i2c_get_clientdata(client);
+	return bt866_do_command(encoder, cmd, arg);
+}
+
+static int __devinit bt866_init(void)
+{
+	i2c_add_driver(&i2c_driver_bt866);
+	return 0;
+}
+
+static void __devexit bt866_exit(void)
+{
+	i2c_del_driver(&i2c_driver_bt866);
+}
+
+module_init(bt866_init);
+module_exit(bt866_exit);
diff --git a/drivers/media/video/ks0127.c b/drivers/media/video/ks0127.c
new file mode 100644
index 000000000000..3bf7ac4f5288
--- /dev/null
+++ b/drivers/media/video/ks0127.c
@@ -0,0 +1,846 @@
+/*
+ * Video Capture Driver (Video for Linux 1/2)
+ * for the Matrox Marvel G200,G400 and Rainbow Runner-G series
+ *
+ * This module is an interface to the KS0127 video decoder chip.
+ *
+ * Copyright (C) 1999  Ryan Drake <stiletto@mediaone.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ *****************************************************************************
+ *
+ * Modified and extended by
+ *	Mike Bernson <mike@mlb.org>
+ *	Gerard v.d. Horst
+ *	Leon van Stuivenberg <l.vanstuivenberg@chello.nl>
+ *	Gernot Ziegler <gz@lysator.liu.se>
+ *
+ * Version History:
+ * V1.0 Ryan Drake	   Initial version by Ryan Drake
+ * V1.1 Gerard v.d. Horst  Added some debugoutput, reset the video-standard
+ */
+
+#ifndef __KERNEL__
+#define __KERNEL__
+#endif
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include "ks0127.h"
+
+#include <linux/i2c.h>
+#include <linux/video_decoder.h>
+
+#define dprintk     if (debug) printk
+
+/* i2c identification */
+#define I2C_KS0127_ADDON   0xD8
+#define I2C_KS0127_ONBOARD 0xDA
+
+#define KS_TYPE_UNKNOWN	0
+#define KS_TYPE_0122S	1
+#define KS_TYPE_0127	2
+#define KS_TYPE_0127B	3
+
+/* ks0127 control registers */
+#define KS_STAT     0x00
+#define KS_CMDA     0x01
+#define KS_CMDB     0x02
+#define KS_CMDC     0x03
+#define KS_CMDD     0x04
+#define KS_HAVB     0x05
+#define KS_HAVE     0x06
+#define KS_HS1B     0x07
+#define KS_HS1E     0x08
+#define KS_HS2B     0x09
+#define KS_HS2E     0x0a
+#define KS_AGC      0x0b
+#define KS_HXTRA    0x0c
+#define KS_CDEM     0x0d
+#define KS_PORTAB   0x0e
+#define KS_LUMA     0x0f
+#define KS_CON      0x10
+#define KS_BRT      0x11
+#define KS_CHROMA   0x12
+#define KS_CHROMB   0x13
+#define KS_DEMOD    0x14
+#define KS_SAT      0x15
+#define KS_HUE      0x16
+#define KS_VERTIA   0x17
+#define KS_VERTIB   0x18
+#define KS_VERTIC   0x19
+#define KS_HSCLL    0x1a
+#define KS_HSCLH    0x1b
+#define KS_VSCLL    0x1c
+#define KS_VSCLH    0x1d
+#define KS_OFMTA    0x1e
+#define KS_OFMTB    0x1f
+#define KS_VBICTL   0x20
+#define KS_CCDAT2   0x21
+#define KS_CCDAT1   0x22
+#define KS_VBIL30   0x23
+#define KS_VBIL74   0x24
+#define KS_VBIL118  0x25
+#define KS_VBIL1512 0x26
+#define KS_TTFRAM   0x27
+#define KS_TESTA    0x28
+#define KS_UVOFFH   0x29
+#define KS_UVOFFL   0x2a
+#define KS_UGAIN    0x2b
+#define KS_VGAIN    0x2c
+#define KS_VAVB     0x2d
+#define KS_VAVE     0x2e
+#define KS_CTRACK   0x2f
+#define KS_POLCTL   0x30
+#define KS_REFCOD   0x31
+#define KS_INVALY   0x32
+#define KS_INVALU   0x33
+#define KS_INVALV   0x34
+#define KS_UNUSEY   0x35
+#define KS_UNUSEU   0x36
+#define KS_UNUSEV   0x37
+#define KS_USRSAV   0x38
+#define KS_USREAV   0x39
+#define KS_SHS1A    0x3a
+#define KS_SHS1B    0x3b
+#define KS_SHS1C    0x3c
+#define KS_CMDE     0x3d
+#define KS_VSDEL    0x3e
+#define KS_CMDF     0x3f
+#define KS_GAMMA0   0x40
+#define KS_GAMMA1   0x41
+#define KS_GAMMA2   0x42
+#define KS_GAMMA3   0x43
+#define KS_GAMMA4   0x44
+#define KS_GAMMA5   0x45
+#define KS_GAMMA6   0x46
+#define KS_GAMMA7   0x47
+#define KS_GAMMA8   0x48
+#define KS_GAMMA9   0x49
+#define KS_GAMMA10  0x4a
+#define KS_GAMMA11  0x4b
+#define KS_GAMMA12  0x4c
+#define KS_GAMMA13  0x4d
+#define KS_GAMMA14  0x4e
+#define KS_GAMMA15  0x4f
+#define KS_GAMMA16  0x50
+#define KS_GAMMA17  0x51
+#define KS_GAMMA18  0x52
+#define KS_GAMMA19  0x53
+#define KS_GAMMA20  0x54
+#define KS_GAMMA21  0x55
+#define KS_GAMMA22  0x56
+#define KS_GAMMA23  0x57
+#define KS_GAMMA24  0x58
+#define KS_GAMMA25  0x59
+#define KS_GAMMA26  0x5a
+#define KS_GAMMA27  0x5b
+#define KS_GAMMA28  0x5c
+#define KS_GAMMA29  0x5d
+#define KS_GAMMA30  0x5e
+#define KS_GAMMA31  0x5f
+#define KS_GAMMAD0  0x60
+#define KS_GAMMAD1  0x61
+#define KS_GAMMAD2  0x62
+#define KS_GAMMAD3  0x63
+#define KS_GAMMAD4  0x64
+#define KS_GAMMAD5  0x65
+#define KS_GAMMAD6  0x66
+#define KS_GAMMAD7  0x67
+#define KS_GAMMAD8  0x68
+#define KS_GAMMAD9  0x69
+#define KS_GAMMAD10 0x6a
+#define KS_GAMMAD11 0x6b
+#define KS_GAMMAD12 0x6c
+#define KS_GAMMAD13 0x6d
+#define KS_GAMMAD14 0x6e
+#define KS_GAMMAD15 0x6f
+#define KS_GAMMAD16 0x70
+#define KS_GAMMAD17 0x71
+#define KS_GAMMAD18 0x72
+#define KS_GAMMAD19 0x73
+#define KS_GAMMAD20 0x74
+#define KS_GAMMAD21 0x75
+#define KS_GAMMAD22 0x76
+#define KS_GAMMAD23 0x77
+#define KS_GAMMAD24 0x78
+#define KS_GAMMAD25 0x79
+#define KS_GAMMAD26 0x7a
+#define KS_GAMMAD27 0x7b
+#define KS_GAMMAD28 0x7c
+#define KS_GAMMAD29 0x7d
+#define KS_GAMMAD30 0x7e
+#define KS_GAMMAD31 0x7f
+
+
+/****************************************************************************
+* mga_dev : represents one ks0127 chip.
+****************************************************************************/
+
+struct adjust {
+	int	contrast;
+	int	bright;
+	int	hue;
+	int	ugain;
+	int	vgain;
+};
+
+struct ks0127 {
+	struct i2c_client *client;
+	unsigned char	addr;
+	int		format_width;
+	int		format_height;
+	int		cap_width;
+	int		cap_height;
+	int		norm;
+	int		ks_type;
+	u8 		regs[256];
+};
+
+
+static int debug; /* insmod parameter */
+
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Debug output");
+MODULE_LICENSE("GPL");
+
+static u8 reg_defaults[64];
+
+
+
+static void init_reg_defaults(void)
+{
+	u8 *table = reg_defaults;
+
+	table[KS_CMDA]     = 0x2c;  /* VSE=0, CCIR 601, autodetect standard */
+	table[KS_CMDB]     = 0x12;  /* VALIGN=0, AGC control and input */
+	table[KS_CMDC]     = 0x00;  /* Test options */
+	/* clock & input select, write 1 to PORTA */
+	table[KS_CMDD]     = 0x01;
+	table[KS_HAVB]     = 0x00;  /* HAV Start Control */
+	table[KS_HAVE]     = 0x00;  /* HAV End Control */
+	table[KS_HS1B]     = 0x10;  /* HS1 Start Control */
+	table[KS_HS1E]     = 0x00;  /* HS1 End Control */
+	table[KS_HS2B]     = 0x00;  /* HS2 Start Control */
+	table[KS_HS2E]     = 0x00;  /* HS2 End Control */
+	table[KS_AGC]      = 0x53;  /* Manual setting for AGC */
+	table[KS_HXTRA]    = 0x00;  /* Extra Bits for HAV and HS1/2 */
+	table[KS_CDEM]     = 0x00;  /* Chroma Demodulation Control */
+	table[KS_PORTAB]   = 0x0f;  /* port B is input, port A output GPPORT */
+	table[KS_LUMA]     = 0x01;  /* Luma control */
+	table[KS_CON]      = 0x00;  /* Contrast Control */
+	table[KS_BRT]      = 0x00;  /* Brightness Control */
+	table[KS_CHROMA]   = 0x2a;  /* Chroma control A */
+	table[KS_CHROMB]   = 0x90;  /* Chroma control B */
+	table[KS_DEMOD]    = 0x00;  /* Chroma Demodulation Control & Status */
+	table[KS_SAT]      = 0x00;  /* Color Saturation Control*/
+	table[KS_HUE]      = 0x00;  /* Hue Control */
+	table[KS_VERTIA]   = 0x00;  /* Vertical Processing Control A */
+	/* Vertical Processing Control B, luma 1 line delayed */
+	table[KS_VERTIB]   = 0x12;
+	table[KS_VERTIC]   = 0x0b;  /* Vertical Processing Control C */
+	table[KS_HSCLL]    = 0x00;  /* Horizontal Scaling Ratio Low */
+	table[KS_HSCLH]    = 0x00;  /* Horizontal Scaling Ratio High */
+	table[KS_VSCLL]    = 0x00;  /* Vertical Scaling Ratio Low */
+	table[KS_VSCLH]    = 0x00;  /* Vertical Scaling Ratio High */
+	/* 16 bit YCbCr 4:2:2 output; I can't make the bt866 like 8 bit /Sam */
+	table[KS_OFMTA]    = 0x30;
+	table[KS_OFMTB]    = 0x00;  /* Output Control B */
+	/* VBI Decoder Control; 4bit fmt: avoid Y overflow */
+	table[KS_VBICTL]   = 0x5d;
+	table[KS_CCDAT2]   = 0x00;  /* Read Only register */
+	table[KS_CCDAT1]   = 0x00;  /* Read Only register */
+	table[KS_VBIL30]   = 0xa8;  /* VBI data decoding options */
+	table[KS_VBIL74]   = 0xaa;  /* VBI data decoding options */
+	table[KS_VBIL118]  = 0x2a;  /* VBI data decoding options */
+	table[KS_VBIL1512] = 0x00;  /* VBI data decoding options */
+	table[KS_TTFRAM]   = 0x00;  /* Teletext frame alignment pattern */
+	table[KS_TESTA]    = 0x00;  /* test register, shouldn't be written */
+	table[KS_UVOFFH]   = 0x00;  /* UV Offset Adjustment High */
+	table[KS_UVOFFL]   = 0x00;  /* UV Offset Adjustment Low */
+	table[KS_UGAIN]    = 0x00;  /* U Component Gain Adjustment */
+	table[KS_VGAIN]    = 0x00;  /* V Component Gain Adjustment */
+	table[KS_VAVB]     = 0x07;  /* VAV Begin */
+	table[KS_VAVE]     = 0x00;  /* VAV End */
+	table[KS_CTRACK]   = 0x00;  /* Chroma Tracking Control */
+	table[KS_POLCTL]   = 0x41;  /* Timing Signal Polarity Control */
+	table[KS_REFCOD]   = 0x80;  /* Reference Code Insertion Control */
+	table[KS_INVALY]   = 0x10;  /* Invalid Y Code */
+	table[KS_INVALU]   = 0x80;  /* Invalid U Code */
+	table[KS_INVALV]   = 0x80;  /* Invalid V Code */
+	table[KS_UNUSEY]   = 0x10;  /* Unused Y Code */
+	table[KS_UNUSEU]   = 0x80;  /* Unused U Code */
+	table[KS_UNUSEV]   = 0x80;  /* Unused V Code */
+	table[KS_USRSAV]   = 0x00;  /* reserved */
+	table[KS_USREAV]   = 0x00;  /* reserved */
+	table[KS_SHS1A]    = 0x00;  /* User Defined SHS1 A */
+	/* User Defined SHS1 B, ALT656=1 on 0127B */
+	table[KS_SHS1B]    = 0x80;
+	table[KS_SHS1C]    = 0x00;  /* User Defined SHS1 C */
+	table[KS_CMDE]     = 0x00;  /* Command Register E */
+	table[KS_VSDEL]    = 0x00;  /* VS Delay Control */
+	/* Command Register F, update -immediately- */
+	/* (there might come no vsync)*/
+	table[KS_CMDF]     = 0x02;
+}
+
+
+/* We need to manually read because of a bug in the KS0127 chip.
+ *
+ * An explanation from kayork@mail.utexas.edu:
+ *
+ * During I2C reads, the KS0127 only samples for a stop condition
+ * during the place where the acknoledge bit should be. Any standard
+ * I2C implementation (correctly) throws in another clock transition
+ * at the 9th bit, and the KS0127 will not recognize the stop condition
+ * and will continue to clock out data.
+ *
+ * So we have to do the read ourself.  Big deal.
+	   workaround in i2c-algo-bit
+ */
+
+
+static u8 ks0127_read(struct ks0127 *ks, u8 reg)
+{
+	struct i2c_client *c = ks->client;
+	char val = 0;
+	struct i2c_msg msgs[] = {
+		{c->addr, 0, sizeof(reg), &reg},
+		{c->addr, I2C_M_RD | I2C_M_NO_RD_ACK, sizeof(val), &val}};
+	int ret;
+
+	ret = i2c_transfer(c->adapter, msgs, ARRAY_SIZE(msgs));
+	if (ret != ARRAY_SIZE(msgs))
+		dprintk("ks0127_write error\n");
+
+	return val;
+}
+
+
+static void ks0127_write(struct ks0127 *ks, u8 reg, u8 val)
+{
+	char msg[] = {reg, val};
+
+	if (i2c_master_send(ks->client, msg, sizeof(msg)) != sizeof(msg))
+		dprintk("ks0127_write error\n");
+
+	ks->regs[reg] = val;
+}
+
+
+/* generic bit-twiddling */
+static void ks0127_and_or(struct ks0127 *ks, u8 reg, u8 and_v, u8 or_v)
+{
+	u8 val = ks->regs[reg];
+	val = (val & and_v) | or_v;
+	ks0127_write(ks, reg, val);
+}
+
+
+
+/****************************************************************************
+* ks0127 private api
+****************************************************************************/
+static void ks0127_reset(struct ks0127* ks)
+{
+	int i;
+	u8 *table = reg_defaults;
+
+	ks->ks_type = KS_TYPE_UNKNOWN;
+
+	dprintk("ks0127: reset\n");
+	msleep(1);
+
+	/* initialize all registers to known values */
+	/* (except STAT, 0x21, 0x22, TEST and 0x38,0x39) */
+
+	for(i = 1; i < 33; i++)
+		ks0127_write(ks, i, table[i]);
+
+	for(i = 35; i < 40; i++)
+		ks0127_write(ks, i, table[i]);
+
+	for(i = 41; i < 56; i++)
+		ks0127_write(ks, i, table[i]);
+
+	for(i = 58; i < 64; i++)
+		ks0127_write(ks, i, table[i]);
+
+
+	if ((ks0127_read(ks, KS_STAT) & 0x80) == 0) {
+		ks->ks_type = KS_TYPE_0122S;
+		dprintk("ks0127: ks0122s Found\n");
+		return;
+	}
+
+	switch(ks0127_read(ks, KS_CMDE) & 0x0f) {
+
+	case 0:
+		ks->ks_type = KS_TYPE_0127;
+		dprintk("ks0127: ks0127 found\n");
+		break;
+
+	case 9:
+		ks->ks_type = KS_TYPE_0127B;
+		dprintk("ks0127: ks0127B Revision A found\n");
+		break;
+
+	default:
+		dprintk("ks0127: unknown revision\n");
+		break;
+	}
+}
+
+static int ks0127_command(struct i2c_client *client,
+			  unsigned int cmd, void *arg)
+{
+	struct ks0127 *ks = i2c_get_clientdata(client);
+
+	int		*iarg = (int*)arg;
+
+	int		status;
+
+	if (!ks)
+		return -ENODEV;
+
+	switch (cmd) {
+
+	case DECODER_INIT:
+		dprintk("ks0127: command DECODER_INIT\n");
+		ks0127_reset(ks);
+		break;
+
+	case DECODER_SET_INPUT:
+		switch(*iarg) {
+		case KS_INPUT_COMPOSITE_1:
+		case KS_INPUT_COMPOSITE_2:
+		case KS_INPUT_COMPOSITE_3:
+		case KS_INPUT_COMPOSITE_4:
+		case KS_INPUT_COMPOSITE_5:
+		case KS_INPUT_COMPOSITE_6:
+			dprintk("ks0127: command DECODER_SET_INPUT %d: "
+				"Composite\n", *iarg);
+			/* autodetect 50/60 Hz */
+			ks0127_and_or(ks, KS_CMDA,   0xfc, 0x00);
+			/* VSE=0 */
+			ks0127_and_or(ks, KS_CMDA,   ~0x40, 0x00);
+			/* set input line */
+			ks0127_and_or(ks, KS_CMDB,   0xb0, *iarg);
+			/* non-freerunning mode */
+			ks0127_and_or(ks, KS_CMDC,   0x70, 0x0a);
+			/* analog input */
+			ks0127_and_or(ks, KS_CMDD,   0x03, 0x00);
+			/* enable chroma demodulation */
+			ks0127_and_or(ks, KS_CTRACK, 0xcf, 0x00);
+			/* chroma trap, HYBWR=1 */
+			ks0127_and_or(ks, KS_LUMA,   0x00,
+				       (reg_defaults[KS_LUMA])|0x0c);
+			/* scaler fullbw, luma comb off */
+			ks0127_and_or(ks, KS_VERTIA, 0x08, 0x81);
+			/* manual chroma comb .25 .5 .25 */
+			ks0127_and_or(ks, KS_VERTIC, 0x0f, 0x90);
+
+			/* chroma path delay */
+			ks0127_and_or(ks, KS_CHROMB, 0x0f, 0x90);
+
+			ks0127_write(ks, KS_UGAIN, reg_defaults[KS_UGAIN]);
+			ks0127_write(ks, KS_VGAIN, reg_defaults[KS_VGAIN]);
+			ks0127_write(ks, KS_UVOFFH, reg_defaults[KS_UVOFFH]);
+			ks0127_write(ks, KS_UVOFFL, reg_defaults[KS_UVOFFL]);
+			break;
+
+		case KS_INPUT_SVIDEO_1:
+		case KS_INPUT_SVIDEO_2:
+		case KS_INPUT_SVIDEO_3:
+			dprintk("ks0127: command DECODER_SET_INPUT %d: "
+				"S-Video\n", *iarg);
+			/* autodetect 50/60 Hz */
+			ks0127_and_or(ks, KS_CMDA,   0xfc, 0x00);
+			/* VSE=0 */
+			ks0127_and_or(ks, KS_CMDA,   ~0x40, 0x00);
+			/* set input line */
+			ks0127_and_or(ks, KS_CMDB,   0xb0, *iarg);
+			/* non-freerunning mode */
+			ks0127_and_or(ks, KS_CMDC,   0x70, 0x0a);
+			/* analog input */
+			ks0127_and_or(ks, KS_CMDD,   0x03, 0x00);
+			/* enable chroma demodulation */
+			ks0127_and_or(ks, KS_CTRACK, 0xcf, 0x00);
+			ks0127_and_or(ks, KS_LUMA, 0x00,
+				       reg_defaults[KS_LUMA]);
+			/* disable luma comb */
+			ks0127_and_or(ks, KS_VERTIA, 0x08,
+				       (reg_defaults[KS_VERTIA]&0xf0)|0x01);
+			ks0127_and_or(ks, KS_VERTIC, 0x0f,
+				       reg_defaults[KS_VERTIC]&0xf0);
+
+			ks0127_and_or(ks, KS_CHROMB, 0x0f,
+				       reg_defaults[KS_CHROMB]&0xf0);
+
+			ks0127_write(ks, KS_UGAIN, reg_defaults[KS_UGAIN]);
+			ks0127_write(ks, KS_VGAIN, reg_defaults[KS_VGAIN]);
+			ks0127_write(ks, KS_UVOFFH, reg_defaults[KS_UVOFFH]);
+			ks0127_write(ks, KS_UVOFFL, reg_defaults[KS_UVOFFL]);
+			break;
+
+		case KS_INPUT_YUV656:
+			dprintk("ks0127: command DECODER_SET_INPUT 15: "
+				"YUV656\n");
+			if (ks->norm == VIDEO_MODE_NTSC ||
+			    ks->norm == KS_STD_PAL_M)
+				/* force 60 Hz */
+				ks0127_and_or(ks, KS_CMDA,   0xfc, 0x03);
+			else
+				/* force 50 Hz */
+				ks0127_and_or(ks, KS_CMDA,   0xfc, 0x02);
+
+			ks0127_and_or(ks, KS_CMDA,   0xff, 0x40); /* VSE=1 */
+			/* set input line and VALIGN */
+			ks0127_and_or(ks, KS_CMDB,   0xb0, (*iarg | 0x40));
+			/* freerunning mode, */
+			/* TSTGEN = 1 TSTGFR=11 TSTGPH=0 TSTGPK=0  VMEM=1*/
+			ks0127_and_or(ks, KS_CMDC,   0x70, 0x87);
+			/* digital input, SYNDIR = 0 INPSL=01 CLKDIR=0 EAV=0 */
+			ks0127_and_or(ks, KS_CMDD,   0x03, 0x08);
+			/* disable chroma demodulation */
+			ks0127_and_or(ks, KS_CTRACK, 0xcf, 0x30);
+			/* HYPK =01 CTRAP = 0 HYBWR=0 PED=1 RGBH=1 UNIT=1 */
+			ks0127_and_or(ks, KS_LUMA,   0x00, 0x71);
+			ks0127_and_or(ks, KS_VERTIC, 0x0f,
+				       reg_defaults[KS_VERTIC]&0xf0);
+
+			/* scaler fullbw, luma comb off */
+			ks0127_and_or(ks, KS_VERTIA, 0x08, 0x81);
+
+			ks0127_and_or(ks, KS_CHROMB, 0x0f,
+				       reg_defaults[KS_CHROMB]&0xf0);
+
+			ks0127_and_or(ks, KS_CON, 0x00, 0x00);
+			ks0127_and_or(ks, KS_BRT, 0x00, 32);	/* spec: 34 */
+				/* spec: 229 (e5) */
+			ks0127_and_or(ks, KS_SAT, 0x00, 0xe8);
+			ks0127_and_or(ks, KS_HUE, 0x00, 0);
+
+			ks0127_and_or(ks, KS_UGAIN, 0x00, 238);
+			ks0127_and_or(ks, KS_VGAIN, 0x00, 0x00);
+
+			/*UOFF:0x30, VOFF:0x30, TSTCGN=1 */
+			ks0127_and_or(ks, KS_UVOFFH, 0x00, 0x4f);
+			ks0127_and_or(ks, KS_UVOFFL, 0x00, 0x00);
+			break;
+
+		default:
+			dprintk("ks0127: command DECODER_SET_INPUT: "
+				"Unknown input %d\n", *iarg);
+			break;
+		}
+
+		/* hack: CDMLPF sometimes spontaneously switches on; */
+		/* force back off */
+		ks0127_write(ks, KS_DEMOD, reg_defaults[KS_DEMOD]);
+		break;
+
+	case DECODER_SET_OUTPUT:
+		switch(*iarg) {
+		case KS_OUTPUT_YUV656E:
+			dprintk("ks0127: command DECODER_SET_OUTPUT: "
+				"OUTPUT_YUV656E (Missing)\n");
+			return -EINVAL;
+			break;
+
+		case KS_OUTPUT_EXV:
+			dprintk("ks0127: command DECODER_SET_OUTPUT: "
+				"OUTPUT_EXV\n");
+			ks0127_and_or(ks, KS_OFMTA, 0xf0, 0x09);
+			break;
+		}
+		break;
+
+	case DECODER_SET_NORM: //sam This block mixes old and new norm names...
+		/* Set to automatic SECAM/Fsc mode */
+		ks0127_and_or(ks, KS_DEMOD, 0xf0, 0x00);
+
+		ks->norm = *iarg;
+		switch(*iarg)
+		{
+		/* this is untested !! */
+		/* It just detects PAL_N/NTSC_M (no special frequencies) */
+		/* And you have to set the standard a second time afterwards */
+		case VIDEO_MODE_AUTO:
+			dprintk("ks0127: command DECODER_SET_NORM: AUTO\n");
+
+			/* The chip determines the format */
+			/* based on the current field rate */
+			ks0127_and_or(ks, KS_CMDA,   0xfc, 0x00);
+			ks0127_and_or(ks, KS_CHROMA, 0x9f, 0x20);
+			/* This is wrong for PAL ! As I said, */
+			/* you need to set the standard once again !! */
+			ks->format_height = 240;
+			ks->format_width = 704;
+			break;
+
+		case VIDEO_MODE_NTSC:
+			dprintk("ks0127: command DECODER_SET_NORM: NTSC_M\n");
+			ks0127_and_or(ks, KS_CHROMA, 0x9f, 0x20);
+			ks->format_height = 240;
+			ks->format_width = 704;
+			break;
+
+		case KS_STD_NTSC_N:
+			dprintk("ks0127: command KS0127_SET_STANDARD: "
+				"NTSC_N (fixme)\n");
+			ks0127_and_or(ks, KS_CHROMA, 0x9f, 0x40);
+			ks->format_height = 240;
+			ks->format_width = 704;
+			break;
+
+		case VIDEO_MODE_PAL:
+			dprintk("ks0127: command DECODER_SET_NORM: PAL_N\n");
+			ks0127_and_or(ks, KS_CHROMA, 0x9f, 0x20);
+			ks->format_height = 290;
+			ks->format_width = 704;
+			break;
+
+		case KS_STD_PAL_M:
+			dprintk("ks0127: command KS0127_SET_STANDARD: "
+				"PAL_M (fixme)\n");
+			ks0127_and_or(ks, KS_CHROMA, 0x9f, 0x40);
+			ks->format_height = 290;
+			ks->format_width = 704;
+			break;
+
+		case VIDEO_MODE_SECAM:
+			dprintk("ks0127: command KS0127_SET_STANDARD: "
+				"SECAM\n");
+			ks->format_height = 290;
+			ks->format_width = 704;
+
+			/* set to secam autodetection */
+			ks0127_and_or(ks, KS_CHROMA, 0xdf, 0x20);
+			ks0127_and_or(ks, KS_DEMOD, 0xf0, 0x00);
+			schedule_timeout_interruptible(HZ/10+1);
+
+			/* did it autodetect? */
+			if (ks0127_read(ks, KS_DEMOD) & 0x40)
+				break;
+
+			/* force to secam mode */
+			ks0127_and_or(ks, KS_DEMOD, 0xf0, 0x0f);
+			break;
+
+		default:
+			dprintk("ks0127: command DECODER_SET_NORM: "
+				"Unknown norm %d\n", *iarg);
+			break;
+		}
+		break;
+
+	case DECODER_SET_PICTURE:
+		dprintk("ks0127: command DECODER_SET_PICTURE "
+			"not yet supported (fixme)\n");
+		return -EINVAL;
+
+	//sam todo: KS0127_SET_BRIGHTNESS: Merge into DECODER_SET_PICTURE
+	//sam todo: KS0127_SET_CONTRAST: Merge into DECODER_SET_PICTURE
+	//sam todo: KS0127_SET_HUE: Merge into DECODER_SET_PICTURE?
+	//sam todo: KS0127_SET_SATURATION: Merge into DECODER_SET_PICTURE
+	//sam todo: KS0127_SET_AGC_MODE:
+	//sam todo: KS0127_SET_AGC:
+	//sam todo: KS0127_SET_CHROMA_MODE:
+	//sam todo: KS0127_SET_PIXCLK_MODE:
+	//sam todo: KS0127_SET_GAMMA_MODE:
+	//sam todo: KS0127_SET_UGAIN:
+	//sam todo: KS0127_SET_VGAIN:
+	//sam todo: KS0127_SET_INVALY:
+	//sam todo: KS0127_SET_INVALU:
+	//sam todo: KS0127_SET_INVALV:
+	//sam todo: KS0127_SET_UNUSEY:
+	//sam todo: KS0127_SET_UNUSEU:
+	//sam todo: KS0127_SET_UNUSEV:
+	//sam todo: KS0127_SET_VSALIGN_MODE:
+
+	case DECODER_ENABLE_OUTPUT:
+	{
+
+		int *iarg = arg;
+		int enable = (*iarg != 0);
+			if (enable) {
+				dprintk("ks0127: command "
+					"DECODER_ENABLE_OUTPUT on "
+					"(%d)\n", enable);
+				/* All output pins on */
+				ks0127_and_or(ks, KS_OFMTA, 0xcf, 0x30);
+				/* Obey the OEN pin */
+				ks0127_and_or(ks, KS_CDEM, 0x7f, 0x00);
+			} else {
+				dprintk("ks0127: command "
+					"DECODER_ENABLE_OUTPUT off "
+					"(%d)\n", enable);
+				/* Video output pins off */
+				ks0127_and_or(ks, KS_OFMTA, 0xcf, 0x00);
+				/* Ignore the OEN pin */
+				ks0127_and_or(ks, KS_CDEM, 0x7f, 0x80);
+			}
+	}
+		break;
+
+	//sam todo: KS0127_SET_OUTPUT_MODE:
+	//sam todo: KS0127_SET_WIDTH:
+	//sam todo: KS0127_SET_HEIGHT:
+	//sam todo: KS0127_SET_HSCALE:
+
+	case DECODER_GET_STATUS:
+		dprintk("ks0127: command DECODER_GET_STATUS\n");
+		*iarg = 0;
+		status = ks0127_read(ks, KS_STAT);
+		if (!(status & 0x20))		 /* NOVID not set */
+			*iarg = (*iarg & DECODER_STATUS_GOOD);
+		if ((status & 0x01))		      /* CLOCK set */
+			*iarg = (*iarg & DECODER_STATUS_COLOR);
+		if ((status & 0x08))		   /* PALDET set */
+			*iarg = (*iarg & DECODER_STATUS_PAL);
+		else
+			*iarg = (*iarg & DECODER_STATUS_NTSC);
+		break;
+
+	//Catch any unknown command
+	default:
+		dprintk("ks0127: command unknown: %04X\n", cmd);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+
+
+
+static int ks0127_probe(struct i2c_adapter *adapter);
+static int ks0127_detach(struct i2c_client *client);
+static int ks0127_command(struct i2c_client *client,
+			  unsigned int cmd, void *arg);
+
+
+
+/* Addresses to scan */
+static unsigned short normal_i2c[] = {I2C_KS0127_ADDON>>1,
+				       I2C_KS0127_ONBOARD>>1, I2C_CLIENT_END};
+static unsigned short probe[2] =	{I2C_CLIENT_END, I2C_CLIENT_END};
+static unsigned short ignore[2] = 	{I2C_CLIENT_END, I2C_CLIENT_END};
+static struct i2c_client_address_data addr_data = {
+	normal_i2c,
+	probe,
+	ignore,
+};
+
+static struct i2c_driver i2c_driver_ks0127 = {
+	.driver.name = "ks0127",
+	.id             = I2C_DRIVERID_KS0127,
+	.attach_adapter = ks0127_probe,
+	.detach_client  = ks0127_detach,
+	.command        = ks0127_command
+};
+
+static struct i2c_client ks0127_client_tmpl =
+{
+	.name = "(ks0127 unset)",
+	.addr = 0,
+	.adapter = NULL,
+	.driver = &i2c_driver_ks0127,
+	.usage_count = 0
+};
+
+static int ks0127_found_proc(struct i2c_adapter *adapter, int addr, int kind)
+{
+	struct ks0127 *ks;
+	struct i2c_client *client;
+
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (client == NULL)
+		return -ENOMEM;
+	memcpy(client, &ks0127_client_tmpl, sizeof(*client));
+
+	ks = kzalloc(sizeof(*ks), GFP_KERNEL);
+	if (ks == NULL) {
+		kfree(client);
+		return -ENOMEM;
+	}
+
+	i2c_set_clientdata(client, ks);
+	client->adapter = adapter;
+	client->addr = addr;
+	sprintf(client->name, "ks0127-%02x", adapter->id);
+
+	ks->client = client;
+	ks->addr = addr;
+	ks->ks_type = KS_TYPE_UNKNOWN;
+
+	/* power up */
+	ks0127_write(ks, KS_CMDA, 0x2c);
+	mdelay(10);
+
+	/* reset the device */
+	ks0127_reset(ks);
+	printk(KERN_INFO "ks0127: attach: %s video decoder\n",
+	       ks->addr==(I2C_KS0127_ADDON>>1) ? "addon" : "on-board");
+
+	i2c_attach_client(client);
+	return 0;
+}
+
+
+static int ks0127_probe(struct i2c_adapter *adapter)
+{
+	if (adapter->id == I2C_HW_B_ZR36067)
+		return i2c_probe(adapter, &addr_data, ks0127_found_proc);
+	return 0;
+}
+
+static int ks0127_detach(struct i2c_client *client)
+{
+	struct ks0127 *ks = i2c_get_clientdata(client);
+
+	ks0127_write(ks, KS_OFMTA, 0x20); /*tristate*/
+	ks0127_write(ks, KS_CMDA, 0x2c | 0x80); /* power down */
+
+	i2c_detach_client(client);
+	kfree(ks);
+	kfree(client);
+
+	dprintk("ks0127: detach\n");
+	return 0;
+}
+
+
+static int __devinit ks0127_init_module(void)
+{
+	init_reg_defaults();
+	i2c_add_driver(&i2c_driver_ks0127);
+	return 0;
+}
+
+static void __devexit ks0127_cleanup_module(void)
+{
+	i2c_del_driver(&i2c_driver_ks0127);
+}
+
+
+module_init(ks0127_init_module);
+module_exit(ks0127_cleanup_module);
diff --git a/drivers/media/video/ks0127.h b/drivers/media/video/ks0127.h
new file mode 100644
index 000000000000..1ec578833aea
--- /dev/null
+++ b/drivers/media/video/ks0127.h
@@ -0,0 +1,53 @@
+/*
+ * Video Capture Driver ( Video for Linux 1/2 )
+ * for the Matrox Marvel G200,G400 and Rainbow Runner-G series
+ *
+ * This module is an interface to the KS0127 video decoder chip.
+ *
+ * Copyright (C) 1999  Ryan Drake <stiletto@mediaone.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+
+#ifndef KS0127_H
+#define KS0127_H
+
+#include <linux/videodev.h>
+
+/* input channels */
+#define KS_INPUT_COMPOSITE_1    0
+#define KS_INPUT_COMPOSITE_2    1
+#define KS_INPUT_COMPOSITE_3    2
+#define KS_INPUT_COMPOSITE_4    4
+#define KS_INPUT_COMPOSITE_5    5
+#define KS_INPUT_COMPOSITE_6    6
+
+#define KS_INPUT_SVIDEO_1       8
+#define KS_INPUT_SVIDEO_2       9
+#define KS_INPUT_SVIDEO_3       10
+
+#define KS_INPUT_YUV656		15
+#define KS_INPUT_COUNT          10
+
+/* output channels */
+#define KS_OUTPUT_YUV656E       0
+#define KS_OUTPUT_EXV           1
+
+/* video standards */
+#define KS_STD_NTSC_N           112       /* 50 Hz NTSC */
+#define KS_STD_PAL_M            113       /* 60 Hz PAL  */
+
+#endif /* KS0127_H */
+
diff --git a/drivers/media/video/zoran.h b/drivers/media/video/zoran.h
index 0166f555a5ca..ffcda95ed9d4 100644
--- a/drivers/media/video/zoran.h
+++ b/drivers/media/video/zoran.h
@@ -159,7 +159,7 @@ Private IOCTL to set up for displaying MJPEG
 #define BUZ_MAX_FRAME     256	/* Must be a power of 2 */
 #define BUZ_MASK_FRAME    255	/* Must be BUZ_MAX_FRAME-1 */
 
-#define BUZ_MAX_INPUT       8
+#define BUZ_MAX_INPUT       16
 
 #if VIDEO_MAX_FRAME <= 32
 #   define   V4L_MAX_FRAME   32
@@ -191,6 +191,9 @@ enum card_type {
 	/* Iomega */
 	BUZ,
 
+	/* AverMedia */
+	AVS6EYES,
+
 	/* total number of cards */
 	NUM_CARDS
 };
@@ -379,6 +382,9 @@ struct card_info {
 	/* is the /GWS line conected? */
 	u8 gws_not_connected;
 
+	/* avs6eyes mux setting */
+	u8 input_mux;
+
 	void (*init) (struct zoran * zr);
 };
 
diff --git a/drivers/media/video/zoran_card.c b/drivers/media/video/zoran_card.c
index 0a85c9e7fb48..798138599bec 100644
--- a/drivers/media/video/zoran_card.c
+++ b/drivers/media/video/zoran_card.c
@@ -27,6 +27,8 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#include <linux/delay.h>
+
 #include <linux/config.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -93,6 +95,11 @@ module_param(default_input, int, 0);
 MODULE_PARM_DESC(default_input,
 		 "Default input (0=Composite, 1=S-Video, 2=Internal)");
 
+static int default_mux = 1;	/* 6 Eyes input selection */
+module_param(default_mux, int, 0);
+MODULE_PARM_DESC(default_mux,
+		 "Default 6 Eyes mux setting (Input selection)");
+
 static int default_norm = 0;	/* 0=PAL, 1=NTSC 2=SECAM */
 module_param(default_norm, int, 0);
 MODULE_PARM_DESC(default_norm, "Default norm (0=PAL, 1=NTSC, 2=SECAM)");
@@ -301,6 +308,30 @@ lml33_init (struct zoran *zr)
 	GPIO(zr, 2, 1);		// Set Composite input/output
 }
 
+static void
+avs6eyes_init (struct zoran *zr)
+{
+	// AverMedia 6-Eyes original driver by Christer Weinigel
+
+	// Lifted straight from Christer's old driver and
+	// modified slightly by Martin Samuelsson.
+
+	int mux = default_mux; /* 1 = BT866, 7 = VID1 */
+
+	GPIO(zr, 4, 1); /* Bt866 SLEEP on */
+	udelay(2);
+
+	GPIO(zr, 0, 1); /* ZR36060 /RESET on */
+	GPIO(zr, 1, 0); /* ZR36060 /SLEEP on */
+	GPIO(zr, 2, mux & 1);   /* MUX S0 */
+	GPIO(zr, 3, 0); /* /FRAME on */
+	GPIO(zr, 4, 0); /* Bt866 SLEEP off */
+	GPIO(zr, 5, mux & 2);   /* MUX S1 */
+	GPIO(zr, 6, 0); /* ? */
+	GPIO(zr, 7, mux & 4);   /* MUX S2 */
+
+}
+
 static char *
 i2cid_to_modulename (u16 i2c_id)
 {
@@ -391,6 +422,14 @@ static struct tvnorm f60sqpixel_dc10 = { 780, 640, 0, 716, 525, 480, 12 };
 static struct tvnorm f50ccir601_lm33r10 = { 864, 720, 74+54, 804, 625, 576, 18 };
 static struct tvnorm f60ccir601_lm33r10 = { 858, 720, 56+54, 788, 525, 480, 16 };
 
+/* FIXME: The ks0127 seem incapable of swapping U and V, too, which is why I
+ * copy Maxim's left shift hack for the 6 Eyes.
+ *
+ * Christer's driver used the unshifted norms, though...
+ * /Sam  */
+static struct tvnorm f50ccir601_avs6eyes = { 864, 720, 74, 804, 625, 576, 18 };
+static struct tvnorm f60ccir601_avs6eyes = { 858, 720, 56, 788, 525, 480, 16 };
+
 static struct card_info zoran_cards[NUM_CARDS] __devinitdata = {
 	{
 		.type = DC10_old,
@@ -419,6 +458,7 @@ static struct card_info zoran_cards[NUM_CARDS] __devinitdata = {
 		.gpcs = { -1, 0 },
 		.vfe_pol = { 0, 0, 0, 0, 0, 0, 0, 0 },
 		.gws_not_connected = 0,
+		.input_mux = 0,
 		.init = &dc10_init,
 	}, {
 		.type = DC10_new,
@@ -445,6 +485,7 @@ static struct card_info zoran_cards[NUM_CARDS] __devinitdata = {
 		.gpcs = { -1, 1},
 		.vfe_pol = { 1, 1, 1, 1, 0, 0, 0, 0 },
 		.gws_not_connected = 0,
+		.input_mux = 0,
 		.init = &dc10plus_init,
 	}, {
 		.type = DC10plus,
@@ -474,6 +515,7 @@ static struct card_info zoran_cards[NUM_CARDS] __devinitdata = {
 		.gpcs = { -1, 1 },
 		.vfe_pol = { 1, 1, 1, 1, 0, 0, 0, 0 },
 		.gws_not_connected = 0,
+		.input_mux = 0,
 		.init = &dc10plus_init,
 	}, {
 		.type = DC30,
@@ -502,6 +544,7 @@ static struct card_info zoran_cards[NUM_CARDS] __devinitdata = {
 		.gpcs = { -1, 0 },
 		.vfe_pol = { 0, 0, 0, 0, 0, 0, 0, 0 },
 		.gws_not_connected = 0,
+		.input_mux = 0,
 		.init = &dc10_init,
 	}, {
 		.type = DC30plus,
@@ -532,6 +575,7 @@ static struct card_info zoran_cards[NUM_CARDS] __devinitdata = {
 		.gpcs = { -1, 0 },
 		.vfe_pol = { 0, 0, 0, 0, 0, 0, 0, 0 },
 		.gws_not_connected = 0,
+		.input_mux = 0,
 		.init = &dc10_init,
 	}, {
 		.type = LML33,
@@ -558,6 +602,7 @@ static struct card_info zoran_cards[NUM_CARDS] __devinitdata = {
 		.gpcs = { 3, 1 },
 		.vfe_pol = { 1, 1, 0, 0, 0, 1, 0, 0 },
 		.gws_not_connected = 1,
+		.input_mux = 0,
 		.init = &lml33_init,
 	}, {
 		.type = LML33R10,
@@ -586,6 +631,7 @@ static struct card_info zoran_cards[NUM_CARDS] __devinitdata = {
 		.gpcs = { 3, 1 },
 		.vfe_pol = { 1, 1, 0, 0, 0, 1, 0, 0 },
 		.gws_not_connected = 1,
+		.input_mux = 0,
 		.init = &lml33_init,
 	}, {
 		.type = BUZ,
@@ -614,8 +660,49 @@ static struct card_info zoran_cards[NUM_CARDS] __devinitdata = {
 		.gpcs = { 3, 1 },
 		.vfe_pol = { 1, 1, 0, 0, 0, 1, 0, 0 },
 		.gws_not_connected = 1,
+		.input_mux = 0,
 		.init = &buz_init,
+	}, {
+		.type = AVS6EYES,
+		.name = "6-Eyes",
+		/* AverMedia chose not to brand the 6-Eyes. Thus it
+		   can't be autodetected, and requires card=x. */
+		.vendor_id = -1,
+		.device_id = -1,
+		.i2c_decoder = I2C_DRIVERID_KS0127,
+		.i2c_encoder = I2C_DRIVERID_BT866,
+		.video_codec = CODEC_TYPE_ZR36060,
+
+		.inputs = 10,
+		.input = {
+			{ 0, "Composite 1" },
+			{ 1, "Composite 2" },
+			{ 2, "Composite 3" },
+			{ 4, "Composite 4" },
+			{ 5, "Composite 5" },
+			{ 6, "Composite 6" },
+			{ 8, "S-Video 1" },
+			{ 9, "S-Video 2" },
+			{10, "S-Video 3" },
+			{15, "YCbCr" }
+		},
+		.norms = 2,
+		.tvn = {
+			&f50ccir601_avs6eyes,
+			&f60ccir601_avs6eyes,
+			NULL
+		},
+		.jpeg_int = ZR36057_ISR_GIRQ1,
+		.vsync_int = ZR36057_ISR_GIRQ0,
+		.gpio = { 1, 0, 3, -1, -1, -1, -1, -1 },// Validity unknown /Sam
+		.gpio_pol = { 0, 0, 0, 0, 0, 0, 0, 0 }, // Validity unknown /Sam
+		.gpcs = { 3, 1 },			// Validity unknown /Sam
+		.vfe_pol = { 1, 0, 0, 0, 0, 1, 0, 0 },  // Validity unknown /Sam
+		.gws_not_connected = 1,
+		.input_mux = 1,
+		.init = &avs6eyes_init,
 	}
+
 };
 
 /*
diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index c8b81f419fd8..748bbf7c327c 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -112,6 +112,8 @@
 #define I2C_DRIVERID_X1205	82	/* Xicor/Intersil X1205 RTC	*/
 #define I2C_DRIVERID_PCF8563	83	/* Philips PCF8563 RTC		*/
 #define I2C_DRIVERID_RS5C372	84	/* Ricoh RS5C372 RTC		*/
+#define I2C_DRIVERID_BT866	85	/* Conexant bt866 video encoder */
+#define I2C_DRIVERID_KS0127	86	/* Samsung ks0127 video decoder */
 
 #define I2C_DRIVERID_I2CDEV	900
 #define I2C_DRIVERID_ARP        902    /* SMBus ARP Client              */
-- 
cgit v1.2.3


From c003d467bd71a7da22554e0d812a646ab58abea5 Mon Sep 17 00:00:00 2001
From: David Mosberger-Tang <David.Mosberger@acm.org>
Date: Fri, 26 May 2006 10:28:13 -0300
Subject: V4L/DVB (4046): Trivial videodev2.h patch

linux/videodev2.h uses types such as __u8 but it fails to include
<linux/types.h>.  Within the kernel, that's not a problem because
<linux/time.h> already includes <linux/types.h>.  However, there are
user apps that try to include videodev2.h (e.g., ekiga) and at least
on ia64, it causes compilation failures since <linux/types.h> doesn't
get included for any other reason, leaving __u8 etc. undefined.  The
attached patch fixes the problem for me.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 include/linux/videodev2.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index ad7fa9c86c10..b3a848b6fb1c 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -19,6 +19,7 @@
 #include <linux/device.h>
 #include <linux/mutex.h>
 #endif
+#include <linux/types.h>
 #include <linux/compiler.h> /* need __user */
 
 
-- 
cgit v1.2.3


From 88ca8ed0b7f2f04a055ff3c389f398ba3ad3d27d Mon Sep 17 00:00:00 2001
From: Scott Alfter <salfter@ssai.us>
Date: Sat, 20 May 2006 16:04:31 -0300
Subject: V4L/DVB (4048): Add support for the Texas Instruments TLV320AIC23B
 audio codec

Signed-off-by: Scott Alfter <salfter@ssai.us>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 drivers/media/video/Kconfig        |   9 ++
 drivers/media/video/Makefile       |   1 +
 drivers/media/video/tlv320aic23b.c | 217 +++++++++++++++++++++++++++++++++++++
 include/linux/i2c-id.h             |   1 +
 4 files changed, 228 insertions(+)
 create mode 100644 drivers/media/video/tlv320aic23b.c

(limited to 'include/linux')

diff --git a/drivers/media/video/Kconfig b/drivers/media/video/Kconfig
index 515e16acf0dd..1732fb56c8b0 100644
--- a/drivers/media/video/Kconfig
+++ b/drivers/media/video/Kconfig
@@ -363,6 +363,15 @@ config VIDEO_CS53L32A
 	  To compile this driver as a module, choose M here: the
 	  module will be called cs53l32a.
 
+config VIDEO_TLV320AIC23B
+	tristate "Texas Instruments TLV320AIC23B audio codec"
+	depends on VIDEO_DEV && I2C && EXPERIMENTAL
+	---help---
+	  Support for the Texas Instruments TLV320AIC23B audio codec.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called tlv320aic23b.
+
 config VIDEO_WM8775
 	tristate "Wolfson Microelectronics WM8775 audio ADC with input mixer"
 	depends on VIDEO_DEV && I2C && EXPERIMENTAL
diff --git a/drivers/media/video/Makefile b/drivers/media/video/Makefile
index 97e899074436..6e897151d911 100644
--- a/drivers/media/video/Makefile
+++ b/drivers/media/video/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_VIDEO_EM28XX) += em28xx/
 obj-$(CONFIG_VIDEO_EM28XX) += tvp5150.o
 obj-$(CONFIG_VIDEO_MSP3400) += msp3400.o
 obj-$(CONFIG_VIDEO_CS53L32A) += cs53l32a.o
+obj-$(CONFIG_VIDEO_TLV320AIC23B) += tlv320aic23b.o
 obj-$(CONFIG_VIDEO_WM8775) += wm8775.o
 obj-$(CONFIG_VIDEO_WM8739) += wm8739.o
 obj-$(CONFIG_VIDEO_OVCAMCHIP) += ovcamchip/
diff --git a/drivers/media/video/tlv320aic23b.c b/drivers/media/video/tlv320aic23b.c
new file mode 100644
index 000000000000..76b2e96429d9
--- /dev/null
+++ b/drivers/media/video/tlv320aic23b.c
@@ -0,0 +1,217 @@
+/*
+ * tlv320aic23b - driver version 0.0.1
+ *
+ * Copyright (C) 2006 Scott Alfter <salfter@ssai.us>
+ *
+ * Based on wm8775 driver
+ *
+ * Copyright (C) 2004 Ulf Eklund <ivtv at eklund.to>
+ * Copyright (C) 2005 Hans Verkuil <hverkuil@xs4all.nl>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#include <asm/uaccess.h>
+#include <linux/i2c.h>
+#include <linux/i2c-id.h>
+#include <linux/videodev.h>
+#include <media/v4l2-common.h>
+
+MODULE_DESCRIPTION("tlv320aic23b driver");
+MODULE_AUTHOR("Scott Alfter, Ulf Eklund, Hans Verkuil");
+MODULE_LICENSE("GPL");
+
+static unsigned short normal_i2c[] = { 0x34 >> 1, I2C_CLIENT_END };
+
+
+I2C_CLIENT_INSMOD;
+
+/* ----------------------------------------------------------------------- */
+
+struct tlv320aic23b_state {
+	u8 muted;
+};
+
+static int tlv320aic23b_write(struct i2c_client *client, int reg, u16 val)
+{
+	int i;
+
+	if ((reg < 0 || reg > 9) && (reg != 15)) {
+		v4l_err(client, "Invalid register R%d\n", reg);
+		return -1;
+	}
+
+	for (i = 0; i < 3; i++) {
+		if (i2c_smbus_write_byte_data(client, (reg << 1) |
+					(val >> 8), val & 0xff) == 0) {
+			return 0;
+		}
+	}
+	v4l_err(client, "I2C: cannot write %03x to register R%d\n", val, reg);
+	return -1;
+}
+
+static int tlv320aic23b_command(struct i2c_client *client, unsigned int cmd,
+			  void *arg)
+{
+	struct tlv320aic23b_state *state = i2c_get_clientdata(client);
+	struct v4l2_control *ctrl = arg;
+	u32* freq = arg;
+
+	switch (cmd) {
+	case VIDIOC_INT_AUDIO_CLOCK_FREQ:
+		switch (*freq) {
+			case 32000: /* set sample rate to 32 kHz */
+				tlv320aic23b_write(client, 8, 0x018);
+				break;
+			case 44100: /* set sample rate to 44.1 kHz */
+				tlv320aic23b_write(client, 8, 0x022);
+				break;
+			case 48000: /* set sample rate to 48 kHz */
+				tlv320aic23b_write(client, 8, 0x000);
+				break;
+			default:
+				return -EINVAL;
+		}
+		break;
+
+	case VIDIOC_G_CTRL:
+		if (ctrl->id != V4L2_CID_AUDIO_MUTE)
+			return -EINVAL;
+		ctrl->value = state->muted;
+		break;
+
+	case VIDIOC_S_CTRL:
+		if (ctrl->id != V4L2_CID_AUDIO_MUTE)
+			return -EINVAL;
+		state->muted = ctrl->value;
+		tlv320aic23b_write(client, 0, 0x180); /* mute both channels */
+		/* set gain on both channels to +3.0 dB */
+		if (!state->muted)
+			tlv320aic23b_write(client, 0, 0x119);
+		break;
+
+	case VIDIOC_LOG_STATUS:
+		v4l_info(client, "Input: %s\n",
+			    state->muted ? "muted" : "active");
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* ----------------------------------------------------------------------- */
+
+/* i2c implementation */
+
+/*
+ * Generic i2c probe
+ * concerning the addresses: i2c wants 7 bit (without the r/w bit), so '>>1'
+ */
+
+static struct i2c_driver i2c_driver;
+
+static int tlv320aic23b_attach(struct i2c_adapter *adapter, int address, int kind)
+{
+	struct i2c_client *client;
+	struct tlv320aic23b_state *state;
+
+	/* Check if the adapter supports the needed features */
+	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA))
+		return 0;
+
+	client = kzalloc(sizeof(struct i2c_client), GFP_KERNEL);
+	if (client == 0)
+		return -ENOMEM;
+
+	client->addr = address;
+	client->adapter = adapter;
+	client->driver = &i2c_driver;
+	snprintf(client->name, sizeof(client->name) - 1, "tlv320aic23b");
+
+	v4l_info(client, "chip found @ 0x%x (%s)\n", address << 1, adapter->name);
+
+	state = kmalloc(sizeof(struct tlv320aic23b_state), GFP_KERNEL);
+	if (state == NULL) {
+		kfree(client);
+		return -ENOMEM;
+	}
+	state->muted = 0;
+	i2c_set_clientdata(client, state);
+
+	/* initialize tlv320aic23b */
+	tlv320aic23b_write(client, 15, 0x000);	/* RESET */
+	tlv320aic23b_write(client, 6, 0x00A);   /* turn off DAC & mic input */
+	tlv320aic23b_write(client, 7, 0x049);   /* left-justified, 24-bit, master mode */
+	tlv320aic23b_write(client, 0, 0x119);   /* set gain on both channels to +3.0 dB */
+	tlv320aic23b_write(client, 8, 0x000);   /* set sample rate to 48 kHz */
+	tlv320aic23b_write(client, 9, 0x001);   /* activate digital interface */
+
+	i2c_attach_client(client);
+
+	return 0;
+}
+
+static int tlv320aic23b_probe(struct i2c_adapter *adapter)
+{
+	if (adapter->class & I2C_CLASS_TV_ANALOG)
+		return i2c_probe(adapter, &addr_data, tlv320aic23b_attach);
+	return 0;
+}
+
+static int tlv320aic23b_detach(struct i2c_client *client)
+{
+	int err;
+
+	err = i2c_detach_client(client);
+	if (err) {
+		return err;
+	}
+	kfree(client);
+
+	return 0;
+}
+
+/* ----------------------------------------------------------------------- */
+
+/* i2c implementation */
+static struct i2c_driver i2c_driver = {
+	.driver = {
+		.name = "tlv320aic23b",
+	},
+	.id             = I2C_DRIVERID_TLV320AIC23B,
+	.attach_adapter = tlv320aic23b_probe,
+	.detach_client  = tlv320aic23b_detach,
+	.command        = tlv320aic23b_command,
+};
+
+
+static int __init tlv320aic23b_init_module(void)
+{
+	return i2c_add_driver(&i2c_driver);
+}
+
+static void __exit tlv320aic23b_cleanup_module(void)
+{
+	i2c_del_driver(&i2c_driver);
+}
+
+module_init(tlv320aic23b_init_module);
+module_exit(tlv320aic23b_cleanup_module);
diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index 748bbf7c327c..21338bb3441d 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -114,6 +114,7 @@
 #define I2C_DRIVERID_RS5C372	84	/* Ricoh RS5C372 RTC		*/
 #define I2C_DRIVERID_BT866	85	/* Conexant bt866 video encoder */
 #define I2C_DRIVERID_KS0127	86	/* Samsung ks0127 video decoder */
+#define I2C_DRIVERID_TLV320AIC23B 87	/* TI TLV320AIC23B audio codec  */
 
 #define I2C_DRIVERID_I2CDEV	900
 #define I2C_DRIVERID_ARP        902    /* SMBus ARP Client              */
-- 
cgit v1.2.3


From 401998fa96fe18b057af3f906527196522dd2d9d Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@infradead.org>
Date: Sun, 4 Jun 2006 10:06:18 -0300
Subject: V4L/DVB (4065): Several improvements at videodev.c

Videodev now is capable of better handling V4L2 api, by
processing V4L2 ioctls and using callbacks to the driver.
The drivers should be migrated to the newer way and the older
one will be obsoleted soon.

Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 drivers/media/video/cx88/cx88.h       |    1 +
 drivers/media/video/saa7134/saa7134.h |    1 +
 drivers/media/video/v4l2-common.c     |   13 +
 drivers/media/video/videodev.c        | 1172 ++++++++++++++++++++++++++++++++-
 include/linux/videodev.h              |   58 +-
 include/linux/videodev2.h             |  135 +---
 include/media/v4l2-common.h           |    6 +
 include/media/v4l2-dev.h              |  371 +++++++++++
 include/media/video-buf.h             |    1 +
 9 files changed, 1579 insertions(+), 179 deletions(-)
 create mode 100644 include/media/v4l2-dev.h

(limited to 'include/linux')

diff --git a/drivers/media/video/cx88/cx88.h b/drivers/media/video/cx88/cx88.h
index 46a21f8f79ef..b9501295a416 100644
--- a/drivers/media/video/cx88/cx88.h
+++ b/drivers/media/video/cx88/cx88.h
@@ -25,6 +25,7 @@
 #include <linux/videodev2.h>
 #include <linux/kdev_t.h>
 
+#include <media/v4l2-common.h>
 #include <media/tuner.h>
 #include <media/tveeprom.h>
 #include <media/video-buf.h>
diff --git a/drivers/media/video/saa7134/saa7134.h b/drivers/media/video/saa7134/saa7134.h
index 9048d2c29899..6cba6c1ef584 100644
--- a/drivers/media/video/saa7134/saa7134.h
+++ b/drivers/media/video/saa7134/saa7134.h
@@ -33,6 +33,7 @@
 
 #include <asm/io.h>
 
+#include <media/v4l2-common.h>
 #include <media/tuner.h>
 #include <media/ir-common.h>
 #include <media/ir-kbd-i2c.h>
diff --git a/drivers/media/video/v4l2-common.c b/drivers/media/video/v4l2-common.c
index ad92e07e74f7..bffe48275eb0 100644
--- a/drivers/media/video/v4l2-common.c
+++ b/drivers/media/video/v4l2-common.c
@@ -59,6 +59,7 @@
 #include <asm/io.h>
 #include <asm/div64.h>
 #include <linux/video_decoder.h>
+#define __OLD_VIDIOC_ /* To allow fixing old calls*/
 #include <media/v4l2-common.h>
 
 #ifdef CONFIG_KMOD
@@ -424,7 +425,9 @@ void v4l_printk_ioctl_arg(char *s,unsigned int cmd, void *arg)
 	case TUNER_SET_TYPE_ADDR:
 	case TUNER_SET_STANDBY:
 	case TDA9887_SET_CONFIG:
+#ifdef __OLD_VIDIOC_
 	case VIDIOC_OVERLAY_OLD:
+#endif
 	case VIDIOC_STREAMOFF:
 	case VIDIOC_G_OUTPUT:
 	case VIDIOC_S_OUTPUT:
@@ -440,7 +443,9 @@ void v4l_printk_ioctl_arg(char *s,unsigned int cmd, void *arg)
 	case VIDIOC_G_AUDIO:
 	case VIDIOC_S_AUDIO:
 	case VIDIOC_ENUMAUDIO:
+#ifdef __OLD_VIDIOC_
 	case VIDIOC_G_AUDIO_OLD:
+#endif
 	{
 		struct v4l2_audio *p=arg;
 
@@ -451,7 +456,9 @@ void v4l_printk_ioctl_arg(char *s,unsigned int cmd, void *arg)
 	case VIDIOC_G_AUDOUT:
 	case VIDIOC_S_AUDOUT:
 	case VIDIOC_ENUMAUDOUT:
+#ifdef __OLD_VIDIOC_
 	case VIDIOC_G_AUDOUT_OLD:
+#endif
 	{
 		struct v4l2_audioout *p=arg;
 		printk ("%s: index=%d, name=%s, capability=%d, mode=%d\n", s,
@@ -496,7 +503,9 @@ void v4l_printk_ioctl_arg(char *s,unsigned int cmd, void *arg)
 	}
 	case VIDIOC_G_CTRL:
 	case VIDIOC_S_CTRL:
+#ifdef __OLD_VIDIOC_
 	case VIDIOC_S_CTRL_OLD:
+#endif
 	{
 		struct v4l2_control *p=arg;
 		printk ("%s: id=%d, value=%d\n", s, p->id, p->value);
@@ -511,7 +520,9 @@ void v4l_printk_ioctl_arg(char *s,unsigned int cmd, void *arg)
 		break;
 	}
 	case VIDIOC_CROPCAP:
+#ifdef __OLD_VIDIOC_
 	case VIDIOC_CROPCAP_OLD:
+#endif
 	{
 		struct v4l2_cropcap *p=arg;
 		/*FIXME: Should also show rect structs */
@@ -703,7 +714,9 @@ void v4l_printk_ioctl_arg(char *s,unsigned int cmd, void *arg)
 	}
 	case VIDIOC_G_PARM:
 	case VIDIOC_S_PARM:
+#ifdef __OLD_VIDIOC_
 	case VIDIOC_S_PARM_OLD:
+#endif
 	{
 		struct v4l2_streamparm *p=arg;
 		printk ("%s: type=%d\n", s, p->type);
diff --git a/drivers/media/video/videodev.c b/drivers/media/video/videodev.c
index 5f87dd5f1d0b..3f7a94b80c63 100644
--- a/drivers/media/video/videodev.c
+++ b/drivers/media/video/videodev.c
@@ -1,20 +1,31 @@
 /*
- * Video capture interface for Linux
+ * Video capture interface for Linux version 2
  *
- *		A generic video device interface for the LINUX operating system
- *		using a set of device structures/vectors for low level operations.
+ *	A generic video device interface for the LINUX operating system
+ *	using a set of device structures/vectors for low level operations.
  *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
  *
- * Author:	Alan Cox, <alan@redhat.com>
+ * Authors:	Alan Cox, <alan@redhat.com> (version 1)
+ *              Mauro Carvalho Chehab <mchehab@infradead.org> (version 2)
  *
  * Fixes:	20000516  Claudio Matsuoka <claudio@conectiva.com>
  *		- Added procfs support
  */
 
+#define dbgarg(cmd, fmt, arg...) \
+		if (vfd->debug & V4L2_DEBUG_IOCTL_ARG)			\
+			printk (KERN_DEBUG "%s: ",  vfd->name);		\
+			v4l_printk_ioctl(cmd);				\
+			printk (KERN_DEBUG "%s: " fmt, vfd->name, ## arg);
+
+#define dbgarg2(fmt, arg...) \
+		if (vfd->debug & V4L2_DEBUG_IOCTL_ARG)			\
+			printk (KERN_DEBUG "%s: " fmt, vfd->name, ## arg);
+
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -30,7 +41,13 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
+#define __OLD_VIDIOC_ /* To allow fixing old calls*/
+#include <linux/videodev2.h>
+
+#ifdef CONFIG_VIDEO_V4L1
 #include <linux/videodev.h>
+#endif
+#include <media/v4l2-common.h>
 
 #define VIDEO_NUM_DEVICES	256
 #define VIDEO_NAME              "video4linux"
@@ -41,7 +58,8 @@
 
 static ssize_t show_name(struct class_device *cd, char *buf)
 {
-	struct video_device *vfd = container_of(cd, struct video_device, class_dev);
+	struct video_device *vfd = container_of(cd, struct video_device,
+								class_dev);
 	return sprintf(buf,"%.*s\n",(int)sizeof(vfd->name),vfd->name);
 }
 
@@ -62,7 +80,8 @@ void video_device_release(struct video_device *vfd)
 
 static void video_release(struct class_device *cd)
 {
-	struct video_device *vfd = container_of(cd, struct video_device, class_dev);
+	struct video_device *vfd = container_of(cd, struct video_device,
+								class_dev);
 
 #if 1
 	/* needed until all drivers are fixed */
@@ -90,7 +109,7 @@ struct video_device* video_devdata(struct file *file)
 }
 
 /*
- *	Open a video device.
+ *	Open a video device - FIXME: Obsoleted
  */
 static int video_open(struct inode *inode, struct file *file)
 {
@@ -130,6 +149,7 @@ static int video_open(struct inode *inode, struct file *file)
  * helper function -- handles userspace copying for ioctl arguments
  */
 
+#ifdef __OLD_VIDIOC_
 static unsigned int
 video_fix_command(unsigned int cmd)
 {
@@ -155,7 +175,11 @@ video_fix_command(unsigned int cmd)
 	}
 	return cmd;
 }
+#endif
 
+/*
+ * Obsolete usercopy function - Should be removed soon
+ */
 int
 video_usercopy(struct inode *inode, struct file *file,
 	       unsigned int cmd, unsigned long arg,
@@ -167,7 +191,9 @@ video_usercopy(struct inode *inode, struct file *file,
 	void	*parg = NULL;
 	int	err  = -EINVAL;
 
+#ifdef __OLD_VIDIOC_
 	cmd = video_fix_command(cmd);
+#endif
 
 	/*  Copy arguments into temp kernel buffer  */
 	switch (_IOC_DIR(cmd)) {
@@ -189,7 +215,8 @@ video_usercopy(struct inode *inode, struct file *file,
 
 		err = -EFAULT;
 		if (_IOC_DIR(cmd) & _IOC_WRITE)
-			if (copy_from_user(parg, (void __user *)arg, _IOC_SIZE(cmd)))
+			if (copy_from_user(parg, (void __user *)arg,
+							_IOC_SIZE(cmd)))
 				goto out;
 		break;
 	}
@@ -218,6 +245,7 @@ out:
 
 /*
  * open/release helper functions -- handle exclusive opens
+ * Should be removed soon
  */
 int video_exclusive_open(struct inode *inode, struct file *file)
 {
@@ -242,6 +270,1115 @@ int video_exclusive_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static char *v4l2_memory_names[] = {
+	[V4L2_MEMORY_MMAP]    = "mmap",
+	[V4L2_MEMORY_USERPTR] = "userptr",
+	[V4L2_MEMORY_OVERLAY] = "overlay",
+};
+
+
+/* FIXME: Those stuff are replicated also on v4l2-common.c */
+static char *v4l2_type_names_FIXME[] = {
+	[V4L2_BUF_TYPE_VIDEO_CAPTURE]      = "video-cap",
+	[V4L2_BUF_TYPE_VIDEO_OVERLAY]      = "video-over",
+	[V4L2_BUF_TYPE_VIDEO_OUTPUT]       = "video-out",
+	[V4L2_BUF_TYPE_VBI_CAPTURE]        = "vbi-cap",
+	[V4L2_BUF_TYPE_VBI_OUTPUT]         = "vbi-out",
+	[V4L2_BUF_TYPE_SLICED_VBI_OUTPUT]  = "sliced-vbi-out",
+	[V4L2_BUF_TYPE_SLICED_VBI_CAPTURE] = "sliced-vbi-capture",
+	[V4L2_BUF_TYPE_PRIVATE]            = "private",
+};
+
+static char *v4l2_field_names_FIXME[] = {
+	[V4L2_FIELD_ANY]        = "any",
+	[V4L2_FIELD_NONE]       = "none",
+	[V4L2_FIELD_TOP]        = "top",
+	[V4L2_FIELD_BOTTOM]     = "bottom",
+	[V4L2_FIELD_INTERLACED] = "interlaced",
+	[V4L2_FIELD_SEQ_TB]     = "seq-tb",
+	[V4L2_FIELD_SEQ_BT]     = "seq-bt",
+	[V4L2_FIELD_ALTERNATE]  = "alternate",
+};
+
+#define prt_names(a,arr) (((a)>=0)&&((a)<ARRAY_SIZE(arr)))?arr[a]:"unknown"
+
+static void dbgbuf(unsigned int cmd, struct video_device *vfd,
+					struct v4l2_buffer *p)
+{
+	struct v4l2_timecode *tc=&p->timecode;
+
+	dbgarg (cmd, "%02ld:%02d:%02d.%08ld index=%d, type=%s, "
+		"bytesused=%d, flags=0x%08d, "
+		"field=%0d, sequence=%d, memory=%s, offset/userptr=0x%08lx\n",
+			(p->timestamp.tv_sec/3600),
+			(int)(p->timestamp.tv_sec/60)%60,
+			(int)(p->timestamp.tv_sec%60),
+			p->timestamp.tv_usec,
+			p->index,
+			prt_names(p->type,v4l2_type_names_FIXME),
+			p->bytesused,p->flags,
+			p->field,p->sequence,
+			prt_names(p->memory,v4l2_memory_names),
+			p->m.userptr);
+	dbgarg2 ("timecode= %02d:%02d:%02d type=%d, "
+		"flags=0x%08d, frames=%d, userbits=0x%08x\n",
+			tc->hours,tc->minutes,tc->seconds,
+			tc->type, tc->flags, tc->frames, (__u32) tc->userbits);
+}
+
+static inline void dbgrect(struct video_device *vfd, char *s,
+							struct v4l2_rect *r)
+{
+	dbgarg2 ("%sRect start at %dx%d, size= %dx%d\n", s, r->left, r->top,
+						r->width, r->height);
+};
+
+static inline void v4l_print_pix_fmt (struct video_device *vfd,
+						struct v4l2_pix_format *fmt)
+{
+	dbgarg2 ("width=%d, height=%d, format=0x%08x, field=%s, "
+		"bytesperline=%d sizeimage=%d, colorspace=%d\n",
+		fmt->width,fmt->height,fmt->pixelformat,
+		prt_names(fmt->field,v4l2_field_names_FIXME),
+		fmt->bytesperline,fmt->sizeimage,fmt->colorspace);
+};
+
+
+static int check_fmt (struct video_device *vfd, enum v4l2_buf_type type)
+{
+	switch (type) {
+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+		if (vfd->vidioc_try_fmt_cap)
+			return (0);
+		break;
+	case V4L2_BUF_TYPE_VIDEO_OVERLAY:
+		if (vfd->vidioc_try_fmt_overlay)
+			return (0);
+		break;
+	case V4L2_BUF_TYPE_VBI_CAPTURE:
+		if (vfd->vidioc_try_fmt_vbi)
+			return (0);
+		break;
+	case V4L2_BUF_TYPE_SLICED_VBI_OUTPUT:
+		if (vfd->vidioc_try_fmt_vbi_output)
+			return (0);
+		break;
+	case V4L2_BUF_TYPE_SLICED_VBI_CAPTURE:
+		if (vfd->vidioc_try_fmt_vbi_capture)
+			return (0);
+		break;
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+		if (vfd->vidioc_try_fmt_video_output)
+			return (0);
+		break;
+	case V4L2_BUF_TYPE_VBI_OUTPUT:
+		if (vfd->vidioc_try_fmt_vbi_output)
+			return (0);
+		break;
+	case V4L2_BUF_TYPE_PRIVATE:
+		if (vfd->vidioc_try_fmt_type_private)
+			return (0);
+		break;
+	}
+	return (-EINVAL);
+}
+
+static int __video_do_ioctl(struct inode *inode, struct file *file,
+		unsigned int cmd, void *arg)
+{
+	struct video_device *vfd = video_devdata(file);
+	void                 *fh = file->private_data;
+	int                  ret = -EINVAL;
+
+	if ( (vfd->debug & V4L2_DEBUG_IOCTL) &&
+				!(vfd->debug | V4L2_DEBUG_IOCTL_ARG)) {
+		v4l_print_ioctl(vfd->name, cmd);
+	}
+
+	switch(cmd) {
+	/* --- capabilities ------------------------------------------ */
+	case VIDIOC_QUERYCAP:
+	{
+		struct v4l2_capability *cap = (struct v4l2_capability*)arg;
+		memset(cap, 0, sizeof(*cap));
+
+		if (!vfd->vidioc_querycap)
+			break;
+
+		ret=vfd->vidioc_querycap(file, fh, cap);
+		if (!ret)
+			dbgarg (cmd, "driver=%s, card=%s, bus=%s, "
+					"version=0x%08x, "
+					"capabilities=0x%08x\n",
+					cap->driver,cap->card,cap->bus_info,
+					cap->version,
+					cap->capabilities);
+		break;
+	}
+
+	/* --- priority ------------------------------------------ */
+	case VIDIOC_G_PRIORITY:
+	{
+		enum v4l2_priority *p=arg;
+
+		if (!vfd->vidioc_g_priority)
+			break;
+		ret=vfd->vidioc_g_priority(file, fh, p);
+		if (!ret)
+			dbgarg(cmd, "priority is %d\n", *p);
+		break;
+	}
+	case VIDIOC_S_PRIORITY:
+	{
+		enum v4l2_priority *p=arg;
+
+		if (!vfd->vidioc_s_priority)
+			break;
+		dbgarg(cmd, "setting priority to %d\n", *p);
+		ret=vfd->vidioc_s_priority(file, fh, *p);
+		break;
+	}
+
+	/* --- capture ioctls ---------------------------------------- */
+	case VIDIOC_ENUM_FMT:
+	{
+		struct v4l2_fmtdesc *f = arg;
+		enum v4l2_buf_type type;
+		unsigned int index;
+
+		index = f->index;
+		type  = f->type;
+		memset(f,0,sizeof(*f));
+		f->index = index;
+		f->type  = type;
+
+		switch (type) {
+		case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+			if (vfd->vidioc_enum_fmt_cap)
+				ret=vfd->vidioc_enum_fmt_cap(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_VIDEO_OVERLAY:
+			if (vfd->vidioc_enum_fmt_overlay)
+				ret=vfd->vidioc_enum_fmt_overlay(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_VBI_CAPTURE:
+			if (vfd->vidioc_enum_fmt_vbi)
+				ret=vfd->vidioc_enum_fmt_vbi(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_SLICED_VBI_OUTPUT:
+			if (vfd->vidioc_enum_fmt_vbi_output)
+				ret=vfd->vidioc_enum_fmt_vbi_output(file,
+								fh, f);
+			break;
+		case V4L2_BUF_TYPE_SLICED_VBI_CAPTURE:
+			if (vfd->vidioc_enum_fmt_vbi_capture)
+				ret=vfd->vidioc_enum_fmt_vbi_capture(file,
+								fh, f);
+			break;
+		case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+			if (vfd->vidioc_enum_fmt_video_output)
+				ret=vfd->vidioc_enum_fmt_video_output(file,
+								fh, f);
+			break;
+		case V4L2_BUF_TYPE_VBI_OUTPUT:
+			if (vfd->vidioc_enum_fmt_vbi_output)
+				ret=vfd->vidioc_enum_fmt_vbi_output(file,
+								fh, f);
+			break;
+		case V4L2_BUF_TYPE_PRIVATE:
+			if (vfd->vidioc_enum_fmt_type_private)
+				ret=vfd->vidioc_enum_fmt_type_private(file,
+								fh, f);
+			break;
+		}
+		if (!ret)
+			dbgarg (cmd, "index=%d, type=%d, flags=%d, "
+					"description=%s,"
+					" pixelformat=0x%8x\n",
+					f->index, f->type, f->flags,
+					f->description,
+					f->pixelformat);
+
+		break;
+	}
+	case VIDIOC_G_FMT:
+	{
+		struct v4l2_format *f = (struct v4l2_format *)arg;
+		enum v4l2_buf_type type=f->type;
+
+		memset(&f->fmt.pix,0,sizeof(f->fmt.pix));
+		f->type=type;
+
+		/* FIXME: Should be one dump per type */
+		dbgarg (cmd, "type=%s\n", prt_names(type,
+					v4l2_type_names_FIXME));
+
+		switch (type) {
+		case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+			if (vfd->vidioc_g_fmt_cap)
+				ret=vfd->vidioc_g_fmt_cap(file, fh, f);
+			if (!ret)
+				v4l_print_pix_fmt(vfd,&f->fmt.pix);
+			break;
+		case V4L2_BUF_TYPE_VIDEO_OVERLAY:
+			if (vfd->vidioc_g_fmt_overlay)
+				ret=vfd->vidioc_g_fmt_overlay(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_VBI_CAPTURE:
+			if (vfd->vidioc_g_fmt_vbi)
+				ret=vfd->vidioc_g_fmt_vbi(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_SLICED_VBI_OUTPUT:
+			if (vfd->vidioc_g_fmt_vbi_output)
+				ret=vfd->vidioc_g_fmt_vbi_output(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_SLICED_VBI_CAPTURE:
+			if (vfd->vidioc_g_fmt_vbi_capture)
+				ret=vfd->vidioc_g_fmt_vbi_capture(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+			if (vfd->vidioc_g_fmt_video_output)
+				ret=vfd->vidioc_g_fmt_video_output(file,
+								fh, f);
+			break;
+		case V4L2_BUF_TYPE_VBI_OUTPUT:
+			if (vfd->vidioc_g_fmt_vbi_output)
+				ret=vfd->vidioc_g_fmt_vbi_output(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_PRIVATE:
+			if (vfd->vidioc_g_fmt_type_private)
+				ret=vfd->vidioc_g_fmt_type_private(file,
+								fh, f);
+			break;
+		}
+
+		break;
+	}
+	case VIDIOC_S_FMT:
+	{
+		struct v4l2_format *f = (struct v4l2_format *)arg;
+
+		/* FIXME: Should be one dump per type */
+		dbgarg (cmd, "type=%s\n", prt_names(f->type,
+					v4l2_type_names_FIXME));
+
+		switch (f->type) {
+		case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+			v4l_print_pix_fmt(vfd,&f->fmt.pix);
+			if (vfd->vidioc_s_fmt_cap)
+				ret=vfd->vidioc_s_fmt_cap(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_VIDEO_OVERLAY:
+			if (vfd->vidioc_s_fmt_overlay)
+				ret=vfd->vidioc_s_fmt_overlay(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_VBI_CAPTURE:
+			if (vfd->vidioc_s_fmt_vbi)
+				ret=vfd->vidioc_s_fmt_vbi(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_SLICED_VBI_OUTPUT:
+			if (vfd->vidioc_s_fmt_vbi_output)
+				ret=vfd->vidioc_s_fmt_vbi_output(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_SLICED_VBI_CAPTURE:
+			if (vfd->vidioc_s_fmt_vbi_capture)
+				ret=vfd->vidioc_s_fmt_vbi_capture(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+			if (vfd->vidioc_s_fmt_video_output)
+				ret=vfd->vidioc_s_fmt_video_output(file,
+								fh, f);
+			break;
+		case V4L2_BUF_TYPE_VBI_OUTPUT:
+			if (vfd->vidioc_s_fmt_vbi_output)
+				ret=vfd->vidioc_s_fmt_vbi_output(file,
+								fh, f);
+			break;
+		case V4L2_BUF_TYPE_PRIVATE:
+			if (vfd->vidioc_s_fmt_type_private)
+				ret=vfd->vidioc_s_fmt_type_private(file,
+								fh, f);
+			break;
+		}
+		break;
+	}
+	case VIDIOC_TRY_FMT:
+	{
+		struct v4l2_format *f = (struct v4l2_format *)arg;
+
+		/* FIXME: Should be one dump per type */
+		dbgarg (cmd, "type=%s\n", prt_names(f->type,
+						v4l2_type_names_FIXME));
+		switch (f->type) {
+		case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+			if (vfd->vidioc_try_fmt_cap)
+				ret=vfd->vidioc_try_fmt_cap(file, fh, f);
+			if (!ret)
+				v4l_print_pix_fmt(vfd,&f->fmt.pix);
+			break;
+		case V4L2_BUF_TYPE_VIDEO_OVERLAY:
+			if (vfd->vidioc_try_fmt_overlay)
+				ret=vfd->vidioc_try_fmt_overlay(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_VBI_CAPTURE:
+			if (vfd->vidioc_try_fmt_vbi)
+				ret=vfd->vidioc_try_fmt_vbi(file, fh, f);
+			break;
+		case V4L2_BUF_TYPE_SLICED_VBI_OUTPUT:
+			if (vfd->vidioc_try_fmt_vbi_output)
+				ret=vfd->vidioc_try_fmt_vbi_output(file,
+								fh, f);
+			break;
+		case V4L2_BUF_TYPE_SLICED_VBI_CAPTURE:
+			if (vfd->vidioc_try_fmt_vbi_capture)
+				ret=vfd->vidioc_try_fmt_vbi_capture(file,
+								fh, f);
+			break;
+		case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+			if (vfd->vidioc_try_fmt_video_output)
+				ret=vfd->vidioc_try_fmt_video_output(file,
+								fh, f);
+			break;
+		case V4L2_BUF_TYPE_VBI_OUTPUT:
+			if (vfd->vidioc_try_fmt_vbi_output)
+				ret=vfd->vidioc_try_fmt_vbi_output(file,
+								fh, f);
+			break;
+		case V4L2_BUF_TYPE_PRIVATE:
+			if (vfd->vidioc_try_fmt_type_private)
+				ret=vfd->vidioc_try_fmt_type_private(file,
+								fh, f);
+			break;
+		}
+
+		break;
+	}
+	/* FIXME: Those buf reqs could be handled here,
+	   with some changes on videobuf to allow its header to be included at
+	   videodev2.h or being merged at videodev2.
+	 */
+	case VIDIOC_REQBUFS:
+	{
+		struct v4l2_requestbuffers *p=arg;
+
+		if (!vfd->vidioc_reqbufs)
+			break;
+		ret = check_fmt (vfd, p->type);
+		if (ret)
+			break;
+
+		ret=vfd->vidioc_reqbufs(file, fh, p);
+		dbgarg (cmd, "count=%d, type=%s, memory=%s\n",
+				p->count,
+				prt_names(p->type,v4l2_type_names_FIXME),
+				prt_names(p->memory,v4l2_memory_names));
+		break;
+	}
+	case VIDIOC_QUERYBUF:
+	{
+		struct v4l2_buffer *p=arg;
+
+		if (!vfd->vidioc_querybuf)
+			break;
+		ret = check_fmt (vfd, p->type);
+		if (ret)
+			break;
+
+		ret=vfd->vidioc_querybuf(file, fh, p);
+		if (!ret)
+			dbgbuf(cmd,vfd,p);
+		break;
+	}
+	case VIDIOC_QBUF:
+	{
+		struct v4l2_buffer *p=arg;
+
+		if (!vfd->vidioc_qbuf)
+			break;
+		ret = check_fmt (vfd, p->type);
+		if (ret)
+			break;
+
+		ret=vfd->vidioc_qbuf(file, fh, p);
+		if (!ret)
+			dbgbuf(cmd,vfd,p);
+		break;
+	}
+	case VIDIOC_DQBUF:
+	{
+		struct v4l2_buffer *p=arg;
+		if (!vfd->vidioc_qbuf)
+			break;
+		ret = check_fmt (vfd, p->type);
+		if (ret)
+			break;
+
+		ret=vfd->vidioc_qbuf(file, fh, p);
+		if (!ret)
+			dbgbuf(cmd,vfd,p);
+		break;
+	}
+	case VIDIOC_OVERLAY:
+	{
+		int *i = arg;
+
+		if (!vfd->vidioc_overlay)
+			break;
+		dbgarg (cmd, "value=%d\n",*i);
+		ret=vfd->vidioc_overlay(file, fh, *i);
+		break;
+	}
+#ifdef HAVE_V4L1
+	/* --- streaming capture ------------------------------------- */
+	case VIDIOCGMBUF:
+	{
+		struct video_mbuf *p=arg;
+
+		memset(&p,0,sizeof(p));
+
+		if (!vfd->vidiocgmbuf)
+			break;
+		ret=vfd->vidiocgmbuf(file, fh, p);
+		if (!ret)
+			dbgarg (cmd, "size=%d, frames=%d, offsets=0x%08lx\n",
+						p->size, p->frames,
+						(unsigned long)p->offsets);
+		break;
+	}
+#endif
+	case VIDIOC_G_FBUF:
+	{
+		struct v4l2_framebuffer *p=arg;
+		if (!vfd->vidioc_g_fbuf)
+			break;
+		ret=vfd->vidioc_g_fbuf(file, fh, arg);
+		if (!ret) {
+			dbgarg (cmd, "capability=%d, flags=%d, base=0x%08lx\n",
+					p->capability,p->flags,
+					(unsigned long)p->base);
+			v4l_print_pix_fmt (vfd, &p->fmt);
+		}
+		break;
+	}
+	case VIDIOC_S_FBUF:
+	{
+		struct v4l2_framebuffer *p=arg;
+		if (!vfd->vidioc_s_fbuf)
+			break;
+
+		dbgarg (cmd, "capability=%d, flags=%d, base=0x%08lx\n",
+				p->capability,p->flags,(unsigned long)p->base);
+		v4l_print_pix_fmt (vfd, &p->fmt);
+		ret=vfd->vidioc_s_fbuf(file, fh, arg);
+
+		break;
+	}
+	case VIDIOC_STREAMON:
+	{
+		enum v4l2_buf_type i = *(int *)arg;
+		if (!vfd->vidioc_streamon)
+			break;
+		dbgarg (cmd, "type=%s\n", prt_names(i,v4l2_type_names_FIXME));
+		ret=vfd->vidioc_streamon(file, fh,i);
+		break;
+	}
+	case VIDIOC_STREAMOFF:
+	{
+		enum v4l2_buf_type i = *(int *)arg;
+
+		if (!vfd->vidioc_streamoff)
+			break;
+		dbgarg (cmd, "type=%s\n", prt_names(i,v4l2_type_names_FIXME));
+		ret=vfd->vidioc_streamoff(file, fh, i);
+		break;
+	}
+	/* ---------- tv norms ---------- */
+	case VIDIOC_ENUMSTD:
+	{
+		struct v4l2_standard *p = arg;
+		unsigned int index = p->index;
+
+		if (!vfd->tvnormsize) {
+			printk (KERN_WARNING "%s: no TV norms defined!\n",
+						vfd->name);
+			break;
+		}
+
+		if (index<=0 || index >= vfd->tvnormsize) {
+			ret=-EINVAL;
+			break;
+		}
+		v4l2_video_std_construct(p, vfd->tvnorms[p->index].id,
+					 vfd->tvnorms[p->index].name);
+		p->index = index;
+
+		dbgarg (cmd, "index=%d, id=%Ld, name=%s, fps=%d/%d, "
+				"framelines=%d\n", p->index,
+				(unsigned long long)p->id, p->name,
+				p->frameperiod.numerator,
+				p->frameperiod.denominator,
+				p->framelines);
+
+		ret=0;
+		break;
+	}
+	case VIDIOC_G_STD:
+	{
+		v4l2_std_id *id = arg;
+
+		*id = vfd->current_norm;
+
+		dbgarg (cmd, "value=%Lu\n", (long long unsigned) *id);
+
+		ret=0;
+		break;
+	}
+	case VIDIOC_S_STD:
+	{
+		v4l2_std_id *id = arg;
+		unsigned int i;
+
+		if (!vfd->tvnormsize) {
+			printk (KERN_WARNING "%s: no TV norms defined!\n",
+						vfd->name);
+			break;
+		}
+
+		dbgarg (cmd, "value=%Lu\n", (long long unsigned) *id);
+
+		/* First search for exact match */
+		for (i = 0; i < vfd->tvnormsize; i++)
+			if (*id == vfd->tvnorms[i].id)
+				break;
+		/* Then for a generic video std that contains desired std */
+		if (i == vfd->tvnormsize)
+			for (i = 0; i < vfd->tvnormsize; i++)
+				if (*id & vfd->tvnorms[i].id)
+					break;
+		if (i == vfd->tvnormsize) {
+			break;
+		}
+
+		/* Calls the specific handler */
+		if (vfd->vidioc_s_std)
+			ret=vfd->vidioc_s_std(file, fh, i);
+		else
+			ret=-EINVAL;
+
+		/* Updates standard information */
+		if (!ret)
+			vfd->current_norm=*id;
+
+		break;
+	}
+	case VIDIOC_QUERYSTD:
+	{
+		v4l2_std_id *p=arg;
+
+		if (!vfd->vidioc_querystd)
+			break;
+		ret=vfd->vidioc_querystd(file, fh, arg);
+		if (!ret)
+			dbgarg (cmd, "detected std=%Lu\n",
+						(unsigned long long)*p);
+		break;
+	}
+	/* ------ input switching ---------- */
+	/* FIXME: Inputs can be handled inside videodev2 */
+	case VIDIOC_ENUMINPUT:
+	{
+		struct v4l2_input *p=arg;
+		int i=p->index;
+
+		if (!vfd->vidioc_enum_input)
+			break;
+		memset(p, 0, sizeof(*p));
+		p->index=i;
+
+		ret=vfd->vidioc_enum_input(file, fh, p);
+		if (!ret)
+			dbgarg (cmd, "index=%d, name=%s, type=%d, "
+					"audioset=%d, "
+					"tuner=%d, std=%Ld, status=%d\n",
+					p->index,p->name,p->type,p->audioset,
+					p->tuner,
+					(unsigned long long)p->std,
+					p->status);
+		break;
+	}
+	case VIDIOC_G_INPUT:
+	{
+		unsigned int *i = arg;
+
+		if (!vfd->vidioc_g_input)
+			break;
+		ret=vfd->vidioc_g_input(file, fh, i);
+		if (!ret)
+			dbgarg (cmd, "value=%d\n",*i);
+		break;
+	}
+	case VIDIOC_S_INPUT:
+	{
+		unsigned int *i = arg;
+
+		if (!vfd->vidioc_s_input)
+			break;
+		dbgarg (cmd, "value=%d\n",*i);
+		ret=vfd->vidioc_s_input(file, fh, *i);
+		break;
+	}
+
+	/* ------ output switching ---------- */
+	case VIDIOC_G_OUTPUT:
+	{
+		unsigned int *i = arg;
+
+		if (!vfd->vidioc_g_output)
+			break;
+		ret=vfd->vidioc_g_output(file, fh, i);
+		if (!ret)
+			dbgarg (cmd, "value=%d\n",*i);
+		break;
+	}
+	case VIDIOC_S_OUTPUT:
+	{
+		unsigned int *i = arg;
+
+		if (!vfd->vidioc_s_output)
+			break;
+		dbgarg (cmd, "value=%d\n",*i);
+		ret=vfd->vidioc_s_output(file, fh, *i);
+		break;
+	}
+
+	/* --- controls ---------------------------------------------- */
+	case VIDIOC_QUERYCTRL:
+	{
+		struct v4l2_queryctrl *p=arg;
+
+		if (!vfd->vidioc_queryctrl)
+			break;
+		ret=vfd->vidioc_queryctrl(file, fh, p);
+
+		if (!ret)
+			dbgarg (cmd, "id=%d, type=%d, name=%s, "
+					"min/max=%d/%d,"
+					" step=%d, default=%d, flags=0x%08x\n",
+					p->id,p->type,p->name,p->minimum,
+					p->maximum,p->step,p->default_value,
+					p->flags);
+		break;
+	}
+	case VIDIOC_G_CTRL:
+	{
+		struct v4l2_control *p = arg;
+
+		if (!vfd->vidioc_g_ctrl)
+			break;
+		dbgarg(cmd, "Enum for index=%d\n", p->id);
+
+		ret=vfd->vidioc_g_ctrl(file, fh, p);
+		if (!ret)
+			dbgarg2 ( "id=%d, value=%d\n", p->id, p->value);
+		break;
+	}
+	case VIDIOC_S_CTRL:
+	{
+		struct v4l2_control *p = arg;
+
+		if (!vfd->vidioc_s_ctrl)
+			break;
+		dbgarg (cmd, "id=%d, value=%d\n", p->id, p->value);
+
+		ret=vfd->vidioc_s_ctrl(file, fh, p);
+		break;
+	}
+	case VIDIOC_QUERYMENU:
+	{
+		struct v4l2_querymenu *p=arg;
+		if (!vfd->vidioc_querymenu)
+			break;
+		ret=vfd->vidioc_querymenu(file, fh, p);
+		if (!ret)
+			dbgarg (cmd, "id=%d, index=%d, name=%s\n",
+						p->id,p->index,p->name);
+		break;
+	}
+	/* --- audio ---------------------------------------------- */
+	case VIDIOC_ENUMAUDIO:
+	{
+		struct v4l2_audio *p=arg;
+
+		if (!vfd->vidioc_enumaudio)
+			break;
+		dbgarg(cmd, "Enum for index=%d\n", p->index);
+		ret=vfd->vidioc_enumaudio(file, fh, p);
+		if (!ret)
+			dbgarg2("index=%d, name=%s, capability=%d, "
+					"mode=%d\n",p->index,p->name,
+					p->capability, p->mode);
+		break;
+	}
+	case VIDIOC_G_AUDIO:
+	{
+		struct v4l2_audio *p=arg;
+
+		if (!vfd->vidioc_g_audio)
+			break;
+		dbgarg(cmd, "Get for index=%d\n", p->index);
+		ret=vfd->vidioc_g_audio(file, fh, p);
+		if (!ret)
+			dbgarg2("index=%d, name=%s, capability=%d, "
+					"mode=%d\n",p->index,
+					p->name,p->capability, p->mode);
+		break;
+	}
+	case VIDIOC_S_AUDIO:
+	{
+		struct v4l2_audio *p=arg;
+
+		if (!vfd->vidioc_s_audio)
+			break;
+		dbgarg(cmd, "index=%d, name=%s, capability=%d, "
+					"mode=%d\n", p->index, p->name,
+					p->capability, p->mode);
+		ret=vfd->vidioc_s_audio(file, fh, p);
+		break;
+	}
+	case VIDIOC_ENUMAUDOUT:
+	{
+		struct v4l2_audioout *p=arg;
+
+		if (!vfd->vidioc_enumaudout)
+			break;
+		dbgarg(cmd, "Enum for index=%d\n", p->index);
+		ret=vfd->vidioc_enumaudout(file, fh, p);
+		if (!ret)
+			dbgarg2("index=%d, name=%s, capability=%d, "
+					"mode=%d\n", p->index, p->name,
+					p->capability,p->mode);
+		break;
+	}
+	case VIDIOC_G_AUDOUT:
+	{
+		struct v4l2_audioout *p=arg;
+
+		if (!vfd->vidioc_g_audout)
+			break;
+		dbgarg(cmd, "Enum for index=%d\n", p->index);
+		ret=vfd->vidioc_g_audout(file, fh, p);
+		if (!ret)
+			dbgarg2("index=%d, name=%s, capability=%d, "
+					"mode=%d\n", p->index, p->name,
+					p->capability,p->mode);
+		break;
+	}
+	case VIDIOC_S_AUDOUT:
+	{
+		struct v4l2_audioout *p=arg;
+
+		if (!vfd->vidioc_s_audout)
+			break;
+		dbgarg(cmd, "index=%d, name=%s, capability=%d, "
+					"mode=%d\n", p->index, p->name,
+					p->capability,p->mode);
+
+		ret=vfd->vidioc_s_audout(file, fh, p);
+		break;
+	}
+	case VIDIOC_G_MODULATOR:
+	{
+		struct v4l2_modulator *p=arg;
+		if (!vfd->vidioc_g_modulator)
+			break;
+		ret=vfd->vidioc_g_modulator(file, fh, p);
+		if (!ret)
+			dbgarg(cmd, "index=%d, name=%s, "
+					"capability=%d, rangelow=%d,"
+					" rangehigh=%d, txsubchans=%d\n",
+					p->index, p->name,p->capability,
+					p->rangelow, p->rangehigh,
+					p->txsubchans);
+		break;
+	}
+	case VIDIOC_S_MODULATOR:
+	{
+		struct v4l2_modulator *p=arg;
+		if (!vfd->vidioc_s_modulator)
+			break;
+		dbgarg(cmd, "index=%d, name=%s, capability=%d, "
+				"rangelow=%d, rangehigh=%d, txsubchans=%d\n",
+				p->index, p->name,p->capability,p->rangelow,
+				p->rangehigh,p->txsubchans);
+			ret=vfd->vidioc_s_modulator(file, fh, p);
+		break;
+	}
+	case VIDIOC_G_CROP:
+	{
+		struct v4l2_crop *p=arg;
+		if (!vfd->vidioc_g_crop)
+			break;
+		ret=vfd->vidioc_g_crop(file, fh, p);
+		if (!ret) {
+			dbgarg(cmd, "type=%d\n", p->type);
+			dbgrect(vfd, "", &p->c);
+		}
+		break;
+	}
+	case VIDIOC_S_CROP:
+	{
+		struct v4l2_crop *p=arg;
+		if (!vfd->vidioc_s_crop)
+			break;
+		dbgarg(cmd, "type=%d\n", p->type);
+		dbgrect(vfd, "", &p->c);
+		ret=vfd->vidioc_s_crop(file, fh, p);
+		break;
+	}
+	case VIDIOC_CROPCAP:
+	{
+		struct v4l2_cropcap *p=arg;
+		/*FIXME: Should also show v4l2_fract pixelaspect */
+		if (!vfd->vidioc_cropcap)
+			break;
+		dbgarg(cmd, "type=%d\n", p->type);
+		dbgrect(vfd, "bounds ", &p->bounds);
+		dbgrect(vfd, "defrect ", &p->defrect);
+		ret=vfd->vidioc_cropcap(file, fh, p);
+		break;
+	}
+	case VIDIOC_G_MPEGCOMP:
+	{
+		struct v4l2_mpeg_compression *p=arg;
+		/*FIXME: Several fields not shown */
+		if (!vfd->vidioc_g_mpegcomp)
+			break;
+		ret=vfd->vidioc_g_mpegcomp(file, fh, p);
+		if (!ret)
+			dbgarg (cmd, "ts_pid_pmt=%d, ts_pid_audio=%d,"
+					" ts_pid_video=%d, ts_pid_pcr=%d, "
+					"ps_size=%d, au_sample_rate=%d, "
+					"au_pesid=%c, vi_frame_rate=%d, "
+					"vi_frames_per_gop=%d, "
+					"vi_bframes_count=%d, vi_pesid=%c\n",
+					p->ts_pid_pmt,p->ts_pid_audio,
+					p->ts_pid_video,p->ts_pid_pcr,
+					p->ps_size, p->au_sample_rate,
+					p->au_pesid, p->vi_frame_rate,
+					p->vi_frames_per_gop,
+					p->vi_bframes_count, p->vi_pesid);
+		break;
+	}
+	case VIDIOC_S_MPEGCOMP:
+	{
+		struct v4l2_mpeg_compression *p=arg;
+		/*FIXME: Several fields not shown */
+		if (!vfd->vidioc_s_mpegcomp)
+			break;
+		dbgarg (cmd, "ts_pid_pmt=%d, ts_pid_audio=%d, "
+				"ts_pid_video=%d, ts_pid_pcr=%d, ps_size=%d, "
+				"au_sample_rate=%d, au_pesid=%c, "
+				"vi_frame_rate=%d, vi_frames_per_gop=%d, "
+				"vi_bframes_count=%d, vi_pesid=%c\n",
+				p->ts_pid_pmt,p->ts_pid_audio, p->ts_pid_video,
+				p->ts_pid_pcr, p->ps_size, p->au_sample_rate,
+				p->au_pesid, p->vi_frame_rate,
+				p->vi_frames_per_gop, p->vi_bframes_count,
+				p->vi_pesid);
+		ret=vfd->vidioc_s_mpegcomp(file, fh, p);
+		break;
+	}
+	case VIDIOC_G_JPEGCOMP:
+	{
+		struct v4l2_jpegcompression *p=arg;
+		if (!vfd->vidioc_g_jpegcomp)
+			break;
+		ret=vfd->vidioc_g_jpegcomp(file, fh, p);
+		if (!ret)
+			dbgarg (cmd, "quality=%d, APPn=%d, "
+						"APP_len=%d, COM_len=%d, "
+						"jpeg_markers=%d\n",
+						p->quality,p->APPn,p->APP_len,
+						p->COM_len,p->jpeg_markers);
+		break;
+	}
+	case VIDIOC_S_JPEGCOMP:
+	{
+		struct v4l2_jpegcompression *p=arg;
+		if (!vfd->vidioc_g_jpegcomp)
+			break;
+		dbgarg (cmd, "quality=%d, APPn=%d, APP_len=%d, "
+					"COM_len=%d, jpeg_markers=%d\n",
+					p->quality,p->APPn,p->APP_len,
+					p->COM_len,p->jpeg_markers);
+			ret=vfd->vidioc_s_jpegcomp(file, fh, p);
+		break;
+	}
+	case VIDIOC_G_PARM:
+	{
+		struct v4l2_streamparm *p=arg;
+		if (!vfd->vidioc_g_parm)
+			break;
+		ret=vfd->vidioc_g_parm(file, fh, p);
+		dbgarg (cmd, "type=%d\n", p->type);
+		break;
+	}
+	case VIDIOC_S_PARM:
+	{
+		struct v4l2_streamparm *p=arg;
+		if (!vfd->vidioc_s_parm)
+			break;
+		dbgarg (cmd, "type=%d\n", p->type);
+		ret=vfd->vidioc_s_parm(file, fh, p);
+		break;
+	}
+	case VIDIOC_G_TUNER:
+	{
+		struct v4l2_tuner *p=arg;
+		if (!vfd->vidioc_g_tuner)
+			break;
+		ret=vfd->vidioc_g_tuner(file, fh, p);
+		if (!ret)
+			dbgarg (cmd, "index=%d, name=%s, type=%d, "
+					"capability=%d, rangelow=%d, "
+					"rangehigh=%d, signal=%d, afc=%d, "
+					"rxsubchans=%d, audmode=%d\n",
+					p->index, p->name, p->type,
+					p->capability, p->rangelow,
+					p->rangehigh, p->rxsubchans,
+					p->audmode, p->signal, p->afc);
+		break;
+	}
+	case VIDIOC_S_TUNER:
+	{
+		struct v4l2_tuner *p=arg;
+		if (!vfd->vidioc_s_tuner)
+			break;
+		dbgarg (cmd, "index=%d, name=%s, type=%d, "
+				"capability=%d, rangelow=%d, rangehigh=%d, "
+				"signal=%d, afc=%d, rxsubchans=%d, "
+				"audmode=%d\n",p->index, p->name, p->type,
+				p->capability, p->rangelow,p->rangehigh,
+				p->rxsubchans, p->audmode, p->signal,
+				p->afc);
+		ret=vfd->vidioc_s_tuner(file, fh, p);
+		break;
+	}
+	case VIDIOC_G_FREQUENCY:
+	{
+		struct v4l2_frequency *p=arg;
+		if (!vfd->vidioc_g_frequency)
+			break;
+		ret=vfd->vidioc_g_frequency(file, fh, p);
+		if (!ret)
+			dbgarg (cmd, "tuner=%d, type=%d, frequency=%d\n",
+						p->tuner,p->type,p->frequency);
+		break;
+	}
+	case VIDIOC_S_FREQUENCY:
+	{
+		struct v4l2_frequency *p=arg;
+		if (!vfd->vidioc_s_frequency)
+			break;
+		dbgarg (cmd, "tuner=%d, type=%d, frequency=%d\n",
+				p->tuner,p->type,p->frequency);
+		ret=vfd->vidioc_s_frequency(file, fh, p);
+		break;
+	}
+	case VIDIOC_G_SLICED_VBI_CAP:
+	{
+		struct v4l2_sliced_vbi_cap *p=arg;
+		if (!vfd->vidioc_g_sliced_vbi_cap)
+			break;
+		ret=vfd->vidioc_g_sliced_vbi_cap(file, fh, p);
+		if (!ret)
+			dbgarg (cmd, "service_set=%d\n", p->service_set);
+		break;
+	}
+	case VIDIOC_LOG_STATUS:
+	{
+		if (!vfd->vidioc_log_status)
+			break;
+		ret=vfd->vidioc_log_status(file, fh);
+		break;
+	}
+
+	/* --- Others --------------------------------------------- */
+
+	default:
+		ret=v4l_compat_translate_ioctl(inode,file,cmd,arg,__video_do_ioctl);
+	}
+
+	if (vfd->debug & V4L2_DEBUG_IOCTL_ARG) {
+		if (ret<0) {
+			printk ("%s: err:\n", vfd->name);
+			v4l_print_ioctl(vfd->name, cmd);
+		}
+	}
+
+	return ret;
+}
+
+int video_ioctl2 (struct inode *inode, struct file *file,
+	       unsigned int cmd, unsigned long arg)
+{
+	char	sbuf[128];
+	void    *mbuf = NULL;
+	void	*parg = NULL;
+	int	err  = -EINVAL;
+
+#ifdef __OLD_VIDIOC_
+	cmd = video_fix_command(cmd);
+#endif
+
+	/*  Copy arguments into temp kernel buffer  */
+	switch (_IOC_DIR(cmd)) {
+	case _IOC_NONE:
+		parg = NULL;
+		break;
+	case _IOC_READ:
+	case _IOC_WRITE:
+	case (_IOC_WRITE | _IOC_READ):
+		if (_IOC_SIZE(cmd) <= sizeof(sbuf)) {
+			parg = sbuf;
+		} else {
+			/* too big to allocate from stack */
+			mbuf = kmalloc(_IOC_SIZE(cmd),GFP_KERNEL);
+			if (NULL == mbuf)
+				return -ENOMEM;
+			parg = mbuf;
+		}
+
+		err = -EFAULT;
+		if (_IOC_DIR(cmd) & _IOC_WRITE)
+			if (copy_from_user(parg, (void __user *)arg, _IOC_SIZE(cmd)))
+				goto out;
+		break;
+	}
+
+	/* Handles IOCTL */
+	err = __video_do_ioctl(inode, file, cmd, parg);
+	if (err == -ENOIOCTLCMD)
+		err = -EINVAL;
+	if (err < 0)
+		goto out;
+
+	/*  Copy results into user buffer  */
+	switch (_IOC_DIR(cmd))
+	{
+	case _IOC_READ:
+	case (_IOC_WRITE | _IOC_READ):
+		if (copy_to_user((void __user *)arg, parg, _IOC_SIZE(cmd)))
+			err = -EFAULT;
+		break;
+	}
+
+out:
+	kfree(mbuf);
+	return err;
+}
+
+
 static struct file_operations video_fops;
 
 /**
@@ -371,7 +1508,9 @@ void video_unregister_device(struct video_device *vfd)
 	mutex_unlock(&videodev_lock);
 }
 
-
+/*
+ * Video fs operations
+ */
 static struct file_operations video_fops=
 {
 	.owner		= THIS_MODULE,
@@ -387,7 +1526,7 @@ static int __init videodev_init(void)
 {
 	int ret;
 
-	printk(KERN_INFO "Linux video capture interface: v1.00\n");
+	printk(KERN_INFO "Linux video capture interface: v2.00\n");
 	if (register_chrdev(VIDEO_MAJOR, VIDEO_NAME, &video_fops)) {
 		printk(KERN_WARNING "video_dev: unable to get major %d\n", VIDEO_MAJOR);
 		return -EIO;
@@ -418,11 +1557,12 @@ EXPORT_SYMBOL(video_devdata);
 EXPORT_SYMBOL(video_usercopy);
 EXPORT_SYMBOL(video_exclusive_open);
 EXPORT_SYMBOL(video_exclusive_release);
+EXPORT_SYMBOL(video_ioctl2);
 EXPORT_SYMBOL(video_device_alloc);
 EXPORT_SYMBOL(video_device_release);
 
-MODULE_AUTHOR("Alan Cox");
-MODULE_DESCRIPTION("Device registrar for Video4Linux drivers");
+MODULE_AUTHOR("Alan Cox, Mauro Carvalho Chehab <mchehab@infradead.org>");
+MODULE_DESCRIPTION("Device registrar for Video4Linux drivers v2");
 MODULE_LICENSE("GPL");
 
 
diff --git a/include/linux/videodev.h b/include/linux/videodev.h
index 91140091ced2..5b6205544a7a 100644
--- a/include/linux/videodev.h
+++ b/include/linux/videodev.h
@@ -1,49 +1,28 @@
+/*
+ *	Video for Linux version 1 - OBSOLETE
+ *
+ *	Header file for v4l1 drivers and applications, for
+ *	Linux kernels 2.2.x or 2.4.x.
+ *
+ *	Provides header for legacy drivers and applications
+ *
+ *	See http://linuxtv.org for more info
+ *
+ */
 #ifndef __LINUX_VIDEODEV_H
 #define __LINUX_VIDEODEV_H
 
 #include <linux/types.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/compiler.h> /* need __user */
 
 #define HAVE_V4L1 1
 
 #include <linux/videodev2.h>
 
-#ifdef __KERNEL__
-
-#include <linux/mm.h>
-
-extern struct video_device* video_devdata(struct file*);
-
-#define to_video_device(cd) container_of(cd, struct video_device, class_dev)
-static inline void
-video_device_create_file(struct video_device *vfd,
-			 struct class_device_attribute *attr)
-{
-	class_device_create_file(&vfd->class_dev, attr);
-}
-static inline void
-video_device_remove_file(struct video_device *vfd,
-			 struct class_device_attribute *attr)
-{
-	class_device_remove_file(&vfd->class_dev, attr);
-}
-
-#if OBSOLETE_OWNER /* to be removed in 2.6.15 */
-/* helper functions to access driver private data. */
-static inline void *video_get_drvdata(struct video_device *dev)
-{
-	return dev->priv;
-}
-
-static inline void video_set_drvdata(struct video_device *dev, void *data)
-{
-	dev->priv = data;
-}
-#endif
-
-extern int video_exclusive_open(struct inode *inode, struct file *file);
-extern int video_exclusive_release(struct inode *inode, struct file *file);
-#endif /* __KERNEL__ */
-
 struct video_capability
 {
 	char name[32];
@@ -363,6 +342,11 @@ struct video_code
 #define VID_HARDWARE_SAA7114H   37
 #define VID_HARDWARE_SN9C102	38
 #define VID_HARDWARE_ARV	39
+
+#ifdef __KERNEL__
+#include <media/v4l2-dev.h>
+#endif /* __KERNEL__ */
+
 #endif /* __LINUX_VIDEODEV_H */
 
 /*
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index b3a848b6fb1c..bb58197ab6b9 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -1,29 +1,23 @@
-#ifndef __LINUX_VIDEODEV2_H
-#define __LINUX_VIDEODEV2_H
 /*
  *	Video for Linux Two
  *
- *	Header file for v4l or V4L2 drivers and applications, for
- *	Linux kernels 2.2.x or 2.4.x.
+ *	Header file for v4l or V4L2 drivers and applications
+ * with public API.
+ * All kernel-specific stuff were moved to media/v4l2-dev.h, so
+ * no #if __KERNEL tests are allowed here
  *
- *	See http://bytesex.org/v4l/ for API specs and other
- *	v4l2 documentation.
+ *	See http://linuxtv.org for more info
  *
  *	Author: Bill Dirks <bdirks@pacbell.net>
  *		Justin Schoeman
  *		et al.
  */
-#ifdef __KERNEL__
+#ifndef __LINUX_VIDEODEV2_H
+#define __LINUX_VIDEODEV2_H
 #include <linux/time.h> /* need struct timeval */
-#include <linux/poll.h>
-#include <linux/device.h>
-#include <linux/mutex.h>
-#endif
 #include <linux/types.h>
 #include <linux/compiler.h> /* need __user */
 
-
-#define OBSOLETE_OWNER 1 /* It will be removed for 2.6.17 */
 #define HAVE_V4L2 1
 
 /*
@@ -48,71 +42,6 @@
 #define VID_TYPE_MJPEG_DECODER	4096	/* Can decode MJPEG streams */
 #define VID_TYPE_MJPEG_ENCODER	8192	/* Can encode MJPEG streams */
 
-#ifdef __KERNEL__
-
-/* Minor device allocation */
-#define MINOR_VFL_TYPE_GRABBER_MIN   0
-#define MINOR_VFL_TYPE_GRABBER_MAX  63
-#define MINOR_VFL_TYPE_RADIO_MIN    64
-#define MINOR_VFL_TYPE_RADIO_MAX   127
-#define MINOR_VFL_TYPE_VTX_MIN     192
-#define MINOR_VFL_TYPE_VTX_MAX     223
-#define MINOR_VFL_TYPE_VBI_MIN     224
-#define MINOR_VFL_TYPE_VBI_MAX     255
-
-#define VFL_TYPE_GRABBER	0
-#define VFL_TYPE_VBI		1
-#define VFL_TYPE_RADIO		2
-#define VFL_TYPE_VTX		3
-
-struct video_device
-{
-	/* device info */
-	struct device *dev;
-	char name[32];
-	int type;       /* v4l1 */
-	int type2;      /* v4l2 */
-	int hardware;
-	int minor;
-
-	/* device ops + callbacks */
-	const struct file_operations *fops;
-	void (*release)(struct video_device *vfd);
-
-
-#if OBSOLETE_OWNER /* to be removed in 2.6.15 */
-	/* obsolete -- fops->owner is used instead */
-	struct module *owner;
-	/* dev->driver_data will be used instead some day.
-	 * Use the video_{get|set}_drvdata() helper functions,
-	 * so the switch over will be transparent for you.
-	 * Or use {pci|usb}_{get|set}_drvdata() directly. */
-	void *priv;
-#endif
-
-	/* for videodev.c intenal usage -- please don't touch */
-	int users;                     /* video_exclusive_{open|close} ... */
-	struct mutex lock;             /* ... helper function uses these   */
-	char devfs_name[64];           /* devfs */
-	struct class_device class_dev; /* sysfs */
-};
-
-#define VIDEO_MAJOR	81
-
-extern int video_register_device(struct video_device *, int type, int nr);
-extern void video_unregister_device(struct video_device *);
-extern int video_usercopy(struct inode *inode, struct file *file,
-			  unsigned int cmd, unsigned long arg,
-			  int (*func)(struct inode *inode, struct file *file,
-				      unsigned int cmd, void *arg));
-
-/* helper functions to alloc / release struct video_device, the
-   later can be used for video_device->release() */
-struct video_device *video_device_alloc(void);
-void video_device_release(struct video_device *vfd);
-
-#endif
-
 /*
  *	M I S C E L L A N E O U S
  */
@@ -1098,6 +1027,7 @@ struct v4l2_streamparm
 #endif
 #define VIDIOC_LOG_STATUS       _IO   ('V', 70)
 
+#ifdef __OLD_VIDIOC_
 /* for compatibility, will go away some day */
 #define VIDIOC_OVERLAY_OLD     	_IOWR ('V', 14, int)
 #define VIDIOC_S_PARM_OLD      	_IOW  ('V', 22, struct v4l2_streamparm)
@@ -1105,57 +1035,10 @@ struct v4l2_streamparm
 #define VIDIOC_G_AUDIO_OLD     	_IOWR ('V', 33, struct v4l2_audio)
 #define VIDIOC_G_AUDOUT_OLD    	_IOWR ('V', 49, struct v4l2_audioout)
 #define VIDIOC_CROPCAP_OLD     	_IOR  ('V', 58, struct v4l2_cropcap)
-
-#define BASE_VIDIOC_PRIVATE	192		/* 192-255 are private */
-
-
-#ifdef __KERNEL__
-/*
- *
- *	V 4 L 2   D R I V E R   H E L P E R   A P I
- *
- *	Some commonly needed functions for drivers (v4l2-common.o module)
- */
-#include <linux/fs.h>
-
-/*  Video standard functions  */
-extern unsigned int v4l2_video_std_fps(struct v4l2_standard *vs);
-extern int v4l2_video_std_construct(struct v4l2_standard *vs,
-				    int id, char *name);
-
-/* prority handling */
-struct v4l2_prio_state {
-	atomic_t prios[4];
-};
-int v4l2_prio_init(struct v4l2_prio_state *global);
-int v4l2_prio_change(struct v4l2_prio_state *global, enum v4l2_priority *local,
-		     enum v4l2_priority new);
-int v4l2_prio_open(struct v4l2_prio_state *global, enum v4l2_priority *local);
-int v4l2_prio_close(struct v4l2_prio_state *global, enum v4l2_priority *local);
-enum v4l2_priority v4l2_prio_max(struct v4l2_prio_state *global);
-int v4l2_prio_check(struct v4l2_prio_state *global, enum v4l2_priority *local);
-
-/* names for fancy debug output */
-extern char *v4l2_field_names[];
-extern char *v4l2_type_names[];
-
-/*  Compatibility layer interface  --  v4l1-compat module */
-typedef int (*v4l2_kioctl)(struct inode *inode, struct file *file,
-			   unsigned int cmd, void *arg);
-
-#ifdef CONFIG_VIDEO_V4L1_COMPAT
-int v4l_compat_translate_ioctl(struct inode *inode, struct file *file,
-			       int cmd, void *arg, v4l2_kioctl driver_ioctl);
-#else
-#define v4l_compat_translate_ioctl(inode,file,cmd,arg,ioctl) -EINVAL
 #endif
 
-/* 32 Bits compatibility layer for 64 bits processors */
-extern long v4l_compat_ioctl32(struct file *file, unsigned int cmd,
-				unsigned long arg);
-
+#define BASE_VIDIOC_PRIVATE	192		/* 192-255 are private */
 
-#endif /* __KERNEL__ */
 #endif /* __LINUX_VIDEODEV2_H */
 
 /*
diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h
index e94aff029cc5..1440d4ab6af9 100644
--- a/include/media/v4l2-common.h
+++ b/include/media/v4l2-common.h
@@ -26,8 +26,14 @@
 #ifndef V4L2_COMMON_H_
 #define V4L2_COMMON_H_
 
+#include <media/v4l2-dev.h>
+
 /* v4l debugging and diagnostics */
 
+/* Debug bitmask flags to be used on V4L2 */
+#define V4L2_DEBUG_IOCTL     0x01
+#define V4L2_DEBUG_IOCTL_ARG 0x02
+
 /* Common printk constucts for v4l-i2c drivers. These macros create a unique
    prefix consisting of the driver name, the adapter number and the i2c
    address. */
diff --git a/include/media/v4l2-dev.h b/include/media/v4l2-dev.h
new file mode 100644
index 000000000000..c2f54d2c9a4c
--- /dev/null
+++ b/include/media/v4l2-dev.h
@@ -0,0 +1,371 @@
+/*
+ *
+ *	V 4 L 2   D R I V E R   H E L P E R   A P I
+ *
+ * Moved from videodev2.h
+ *
+ *	Some commonly needed functions for drivers (v4l2-common.o module)
+ */
+#ifndef _V4L2_DEV_H
+#define _V4L2_DEV_H
+
+#define OBSOLETE_OWNER 1 /* to be removed soon */
+
+#include <linux/poll.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/compiler.h> /* need __user */
+#ifdef CONFIG_VIDEO_V4L1
+#include <linux/videodev.h>
+#else
+#include <linux/videodev2.h>
+#endif
+
+#include <linux/fs.h>
+
+#define VIDEO_MAJOR	81
+/* Minor device allocation */
+#define MINOR_VFL_TYPE_GRABBER_MIN   0
+#define MINOR_VFL_TYPE_GRABBER_MAX  63
+#define MINOR_VFL_TYPE_RADIO_MIN    64
+#define MINOR_VFL_TYPE_RADIO_MAX   127
+#define MINOR_VFL_TYPE_VTX_MIN     192
+#define MINOR_VFL_TYPE_VTX_MAX     223
+#define MINOR_VFL_TYPE_VBI_MIN     224
+#define MINOR_VFL_TYPE_VBI_MAX     255
+
+#define VFL_TYPE_GRABBER	0
+#define VFL_TYPE_VBI		1
+#define VFL_TYPE_RADIO		2
+#define VFL_TYPE_VTX		3
+
+	const struct file_operations *fops;
+
+/*  Video standard functions  */
+extern unsigned int v4l2_video_std_fps(struct v4l2_standard *vs);
+extern int v4l2_video_std_construct(struct v4l2_standard *vs,
+				    int id, char *name);
+
+/* prority handling */
+struct v4l2_prio_state {
+	atomic_t prios[4];
+};
+int v4l2_prio_init(struct v4l2_prio_state *global);
+int v4l2_prio_change(struct v4l2_prio_state *global, enum v4l2_priority *local,
+		     enum v4l2_priority new);
+int v4l2_prio_open(struct v4l2_prio_state *global, enum v4l2_priority *local);
+int v4l2_prio_close(struct v4l2_prio_state *global, enum v4l2_priority *local);
+enum v4l2_priority v4l2_prio_max(struct v4l2_prio_state *global);
+int v4l2_prio_check(struct v4l2_prio_state *global, enum v4l2_priority *local);
+
+/* names for fancy debug output */
+extern char *v4l2_field_names[];
+extern char *v4l2_type_names[];
+
+/*  Compatibility layer interface  --  v4l1-compat module */
+typedef int (*v4l2_kioctl)(struct inode *inode, struct file *file,
+			   unsigned int cmd, void *arg);
+#ifdef CONFIG_VIDEO_V4L1_COMPAT
+int v4l_compat_translate_ioctl(struct inode *inode, struct file *file,
+			       int cmd, void *arg, v4l2_kioctl driver_ioctl);
+#else
+#define v4l_compat_translate_ioctl(inode,file,cmd,arg,ioctl) -EINVAL
+#endif
+
+/* 32 Bits compatibility layer for 64 bits processors */
+extern long v4l_compat_ioctl32(struct file *file, unsigned int cmd,
+				unsigned long arg);
+
+/*
+ * Newer version of video_device, handled by videodev2.c
+ * 	This version moves redundant code from video device code to
+ *	the common handler
+ */
+struct v4l2_tvnorm {
+	char          *name;
+	v4l2_std_id   id;
+
+	void          *priv_data;
+};
+
+struct video_device
+{
+	/* device ops */
+	struct file_operations *fops;
+
+	/* device info */
+	struct device *dev;
+	char name[32];
+	int type;       /* v4l1 */
+	int type2;      /* v4l2 */
+	int hardware;
+	int minor;
+
+	int debug;	/* Activates debug level*/
+
+	/* Video standard vars */
+	int tvnormsize;	/* Size of tvnorm array */
+	v4l2_std_id current_norm; /* Current tvnorm */
+	struct v4l2_tvnorm *tvnorms;
+
+	/* callbacks */
+	void (*release)(struct video_device *vfd);
+
+	/* ioctl callbacks */
+
+	/* VIDIOC_QUERYCAP handler */
+	int (*vidioc_querycap)(struct file *file, void *fh, struct v4l2_capability *cap);
+
+	/* Priority handling */
+	int (*vidioc_g_priority)   (struct file *file, void *fh,
+				    enum v4l2_priority *p);
+	int (*vidioc_s_priority)   (struct file *file, void *fh,
+				    enum v4l2_priority p);
+
+	/* VIDIOC_ENUM_FMT handlers */
+	int (*vidioc_enum_fmt_cap)         (struct file *file, void *fh,
+					    struct v4l2_fmtdesc *f);
+	int (*vidioc_enum_fmt_overlay)     (struct file *file, void *fh,
+					    struct v4l2_fmtdesc *f);
+	int (*vidioc_enum_fmt_vbi)         (struct file *file, void *fh,
+					    struct v4l2_fmtdesc *f);
+	int (*vidioc_enum_fmt_vbi_capture) (struct file *file, void *fh,
+					    struct v4l2_fmtdesc *f);
+	int (*vidioc_enum_fmt_video_output)(struct file *file, void *fh,
+					    struct v4l2_fmtdesc *f);
+	int (*vidioc_enum_fmt_vbi_output)  (struct file *file, void *fh,
+					    struct v4l2_fmtdesc *f);
+	int (*vidioc_enum_fmt_type_private)(struct file *file, void *fh,
+					    struct v4l2_fmtdesc *f);
+
+	/* VIDIOC_G_FMT handlers */
+	int (*vidioc_g_fmt_cap)        (struct file *file, void *fh,
+					struct v4l2_format *f);
+	int (*vidioc_g_fmt_overlay)    (struct file *file, void *fh,
+					struct v4l2_format *f);
+	int (*vidioc_g_fmt_vbi)        (struct file *file, void *fh,
+					struct v4l2_format *f);
+	int (*vidioc_g_fmt_vbi_output) (struct file *file, void *fh,
+					struct v4l2_format *f);
+	int (*vidioc_g_fmt_vbi_capture)(struct file *file, void *fh,
+					struct v4l2_format *f);
+	int (*vidioc_g_fmt_video_output)(struct file *file, void *fh,
+					struct v4l2_format *f);
+	int (*vidioc_g_fmt_type_private)(struct file *file, void *fh,
+					struct v4l2_format *f);
+
+	/* VIDIOC_S_FMT handlers */
+	int (*vidioc_s_fmt_cap)        (struct file *file, void *fh,
+					struct v4l2_format *f);
+
+	int (*vidioc_s_fmt_overlay)    (struct file *file, void *fh,
+					struct v4l2_format *f);
+	int (*vidioc_s_fmt_vbi)        (struct file *file, void *fh,
+					struct v4l2_format *f);
+	int (*vidioc_s_fmt_vbi_output) (struct file *file, void *fh,
+					struct v4l2_format *f);
+	int (*vidioc_s_fmt_vbi_capture)(struct file *file, void *fh,
+					struct v4l2_format *f);
+	int (*vidioc_s_fmt_video_output)(struct file *file, void *fh,
+					struct v4l2_format *f);
+	int (*vidioc_s_fmt_type_private)(struct file *file, void *fh,
+					struct v4l2_format *f);
+
+	/* VIDIOC_TRY_FMT handlers */
+	int (*vidioc_try_fmt_cap)        (struct file *file, void *fh,
+					  struct v4l2_format *f);
+	int (*vidioc_try_fmt_overlay)    (struct file *file, void *fh,
+					  struct v4l2_format *f);
+	int (*vidioc_try_fmt_vbi)        (struct file *file, void *fh,
+					  struct v4l2_format *f);
+	int (*vidioc_try_fmt_vbi_output) (struct file *file, void *fh,
+					  struct v4l2_format *f);
+	int (*vidioc_try_fmt_vbi_capture)(struct file *file, void *fh,
+					  struct v4l2_format *f);
+	int (*vidioc_try_fmt_video_output)(struct file *file, void *fh,
+					  struct v4l2_format *f);
+	int (*vidioc_try_fmt_type_private)(struct file *file, void *fh,
+					  struct v4l2_format *f);
+
+	/* Buffer handlers */
+	int (*vidioc_reqbufs) (struct file *file, void *fh, struct v4l2_requestbuffers *b);
+	int (*vidioc_querybuf)(struct file *file, void *fh, struct v4l2_buffer *b);
+	int (*vidioc_qbuf)    (struct file *file, void *fh, struct v4l2_buffer *b);
+	int (*vidioc_dqbuf)   (struct file *file, void *fh, struct v4l2_buffer *b);
+
+
+	int (*vidioc_overlay) (struct file *file, void *fh, unsigned int i);
+#ifdef HAVE_V4L1
+			/* buffer type is struct vidio_mbuf * */
+	int (*vidiocgmbuf)  (struct file *file, void *fh, struct video_mbuf *p);
+#endif
+	int (*vidioc_g_fbuf)   (struct file *file, void *fh,
+				struct v4l2_framebuffer *a);
+	int (*vidioc_s_fbuf)   (struct file *file, void *fh,
+				struct v4l2_framebuffer *a);
+
+		/* Stream on/off */
+	int (*vidioc_streamon) (struct file *file, void *fh, enum v4l2_buf_type i);
+	int (*vidioc_streamoff)(struct file *file, void *fh, enum v4l2_buf_type i);
+
+		/* Standard handling
+			G_STD and ENUMSTD are handled by videodev.c
+		 */
+	int (*vidioc_s_std)    (struct file *file, void *fh, v4l2_std_id a);
+	int (*vidioc_querystd) (struct file *file, void *fh, v4l2_std_id *a);
+
+		/* Input handling */
+	int (*vidioc_enum_input)(struct file *file, void *fh,
+				 struct v4l2_input *inp);
+	int (*vidioc_g_input)   (struct file *file, void *fh, unsigned int *i);
+	int (*vidioc_s_input)   (struct file *file, void *fh, unsigned int i);
+
+		/* Output handling */
+	int (*vidioc_enumoutput) (struct file *file, void *fh,
+				  struct v4l2_output *a);
+	int (*vidioc_g_output)   (struct file *file, void *fh, unsigned int *i);
+	int (*vidioc_s_output)   (struct file *file, void *fh, unsigned int i);
+
+		/* Control handling */
+	int (*vidioc_queryctrl)        (struct file *file, void *fh,
+					struct v4l2_queryctrl *a);
+	int (*vidioc_g_ctrl)           (struct file *file, void *fh,
+					struct v4l2_control *a);
+	int (*vidioc_s_ctrl)           (struct file *file, void *fh,
+					struct v4l2_control *a);
+	int (*vidioc_querymenu)        (struct file *file, void *fh,
+					struct v4l2_querymenu *a);
+
+	/* Audio ioctls */
+	int (*vidioc_enumaudio)        (struct file *file, void *fh,
+					struct v4l2_audio *a);
+	int (*vidioc_g_audio)          (struct file *file, void *fh,
+					struct v4l2_audio *a);
+	int (*vidioc_s_audio)          (struct file *file, void *fh,
+					struct v4l2_audio *a);
+
+	/* Audio out ioctls */
+	int (*vidioc_enumaudout)       (struct file *file, void *fh,
+					struct v4l2_audioout *a);
+	int (*vidioc_g_audout)         (struct file *file, void *fh,
+					struct v4l2_audioout *a);
+	int (*vidioc_s_audout)         (struct file *file, void *fh,
+					struct v4l2_audioout *a);
+	int (*vidioc_g_modulator)      (struct file *file, void *fh,
+					struct v4l2_modulator *a);
+	int (*vidioc_s_modulator)      (struct file *file, void *fh,
+					struct v4l2_modulator *a);
+	/* Crop ioctls */
+	int (*vidioc_cropcap)          (struct file *file, void *fh,
+					struct v4l2_cropcap *a);
+	int (*vidioc_g_crop)           (struct file *file, void *fh,
+					struct v4l2_crop *a);
+	int (*vidioc_s_crop)           (struct file *file, void *fh,
+					struct v4l2_crop *a);
+	/* Compression ioctls */
+	int (*vidioc_g_mpegcomp)       (struct file *file, void *fh,
+					struct v4l2_mpeg_compression *a);
+	int (*vidioc_s_mpegcomp)       (struct file *file, void *fh,
+					struct v4l2_mpeg_compression *a);
+	int (*vidioc_g_jpegcomp)       (struct file *file, void *fh,
+					struct v4l2_jpegcompression *a);
+	int (*vidioc_s_jpegcomp)       (struct file *file, void *fh,
+					struct v4l2_jpegcompression *a);
+
+	/* Stream type-dependent parameter ioctls */
+	int (*vidioc_g_parm)           (struct file *file, void *fh,
+					struct v4l2_streamparm *a);
+	int (*vidioc_s_parm)           (struct file *file, void *fh,
+					struct v4l2_streamparm *a);
+
+	/* Tuner ioctls */
+	int (*vidioc_g_tuner)          (struct file *file, void *fh,
+					struct v4l2_tuner *a);
+	int (*vidioc_s_tuner)          (struct file *file, void *fh,
+					struct v4l2_tuner *a);
+	int (*vidioc_g_frequency)      (struct file *file, void *fh,
+					struct v4l2_frequency *a);
+	int (*vidioc_s_frequency)      (struct file *file, void *fh,
+					struct v4l2_frequency *a);
+
+	/* Sliced VBI cap */
+	int (*vidioc_g_sliced_vbi_cap) (struct file *file, void *fh,
+					struct v4l2_sliced_vbi_cap *a);
+
+	/* Log status ioctl */
+	int (*vidioc_log_status)       (struct file *file, void *fh);
+
+
+#if OBSOLETE_OWNER /* to be removed soon */
+/* obsolete -- fops->owner is used instead */
+struct module *owner;
+/* dev->driver_data will be used instead some day.
+	* Use the video_{get|set}_drvdata() helper functions,
+	* so the switch over will be transparent for you.
+	* Or use {pci|usb}_{get|set}_drvdata() directly. */
+void *priv;
+#endif
+
+	/* for videodev.c intenal usage -- please don't touch */
+	int users;                     /* video_exclusive_{open|close} ... */
+	struct mutex lock;             /* ... helper function uses these   */
+	char devfs_name[64];           /* devfs */
+	struct class_device class_dev; /* sysfs */
+};
+
+/* Version 2 functions */
+extern int video_register_device(struct video_device *vfd, int type, int nr);
+void video_unregister_device(struct video_device *);
+extern int video_ioctl2(struct inode *inode, struct file *file,
+			  unsigned int cmd, unsigned long arg);
+
+/* helper functions to alloc / release struct video_device, the
+   later can be used for video_device->release() */
+struct video_device *video_device_alloc(void);
+void video_device_release(struct video_device *vfd);
+
+/* Include support for obsoleted stuff */
+extern int video_usercopy(struct inode *inode, struct file *file,
+			  unsigned int cmd, unsigned long arg,
+			  int (*func)(struct inode *inode, struct file *file,
+				      unsigned int cmd, void *arg));
+
+
+#ifdef HAVE_V4L1
+#include <linux/mm.h>
+
+extern struct video_device* video_devdata(struct file*);
+
+#define to_video_device(cd) container_of(cd, struct video_device, class_dev)
+static inline void
+video_device_create_file(struct video_device *vfd,
+			 struct class_device_attribute *attr)
+{
+	class_device_create_file(&vfd->class_dev, attr);
+}
+static inline void
+video_device_remove_file(struct video_device *vfd,
+			 struct class_device_attribute *attr)
+{
+	class_device_remove_file(&vfd->class_dev, attr);
+}
+
+#if OBSOLETE_OWNER /* to be removed soon */
+/* helper functions to access driver private data. */
+static inline void *video_get_drvdata(struct video_device *dev)
+{
+	return dev->priv;
+}
+
+static inline void video_set_drvdata(struct video_device *dev, void *data)
+{
+	dev->priv = data;
+}
+#endif
+
+extern int video_exclusive_open(struct inode *inode, struct file *file);
+extern int video_exclusive_release(struct inode *inode, struct file *file);
+#endif /* HAVE_V4L1 */
+
+#endif /* _V4L2_DEV_H */
diff --git a/include/media/video-buf.h b/include/media/video-buf.h
index fff3fd0fbf94..1115a256969f 100644
--- a/include/media/video-buf.h
+++ b/include/media/video-buf.h
@@ -23,6 +23,7 @@
  */
 
 #include <linux/videodev2.h>
+#include <linux/poll.h>
 
 #define UNSET (-1U)
 
-- 
cgit v1.2.3


From 5e87efa3b29c105f81fea785babafb098e4e046d Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@infradead.org>
Date: Mon, 5 Jun 2006 10:26:32 -0300
Subject: V4L/DVB (4068): Removed all references to kernel stuff from
 videodev.h and videodev2.h

The videodev.h and videodev2.h describe the public API for V4L and V4L2.
It shouldn't have there any kernel-specific stuff. Those were moved to
v4l2-dev.h.
This patch removes some uneeded headers and include v4l2-common.h on all
V4L driver. This header includes device implementation of V4L2 API provided
on v4l2-dev.h as well as V4L2 internal ioctls that provides connections
between master driver and its i2c devices.

Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 drivers/media/radio/miropcm20-radio.c   |  1 +
 drivers/media/radio/radio-aimslab.c     |  1 +
 drivers/media/radio/radio-aztech.c      |  1 +
 drivers/media/radio/radio-cadet.c       |  1 +
 drivers/media/radio/radio-gemtek-pci.c  |  1 +
 drivers/media/radio/radio-gemtek.c      |  1 +
 drivers/media/radio/radio-maestro.c     |  2 +-
 drivers/media/radio/radio-maxiradio.c   |  1 +
 drivers/media/radio/radio-rtrack2.c     |  1 +
 drivers/media/radio/radio-sf16fmi.c     |  1 +
 drivers/media/radio/radio-sf16fmr2.c    |  1 +
 drivers/media/radio/radio-terratec.c    |  1 +
 drivers/media/radio/radio-trust.c       |  1 +
 drivers/media/radio/radio-typhoon.c     |  1 +
 drivers/media/radio/radio-zoltrix.c     |  1 +
 drivers/media/video/arv.c               |  1 +
 drivers/media/video/bt8xx/bttvp.h       |  1 +
 drivers/media/video/bw-qcam.c           |  1 +
 drivers/media/video/c-qcam.c            |  1 +
 drivers/media/video/cpia.h              |  1 +
 drivers/media/video/cpia2/cpia2.h       |  1 +
 drivers/media/video/dsbr100.c           |  1 +
 drivers/media/video/meye.c              |  1 +
 drivers/media/video/ov511.h             |  1 +
 drivers/media/video/planb.c             |  1 +
 drivers/media/video/pms.c               |  1 +
 drivers/media/video/pwc/pwc.h           |  2 +-
 drivers/media/video/saa5246a.c          |  1 +
 drivers/media/video/saa5249.c           |  1 +
 drivers/media/video/saa7110.c           |  1 +
 drivers/media/video/se401.h             |  1 +
 drivers/media/video/stradis.c           |  1 +
 drivers/media/video/stv680.c            |  1 +
 drivers/media/video/tda9875.c           |  1 +
 drivers/media/video/tuner-3036.c        |  1 +
 drivers/media/video/usbvideo/usbvideo.h |  1 +
 drivers/media/video/v4l1-compat.c       |  1 +
 drivers/media/video/vino.c              |  2 +-
 drivers/media/video/vpx3220.c           |  1 +
 drivers/media/video/w9966.c             |  1 +
 drivers/media/video/zoran_card.c        |  1 +
 drivers/media/video/zoran_driver.c      |  1 +
 drivers/media/video/zoran_procfs.c      |  1 +
 include/linux/videodev.h                | 11 -----------
 include/linux/videodev2.h               | 18 ++++--------------
 include/media/ovcamchip.h               |  1 +
 include/media/saa7146_vv.h              |  2 +-
 include/media/v4l2-dev.h                |  8 +++-----
 48 files changed, 52 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/radio/miropcm20-radio.c b/drivers/media/radio/miropcm20-radio.c
index 7765c6a0cfe6..c4312fa0e2f5 100644
--- a/drivers/media/radio/miropcm20-radio.c
+++ b/drivers/media/radio/miropcm20-radio.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include "oss/aci.h"
 #include "miropcm20-rds-core.h"
 
diff --git a/drivers/media/radio/radio-aimslab.c b/drivers/media/radio/radio-aimslab.c
index 36119d77f868..df22a582e7a2 100644
--- a/drivers/media/radio/radio-aimslab.c
+++ b/drivers/media/radio/radio-aimslab.c
@@ -34,6 +34,7 @@
 #include <asm/io.h>		/* outb, outb_p			*/
 #include <asm/uaccess.h>	/* copy to/from user		*/
 #include <linux/videodev.h>	/* kernel radio structs		*/
+#include <media/v4l2-common.h>
 #include <linux/config.h>	/* CONFIG_RADIO_RTRACK_PORT 	*/
 #include <asm/semaphore.h>	/* Lock for the I/O 		*/
 
diff --git a/drivers/media/radio/radio-aztech.c b/drivers/media/radio/radio-aztech.c
index ca676245c071..95e6322133ee 100644
--- a/drivers/media/radio/radio-aztech.c
+++ b/drivers/media/radio/radio-aztech.c
@@ -31,6 +31,7 @@
 #include <asm/io.h>		/* outb, outb_p			*/
 #include <asm/uaccess.h>	/* copy to/from user		*/
 #include <linux/videodev.h>	/* kernel radio structs		*/
+#include <media/v4l2-common.h>
 #include <linux/config.h>	/* CONFIG_RADIO_AZTECH_PORT 	*/
 
 /* acceptable ports: 0x350 (JP3 shorted), 0x358 (JP3 open) */
diff --git a/drivers/media/radio/radio-cadet.c b/drivers/media/radio/radio-cadet.c
index c048454c7ba5..8641aec7baf8 100644
--- a/drivers/media/radio/radio-cadet.c
+++ b/drivers/media/radio/radio-cadet.c
@@ -34,6 +34,7 @@
 #include <asm/io.h>		/* outb, outb_p			*/
 #include <asm/uaccess.h>	/* copy to/from user		*/
 #include <linux/videodev.h>	/* kernel radio structs		*/
+#include <media/v4l2-common.h>
 #include <linux/param.h>
 #include <linux/pnp.h>
 
diff --git a/drivers/media/radio/radio-gemtek-pci.c b/drivers/media/radio/radio-gemtek-pci.c
index d5c3c4b878c7..9f249e7e60c9 100644
--- a/drivers/media/radio/radio-gemtek-pci.c
+++ b/drivers/media/radio/radio-gemtek-pci.c
@@ -44,6 +44,7 @@
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/errno.h>
 
 #include <asm/io.h>
diff --git a/drivers/media/radio/radio-gemtek.c b/drivers/media/radio/radio-gemtek.c
index 77a1e12333a3..162f37d8bf96 100644
--- a/drivers/media/radio/radio-gemtek.c
+++ b/drivers/media/radio/radio-gemtek.c
@@ -22,6 +22,7 @@
 #include <asm/io.h>		/* outb, outb_p			*/
 #include <asm/uaccess.h>	/* copy to/from user		*/
 #include <linux/videodev.h>	/* kernel radio structs		*/
+#include <media/v4l2-common.h>
 #include <linux/config.h>	/* CONFIG_RADIO_GEMTEK_PORT 	*/
 #include <linux/spinlock.h>
 
diff --git a/drivers/media/radio/radio-maestro.c b/drivers/media/radio/radio-maestro.c
index 2501792e1fa8..fcfa6c9fe225 100644
--- a/drivers/media/radio/radio-maestro.c
+++ b/drivers/media/radio/radio-maestro.c
@@ -26,7 +26,7 @@
 #include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/videodev.h>
-
+#include <media/v4l2-common.h>
 
 #define DRIVER_VERSION	"0.05"
 
diff --git a/drivers/media/radio/radio-maxiradio.c b/drivers/media/radio/radio-maxiradio.c
index fe2552569690..f93d7afe7304 100644
--- a/drivers/media/radio/radio-maxiradio.c
+++ b/drivers/media/radio/radio-maxiradio.c
@@ -41,6 +41,7 @@
 
 #include <linux/pci.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 
 /* version 0.75      Sun Feb  4 22:51:27 EET 2001 */
 #define DRIVER_VERSION	"0.75"
diff --git a/drivers/media/radio/radio-rtrack2.c b/drivers/media/radio/radio-rtrack2.c
index 3821d25ed411..5b68ac4c7322 100644
--- a/drivers/media/radio/radio-rtrack2.c
+++ b/drivers/media/radio/radio-rtrack2.c
@@ -15,6 +15,7 @@
 #include <asm/io.h>		/* outb, outb_p			*/
 #include <asm/uaccess.h>	/* copy to/from user		*/
 #include <linux/videodev.h>	/* kernel radio structs		*/
+#include <media/v4l2-common.h>
 #include <linux/config.h>	/* CONFIG_RADIO_RTRACK2_PORT 	*/
 #include <linux/spinlock.h>
 
diff --git a/drivers/media/radio/radio-sf16fmi.c b/drivers/media/radio/radio-sf16fmi.c
index 70cfbc3910dd..efee6e339d15 100644
--- a/drivers/media/radio/radio-sf16fmi.c
+++ b/drivers/media/radio/radio-sf16fmi.c
@@ -21,6 +21,7 @@
 #include <linux/ioport.h>	/* request_region		*/
 #include <linux/delay.h>	/* udelay			*/
 #include <linux/videodev.h>	/* kernel radio structs		*/
+#include <media/v4l2-common.h>
 #include <linux/isapnp.h>
 #include <asm/io.h>		/* outb, outb_p			*/
 #include <asm/uaccess.h>	/* copy to/from user		*/
diff --git a/drivers/media/radio/radio-sf16fmr2.c b/drivers/media/radio/radio-sf16fmr2.c
index ca560a4cd41f..3483b2c7bc9d 100644
--- a/drivers/media/radio/radio-sf16fmr2.c
+++ b/drivers/media/radio/radio-sf16fmr2.c
@@ -19,6 +19,7 @@
 #include <asm/io.h>		/* outb, outb_p			*/
 #include <asm/uaccess.h>	/* copy to/from user		*/
 #include <linux/videodev.h>	/* kernel radio structs		*/
+#include <media/v4l2-common.h>
 #include <linux/mutex.h>
 
 static struct mutex lock;
diff --git a/drivers/media/radio/radio-terratec.c b/drivers/media/radio/radio-terratec.c
index 3ac0c361b9a2..dfba4ae596cd 100644
--- a/drivers/media/radio/radio-terratec.c
+++ b/drivers/media/radio/radio-terratec.c
@@ -30,6 +30,7 @@
 #include <asm/io.h>		/* outb, outb_p			*/
 #include <asm/uaccess.h>	/* copy to/from user		*/
 #include <linux/videodev.h>	/* kernel radio structs		*/
+#include <media/v4l2-common.h>
 #include <linux/config.h>	/* CONFIG_RADIO_TERRATEC_PORT 	*/
 #include <linux/spinlock.h>
 
diff --git a/drivers/media/radio/radio-trust.c b/drivers/media/radio/radio-trust.c
index efcec0181c2c..8da4badc22b4 100644
--- a/drivers/media/radio/radio-trust.c
+++ b/drivers/media/radio/radio-trust.c
@@ -22,6 +22,7 @@
 #include <asm/io.h>
 #include <asm/uaccess.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/config.h>	/* CONFIG_RADIO_TRUST_PORT 	*/
 
 /* acceptable ports: 0x350 (JP3 shorted), 0x358 (JP3 open) */
diff --git a/drivers/media/radio/radio-typhoon.c b/drivers/media/radio/radio-typhoon.c
index e42409906682..cf4fc08bd8a7 100644
--- a/drivers/media/radio/radio-typhoon.c
+++ b/drivers/media/radio/radio-typhoon.c
@@ -36,6 +36,7 @@
 #include <asm/io.h>		/* outb, outb_p                   */
 #include <asm/uaccess.h>	/* copy to/from user              */
 #include <linux/videodev.h>	/* kernel radio structs           */
+#include <media/v4l2-common.h>
 #include <linux/config.h>	/* CONFIG_RADIO_TYPHOON_*         */
 
 #define BANNER "Typhoon Radio Card driver v0.1\n"
diff --git a/drivers/media/radio/radio-zoltrix.c b/drivers/media/radio/radio-zoltrix.c
index 8aceea083980..59b86a6b4b0e 100644
--- a/drivers/media/radio/radio-zoltrix.c
+++ b/drivers/media/radio/radio-zoltrix.c
@@ -33,6 +33,7 @@
 #include <asm/io.h>		/* outb, outb_p                   */
 #include <asm/uaccess.h>	/* copy to/from user              */
 #include <linux/videodev.h>	/* kernel radio structs           */
+#include <media/v4l2-common.h>
 #include <linux/config.h>	/* CONFIG_RADIO_ZOLTRIX_PORT      */
 
 #ifndef CONFIG_RADIO_ZOLTRIX_PORT
diff --git a/drivers/media/video/arv.c b/drivers/media/video/arv.c
index dbe025170599..53824cc229fa 100644
--- a/drivers/media/video/arv.c
+++ b/drivers/media/video/arv.c
@@ -31,6 +31,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/mutex.h>
 
 #include <asm/uaccess.h>
diff --git a/drivers/media/video/bt8xx/bttvp.h b/drivers/media/video/bt8xx/bttvp.h
index 4183a21cf161..d2956010f763 100644
--- a/drivers/media/video/bt8xx/bttvp.h
+++ b/drivers/media/video/bt8xx/bttvp.h
@@ -33,6 +33,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/pci.h>
 #include <linux/input.h>
 #include <linux/mutex.h>
diff --git a/drivers/media/video/bw-qcam.c b/drivers/media/video/bw-qcam.c
index cf61c590f4ad..709c07cee290 100644
--- a/drivers/media/video/bw-qcam.c
+++ b/drivers/media/video/bw-qcam.c
@@ -73,6 +73,7 @@ OTHER DEALINGS IN THE SOFTWARE.
 #include <linux/parport.h>
 #include <linux/sched.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/mutex.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/media/video/c-qcam.c b/drivers/media/video/c-qcam.c
index 22a7386bbea6..a3989bd2f81b 100644
--- a/drivers/media/video/c-qcam.c
+++ b/drivers/media/video/c-qcam.c
@@ -34,6 +34,7 @@
 #include <linux/parport.h>
 #include <linux/sched.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/mutex.h>
 
 #include <asm/uaccess.h>
diff --git a/drivers/media/video/cpia.h b/drivers/media/video/cpia.h
index 3dc88b7558bf..6eaa692021c5 100644
--- a/drivers/media/video/cpia.h
+++ b/drivers/media/video/cpia.h
@@ -45,6 +45,7 @@
 
 #include <asm/uaccess.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/list.h>
 #include <linux/smp_lock.h>
 #include <linux/mutex.h>
diff --git a/drivers/media/video/cpia2/cpia2.h b/drivers/media/video/cpia2/cpia2.h
index 1764991b0ac9..c5ecb2be5f93 100644
--- a/drivers/media/video/cpia2/cpia2.h
+++ b/drivers/media/video/cpia2/cpia2.h
@@ -33,6 +33,7 @@
 
 #include <linux/version.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/usb.h>
 #include <linux/poll.h>
 
diff --git a/drivers/media/video/dsbr100.c b/drivers/media/video/dsbr100.c
index 3b4e9985c3d7..f7e33f9ee8e9 100644
--- a/drivers/media/video/dsbr100.c
+++ b/drivers/media/video/dsbr100.c
@@ -72,6 +72,7 @@
 #include <linux/slab.h>
 #include <linux/input.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/usb.h>
 #include <linux/smp_lock.h>
 
diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c
index 595a3ea7574e..f68ca7d9f531 100644
--- a/drivers/media/video/meye.c
+++ b/drivers/media/video/meye.c
@@ -32,6 +32,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <linux/delay.h>
diff --git a/drivers/media/video/ov511.h b/drivers/media/video/ov511.h
index 12b3d51e1c34..68b082bcee1d 100644
--- a/drivers/media/video/ov511.h
+++ b/drivers/media/video/ov511.h
@@ -3,6 +3,7 @@
 
 #include <asm/uaccess.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/smp_lock.h>
 #include <linux/usb.h>
 #include <linux/mutex.h>
diff --git a/drivers/media/video/planb.c b/drivers/media/video/planb.c
index d9e3cada52f4..3484e36b6801 100644
--- a/drivers/media/video/planb.c
+++ b/drivers/media/video/planb.c
@@ -40,6 +40,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/wait.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
diff --git a/drivers/media/video/pms.c b/drivers/media/video/pms.c
index 09835ca098b1..fb6471e74dc5 100644
--- a/drivers/media/video/pms.c
+++ b/drivers/media/video/pms.c
@@ -30,6 +30,7 @@
 #include <asm/io.h>
 #include <linux/sched.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/mutex.h>
 
 #include <asm/uaccess.h>
diff --git a/drivers/media/video/pwc/pwc.h b/drivers/media/video/pwc/pwc.h
index 1fd8c34d1181..a087108d0e02 100644
--- a/drivers/media/video/pwc/pwc.h
+++ b/drivers/media/video/pwc/pwc.h
@@ -35,7 +35,7 @@
 #include <asm/semaphore.h>
 #include <asm/errno.h>
 #include <linux/videodev.h>
-#include <linux/videodev2.h>
+#include <media/v4l2-common.h>
 
 #include "pwc-uncompress.h"
 #include <media/pwc-ioctl.h>
diff --git a/drivers/media/video/saa5246a.c b/drivers/media/video/saa5246a.c
index dd830e0e5e96..59a187272c83 100644
--- a/drivers/media/video/saa5246a.c
+++ b/drivers/media/video/saa5246a.c
@@ -46,6 +46,7 @@
 #include <linux/i2c.h>
 #include <linux/videotext.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/mutex.h>
 
 #include "saa5246a.h"
diff --git a/drivers/media/video/saa5249.c b/drivers/media/video/saa5249.c
index 531e9461cb66..19a8d65699f8 100644
--- a/drivers/media/video/saa5249.c
+++ b/drivers/media/video/saa5249.c
@@ -56,6 +56,7 @@
 #include <linux/i2c.h>
 #include <linux/videotext.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/mutex.h>
 
 
diff --git a/drivers/media/video/saa7110.c b/drivers/media/video/saa7110.c
index 41d951db6ec0..676b9970eb2e 100644
--- a/drivers/media/video/saa7110.c
+++ b/drivers/media/video/saa7110.c
@@ -43,6 +43,7 @@ MODULE_LICENSE("GPL");
 #define I2C_NAME(s) (s)->name
 
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/video_decoder.h>
 
 static int debug = 0;
diff --git a/drivers/media/video/se401.h b/drivers/media/video/se401.h
index a7a216bd4413..c0891b3e0018 100644
--- a/drivers/media/video/se401.h
+++ b/drivers/media/video/se401.h
@@ -4,6 +4,7 @@
 
 #include <asm/uaccess.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/smp_lock.h>
 #include <linux/mutex.h>
 
diff --git a/drivers/media/video/stradis.c b/drivers/media/video/stradis.c
index 07476c71174a..6be9c1131e1f 100644
--- a/drivers/media/video/stradis.c
+++ b/drivers/media/video/stradis.c
@@ -42,6 +42,7 @@
 #include <asm/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 
 #include "saa7146.h"
 #include "saa7146reg.h"
diff --git a/drivers/media/video/stv680.c b/drivers/media/video/stv680.c
index b38bda83a7c5..351b182d921f 100644
--- a/drivers/media/video/stv680.c
+++ b/drivers/media/video/stv680.c
@@ -66,6 +66,7 @@
 #include <linux/pagemap.h>
 #include <linux/errno.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/usb.h>
 #include <linux/mutex.h>
 
diff --git a/drivers/media/video/tda9875.c b/drivers/media/video/tda9875.c
index 103ccb919292..5b6aa49bf4f2 100644
--- a/drivers/media/video/tda9875.c
+++ b/drivers/media/video/tda9875.c
@@ -26,6 +26,7 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
 #include <linux/init.h>
diff --git a/drivers/media/video/tuner-3036.c b/drivers/media/video/tuner-3036.c
index 74ab48c09c6a..bdf506e6ae27 100644
--- a/drivers/media/video/tuner-3036.c
+++ b/drivers/media/video/tuner-3036.c
@@ -25,6 +25,7 @@
 
 #include <linux/i2c.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 
 #include <media/tuner.h>
 
diff --git a/drivers/media/video/usbvideo/usbvideo.h b/drivers/media/video/usbvideo/usbvideo.h
index 3cbf4fc499a3..49dbee5f5628 100644
--- a/drivers/media/video/usbvideo/usbvideo.h
+++ b/drivers/media/video/usbvideo/usbvideo.h
@@ -18,6 +18,7 @@
 
 #include <linux/config.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/usb.h>
 #include <linux/mutex.h>
 
diff --git a/drivers/media/video/v4l1-compat.c b/drivers/media/video/v4l1-compat.c
index 474a29bc1760..19d3c20dc7ef 100644
--- a/drivers/media/video/v4l1-compat.c
+++ b/drivers/media/video/v4l1-compat.c
@@ -32,6 +32,7 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
diff --git a/drivers/media/video/vino.c b/drivers/media/video/vino.c
index a8c101494cf5..268e69fdefc6 100644
--- a/drivers/media/video/vino.c
+++ b/drivers/media/video/vino.c
@@ -40,7 +40,7 @@
 #include <linux/i2c-algo-sgi.h>
 
 #include <linux/videodev.h>
-#include <linux/videodev2.h>
+#include <media/v4l2-common.h>
 #include <linux/video_decoder.h>
 #include <linux/mutex.h>
 
diff --git a/drivers/media/video/vpx3220.c b/drivers/media/video/vpx3220.c
index 40b205b91481..1eca7e65d235 100644
--- a/drivers/media/video/vpx3220.c
+++ b/drivers/media/video/vpx3220.c
@@ -34,6 +34,7 @@
 #define I2C_NAME(x) (x)->name
 
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/video_decoder.h>
 
 #define I2C_VPX3220        0x86
diff --git a/drivers/media/video/w9966.c b/drivers/media/video/w9966.c
index 80ef8a1b8f63..4bdc886abc4c 100644
--- a/drivers/media/video/w9966.c
+++ b/drivers/media/video/w9966.c
@@ -58,6 +58,7 @@
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/parport.h>
 
 //#define DEBUG				// Undef me for production
diff --git a/drivers/media/video/zoran_card.c b/drivers/media/video/zoran_card.c
index 798138599bec..958c1e6fc852 100644
--- a/drivers/media/video/zoran_card.c
+++ b/drivers/media/video/zoran_card.c
@@ -40,6 +40,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/spinlock.h>
 #include <linux/sem.h>
 #include <linux/kmod.h>
diff --git a/drivers/media/video/zoran_driver.c b/drivers/media/video/zoran_driver.c
index b5a576a37fd2..fd05a7fc4922 100644
--- a/drivers/media/video/zoran_driver.c
+++ b/drivers/media/video/zoran_driver.c
@@ -73,6 +73,7 @@
 			     )
 
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include "videocodec.h"
 
 #include <asm/io.h>
diff --git a/drivers/media/video/zoran_procfs.c b/drivers/media/video/zoran_procfs.c
index a00fae90229a..f4ffe79bdc5b 100644
--- a/drivers/media/video/zoran_procfs.c
+++ b/drivers/media/video/zoran_procfs.c
@@ -43,6 +43,7 @@
 #include <linux/seq_file.h>
 
 #include <linux/ctype.h>
+#include <linux/poll.h>
 #include <asm/io.h>
 
 #include "videocodec.h"
diff --git a/include/linux/videodev.h b/include/linux/videodev.h
index 5b6205544a7a..41bc7e9603cd 100644
--- a/include/linux/videodev.h
+++ b/include/linux/videodev.h
@@ -12,13 +12,6 @@
 #ifndef __LINUX_VIDEODEV_H
 #define __LINUX_VIDEODEV_H
 
-#include <linux/types.h>
-#include <linux/poll.h>
-#include <linux/fs.h>
-#include <linux/device.h>
-#include <linux/mutex.h>
-#include <linux/compiler.h> /* need __user */
-
 #define HAVE_V4L1 1
 
 #include <linux/videodev2.h>
@@ -343,10 +336,6 @@ struct video_code
 #define VID_HARDWARE_SN9C102	38
 #define VID_HARDWARE_ARV	39
 
-#ifdef __KERNEL__
-#include <media/v4l2-dev.h>
-#endif /* __KERNEL__ */
-
 #endif /* __LINUX_VIDEODEV_H */
 
 /*
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index bb58197ab6b9..795831d9f4d4 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -24,7 +24,6 @@
  * Common stuff for both V4L1 and V4L2
  * Moved from videodev.h
  */
-
 #define VIDEO_MAX_FRAME               32
 
 #define VID_TYPE_CAPTURE	1	/* Can capture */
@@ -200,7 +199,6 @@ struct v4l2_capability
 /*
  *	V I D E O   I M A G E   F O R M A T
  */
-
 struct v4l2_pix_format
 {
 	__u32         		width;
@@ -213,7 +211,7 @@ struct v4l2_pix_format
 	__u32			priv;		/* private data, depends on pixelformat */
 };
 
-/*           Pixel format    FOURCC                  depth  Description   */
+/*      Pixel format         FOURCC                        depth  Description  */
 #define V4L2_PIX_FMT_RGB332  v4l2_fourcc('R','G','B','1') /*  8  RGB-3-3-2     */
 #define V4L2_PIX_FMT_RGB555  v4l2_fourcc('R','G','B','O') /* 16  RGB-5-5-5     */
 #define V4L2_PIX_FMT_RGB565  v4l2_fourcc('R','G','B','P') /* 16  RGB-5-6-5     */
@@ -273,7 +271,6 @@ struct v4l2_fmtdesc
 
 #define V4L2_FMT_FLAG_COMPRESSED 0x0001
 
-
 /*
  *	T I M E C O D E
  */
@@ -303,7 +300,6 @@ struct v4l2_timecode
 #define V4L2_TC_USERBITS_8BITCHARS	0x0008
 /* The above is based on SMPTE timecodes */
 
-
 /*
  *	M P E G   C O M P R E S S I O N   P A R A M E T E R S
  *
@@ -311,8 +307,6 @@ struct v4l2_timecode
  *  ###          there will be some incompatible changes.
  *
  */
-
-
 enum v4l2_bitrate_mode {
 	V4L2_BITRATE_NONE = 0,	/* not specified */
 	V4L2_BITRATE_CBR,	/* constant bitrate */
@@ -421,7 +415,6 @@ struct v4l2_jpegcompression
 					* allways use APP0 */
 };
 
-
 /*
  *	M E M O R Y - M A P P I N G   B U F F E R S
  */
@@ -503,7 +496,6 @@ struct v4l2_window
 	void			__user *bitmap;
 };
 
-
 /*
  *	C A P T U R E   P A R A M E T E R S
  */
@@ -516,6 +508,7 @@ struct v4l2_captureparm
 	__u32              readbuffers;   /*  # of buffers for read */
 	__u32		   reserved[4];
 };
+
 /*  Flags for 'capability' and 'capturemode' fields */
 #define V4L2_MODE_HIGHQUALITY	0x0001	/*  High quality imaging mode */
 #define V4L2_CAP_TIMEPERFRAME	0x1000	/*  timeperframe field is supported */
@@ -533,7 +526,6 @@ struct v4l2_outputparm
 /*
  *	I N P U T   I M A G E   C R O P P I N G
  */
-
 struct v4l2_cropcap {
 	enum v4l2_buf_type      type;
 	struct v4l2_rect        bounds;
@@ -640,7 +632,6 @@ struct v4l2_standard
 	__u32		     reserved[4];
 };
 
-
 /*
  *	V I D E O   I N P U T S
  */
@@ -655,6 +646,7 @@ struct v4l2_input
 	__u32	     status;
 	__u32	     reserved[4];
 };
+
 /*  Values for the 'type' field */
 #define V4L2_INPUT_TYPE_TUNER		1
 #define V4L2_INPUT_TYPE_CAMERA		2
@@ -834,6 +826,7 @@ struct v4l2_audio
 	__u32	mode;
 	__u32	reserved[2];
 };
+
 /*  Flags for the 'capability' field */
 #define V4L2_AUDCAP_STEREO		0x00001
 #define V4L2_AUDCAP_AVL			0x00002
@@ -857,7 +850,6 @@ struct v4l2_audioout
  */
 
 /* Raw VBI */
-
 struct v4l2_vbi_format
 {
 	__u32	sampling_rate;		/* in 1 Hz */
@@ -964,8 +956,6 @@ struct v4l2_streamparm
 	} parm;
 };
 
-
-
 /*
  *	I O C T L   C O D E S   F O R   V I D E O   D E V I C E S
  *
diff --git a/include/media/ovcamchip.h b/include/media/ovcamchip.h
index 8138983adced..0f43451f8bb3 100644
--- a/include/media/ovcamchip.h
+++ b/include/media/ovcamchip.h
@@ -15,6 +15,7 @@
 #define __LINUX_OVCAMCHIP_H
 
 #include <linux/videodev.h>
+#include <media/v4l2-common.h>
 #include <linux/i2c.h>
 
 /* --------------------------------- */
diff --git a/include/media/saa7146_vv.h b/include/media/saa7146_vv.h
index 4507cb61ae93..83fe2e3d1e25 100644
--- a/include/media/saa7146_vv.h
+++ b/include/media/saa7146_vv.h
@@ -2,7 +2,7 @@
 #define __SAA7146_VV__
 
 #include <linux/videodev.h>
-
+#include <media/v4l2-common.h>
 #include <media/saa7146.h>
 #include <media/video-buf.h>
 
diff --git a/include/media/v4l2-dev.h b/include/media/v4l2-dev.h
index c2f54d2c9a4c..dec6b24e4c42 100644
--- a/include/media/v4l2-dev.h
+++ b/include/media/v4l2-dev.h
@@ -40,8 +40,6 @@
 #define VFL_TYPE_RADIO		2
 #define VFL_TYPE_VTX		3
 
-	const struct file_operations *fops;
-
 /*  Video standard functions  */
 extern unsigned int v4l2_video_std_fps(struct v4l2_standard *vs);
 extern int v4l2_video_std_construct(struct v4l2_standard *vs,
@@ -92,7 +90,7 @@ struct v4l2_tvnorm {
 struct video_device
 {
 	/* device ops */
-	struct file_operations *fops;
+	const struct file_operations *fops;
 
 	/* device info */
 	struct device *dev;
@@ -297,7 +295,7 @@ struct video_device
 	int (*vidioc_log_status)       (struct file *file, void *fh);
 
 
-#if OBSOLETE_OWNER /* to be removed soon */
+#ifdef OBSOLETE_OWNER /* to be removed soon */
 /* obsolete -- fops->owner is used instead */
 struct module *owner;
 /* dev->driver_data will be used instead some day.
@@ -351,7 +349,7 @@ video_device_remove_file(struct video_device *vfd,
 	class_device_remove_file(&vfd->class_dev, attr);
 }
 
-#if OBSOLETE_OWNER /* to be removed soon */
+#ifdef OBSOLETE_OWNER /* to be removed soon */
 /* helper functions to access driver private data. */
 static inline void *video_get_drvdata(struct video_device *dev)
 {
-- 
cgit v1.2.3


From 845f16abad37e2a255ac1c167375f6588502f93f Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Tue, 6 Jun 2006 11:20:08 -0300
Subject: V4L/DVB (4070): Zoran strncpy() fix

The zoran driver uses strncpy() in an unsafe way.  This patch uses the proper
sizeof()-1 size parameter.  Since all strncpy() targets are initialised with
memset() the trailing '\0' is already set.  Where std->name was the target for
the strncpy() we overwrote 8 Bytes of the std structure with zeros.

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 drivers/media/video/zoran_driver.c | 26 +++++++++++++-------------
 include/linux/videodev2.h          |  2 ++
 2 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/zoran_driver.c b/drivers/media/video/zoran_driver.c
index fd05a7fc4922..9711f6248ef7 100644
--- a/drivers/media/video/zoran_driver.c
+++ b/drivers/media/video/zoran_driver.c
@@ -2048,7 +2048,7 @@ zoran_do_ioctl (struct inode *inode,
 		dprintk(3, KERN_DEBUG "%s: VIDIOCGCAP\n", ZR_DEVNAME(zr));
 
 		memset(vcap, 0, sizeof(struct video_capability));
-		strncpy(vcap->name, ZR_DEVNAME(zr), sizeof(vcap->name));
+		strncpy(vcap->name, ZR_DEVNAME(zr), sizeof(vcap->name)-1);
 		vcap->type = ZORAN_VID_TYPE;
 
 		vcap->channels = zr->card.inputs;
@@ -2690,8 +2690,8 @@ zoran_do_ioctl (struct inode *inode,
 		dprintk(3, KERN_DEBUG "%s: VIDIOC_QUERYCAP\n", ZR_DEVNAME(zr));
 
 		memset(cap, 0, sizeof(*cap));
-		strncpy(cap->card, ZR_DEVNAME(zr), sizeof(cap->card));
-		strncpy(cap->driver, "zoran", sizeof(cap->driver));
+		strncpy(cap->card, ZR_DEVNAME(zr), sizeof(cap->card)-1);
+		strncpy(cap->driver, "zoran", sizeof(cap->driver)-1);
 		snprintf(cap->bus_info, sizeof(cap->bus_info), "PCI:%s",
 			 pci_name(zr->pci_dev));
 		cap->version =
@@ -2743,7 +2743,7 @@ zoran_do_ioctl (struct inode *inode,
 		memset(fmt, 0, sizeof(*fmt));
 		fmt->index = index;
 		fmt->type = type;
-		strncpy(fmt->description, zoran_formats[i].name, 31);
+		strncpy(fmt->description, zoran_formats[i].name, sizeof(fmt->description)-1);
 		fmt->pixelformat = zoran_formats[i].fourcc;
 		if (zoran_formats[i].flags & ZORAN_FORMAT_COMPRESSED)
 			fmt->flags |= V4L2_FMT_FLAG_COMPRESSED;
@@ -3567,16 +3567,16 @@ zoran_do_ioctl (struct inode *inode,
 
 		switch (ctrl->id) {
 		case V4L2_CID_BRIGHTNESS:
-			strncpy(ctrl->name, "Brightness", 31);
+			strncpy(ctrl->name, "Brightness", sizeof(ctrl->name)-1);
 			break;
 		case V4L2_CID_CONTRAST:
-			strncpy(ctrl->name, "Contrast", 31);
+			strncpy(ctrl->name, "Contrast", sizeof(ctrl->name)-1);
 			break;
 		case V4L2_CID_SATURATION:
-			strncpy(ctrl->name, "Saturation", 31);
+			strncpy(ctrl->name, "Saturation", sizeof(ctrl->name)-1);
 			break;
 		case V4L2_CID_HUE:
-			strncpy(ctrl->name, "Hue", 31);
+			strncpy(ctrl->name, "Hue", sizeof(ctrl->name)-1);
 			break;
 		}
 
@@ -3694,7 +3694,7 @@ zoran_do_ioctl (struct inode *inode,
 					&caps);
 			if (caps.flags & VIDEO_DECODER_AUTO) {
 				std->id = V4L2_STD_ALL;
-				strncpy(std->name, "Autodetect", 31);
+				strncpy(std->name, "Autodetect", sizeof(std->name)-1);
 				return 0;
 			} else
 				return -EINVAL;
@@ -3702,21 +3702,21 @@ zoran_do_ioctl (struct inode *inode,
 		switch (std->index) {
 		case 0:
 			std->id = V4L2_STD_PAL;
-			strncpy(std->name, "PAL", 31);
+			strncpy(std->name, "PAL", sizeof(std->name)-1);
 			std->frameperiod.numerator = 1;
 			std->frameperiod.denominator = 25;
 			std->framelines = zr->card.tvn[0]->Ht;
 			break;
 		case 1:
 			std->id = V4L2_STD_NTSC;
-			strncpy(std->name, "NTSC", 31);
+			strncpy(std->name, "NTSC", sizeof(std->name)-1);
 			std->frameperiod.numerator = 1001;
 			std->frameperiod.denominator = 30000;
 			std->framelines = zr->card.tvn[1]->Ht;
 			break;
 		case 2:
 			std->id = V4L2_STD_SECAM;
-			strncpy(std->name, "SECAM", 31);
+			strncpy(std->name, "SECAM", sizeof(std->name)-1);
 			std->frameperiod.numerator = 1;
 			std->frameperiod.denominator = 25;
 			std->framelines = zr->card.tvn[2]->Ht;
@@ -3872,7 +3872,7 @@ zoran_do_ioctl (struct inode *inode,
 		memset(outp, 0, sizeof(*outp));
 		outp->index = 0;
 		outp->type = V4L2_OUTPUT_TYPE_ANALOGVGAOVERLAY;
-		strncpy(outp->name, "Autodetect", 31);
+		strncpy(outp->name, "Autodetect", sizeof(outp->name)-1);
 
 		return 0;
 	}
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 795831d9f4d4..bc957d83a127 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -14,7 +14,9 @@
  */
 #ifndef __LINUX_VIDEODEV2_H
 #define __LINUX_VIDEODEV2_H
+#ifdef __KERNEL__
 #include <linux/time.h> /* need struct timeval */
+#endif
 #include <linux/types.h>
 #include <linux/compiler.h> /* need __user */
 
-- 
cgit v1.2.3


From 89a58c83f8f1056583f30c4d4258b1fec39bb0e2 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@infradead.org>
Date: Sat, 17 Jun 2006 07:57:03 -0300
Subject: V4L/DVB (4108): Fixes some userspace dependencies at V4L2 public api
 header

Make life easier for distro guys, by removing the need of including
at the userspace header.
Also, linux/compiler.h is not needed at userspace.

Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 include/linux/videodev2.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index bc957d83a127..9a70c5654ceb 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -15,10 +15,12 @@
 #ifndef __LINUX_VIDEODEV2_H
 #define __LINUX_VIDEODEV2_H
 #ifdef __KERNEL__
-#include <linux/time.h> /* need struct timeval */
+#include <linux/time.h>     /* need struct timeval */
+#include <linux/compiler.h> /* need __user */
+#else
+#define __user
 #endif
 #include <linux/types.h>
-#include <linux/compiler.h> /* need __user */
 
 #define HAVE_V4L2 1
 
-- 
cgit v1.2.3


From 4f341712120abde54d9113856e9118e6580d7061 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Sun, 18 Jun 2006 13:27:19 -0300
Subject: V4L/DVB (4188): Add new MPEG control/ioctl definitions to videodev2.h

The old, experimental, VIDIOC_S/G_CODEC API to pass MPEG parameters is now
obsolete and is replaced by 'extended controls' which offer more flexibility
and are hopefully more future proof.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 include/linux/videodev2.h | 225 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 222 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 9a70c5654ceb..337c31409cd7 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -105,6 +105,8 @@ enum v4l2_ctrl_type {
 	V4L2_CTRL_TYPE_BOOLEAN	     = 2,
 	V4L2_CTRL_TYPE_MENU	     = 3,
 	V4L2_CTRL_TYPE_BUTTON	     = 4,
+	V4L2_CTRL_TYPE_INTEGER64     = 5,
+	V4L2_CTRL_TYPE_CTRL_CLASS    = 6,
 };
 
 enum v4l2_tuner_type {
@@ -251,7 +253,7 @@ struct v4l2_pix_format
 #define V4L2_PIX_FMT_MJPEG    v4l2_fourcc('M','J','P','G') /* Motion-JPEG   */
 #define V4L2_PIX_FMT_JPEG     v4l2_fourcc('J','P','E','G') /* JFIF JPEG     */
 #define V4L2_PIX_FMT_DV       v4l2_fourcc('d','v','s','d') /* 1394          */
-#define V4L2_PIX_FMT_MPEG     v4l2_fourcc('M','P','E','G') /* MPEG          */
+#define V4L2_PIX_FMT_MPEG     v4l2_fourcc('M','P','E','G') /* MPEG-1/2/4    */
 
 /*  Vendor-specific formats   */
 #define V4L2_PIX_FMT_WNVA     v4l2_fourcc('W','N','V','A') /* Winnov hw compress */
@@ -701,6 +703,34 @@ struct v4l2_control
 	__s32		     value;
 };
 
+struct v4l2_ext_control
+{
+	__u32 id;
+	__u32 reserved2[2];
+	union {
+		__s32 value;
+		__s64 value64;
+		void *reserved;
+	};
+};
+
+struct v4l2_ext_controls
+{
+	__u32 ctrl_class;
+	__u32 count;
+	__u32 error_idx;
+	__u32 reserved[2];
+	struct v4l2_ext_control *controls;
+};
+
+/*  Values for ctrl_class field */
+#define V4L2_CTRL_CLASS_USER 0x00980000	/* Old-style 'user' controls */
+#define V4L2_CTRL_CLASS_MPEG 0x00990000	/* MPEG-compression controls */
+
+#define V4L2_CTRL_ID_MASK      	  (0x0fffffff)
+#define V4L2_CTRL_ID2CLASS(id)    ((id) & 0x0fff0000UL)
+#define V4L2_CTRL_DRIVER_PRIV(id) (((id) & 0xffff) >= 0x1000)
+
 /*  Used in the VIDIOC_QUERYCTRL ioctl for querying controls */
 struct v4l2_queryctrl
 {
@@ -727,12 +757,21 @@ struct v4l2_querymenu
 /*  Control flags  */
 #define V4L2_CTRL_FLAG_DISABLED		0x0001
 #define V4L2_CTRL_FLAG_GRABBED		0x0002
+#define V4L2_CTRL_FLAG_READ_ONLY 	0x0004
+#define V4L2_CTRL_FLAG_UPDATE 		0x0008
+#define V4L2_CTRL_FLAG_INACTIVE 	0x0010
+#define V4L2_CTRL_FLAG_SLIDER 		0x0020
+
+/*  Query flag, to be ORed with the control ID */
+#define V4L2_CTRL_FLAG_NEXT_CTRL	0x80000000
 
-/*  Control IDs defined by V4L2 */
-#define V4L2_CID_BASE			0x00980900
+/*  User-class control IDs defined by V4L2 */
+#define V4L2_CID_BASE			(V4L2_CTRL_CLASS_USER | 0x900)
+#define V4L2_CID_USER_BASE 		V4L2_CID_BASE
 /*  IDs reserved for driver specific controls */
 #define V4L2_CID_PRIVATE_BASE		0x08000000
 
+#define V4L2_CID_USER_CLASS 		(V4L2_CTRL_CLASS_USER | 1)
 #define V4L2_CID_BRIGHTNESS		(V4L2_CID_BASE+0)
 #define V4L2_CID_CONTRAST		(V4L2_CID_BASE+1)
 #define V4L2_CID_SATURATION		(V4L2_CID_BASE+2)
@@ -759,6 +798,183 @@ struct v4l2_querymenu
 #define V4L2_CID_VCENTER		(V4L2_CID_BASE+23)
 #define V4L2_CID_LASTP1			(V4L2_CID_BASE+24) /* last CID + 1 */
 
+/*  MPEG-class control IDs defined by V4L2 */
+#define V4L2_CID_MPEG_BASE 			(V4L2_CTRL_CLASS_MPEG | 0x900)
+#define V4L2_CID_MPEG_CLASS 			(V4L2_CTRL_CLASS_MPEG | 1)
+
+/*  MPEG streams */
+#define V4L2_CID_MPEG_STREAM_TYPE 		(V4L2_CID_MPEG_BASE+0)
+enum v4l2_mpeg_stream_type {
+	V4L2_MPEG_STREAM_TYPE_MPEG2_PS,   /* MPEG-2 program stream */
+	V4L2_MPEG_STREAM_TYPE_MPEG2_TS,   /* MPEG-2 transport stream */
+	V4L2_MPEG_STREAM_TYPE_MPEG1_SS,   /* MPEG-1 system stream */
+	V4L2_MPEG_STREAM_TYPE_MPEG2_DVD,  /* MPEG-2 DVD-compatible stream */
+	V4L2_MPEG_STREAM_TYPE_MPEG1_VCD,  /* MPEG-1 VCD-compatible stream */
+	V4L2_MPEG_STREAM_TYPE_MPEG2_SVCD, /* MPEG-2 SVCD-compatible stream */
+};
+#define V4L2_CID_MPEG_STREAM_PID_PMT 		(V4L2_CID_MPEG_BASE+1)
+#define V4L2_CID_MPEG_STREAM_PID_AUDIO 		(V4L2_CID_MPEG_BASE+2)
+#define V4L2_CID_MPEG_STREAM_PID_VIDEO 		(V4L2_CID_MPEG_BASE+3)
+#define V4L2_CID_MPEG_STREAM_PID_PCR 		(V4L2_CID_MPEG_BASE+4)
+#define V4L2_CID_MPEG_STREAM_PES_ID_AUDIO 	(V4L2_CID_MPEG_BASE+5)
+#define V4L2_CID_MPEG_STREAM_PES_ID_VIDEO 	(V4L2_CID_MPEG_BASE+6)
+
+/*  MPEG audio */
+#define V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ 	(V4L2_CID_MPEG_BASE+100)
+enum v4l2_mpeg_audio_sampling_freq {
+	V4L2_MPEG_AUDIO_SAMPLING_FREQ_44100,
+	V4L2_MPEG_AUDIO_SAMPLING_FREQ_48000,
+	V4L2_MPEG_AUDIO_SAMPLING_FREQ_32000,
+};
+#define V4L2_CID_MPEG_AUDIO_ENCODING 		(V4L2_CID_MPEG_BASE+101)
+enum v4l2_mpeg_audio_encoding {
+	V4L2_MPEG_AUDIO_ENCODING_LAYER_1,
+	V4L2_MPEG_AUDIO_ENCODING_LAYER_2,
+	V4L2_MPEG_AUDIO_ENCODING_LAYER_3,
+};
+#define V4L2_CID_MPEG_AUDIO_L1_BITRATE 		(V4L2_CID_MPEG_BASE+102)
+enum v4l2_mpeg_audio_l1_bitrate {
+	V4L2_MPEG_AUDIO_L1_BITRATE_32K,
+	V4L2_MPEG_AUDIO_L1_BITRATE_64K,
+	V4L2_MPEG_AUDIO_L1_BITRATE_96K,
+	V4L2_MPEG_AUDIO_L1_BITRATE_128K,
+	V4L2_MPEG_AUDIO_L1_BITRATE_160K,
+	V4L2_MPEG_AUDIO_L1_BITRATE_192K,
+	V4L2_MPEG_AUDIO_L1_BITRATE_224K,
+	V4L2_MPEG_AUDIO_L1_BITRATE_256K,
+	V4L2_MPEG_AUDIO_L1_BITRATE_288K,
+	V4L2_MPEG_AUDIO_L1_BITRATE_320K,
+	V4L2_MPEG_AUDIO_L1_BITRATE_352K,
+	V4L2_MPEG_AUDIO_L1_BITRATE_384K,
+	V4L2_MPEG_AUDIO_L1_BITRATE_416K,
+	V4L2_MPEG_AUDIO_L1_BITRATE_448K,
+};
+#define V4L2_CID_MPEG_AUDIO_L2_BITRATE 		(V4L2_CID_MPEG_BASE+103)
+enum v4l2_mpeg_audio_l2_bitrate {
+	V4L2_MPEG_AUDIO_L2_BITRATE_32K,
+	V4L2_MPEG_AUDIO_L2_BITRATE_48K,
+	V4L2_MPEG_AUDIO_L2_BITRATE_56K,
+	V4L2_MPEG_AUDIO_L2_BITRATE_64K,
+	V4L2_MPEG_AUDIO_L2_BITRATE_80K,
+	V4L2_MPEG_AUDIO_L2_BITRATE_96K,
+	V4L2_MPEG_AUDIO_L2_BITRATE_112K,
+	V4L2_MPEG_AUDIO_L2_BITRATE_128K,
+	V4L2_MPEG_AUDIO_L2_BITRATE_160K,
+	V4L2_MPEG_AUDIO_L2_BITRATE_192K,
+	V4L2_MPEG_AUDIO_L2_BITRATE_224K,
+	V4L2_MPEG_AUDIO_L2_BITRATE_256K,
+	V4L2_MPEG_AUDIO_L2_BITRATE_320K,
+	V4L2_MPEG_AUDIO_L2_BITRATE_384K,
+};
+#define V4L2_CID_MPEG_AUDIO_L3_BITRATE 		(V4L2_CID_MPEG_BASE+104)
+enum v4l2_mpeg_audio_l3_bitrate {
+	V4L2_MPEG_AUDIO_L3_BITRATE_32K,
+	V4L2_MPEG_AUDIO_L3_BITRATE_40K,
+	V4L2_MPEG_AUDIO_L3_BITRATE_48K,
+	V4L2_MPEG_AUDIO_L3_BITRATE_56K,
+	V4L2_MPEG_AUDIO_L3_BITRATE_64K,
+	V4L2_MPEG_AUDIO_L3_BITRATE_80K,
+	V4L2_MPEG_AUDIO_L3_BITRATE_96K,
+	V4L2_MPEG_AUDIO_L3_BITRATE_112K,
+	V4L2_MPEG_AUDIO_L3_BITRATE_128K,
+	V4L2_MPEG_AUDIO_L3_BITRATE_160K,
+	V4L2_MPEG_AUDIO_L3_BITRATE_192K,
+	V4L2_MPEG_AUDIO_L3_BITRATE_224K,
+	V4L2_MPEG_AUDIO_L3_BITRATE_256K,
+	V4L2_MPEG_AUDIO_L3_BITRATE_320K,
+};
+#define V4L2_CID_MPEG_AUDIO_MODE 		(V4L2_CID_MPEG_BASE+105)
+enum v4l2_mpeg_audio_mode {
+	V4L2_MPEG_AUDIO_MODE_STEREO,
+	V4L2_MPEG_AUDIO_MODE_JOINT_STEREO,
+	V4L2_MPEG_AUDIO_MODE_DUAL,
+	V4L2_MPEG_AUDIO_MODE_MONO,
+};
+#define V4L2_CID_MPEG_AUDIO_MODE_EXTENSION 	(V4L2_CID_MPEG_BASE+106)
+enum v4l2_mpeg_audio_mode_extension {
+	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_4,
+	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_8,
+	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_12,
+	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_16,
+};
+#define V4L2_CID_MPEG_AUDIO_EMPHASIS 		(V4L2_CID_MPEG_BASE+107)
+enum v4l2_mpeg_audio_emphasis {
+	V4L2_MPEG_AUDIO_EMPHASIS_NONE,
+	V4L2_MPEG_AUDIO_EMPHASIS_50_DIV_15_uS,
+	V4L2_MPEG_AUDIO_EMPHASIS_CCITT_J17,
+};
+#define V4L2_CID_MPEG_AUDIO_CRC 		(V4L2_CID_MPEG_BASE+108)
+enum v4l2_mpeg_audio_crc {
+	V4L2_MPEG_AUDIO_CRC_NONE,
+	V4L2_MPEG_AUDIO_CRC_CRC16,
+};
+
+/*  MPEG video */
+#define V4L2_CID_MPEG_VIDEO_ENCODING 		(V4L2_CID_MPEG_BASE+200)
+enum v4l2_mpeg_video_encoding {
+	V4L2_MPEG_VIDEO_ENCODING_MPEG_1,
+	V4L2_MPEG_VIDEO_ENCODING_MPEG_2,
+};
+#define V4L2_CID_MPEG_VIDEO_ASPECT 		(V4L2_CID_MPEG_BASE+201)
+enum v4l2_mpeg_video_aspect {
+	V4L2_MPEG_VIDEO_ASPECT_1x1,
+	V4L2_MPEG_VIDEO_ASPECT_4x3,
+	V4L2_MPEG_VIDEO_ASPECT_16x9,
+	V4L2_MPEG_VIDEO_ASPECT_221x100,
+};
+#define V4L2_CID_MPEG_VIDEO_B_FRAMES 		(V4L2_CID_MPEG_BASE+202)
+#define V4L2_CID_MPEG_VIDEO_GOP_SIZE 		(V4L2_CID_MPEG_BASE+203)
+#define V4L2_CID_MPEG_VIDEO_GOP_CLOSURE 	(V4L2_CID_MPEG_BASE+204)
+#define V4L2_CID_MPEG_VIDEO_PULLDOWN 		(V4L2_CID_MPEG_BASE+205)
+#define V4L2_CID_MPEG_VIDEO_BITRATE_MODE 	(V4L2_CID_MPEG_BASE+206)
+enum v4l2_mpeg_video_bitrate_mode {
+	V4L2_MPEG_VIDEO_BITRATE_MODE_VBR,
+	V4L2_MPEG_VIDEO_BITRATE_MODE_CBR,
+};
+#define V4L2_CID_MPEG_VIDEO_BITRATE 		(V4L2_CID_MPEG_BASE+207)
+#define V4L2_CID_MPEG_VIDEO_BITRATE_PEAK 	(V4L2_CID_MPEG_BASE+208)
+#define V4L2_CID_MPEG_VIDEO_TEMPORAL_DECIMATION (V4L2_CID_MPEG_BASE+209)
+
+/*  MPEG-class control IDs specific to the CX2584x driver as defined by V4L2 */
+#define V4L2_CID_MPEG_CX2341X_BASE 				(V4L2_CTRL_CLASS_MPEG | 0x1000)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE 	(V4L2_CID_MPEG_CX2341X_BASE+0)
+enum v4l2_mpeg_cx2341x_video_spatial_filter_mode {
+	V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_MANUAL,
+	V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_AUTO,
+};
+#define V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER 		(V4L2_CID_MPEG_CX2341X_BASE+1)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE 	(V4L2_CID_MPEG_CX2341X_BASE+2)
+enum v4l2_mpeg_cx2341x_video_luma_spatial_filter_type {
+	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_OFF,
+	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_1D_HOR,
+	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_1D_VERT,
+	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_2D_HV_SEPARABLE,
+	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_2D_SYM_NON_SEPARABLE,
+};
+#define V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE 	(V4L2_CID_MPEG_CX2341X_BASE+3)
+enum v4l2_mpeg_cx2341x_video_chroma_spatial_filter_type {
+	V4L2_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE_OFF,
+	V4L2_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE_1D_HOR,
+};
+#define V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE 	(V4L2_CID_MPEG_CX2341X_BASE+4)
+enum v4l2_mpeg_cx2341x_video_temporal_filter_mode {
+	V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_MANUAL,
+	V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_AUTO,
+};
+#define V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER 		(V4L2_CID_MPEG_CX2341X_BASE+5)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE 		(V4L2_CID_MPEG_CX2341X_BASE+6)
+enum v4l2_mpeg_cx2341x_video_median_filter_type {
+	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_OFF,
+	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_HOR,
+	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_VERT,
+	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_HOR_VERT,
+	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_DIAG,
+};
+#define V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_BOTTOM 	(V4L2_CID_MPEG_CX2341X_BASE+7)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_TOP 	(V4L2_CID_MPEG_CX2341X_BASE+8)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_BOTTOM	(V4L2_CID_MPEG_CX2341X_BASE+9)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_TOP 	(V4L2_CID_MPEG_CX2341X_BASE+10)
+
 /*
  *	T U N I N G
  */
@@ -1020,6 +1236,9 @@ struct v4l2_streamparm
 #define VIDIOC_G_SLICED_VBI_CAP _IOR  ('V', 69, struct v4l2_sliced_vbi_cap)
 #endif
 #define VIDIOC_LOG_STATUS       _IO   ('V', 70)
+#define VIDIOC_G_EXT_CTRLS	_IOWR ('V', 71, struct v4l2_ext_controls)
+#define VIDIOC_S_EXT_CTRLS	_IOWR ('V', 72, struct v4l2_ext_controls)
+#define VIDIOC_TRY_EXT_CTRLS	_IOWR ('V', 73, struct v4l2_ext_controls)
 
 #ifdef __OLD_VIDIOC_
 /* for compatibility, will go away some day */
-- 
cgit v1.2.3


From f81cf7533b4b8411a0d2fa943adcede340dfdab6 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Sun, 18 Jun 2006 16:54:20 -0300
Subject: V4L/DVB (4198): Avoid newer usages of obsoleted experimental MPEGCOMP
 API

Put old MPEGCOMP API under #if __KERNEL__ and issue warnings when used.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 drivers/media/video/cx88/cx88-blackbird.c     | 4 ++++
 drivers/media/video/saa7134/saa7134-empress.c | 4 ++++
 drivers/media/video/videodev.c                | 1 +
 include/linux/videodev2.h                     | 9 +++++++--
 4 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/cx88/cx88-blackbird.c b/drivers/media/video/cx88/cx88-blackbird.c
index 9fa1ddbe3fd8..6e945de1f20a 100644
--- a/drivers/media/video/cx88/cx88-blackbird.c
+++ b/drivers/media/video/cx88/cx88-blackbird.c
@@ -805,10 +805,14 @@ static int mpeg_do_ioctl(struct inode *inode, struct file *file,
 	{
 		struct v4l2_mpeg_compression *f = arg;
 
+		printk(KERN_WARNING "VIDIOC_G_MPEGCOMP is obsolete. "
+				    "Replace with VIDIOC_G_EXT_CTRLS!");
 		memcpy(f,&default_mpeg_params,sizeof(*f));
 		return 0;
 	}
 	case VIDIOC_S_MPEGCOMP:
+		printk(KERN_WARNING "VIDIOC_S_MPEGCOMP is obsolete. "
+				    "Replace with VIDIOC_S_EXT_CTRLS!");
 		return 0;
 	case VIDIOC_G_EXT_CTRLS:
 	{
diff --git a/drivers/media/video/saa7134/saa7134-empress.c b/drivers/media/video/saa7134/saa7134-empress.c
index bc743b9253fd..65d044086ce9 100644
--- a/drivers/media/video/saa7134/saa7134-empress.c
+++ b/drivers/media/video/saa7134/saa7134-empress.c
@@ -281,10 +281,14 @@ static int ts_do_ioctl(struct inode *inode, struct file *file,
 		return saa7134_common_ioctl(dev, cmd, arg);
 
 	case VIDIOC_S_MPEGCOMP:
+		printk(KERN_WARNING "VIDIOC_S_MPEGCOMP is obsolete. "
+				    "Replace with VIDIOC_S_EXT_CTRLS!");
 		saa7134_i2c_call_clients(dev, VIDIOC_S_MPEGCOMP, arg);
 		ts_init_encoder(dev);
 		return 0;
 	case VIDIOC_G_MPEGCOMP:
+		printk(KERN_WARNING "VIDIOC_G_MPEGCOMP is obsolete. "
+				    "Replace with VIDIOC_G_EXT_CTRLS!");
 		saa7134_i2c_call_clients(dev, VIDIOC_G_MPEGCOMP, arg);
 		return 0;
 	case VIDIOC_S_EXT_CTRLS:
diff --git a/drivers/media/video/videodev.c b/drivers/media/video/videodev.c
index 763e178555d0..2dfa7f23d0ca 100644
--- a/drivers/media/video/videodev.c
+++ b/drivers/media/video/videodev.c
@@ -1216,6 +1216,7 @@ static int __video_do_ioctl(struct inode *inode, struct file *file,
 	case VIDIOC_G_MPEGCOMP:
 	{
 		struct v4l2_mpeg_compression *p=arg;
+
 		/*FIXME: Several fields not shown */
 		if (!vfd->vidioc_g_mpegcomp)
 			break;
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 337c31409cd7..260ff6787ad4 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -306,11 +306,13 @@ struct v4l2_timecode
 #define V4L2_TC_USERBITS_8BITCHARS	0x0008
 /* The above is based on SMPTE timecodes */
 
+#ifdef __KERNEL__
 /*
  *	M P E G   C O M P R E S S I O N   P A R A M E T E R S
  *
- *  ### WARNING: this is still work-in-progress right now, most likely
- *  ###          there will be some incompatible changes.
+ *  ### WARNING: This experimental MPEG compression API is obsolete.
+ *  ###          It is replaced by the MPEG controls API.
+ *  ###          This old API will disappear in the near future!
  *
  */
 enum v4l2_bitrate_mode {
@@ -390,6 +392,7 @@ struct v4l2_mpeg_compression {
 	/* I don't expect the above being perfect yet ;) */
 	__u32				reserved_5[8];
 };
+#endif
 
 struct v4l2_jpegcompression
 {
@@ -1185,8 +1188,10 @@ struct v4l2_streamparm
 #define VIDIOC_ENUM_FMT         _IOWR ('V',  2, struct v4l2_fmtdesc)
 #define VIDIOC_G_FMT		_IOWR ('V',  4, struct v4l2_format)
 #define VIDIOC_S_FMT		_IOWR ('V',  5, struct v4l2_format)
+#ifdef __KERNEL__
 #define VIDIOC_G_MPEGCOMP       _IOR  ('V',  6, struct v4l2_mpeg_compression)
 #define VIDIOC_S_MPEGCOMP     	_IOW  ('V',  7, struct v4l2_mpeg_compression)
+#endif
 #define VIDIOC_REQBUFS		_IOWR ('V',  8, struct v4l2_requestbuffers)
 #define VIDIOC_QUERYBUF		_IOWR ('V',  9, struct v4l2_buffer)
 #define VIDIOC_G_FBUF		_IOR  ('V', 10, struct v4l2_framebuffer)
-- 
cgit v1.2.3


From 0ccac4af1a8f22e2e96d89b9bf8766dc7286a972 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Fri, 23 Jun 2006 15:52:50 -0300
Subject: V4L/DVB (4203): Explicitly set the enum values.

It's better to use explicit enums. It reduces the chance of someone
inserting new enums in the middle which would break things.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 include/linux/videodev2.h | 182 +++++++++++++++++++++++-----------------------
 include/media/cx2341x.h   |   6 +-
 2 files changed, 94 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 260ff6787ad4..4f428547ec09 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -808,12 +808,12 @@ struct v4l2_querymenu
 /*  MPEG streams */
 #define V4L2_CID_MPEG_STREAM_TYPE 		(V4L2_CID_MPEG_BASE+0)
 enum v4l2_mpeg_stream_type {
-	V4L2_MPEG_STREAM_TYPE_MPEG2_PS,   /* MPEG-2 program stream */
-	V4L2_MPEG_STREAM_TYPE_MPEG2_TS,   /* MPEG-2 transport stream */
-	V4L2_MPEG_STREAM_TYPE_MPEG1_SS,   /* MPEG-1 system stream */
-	V4L2_MPEG_STREAM_TYPE_MPEG2_DVD,  /* MPEG-2 DVD-compatible stream */
-	V4L2_MPEG_STREAM_TYPE_MPEG1_VCD,  /* MPEG-1 VCD-compatible stream */
-	V4L2_MPEG_STREAM_TYPE_MPEG2_SVCD, /* MPEG-2 SVCD-compatible stream */
+	V4L2_MPEG_STREAM_TYPE_MPEG2_PS   = 0, /* MPEG-2 program stream */
+	V4L2_MPEG_STREAM_TYPE_MPEG2_TS   = 1, /* MPEG-2 transport stream */
+	V4L2_MPEG_STREAM_TYPE_MPEG1_SS   = 2, /* MPEG-1 system stream */
+	V4L2_MPEG_STREAM_TYPE_MPEG2_DVD  = 3, /* MPEG-2 DVD-compatible stream */
+	V4L2_MPEG_STREAM_TYPE_MPEG1_VCD  = 4, /* MPEG-1 VCD-compatible stream */
+	V4L2_MPEG_STREAM_TYPE_MPEG2_SVCD = 5, /* MPEG-2 SVCD-compatible stream */
 };
 #define V4L2_CID_MPEG_STREAM_PID_PMT 		(V4L2_CID_MPEG_BASE+1)
 #define V4L2_CID_MPEG_STREAM_PID_AUDIO 		(V4L2_CID_MPEG_BASE+2)
@@ -825,105 +825,105 @@ enum v4l2_mpeg_stream_type {
 /*  MPEG audio */
 #define V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ 	(V4L2_CID_MPEG_BASE+100)
 enum v4l2_mpeg_audio_sampling_freq {
-	V4L2_MPEG_AUDIO_SAMPLING_FREQ_44100,
-	V4L2_MPEG_AUDIO_SAMPLING_FREQ_48000,
-	V4L2_MPEG_AUDIO_SAMPLING_FREQ_32000,
+	V4L2_MPEG_AUDIO_SAMPLING_FREQ_44100 = 0,
+	V4L2_MPEG_AUDIO_SAMPLING_FREQ_48000 = 1,
+	V4L2_MPEG_AUDIO_SAMPLING_FREQ_32000 = 2,
 };
 #define V4L2_CID_MPEG_AUDIO_ENCODING 		(V4L2_CID_MPEG_BASE+101)
 enum v4l2_mpeg_audio_encoding {
-	V4L2_MPEG_AUDIO_ENCODING_LAYER_1,
-	V4L2_MPEG_AUDIO_ENCODING_LAYER_2,
-	V4L2_MPEG_AUDIO_ENCODING_LAYER_3,
+	V4L2_MPEG_AUDIO_ENCODING_LAYER_1 = 0,
+	V4L2_MPEG_AUDIO_ENCODING_LAYER_2 = 1,
+	V4L2_MPEG_AUDIO_ENCODING_LAYER_3 = 2,
 };
 #define V4L2_CID_MPEG_AUDIO_L1_BITRATE 		(V4L2_CID_MPEG_BASE+102)
 enum v4l2_mpeg_audio_l1_bitrate {
-	V4L2_MPEG_AUDIO_L1_BITRATE_32K,
-	V4L2_MPEG_AUDIO_L1_BITRATE_64K,
-	V4L2_MPEG_AUDIO_L1_BITRATE_96K,
-	V4L2_MPEG_AUDIO_L1_BITRATE_128K,
-	V4L2_MPEG_AUDIO_L1_BITRATE_160K,
-	V4L2_MPEG_AUDIO_L1_BITRATE_192K,
-	V4L2_MPEG_AUDIO_L1_BITRATE_224K,
-	V4L2_MPEG_AUDIO_L1_BITRATE_256K,
-	V4L2_MPEG_AUDIO_L1_BITRATE_288K,
-	V4L2_MPEG_AUDIO_L1_BITRATE_320K,
-	V4L2_MPEG_AUDIO_L1_BITRATE_352K,
-	V4L2_MPEG_AUDIO_L1_BITRATE_384K,
-	V4L2_MPEG_AUDIO_L1_BITRATE_416K,
-	V4L2_MPEG_AUDIO_L1_BITRATE_448K,
+	V4L2_MPEG_AUDIO_L1_BITRATE_32K  = 0,
+	V4L2_MPEG_AUDIO_L1_BITRATE_64K  = 1,
+	V4L2_MPEG_AUDIO_L1_BITRATE_96K  = 2,
+	V4L2_MPEG_AUDIO_L1_BITRATE_128K = 3,
+	V4L2_MPEG_AUDIO_L1_BITRATE_160K = 4,
+	V4L2_MPEG_AUDIO_L1_BITRATE_192K = 5,
+	V4L2_MPEG_AUDIO_L1_BITRATE_224K = 6,
+	V4L2_MPEG_AUDIO_L1_BITRATE_256K = 7,
+	V4L2_MPEG_AUDIO_L1_BITRATE_288K = 8,
+	V4L2_MPEG_AUDIO_L1_BITRATE_320K = 9,
+	V4L2_MPEG_AUDIO_L1_BITRATE_352K = 10,
+	V4L2_MPEG_AUDIO_L1_BITRATE_384K = 11,
+	V4L2_MPEG_AUDIO_L1_BITRATE_416K = 12,
+	V4L2_MPEG_AUDIO_L1_BITRATE_448K = 13,
 };
 #define V4L2_CID_MPEG_AUDIO_L2_BITRATE 		(V4L2_CID_MPEG_BASE+103)
 enum v4l2_mpeg_audio_l2_bitrate {
-	V4L2_MPEG_AUDIO_L2_BITRATE_32K,
-	V4L2_MPEG_AUDIO_L2_BITRATE_48K,
-	V4L2_MPEG_AUDIO_L2_BITRATE_56K,
-	V4L2_MPEG_AUDIO_L2_BITRATE_64K,
-	V4L2_MPEG_AUDIO_L2_BITRATE_80K,
-	V4L2_MPEG_AUDIO_L2_BITRATE_96K,
-	V4L2_MPEG_AUDIO_L2_BITRATE_112K,
-	V4L2_MPEG_AUDIO_L2_BITRATE_128K,
-	V4L2_MPEG_AUDIO_L2_BITRATE_160K,
-	V4L2_MPEG_AUDIO_L2_BITRATE_192K,
-	V4L2_MPEG_AUDIO_L2_BITRATE_224K,
-	V4L2_MPEG_AUDIO_L2_BITRATE_256K,
-	V4L2_MPEG_AUDIO_L2_BITRATE_320K,
-	V4L2_MPEG_AUDIO_L2_BITRATE_384K,
+	V4L2_MPEG_AUDIO_L2_BITRATE_32K  = 0,
+	V4L2_MPEG_AUDIO_L2_BITRATE_48K  = 1,
+	V4L2_MPEG_AUDIO_L2_BITRATE_56K  = 2,
+	V4L2_MPEG_AUDIO_L2_BITRATE_64K  = 3,
+	V4L2_MPEG_AUDIO_L2_BITRATE_80K  = 4,
+	V4L2_MPEG_AUDIO_L2_BITRATE_96K  = 5,
+	V4L2_MPEG_AUDIO_L2_BITRATE_112K = 6,
+	V4L2_MPEG_AUDIO_L2_BITRATE_128K = 7,
+	V4L2_MPEG_AUDIO_L2_BITRATE_160K = 8,
+	V4L2_MPEG_AUDIO_L2_BITRATE_192K = 9,
+	V4L2_MPEG_AUDIO_L2_BITRATE_224K = 10,
+	V4L2_MPEG_AUDIO_L2_BITRATE_256K = 11,
+	V4L2_MPEG_AUDIO_L2_BITRATE_320K = 12,
+	V4L2_MPEG_AUDIO_L2_BITRATE_384K = 13,
 };
 #define V4L2_CID_MPEG_AUDIO_L3_BITRATE 		(V4L2_CID_MPEG_BASE+104)
 enum v4l2_mpeg_audio_l3_bitrate {
-	V4L2_MPEG_AUDIO_L3_BITRATE_32K,
-	V4L2_MPEG_AUDIO_L3_BITRATE_40K,
-	V4L2_MPEG_AUDIO_L3_BITRATE_48K,
-	V4L2_MPEG_AUDIO_L3_BITRATE_56K,
-	V4L2_MPEG_AUDIO_L3_BITRATE_64K,
-	V4L2_MPEG_AUDIO_L3_BITRATE_80K,
-	V4L2_MPEG_AUDIO_L3_BITRATE_96K,
-	V4L2_MPEG_AUDIO_L3_BITRATE_112K,
-	V4L2_MPEG_AUDIO_L3_BITRATE_128K,
-	V4L2_MPEG_AUDIO_L3_BITRATE_160K,
-	V4L2_MPEG_AUDIO_L3_BITRATE_192K,
-	V4L2_MPEG_AUDIO_L3_BITRATE_224K,
-	V4L2_MPEG_AUDIO_L3_BITRATE_256K,
-	V4L2_MPEG_AUDIO_L3_BITRATE_320K,
+	V4L2_MPEG_AUDIO_L3_BITRATE_32K  = 0,
+	V4L2_MPEG_AUDIO_L3_BITRATE_40K  = 1,
+	V4L2_MPEG_AUDIO_L3_BITRATE_48K  = 2,
+	V4L2_MPEG_AUDIO_L3_BITRATE_56K  = 3,
+	V4L2_MPEG_AUDIO_L3_BITRATE_64K  = 4,
+	V4L2_MPEG_AUDIO_L3_BITRATE_80K  = 5,
+	V4L2_MPEG_AUDIO_L3_BITRATE_96K  = 6,
+	V4L2_MPEG_AUDIO_L3_BITRATE_112K = 7,
+	V4L2_MPEG_AUDIO_L3_BITRATE_128K = 8,
+	V4L2_MPEG_AUDIO_L3_BITRATE_160K = 9,
+	V4L2_MPEG_AUDIO_L3_BITRATE_192K = 10,
+	V4L2_MPEG_AUDIO_L3_BITRATE_224K = 11,
+	V4L2_MPEG_AUDIO_L3_BITRATE_256K = 12,
+	V4L2_MPEG_AUDIO_L3_BITRATE_320K = 13,
 };
 #define V4L2_CID_MPEG_AUDIO_MODE 		(V4L2_CID_MPEG_BASE+105)
 enum v4l2_mpeg_audio_mode {
-	V4L2_MPEG_AUDIO_MODE_STEREO,
-	V4L2_MPEG_AUDIO_MODE_JOINT_STEREO,
-	V4L2_MPEG_AUDIO_MODE_DUAL,
-	V4L2_MPEG_AUDIO_MODE_MONO,
+	V4L2_MPEG_AUDIO_MODE_STEREO       = 0,
+	V4L2_MPEG_AUDIO_MODE_JOINT_STEREO = 1,
+	V4L2_MPEG_AUDIO_MODE_DUAL         = 2,
+	V4L2_MPEG_AUDIO_MODE_MONO         = 3,
 };
 #define V4L2_CID_MPEG_AUDIO_MODE_EXTENSION 	(V4L2_CID_MPEG_BASE+106)
 enum v4l2_mpeg_audio_mode_extension {
-	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_4,
-	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_8,
-	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_12,
-	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_16,
+	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_4  = 0,
+	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_8  = 1,
+	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_12 = 2,
+	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_16 = 3,
 };
 #define V4L2_CID_MPEG_AUDIO_EMPHASIS 		(V4L2_CID_MPEG_BASE+107)
 enum v4l2_mpeg_audio_emphasis {
-	V4L2_MPEG_AUDIO_EMPHASIS_NONE,
-	V4L2_MPEG_AUDIO_EMPHASIS_50_DIV_15_uS,
-	V4L2_MPEG_AUDIO_EMPHASIS_CCITT_J17,
+	V4L2_MPEG_AUDIO_EMPHASIS_NONE         = 0,
+	V4L2_MPEG_AUDIO_EMPHASIS_50_DIV_15_uS = 1,
+	V4L2_MPEG_AUDIO_EMPHASIS_CCITT_J17    = 2,
 };
 #define V4L2_CID_MPEG_AUDIO_CRC 		(V4L2_CID_MPEG_BASE+108)
 enum v4l2_mpeg_audio_crc {
-	V4L2_MPEG_AUDIO_CRC_NONE,
-	V4L2_MPEG_AUDIO_CRC_CRC16,
+	V4L2_MPEG_AUDIO_CRC_NONE  = 0,
+	V4L2_MPEG_AUDIO_CRC_CRC16 = 1,
 };
 
 /*  MPEG video */
 #define V4L2_CID_MPEG_VIDEO_ENCODING 		(V4L2_CID_MPEG_BASE+200)
 enum v4l2_mpeg_video_encoding {
-	V4L2_MPEG_VIDEO_ENCODING_MPEG_1,
-	V4L2_MPEG_VIDEO_ENCODING_MPEG_2,
+	V4L2_MPEG_VIDEO_ENCODING_MPEG_1 = 0,
+	V4L2_MPEG_VIDEO_ENCODING_MPEG_2 = 1,
 };
 #define V4L2_CID_MPEG_VIDEO_ASPECT 		(V4L2_CID_MPEG_BASE+201)
 enum v4l2_mpeg_video_aspect {
-	V4L2_MPEG_VIDEO_ASPECT_1x1,
-	V4L2_MPEG_VIDEO_ASPECT_4x3,
-	V4L2_MPEG_VIDEO_ASPECT_16x9,
-	V4L2_MPEG_VIDEO_ASPECT_221x100,
+	V4L2_MPEG_VIDEO_ASPECT_1x1     = 0,
+	V4L2_MPEG_VIDEO_ASPECT_4x3     = 1,
+	V4L2_MPEG_VIDEO_ASPECT_16x9    = 2,
+	V4L2_MPEG_VIDEO_ASPECT_221x100 = 3,
 };
 #define V4L2_CID_MPEG_VIDEO_B_FRAMES 		(V4L2_CID_MPEG_BASE+202)
 #define V4L2_CID_MPEG_VIDEO_GOP_SIZE 		(V4L2_CID_MPEG_BASE+203)
@@ -931,8 +931,8 @@ enum v4l2_mpeg_video_aspect {
 #define V4L2_CID_MPEG_VIDEO_PULLDOWN 		(V4L2_CID_MPEG_BASE+205)
 #define V4L2_CID_MPEG_VIDEO_BITRATE_MODE 	(V4L2_CID_MPEG_BASE+206)
 enum v4l2_mpeg_video_bitrate_mode {
-	V4L2_MPEG_VIDEO_BITRATE_MODE_VBR,
-	V4L2_MPEG_VIDEO_BITRATE_MODE_CBR,
+	V4L2_MPEG_VIDEO_BITRATE_MODE_VBR = 0,
+	V4L2_MPEG_VIDEO_BITRATE_MODE_CBR = 1,
 };
 #define V4L2_CID_MPEG_VIDEO_BITRATE 		(V4L2_CID_MPEG_BASE+207)
 #define V4L2_CID_MPEG_VIDEO_BITRATE_PEAK 	(V4L2_CID_MPEG_BASE+208)
@@ -942,36 +942,36 @@ enum v4l2_mpeg_video_bitrate_mode {
 #define V4L2_CID_MPEG_CX2341X_BASE 				(V4L2_CTRL_CLASS_MPEG | 0x1000)
 #define V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE 	(V4L2_CID_MPEG_CX2341X_BASE+0)
 enum v4l2_mpeg_cx2341x_video_spatial_filter_mode {
-	V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_MANUAL,
-	V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_AUTO,
+	V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_MANUAL = 0,
+	V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_AUTO   = 1,
 };
 #define V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER 		(V4L2_CID_MPEG_CX2341X_BASE+1)
 #define V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE 	(V4L2_CID_MPEG_CX2341X_BASE+2)
 enum v4l2_mpeg_cx2341x_video_luma_spatial_filter_type {
-	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_OFF,
-	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_1D_HOR,
-	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_1D_VERT,
-	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_2D_HV_SEPARABLE,
-	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_2D_SYM_NON_SEPARABLE,
+	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_OFF                  = 0,
+	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_1D_HOR               = 1,
+	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_1D_VERT              = 2,
+	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_2D_HV_SEPARABLE      = 3,
+	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_2D_SYM_NON_SEPARABLE = 4,
 };
 #define V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE 	(V4L2_CID_MPEG_CX2341X_BASE+3)
 enum v4l2_mpeg_cx2341x_video_chroma_spatial_filter_type {
-	V4L2_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE_OFF,
-	V4L2_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE_1D_HOR,
+	V4L2_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE_OFF    = 0,
+	V4L2_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE_1D_HOR = 1,
 };
 #define V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE 	(V4L2_CID_MPEG_CX2341X_BASE+4)
 enum v4l2_mpeg_cx2341x_video_temporal_filter_mode {
-	V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_MANUAL,
-	V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_AUTO,
+	V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_MANUAL = 0,
+	V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_AUTO   = 1,
 };
 #define V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER 		(V4L2_CID_MPEG_CX2341X_BASE+5)
 #define V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE 		(V4L2_CID_MPEG_CX2341X_BASE+6)
 enum v4l2_mpeg_cx2341x_video_median_filter_type {
-	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_OFF,
-	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_HOR,
-	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_VERT,
-	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_HOR_VERT,
-	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_DIAG,
+	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_OFF      = 0,
+	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_HOR      = 1,
+	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_VERT     = 2,
+	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_HOR_VERT = 3,
+	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_DIAG     = 4,
 };
 #define V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_BOTTOM 	(V4L2_CID_MPEG_CX2341X_BASE+7)
 #define V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_TOP 	(V4L2_CID_MPEG_CX2341X_BASE+8)
diff --git a/include/media/cx2341x.h b/include/media/cx2341x.h
index fb170d4b5235..51fb06b4c394 100644
--- a/include/media/cx2341x.h
+++ b/include/media/cx2341x.h
@@ -20,9 +20,9 @@
 #define CX2341X_H
 
 enum cx2341x_port {
-	CX2341X_PORT_MEMORY,
-	CX2341X_PORT_STREAMING,
-	CX2341X_PORT_SERIAL
+	CX2341X_PORT_MEMORY    = 0,
+	CX2341X_PORT_STREAMING = 1,
+	CX2341X_PORT_SERIAL    = 2
 };
 
 struct cx2341x_mpeg_params {
-- 
cgit v1.2.3


From ccf01ef7aa9c6c293a1c64c27331a2ce227916ec Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 25 Jun 2006 06:27:31 -0400
Subject: Merge branch 'odirect'

---
 fs/nfs/direct.c         | 435 ++++++++++++++++++++++++++----------------------
 include/linux/nfs_xdr.h |   2 -
 2 files changed, 234 insertions(+), 203 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e25b7595b7ad..402005c35ab3 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -68,19 +68,25 @@ struct nfs_direct_req {
 	struct kref		kref;		/* release manager */
 
 	/* I/O parameters */
+	struct list_head	list,		/* nfs_read/write_data structs */
+				rewrite_list;	/* saved nfs_write_data structs */
 	struct nfs_open_context	*ctx;		/* file open context info */
 	struct kiocb *		iocb;		/* controlling i/o request */
 	struct inode *		inode;		/* target file of i/o */
+	unsigned long		user_addr;	/* location of user's buffer */
+	size_t			user_count;	/* total bytes to move */
+	loff_t			pos;		/* starting offset in file */
+	struct page **		pages;		/* pages in our buffer */
+	unsigned int		npages;		/* count of pages */
 
 	/* completion state */
-	atomic_t		io_count;	/* i/os we're waiting for */
 	spinlock_t		lock;		/* protect completion state */
+	int			outstanding;	/* i/os we're waiting for */
 	ssize_t			count,		/* bytes actually processed */
 				error;		/* any reported error */
 	struct completion	completion;	/* wait for i/o completion */
 
 	/* commit state */
-	struct list_head	rewrite_list;	/* saved nfs_write_data structs */
 	struct nfs_write_data *	commit_data;	/* special write_data for commits */
 	int			flags;
 #define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
@@ -88,37 +94,8 @@ struct nfs_direct_req {
 	struct nfs_writeverf	verf;		/* unstable write verifier */
 };
 
+static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
-static const struct rpc_call_ops nfs_write_direct_ops;
-
-static inline void get_dreq(struct nfs_direct_req *dreq)
-{
-	atomic_inc(&dreq->io_count);
-}
-
-static inline int put_dreq(struct nfs_direct_req *dreq)
-{
-	return atomic_dec_and_test(&dreq->io_count);
-}
-
-/*
- * "size" is never larger than rsize or wsize.
- */
-static inline int nfs_direct_count_pages(unsigned long user_addr, size_t size)
-{
-	int page_count;
-
-	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	page_count -= user_addr >> PAGE_SHIFT;
-	BUG_ON(page_count < 0);
-
-	return page_count;
-}
-
-static inline unsigned int nfs_max_pages(unsigned int size)
-{
-	return (size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-}
 
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
@@ -142,21 +119,50 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_
 	return -EINVAL;
 }
 
-static void nfs_direct_dirty_pages(struct page **pages, int npages)
+static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
 {
 	int i;
 	for (i = 0; i < npages; i++) {
 		struct page *page = pages[i];
-		if (!PageCompound(page))
+		if (do_dirty && !PageCompound(page))
 			set_page_dirty_lock(page);
+		page_cache_release(page);
 	}
+	kfree(pages);
 }
 
-static void nfs_direct_release_pages(struct page **pages, int npages)
+static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
 {
-	int i;
-	for (i = 0; i < npages; i++)
-		page_cache_release(pages[i]);
+	int result = -ENOMEM;
+	unsigned long page_count;
+	size_t array_size;
+
+	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	page_count -= user_addr >> PAGE_SHIFT;
+
+	array_size = (page_count * sizeof(struct page *));
+	*pages = kmalloc(array_size, GFP_KERNEL);
+	if (*pages) {
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, user_addr,
+					page_count, (rw == READ), 0,
+					*pages, NULL);
+		up_read(&current->mm->mmap_sem);
+		if (result != page_count) {
+			/*
+			 * If we got fewer pages than expected from
+			 * get_user_pages(), the user buffer runs off the
+			 * end of a mapping; return EFAULT.
+			 */
+			if (result >= 0) {
+				nfs_free_user_pages(*pages, result, 0);
+				result = -EFAULT;
+			} else
+				kfree(*pages);
+			*pages = NULL;
+		}
+	}
+	return result;
 }
 
 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
@@ -168,13 +174,13 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 		return NULL;
 
 	kref_init(&dreq->kref);
-	kref_get(&dreq->kref);
 	init_completion(&dreq->completion);
+	INIT_LIST_HEAD(&dreq->list);
 	INIT_LIST_HEAD(&dreq->rewrite_list);
 	dreq->iocb = NULL;
 	dreq->ctx = NULL;
 	spin_lock_init(&dreq->lock);
-	atomic_set(&dreq->io_count, 0);
+	dreq->outstanding = 0;
 	dreq->count = 0;
 	dreq->error = 0;
 	dreq->flags = 0;
@@ -215,11 +221,18 @@ out:
 }
 
 /*
- * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
- * the iocb is still valid here if this is a synchronous request.
+ * We must hold a reference to all the pages in this direct read request
+ * until the RPCs complete.  This could be long *after* we are woken up in
+ * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
+ *
+ * In addition, synchronous I/O uses a stack-allocated iocb.  Thus we
+ * can't trust the iocb is still valid here if this is a synchronous
+ * request.  If the waiter is woken prematurely, the iocb is long gone.
  */
 static void nfs_direct_complete(struct nfs_direct_req *dreq)
 {
+	nfs_free_user_pages(dreq->pages, dreq->npages, 1);
+
 	if (dreq->iocb) {
 		long res = (long) dreq->error;
 		if (!res)
@@ -232,10 +245,48 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
 }
 
 /*
- * We must hold a reference to all the pages in this direct read request
- * until the RPCs complete.  This could be long *after* we are woken up in
- * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
+ * Note we also set the number of requests we have in the dreq when we are
+ * done.  This prevents races with I/O completion so we will always wait
+ * until all requests have been dispatched and completed.
  */
+static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
+{
+	struct list_head *list;
+	struct nfs_direct_req *dreq;
+	unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	dreq = nfs_direct_req_alloc();
+	if (!dreq)
+		return NULL;
+
+	list = &dreq->list;
+	for(;;) {
+		struct nfs_read_data *data = nfs_readdata_alloc(rpages);
+
+		if (unlikely(!data)) {
+			while (!list_empty(list)) {
+				data = list_entry(list->next,
+						  struct nfs_read_data, pages);
+				list_del(&data->pages);
+				nfs_readdata_free(data);
+			}
+			kref_put(&dreq->kref, nfs_direct_req_release);
+			return NULL;
+		}
+
+		INIT_LIST_HEAD(&data->pages);
+		list_add(&data->pages, list);
+
+		data->req = (struct nfs_page *) dreq;
+		dreq->outstanding++;
+		if (nbytes <= rsize)
+			break;
+		nbytes -= rsize;
+	}
+	kref_get(&dreq->kref);
+	return dreq;
+}
+
 static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
@@ -244,9 +295,6 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 	if (nfs_readpage_result(task, data) != 0)
 		return;
 
-	nfs_direct_dirty_pages(data->pagevec, data->npages);
-	nfs_direct_release_pages(data->pagevec, data->npages);
-
 	spin_lock(&dreq->lock);
 
 	if (likely(task->tk_status >= 0))
@@ -254,10 +302,13 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 	else
 		dreq->error = task->tk_status;
 
-	spin_unlock(&dreq->lock);
+	if (--dreq->outstanding) {
+		spin_unlock(&dreq->lock);
+		return;
+	}
 
-	if (put_dreq(dreq))
-		nfs_direct_complete(dreq);
+	spin_unlock(&dreq->lock);
+	nfs_direct_complete(dreq);
 }
 
 static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -266,60 +317,41 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
 };
 
 /*
- * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
- * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
- * bail and stop sending more reads.  Read length accounting is
- * handled automatically by nfs_direct_read_result().  Otherwise, if
- * no requests have been sent, just return an error.
+ * For each nfs_read_data struct that was allocated on the list, dispatch
+ * an NFS READ operation
  */
-static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
+static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
+	struct list_head *list = &dreq->list;
+	struct page **pages = dreq->pages;
+	size_t count = dreq->user_count;
+	loff_t pos = dreq->pos;
 	size_t rsize = NFS_SERVER(inode)->rsize;
-	unsigned int rpages = nfs_max_pages(rsize);
-	unsigned int pgbase;
-	int result;
-	ssize_t started = 0;
-
-	get_dreq(dreq);
+	unsigned int curpage, pgbase;
 
-	pgbase = user_addr & ~PAGE_MASK;
+	curpage = 0;
+	pgbase = dreq->user_addr & ~PAGE_MASK;
 	do {
 		struct nfs_read_data *data;
 		size_t bytes;
 
-		result = -ENOMEM;
-		data = nfs_readdata_alloc(rpages);
-		if (unlikely(!data))
-			break;
-
 		bytes = rsize;
 		if (count < rsize)
 			bytes = count;
 
-		data->npages = nfs_direct_count_pages(user_addr, bytes);
-		down_read(&current->mm->mmap_sem);
-		result = get_user_pages(current, current->mm, user_addr,
-					data->npages, 1, 0, data->pagevec, NULL);
-		up_read(&current->mm->mmap_sem);
-		if (unlikely(result < data->npages)) {
-			if (result > 0)
-				nfs_direct_release_pages(data->pagevec, result);
-			nfs_readdata_release(data);
-			break;
-		}
-
-		get_dreq(dreq);
+		BUG_ON(list_empty(list));
+		data = list_entry(list->next, struct nfs_read_data, pages);
+		list_del_init(&data->pages);
 
-		data->req = (struct nfs_page *) dreq;
 		data->inode = inode;
 		data->cred = ctx->cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-		data->args.pages = data->pagevec;
+		data->args.pages = &pages[curpage];
 		data->args.count = bytes;
 		data->res.fattr = &data->fattr;
 		data->res.eof = 0;
@@ -342,35 +374,33 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
 				bytes,
 				(unsigned long long)data->args.offset);
 
-		started += bytes;
-		user_addr += bytes;
 		pos += bytes;
 		pgbase += bytes;
+		curpage += pgbase >> PAGE_SHIFT;
 		pgbase &= ~PAGE_MASK;
 
 		count -= bytes;
 	} while (count != 0);
-
-	if (put_dreq(dreq))
-		nfs_direct_complete(dreq);
-
-	if (started)
-		return 0;
-	return result < 0 ? (ssize_t) result : -EFAULT;
+	BUG_ON(!list_empty(list));
 }
 
-static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages)
 {
-	ssize_t result = 0;
+	ssize_t result;
 	sigset_t oldset;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct rpc_clnt *clnt = NFS_CLIENT(inode);
 	struct nfs_direct_req *dreq;
 
-	dreq = nfs_direct_req_alloc();
+	dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize);
 	if (!dreq)
 		return -ENOMEM;
 
+	dreq->user_addr = user_addr;
+	dreq->user_count = count;
+	dreq->pos = pos;
+	dreq->pages = pages;
+	dreq->npages = nr_pages;
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
 	if (!is_sync_kiocb(iocb))
@@ -378,9 +408,8 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 
 	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
 	rpc_clnt_sigmask(clnt, &oldset);
-	result = nfs_direct_read_schedule(dreq, user_addr, count, pos);
-	if (!result)
-		result = nfs_direct_wait(dreq);
+	nfs_direct_read_schedule(dreq);
+	result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
 	return result;
@@ -388,10 +417,10 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 
 static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 {
-	while (!list_empty(&dreq->rewrite_list)) {
-		struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
+	list_splice_init(&dreq->rewrite_list, &dreq->list);
+	while (!list_empty(&dreq->list)) {
+		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
 		list_del(&data->pages);
-		nfs_direct_release_pages(data->pagevec, data->npages);
 		nfs_writedata_release(data);
 	}
 }
@@ -399,51 +428,14 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
-	struct inode *inode = dreq->inode;
-	struct list_head *p;
-	struct nfs_write_data *data;
+	struct list_head *pos;
 
+	list_splice_init(&dreq->rewrite_list, &dreq->list);
+	list_for_each(pos, &dreq->list)
+		dreq->outstanding++;
 	dreq->count = 0;
-	get_dreq(dreq);
-
-	list_for_each(p, &dreq->rewrite_list) {
-		data = list_entry(p, struct nfs_write_data, pages);
-
-		get_dreq(dreq);
-
-		/*
-		 * Reset data->res.
-		 */
-		nfs_fattr_init(&data->fattr);
-		data->res.count = data->args.count;
-		memset(&data->verf, 0, sizeof(data->verf));
-
-		/*
-		 * Reuse data->task; data->args should not have changed
-		 * since the original request was sent.
-		 */
-		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
-				&nfs_write_direct_ops, data);
-		NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
-
-		data->task.tk_priority = RPC_PRIORITY_NORMAL;
-		data->task.tk_cookie = (unsigned long) inode;
-
-		/*
-		 * We're called via an RPC callback, so BKL is already held.
-		 */
-		rpc_execute(&data->task);
-
-		dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
-				data->task.tk_pid,
-				inode->i_sb->s_id,
-				(long long)NFS_FILEID(inode),
-				data->args.count,
-				(unsigned long long)data->args.offset);
-	}
 
-	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, inode);
+	nfs_direct_write_schedule(dreq, FLUSH_STABLE);
 }
 
 static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
@@ -480,8 +472,8 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->cred = dreq->ctx->cred;
 
 	data->args.fh = NFS_FH(data->inode);
-	data->args.offset = 0;
-	data->args.count = 0;
+	data->args.offset = dreq->pos;
+	data->args.count = dreq->user_count;
 	data->res.count = 0;
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
@@ -543,6 +535,47 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 }
 #endif
 
+static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
+{
+	struct list_head *list;
+	struct nfs_direct_req *dreq;
+	unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	dreq = nfs_direct_req_alloc();
+	if (!dreq)
+		return NULL;
+
+	list = &dreq->list;
+	for(;;) {
+		struct nfs_write_data *data = nfs_writedata_alloc(wpages);
+
+		if (unlikely(!data)) {
+			while (!list_empty(list)) {
+				data = list_entry(list->next,
+						  struct nfs_write_data, pages);
+				list_del(&data->pages);
+				nfs_writedata_free(data);
+			}
+			kref_put(&dreq->kref, nfs_direct_req_release);
+			return NULL;
+		}
+
+		INIT_LIST_HEAD(&data->pages);
+		list_add(&data->pages, list);
+
+		data->req = (struct nfs_page *) dreq;
+		dreq->outstanding++;
+		if (nbytes <= wsize)
+			break;
+		nbytes -= wsize;
+	}
+
+	nfs_alloc_commit_data(dreq);
+
+	kref_get(&dreq->kref);
+	return dreq;
+}
+
 static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data *data = calldata;
@@ -572,6 +605,8 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 				}
 		}
 	}
+	/* In case we have to resend */
+	data->args.stable = NFS_FILE_SYNC;
 
 	spin_unlock(&dreq->lock);
 }
@@ -585,8 +620,14 @@ static void nfs_direct_write_release(void *calldata)
 	struct nfs_write_data *data = calldata;
 	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
 
-	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, data->inode);
+	spin_lock(&dreq->lock);
+	if (--dreq->outstanding) {
+		spin_unlock(&dreq->lock);
+		return;
+	}
+	spin_unlock(&dreq->lock);
+
+	nfs_direct_write_complete(dreq, data->inode);
 }
 
 static const struct rpc_call_ops nfs_write_direct_ops = {
@@ -595,62 +636,41 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
 };
 
 /*
- * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
- * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
- * bail and stop sending more writes.  Write length accounting is
- * handled automatically by nfs_direct_write_result().  Otherwise, if
- * no requests have been sent, just return an error.
+ * For each nfs_write_data struct that was allocated on the list, dispatch
+ * an NFS WRITE operation
  */
-static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
+static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
+	struct list_head *list = &dreq->list;
+	struct page **pages = dreq->pages;
+	size_t count = dreq->user_count;
+	loff_t pos = dreq->pos;
 	size_t wsize = NFS_SERVER(inode)->wsize;
-	unsigned int wpages = nfs_max_pages(wsize);
-	unsigned int pgbase;
-	int result;
-	ssize_t started = 0;
+	unsigned int curpage, pgbase;
 
-	get_dreq(dreq);
-
-	pgbase = user_addr & ~PAGE_MASK;
+	curpage = 0;
+	pgbase = dreq->user_addr & ~PAGE_MASK;
 	do {
 		struct nfs_write_data *data;
 		size_t bytes;
 
-		result = -ENOMEM;
-		data = nfs_writedata_alloc(wpages);
-		if (unlikely(!data))
-			break;
-
 		bytes = wsize;
 		if (count < wsize)
 			bytes = count;
 
-		data->npages = nfs_direct_count_pages(user_addr, bytes);
-		down_read(&current->mm->mmap_sem);
-		result = get_user_pages(current, current->mm, user_addr,
-					data->npages, 0, 0, data->pagevec, NULL);
-		up_read(&current->mm->mmap_sem);
-		if (unlikely(result < data->npages)) {
-			if (result > 0)
-				nfs_direct_release_pages(data->pagevec, result);
-			nfs_writedata_release(data);
-			break;
-		}
-
-		get_dreq(dreq);
-
+		BUG_ON(list_empty(list));
+		data = list_entry(list->next, struct nfs_write_data, pages);
 		list_move_tail(&data->pages, &dreq->rewrite_list);
 
-		data->req = (struct nfs_page *) dreq;
 		data->inode = inode;
 		data->cred = ctx->cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-		data->args.pages = data->pagevec;
+		data->args.pages = &pages[curpage];
 		data->args.count = bytes;
 		data->res.fattr = &data->fattr;
 		data->res.count = bytes;
@@ -674,26 +694,19 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
 				bytes,
 				(unsigned long long)data->args.offset);
 
-		started += bytes;
-		user_addr += bytes;
 		pos += bytes;
 		pgbase += bytes;
+		curpage += pgbase >> PAGE_SHIFT;
 		pgbase &= ~PAGE_MASK;
 
 		count -= bytes;
 	} while (count != 0);
-
-	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, inode);
-
-	if (started)
-		return 0;
-	return result < 0 ? (ssize_t) result : -EFAULT;
+	BUG_ON(!list_empty(list));
 }
 
-static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages)
 {
-	ssize_t result = 0;
+	ssize_t result;
 	sigset_t oldset;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct rpc_clnt *clnt = NFS_CLIENT(inode);
@@ -701,14 +714,17 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	int sync = 0;
 
-	dreq = nfs_direct_req_alloc();
+	dreq = nfs_direct_write_alloc(count, wsize);
 	if (!dreq)
 		return -ENOMEM;
-	nfs_alloc_commit_data(dreq);
-
 	if (dreq->commit_data == NULL || count < wsize)
 		sync = FLUSH_STABLE;
 
+	dreq->user_addr = user_addr;
+	dreq->user_count = count;
+	dreq->pos = pos;
+	dreq->pages = pages;
+	dreq->npages = nr_pages;
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
 	if (!is_sync_kiocb(iocb))
@@ -719,9 +735,8 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	nfs_begin_data_update(inode);
 
 	rpc_clnt_sigmask(clnt, &oldset);
-	result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
-	if (!result)
-		result = nfs_direct_wait(dreq);
+	nfs_direct_write_schedule(dreq, sync);
+	result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
 	return result;
@@ -751,6 +766,8 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
 {
 	ssize_t retval = -EINVAL;
+	int page_count;
+	struct page **pages;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 
@@ -772,7 +789,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count,
 	if (retval)
 		goto out;
 
-	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos);
+	retval = nfs_get_user_pages(READ, (unsigned long) buf,
+						count, &pages);
+	if (retval < 0)
+		goto out;
+	page_count = retval;
+
+	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
+						pages, page_count);
 	if (retval > 0)
 		iocb->ki_pos = pos + retval;
 
@@ -808,6 +832,8 @@ out:
 ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
 {
 	ssize_t retval;
+	int page_count;
+	struct page **pages;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 
@@ -835,7 +861,14 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t
 	if (retval)
 		goto out;
 
-	retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos);
+	retval = nfs_get_user_pages(WRITE, (unsigned long) buf,
+						count, &pages);
+	if (retval < 0)
+		goto out;
+	page_count = retval;
+
+	retval = nfs_direct_write(iocb, (unsigned long) buf, count,
+					pos, pages, page_count);
 
 	/*
 	 * XXX: nfs_end_data_update() already ensures this file's
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 2d3fb6416d91..7c7320fa51aa 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -729,7 +729,6 @@ struct nfs_read_data {
 	struct list_head	pages;	/* Coalesced read requests */
 	struct nfs_page		*req;	/* multi ops per nfs_page */
 	struct page		**pagevec;
-	unsigned int		npages;	/* active pages in pagevec */
 	struct nfs_readargs args;
 	struct nfs_readres  res;
 #ifdef CONFIG_NFS_V4
@@ -748,7 +747,6 @@ struct nfs_write_data {
 	struct list_head	pages;		/* Coalesced requests we wish to flush */
 	struct nfs_page		*req;		/* multi ops per nfs_page */
 	struct page		**pagevec;
-	unsigned int		npages;		/* active pages in pagevec */
 	struct nfs_writeargs	args;		/* argument struct */
 	struct nfs_writeres	res;		/* result struct */
 #ifdef CONFIG_NFS_V4
-- 
cgit v1.2.3


From d75d54147db9db5194040bd1c5022df6ba36ee48 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 25 Jun 2006 02:41:26 -0700
Subject: git-nfs-build-fixes

Fix various problems with nfs4 disabled.  And various other things.

In file included from fs/nfs/inode.c:50:
fs/nfs/internal.h:24: error: static declaration of 'nfs_do_refmount' follows non-static declaration
include/linux/nfs_fs.h:320: error: previous declaration of 'nfs_do_refmount' was here
fs/nfs/internal.h:65: warning: 'struct nfs4_fs_locations' declared inside parameter list
fs/nfs/internal.h:65: warning: its scope is only this definition or declaration, which is probably not what you want
fs/nfs/internal.h: In function 'nfs4_path':
fs/nfs/internal.h:97: error: 'struct nfs_server' has no member named 'mnt_path'
fs/nfs/inode.c: In function 'init_once':
fs/nfs/inode.c:1116: error: 'struct nfs_inode' has no member named 'open_states'
fs/nfs/inode.c:1116: error: 'struct nfs_inode' has no member named 'delegation'
fs/nfs/inode.c:1116: error: 'struct nfs_inode' has no member named 'delegation_state'
fs/nfs/inode.c:1116: error: 'struct nfs_inode' has no member named 'rwsem'
distcc[26452] ERROR: compile fs/nfs/inode.c on g5/64 failed
make[1]: *** [fs/nfs/inode.o] Error 1
make: *** [fs/nfs/inode.o] Error 2
make: *** Waiting for unfinished jobs....
In file included from fs/nfs/nfs3xdr.c:26:
fs/nfs/internal.h:24: error: static declaration of 'nfs_do_refmount' follows non-static declaration
include/linux/nfs_fs.h:320: error: previous declaration of 'nfs_do_refmount' was here
fs/nfs/internal.h:65: warning: 'struct nfs4_fs_locations' declared inside parameter list
fs/nfs/internal.h:65: warning: its scope is only this definition or declaration, which is probably not what you want
fs/nfs/internal.h: In function 'nfs4_path':
fs/nfs/internal.h:97: error: 'struct nfs_server' has no member named 'mnt_path'
distcc[26486] ERROR: compile fs/nfs/nfs3xdr.c on g5/64 failed
make[1]: *** [fs/nfs/nfs3xdr.o] Error 1
make: *** [fs/nfs/nfs3xdr.o] Error 2
In file included from fs/nfs/nfs3proc.c:24:
fs/nfs/internal.h:24: error: static declaration of 'nfs_do_refmount' follows non-static declaration
include/linux/nfs_fs.h:320: error: previous declaration of 'nfs_do_refmount' was here
fs/nfs/internal.h:65: warning: 'struct nfs4_fs_locations' declared inside parameter list
fs/nfs/internal.h:65: warning: its scope is only this definition or declaration, which is probably not what you want
fs/nfs/internal.h: In function 'nfs4_path':
fs/nfs/internal.h:97: error: 'struct nfs_server' has no member named 'mnt_path'
distcc[26469] ERROR: compile fs/nfs/nfs3proc.c on bix/32 failed
make[1]: *** [fs/nfs/nfs3proc.o] Error 1
make: *** [fs/nfs/nfs3proc.o] Error 2
**FAILED**

Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andreas Gruenbacher <agruen@suse.de>
Cc: Andy Adamson <andros@citi.umich.edu>
Cc: Chuck Lever <cel@netapp.com>
Cc: David Howells <dhowells@redhat.com>
Cc: J. Bruce Fields <bfields@fieldses.org>
Cc: Manoj Naik <manoj@almaden.ibm.com>
Cc: Marc Eshel <eshel@almaden.ibm.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         | 16 +++++++++-------
 fs/nfs/internal.h      |  9 ++++++++-
 fs/nfs/nfs2xdr.c       |  2 ++
 include/linux/nfs_fs.h |  2 --
 4 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 24a7139d3449..51bc88b662fe 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1089,13 +1089,15 @@ void nfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
 
-#define nfs4_init_once(nfsi) \
-	do { \
-		INIT_LIST_HEAD(&(nfsi)->open_states); \
-		nfsi->delegation = NULL; \
-		nfsi->delegation_state = 0; \
-		init_rwsem(&nfsi->rwsem); \
-	} while(0)
+static inline void nfs4_init_once(struct nfs_inode *nfsi)
+{
+#ifdef CONFIG_NFS_V4
+	INIT_LIST_HEAD(&nfsi->open_states);
+	nfsi->delegation = NULL;
+	nfsi->delegation_state = 0;
+	init_rwsem(&nfsi->rwsem);
+#endif
+}
 
 static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5e51c4535b6f..bd2815e2dec1 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -58,11 +58,13 @@ extern int nfs_stat_to_errno(int);
 extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
 
 /* nfs4proc.c */
+#ifdef CONFIG_NFS_V4
 extern struct rpc_procinfo nfs4_procedures[];
 
 extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
 				  struct nfs4_fs_locations *fs_locations,
 				  struct page *page);
+#endif
 
 /* inode.c */
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
@@ -92,9 +94,14 @@ extern char *nfs_path(const char *base, const struct dentry *dentry,
 /*
  * Determine the mount path as a string
  */
-static inline char *nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
+static inline char *
+nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
 {
+#ifdef CONFIG_NFS_V4
 	return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen);
+#else
+	return NULL;
+#endif
 }
 
 /*
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 67391eef6b93..3b939e055a08 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -25,6 +25,8 @@
 #include <linux/nfs_fs.h>
 #include "internal.h"
 
+#include "internal.h"
+
 #define NFSDBG_FACILITY		NFSDBG_XDR
 /* #define NFS_PARANOIA 1 */
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 152798949113..0a1740b2532e 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -317,8 +317,6 @@ extern struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
 					const struct dentry *dentry,
 					struct nfs_fh *fh,
 					struct nfs_fattr *fattr);
-extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent,
-					struct dentry *dentry);
 
 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
 extern u32 root_nfs_parse_addr(char *name); /*__init*/
-- 
cgit v1.2.3


From fb1bb34d45400f12e0a33f8c487b3795674908a7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 25 Jun 2006 05:46:43 -0700
Subject: [PATCH] remove for_each_cpu()

Convert a few stragglers over to for_each_possible_cpu(), remove
for_each_cpu().

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c       | 6 +++---
 arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c | 6 +++---
 arch/ia64/kernel/topology.c                       | 2 +-
 arch/powerpc/platforms/cell/interrupt.c           | 2 +-
 include/linux/cpumask.h                           | 1 -
 net/ipv4/netfilter/ip_tables.c                    | 2 +-
 6 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
index 05668e3598c0..5fd65325b81a 100644
--- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -371,11 +371,11 @@ static int acpi_cpufreq_early_init_acpi(void)
 
 	dprintk("acpi_cpufreq_early_init\n");
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		data = kzalloc(sizeof(struct acpi_processor_performance), 
 			GFP_KERNEL);
 		if (!data) {
-			for_each_cpu(j) {
+			for_each_possible_cpu(j) {
 				kfree(acpi_perf_data[j]);
 				acpi_perf_data[j] = NULL;
 			}
@@ -584,7 +584,7 @@ acpi_cpufreq_exit (void)
 
 	cpufreq_unregister_driver(&acpi_cpufreq_driver);
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		kfree(acpi_perf_data[i]);
 		acpi_perf_data[i] = NULL;
 	}
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
index 31c3a5baaa7f..f7e4356f6820 100644
--- a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -361,11 +361,11 @@ static int centrino_cpu_early_init_acpi(void)
 	unsigned int	i, j;
 	struct acpi_processor_performance	*data;
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		data = kzalloc(sizeof(struct acpi_processor_performance), 
 				GFP_KERNEL);
 		if (!data) {
-			for_each_cpu(j) {
+			for_each_possible_cpu(j) {
 				kfree(acpi_perf_data[j]);
 				acpi_perf_data[j] = NULL;
 			}
@@ -805,7 +805,7 @@ static void __exit centrino_exit(void)
 	cpufreq_unregister_driver(&centrino_driver);
 
 #ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI
-	for_each_cpu(j) {
+	for_each_possible_cpu(j) {
 		kfree(acpi_perf_data[j]);
 		acpi_perf_data[j] = NULL;
 	}
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 4f3a16b37f8f..879edb51d1e0 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -166,7 +166,7 @@ static void cache_shared_cpu_map_setup( unsigned int cpu,
 
 	num_shared = (int) csi.num_shared;
 	do {
-		for_each_cpu(j)
+		for_each_possible_cpu(j)
 			if (cpu_data(cpu)->socket_id == cpu_data(j)->socket_id
 				&& cpu_data(j)->core_id == csi.log1_cid
 				&& cpu_data(j)->thread_id == csi.log1_tid)
diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index f4e2d8805c9e..1bbf822b4efc 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -180,7 +180,7 @@ static int setup_iic_hardcoded(void)
 	unsigned long regs;
 	struct iic *iic;
 
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		iic = &per_cpu(iic, cpu);
 		nodeid = cpu/2;
 
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 9cbb781d6f80..fb5b761e3444 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -405,7 +405,6 @@ int __any_online_cpu(const cpumask_t *mask);
 #define any_online_cpu(mask)		0
 #endif
 
-#define for_each_cpu(cpu)  for_each_cpu_mask((cpu), cpu_possible_map)
 #define for_each_possible_cpu(cpu)  for_each_cpu_mask((cpu), cpu_possible_map)
 #define for_each_online_cpu(cpu)  for_each_cpu_mask((cpu), cpu_online_map)
 #define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index cee3397ec277..706c0025ec5e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1761,7 +1761,7 @@ translate_compat_table(const char *name,
 		goto free_newinfo;
 
 	/* And one copy for every other CPU */
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		if (newinfo->entries[i] && newinfo->entries[i] != entry1)
 			memcpy(newinfo->entries[i], entry1, newinfo->size);
 
-- 
cgit v1.2.3


From 68402ddc677005ed1b1359bbc1f279548cfc0928 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 25 Jun 2006 05:46:47 -0700
Subject: [PATCH] mm: remove VM_LOCKED before remap_pfn_range and drop VM_SHM

Remove VM_LOCKED before remap_pfn range from device drivers and get rid of
VM_SHM.

remap_pfn_range() already sets VM_IO.  There is no need to set VM_SHM since
it does nothing.  VM_LOCKED is of no use since the remap_pfn_range does not
place pages on the LRU.  The pages are therefore never subject to swap
anyways.  Remove all the vm_flags settings before calling remap_pfn_range.

After removing all the vm_flag settings no use of VM_SHM is left.  Drop it.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/kernel/bios32.c              |  1 -
 arch/cris/arch-v32/drivers/pci/bios.c |  2 --
 arch/i386/pci/i386.c                  |  2 --
 arch/ia64/pci/pci.c                   |  3 ---
 arch/powerpc/kernel/pci_32.c          |  1 -
 arch/powerpc/kernel/pci_64.c          |  1 -
 arch/powerpc/kernel/proc_ppc64.c      |  2 --
 arch/ppc/kernel/pci.c                 |  1 -
 arch/xtensa/kernel/pci.c              | 12 ------------
 drivers/char/mmtimer.c                |  1 -
 drivers/sbus/char/flash.c             |  1 -
 drivers/sbus/char/vfc_dev.c           |  2 +-
 drivers/video/igafb.c                 |  3 ---
 include/linux/mm.h                    |  1 -
 14 files changed, 1 insertion(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c
index de606dfa8db9..302fc1401547 100644
--- a/arch/arm/kernel/bios32.c
+++ b/arch/arm/kernel/bios32.c
@@ -702,7 +702,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 	/*
 	 * Mark this as IO
 	 */
-	vma->vm_flags |= VM_SHM | VM_LOCKED | VM_IO;
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	if (remap_pfn_range(vma, vma->vm_start, phys,
diff --git a/arch/cris/arch-v32/drivers/pci/bios.c b/arch/cris/arch-v32/drivers/pci/bios.c
index 24bc149889b6..1e9d062103ae 100644
--- a/arch/cris/arch-v32/drivers/pci/bios.c
+++ b/arch/cris/arch-v32/drivers/pci/bios.c
@@ -27,8 +27,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 	/* Leave vm_pgoff as-is, the PCI space address is the physical
 	 * address on this platform.
 	 */
-	vma->vm_flags |= (VM_SHM | VM_LOCKED | VM_IO);
-
 	prot = pgprot_val(vma->vm_page_prot);
 	vma->vm_page_prot = __pgprot(prot);
 
diff --git a/arch/i386/pci/i386.c b/arch/i386/pci/i386.c
index 7852827a599b..a151f7a99f5e 100644
--- a/arch/i386/pci/i386.c
+++ b/arch/i386/pci/i386.c
@@ -285,8 +285,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 	/* Leave vm_pgoff as-is, the PCI space address is the physical
 	 * address on this platform.
 	 */
-	vma->vm_flags |= (VM_SHM | VM_LOCKED | VM_IO);
-
 	prot = pgprot_val(vma->vm_page_prot);
 	if (boot_cpu_data.x86 > 3)
 		prot |= _PAGE_PCD | _PAGE_PWT;
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 61dd8608da4f..77375a55da31 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -602,8 +602,6 @@ pci_mmap_page_range (struct pci_dev *dev, struct vm_area_struct *vma,
 	 * Leave vm_pgoff as-is, the PCI space address is the physical
 	 * address on this platform.
 	 */
-	vma->vm_flags |= (VM_SHM | VM_RESERVED | VM_IO);
-
 	if (write_combine && efi_range_is_wc(vma->vm_start,
 					     vma->vm_end - vma->vm_start))
 		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
@@ -666,7 +664,6 @@ pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma)
 
 	vma->vm_pgoff += (unsigned long)addr >> PAGE_SHIFT;
 	vma->vm_page_prot = prot;
-	vma->vm_flags |= (VM_SHM | VM_RESERVED | VM_IO);
 
 	if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
 			    size, vma->vm_page_prot))
diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c
index c858eb4bef17..b5431ccf1147 100644
--- a/arch/powerpc/kernel/pci_32.c
+++ b/arch/powerpc/kernel/pci_32.c
@@ -1654,7 +1654,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 		return -EINVAL;
 
 	vma->vm_pgoff = offset >> PAGE_SHIFT;
-	vma->vm_flags |= VM_SHM | VM_LOCKED | VM_IO;
 	vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp,
 						  vma->vm_page_prot,
 						  mmap_state, write_combine);
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index 5ad87c426bed..247937dd8b73 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -877,7 +877,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 		return -EINVAL;
 
 	vma->vm_pgoff = offset >> PAGE_SHIFT;
-	vma->vm_flags |= VM_SHM | VM_LOCKED | VM_IO;
 	vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp,
 						  vma->vm_page_prot,
 						  mmap_state, write_combine);
diff --git a/arch/powerpc/kernel/proc_ppc64.c b/arch/powerpc/kernel/proc_ppc64.c
index 2b87f82df135..2ab8f2be911e 100644
--- a/arch/powerpc/kernel/proc_ppc64.c
+++ b/arch/powerpc/kernel/proc_ppc64.c
@@ -115,8 +115,6 @@ static int page_map_mmap( struct file *file, struct vm_area_struct *vma )
 {
 	struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode);
 
-	vma->vm_flags |= VM_SHM | VM_LOCKED;
-
 	if ((vma->vm_end - vma->vm_start) > dp->size)
 		return -EINVAL;
 
diff --git a/arch/ppc/kernel/pci.c b/arch/ppc/kernel/pci.c
index 809673a36f7a..d20accf9650d 100644
--- a/arch/ppc/kernel/pci.c
+++ b/arch/ppc/kernel/pci.c
@@ -1032,7 +1032,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 		return -EINVAL;
 
 	vma->vm_pgoff = offset >> PAGE_SHIFT;
-	vma->vm_flags |= VM_SHM | VM_LOCKED | VM_IO;
 	vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp,
 						  vma->vm_page_prot,
 						  mmap_state, write_combine);
diff --git a/arch/xtensa/kernel/pci.c b/arch/xtensa/kernel/pci.c
index de19501aa809..c6f471b9eaa0 100644
--- a/arch/xtensa/kernel/pci.c
+++ b/arch/xtensa/kernel/pci.c
@@ -349,17 +349,6 @@ __pci_mmap_make_offset(struct pci_dev *dev, struct vm_area_struct *vma,
 	return -EINVAL;
 }
 
-/*
- * Set vm_flags of VMA, as appropriate for this architecture, for a pci device
- * mapping.
- */
-static __inline__ void
-__pci_mmap_set_flags(struct pci_dev *dev, struct vm_area_struct *vma,
-		     enum pci_mmap_state mmap_state)
-{
-	vma->vm_flags |= VM_SHM | VM_LOCKED | VM_IO;
-}
-
 /*
  * Set vm_page_prot of VMA, as appropriate for this architecture, for a pci
  * device mapping.
@@ -399,7 +388,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 	if (ret < 0)
 		return ret;
 
-	__pci_mmap_set_flags(dev, vma, mmap_state);
 	__pci_mmap_set_pgprot(dev, vma, mmap_state, write_combine);
 
 	ret = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index 1b05fa688996..d65b3109318a 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -329,7 +329,6 @@ static int mmtimer_mmap(struct file *file, struct vm_area_struct *vma)
 	if (PAGE_SIZE > (1 << 16))
 		return -ENOSYS;
 
-	vma->vm_flags |= (VM_IO | VM_SHM | VM_LOCKED );
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	mmtimer_addr = __pa(RTC_COUNTER_ADDR);
diff --git a/drivers/sbus/char/flash.c b/drivers/sbus/char/flash.c
index 5ae684c011f8..31b8a5f6116f 100644
--- a/drivers/sbus/char/flash.c
+++ b/drivers/sbus/char/flash.c
@@ -71,7 +71,6 @@ flash_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_end - (vma->vm_start + (vma->vm_pgoff << PAGE_SHIFT)) > size)
 		size = vma->vm_end - (vma->vm_start + (vma->vm_pgoff << PAGE_SHIFT));
 
-	vma->vm_flags |= (VM_SHM | VM_LOCKED);
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	if (io_remap_pfn_range(vma, vma->vm_start, addr, size, vma->vm_page_prot))
diff --git a/drivers/sbus/char/vfc_dev.c b/drivers/sbus/char/vfc_dev.c
index dfdd6be551f3..ddcd330b9e89 100644
--- a/drivers/sbus/char/vfc_dev.c
+++ b/drivers/sbus/char/vfc_dev.c
@@ -623,7 +623,7 @@ static int vfc_mmap(struct file *file, struct vm_area_struct *vma)
 		map_size = sizeof(struct vfc_regs);
 
 	vma->vm_flags |=
-		(VM_SHM | VM_LOCKED | VM_IO | VM_MAYREAD | VM_MAYWRITE | VM_MAYSHARE);
+		(VM_MAYREAD | VM_MAYWRITE | VM_MAYSHARE);
 	map_offset = (unsigned int) (long)dev->phys_regs;
 	ret = io_remap_pfn_range(vma, vma->vm_start,
 				  MK_IOSPACE_PFN(dev->which_io,
diff --git a/drivers/video/igafb.c b/drivers/video/igafb.c
index 6b88050d21bf..8a0c2d3d3805 100644
--- a/drivers/video/igafb.c
+++ b/drivers/video/igafb.c
@@ -232,9 +232,6 @@ static int igafb_mmap(struct fb_info *info,
 
 	size = vma->vm_end - vma->vm_start;
 
-	/* To stop the swapper from even considering these pages. */
-	vma->vm_flags |= (VM_SHM | VM_LOCKED);
-
 	/* Each page, see which map applies */
 	for (page = 0; page < size; ) {
 		map_size = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3b09444121d9..71c5d2f667ed 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -145,7 +145,6 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_GROWSDOWN	0x00000100	/* general info on the segment */
 #define VM_GROWSUP	0x00000200
-#define VM_SHM		0x00000000	/* Means nothing: delete it later */
 #define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
 
-- 
cgit v1.2.3


From 7b2259b3e53f128c10a9fded0965e69d4a949847 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 25 Jun 2006 05:46:48 -0700
Subject: [PATCH] page migration: Support a vma migration function

Hooks for calling vma specific migration functions

With this patch a vma may define a vma->vm_ops->migrate function.  That
function may perform page migration on its own (some vmas may not contain page
structs and therefore cannot be handled by regular page migration.  Pages in a
vma may require special preparatory treatment before migration is possible
etc) .  Only mmap_sem is held when the migration function is called.  The
migrate() function gets passed two sets of nodemasks describing the source and
the target of the migration.  The flags parameter either contains

MPOL_MF_MOVE	which means that only pages used exclusively by
		the specified mm should be moved

or

MPOL_MF_MOVE_ALL which means that pages shared with other processes
		should also be moved.

The migration function returns 0 on success or an error condition.  An error
condition will prevent regular page migration from occurring.

On its own this patch cannot be included since there are no users for this
functionality.  But it seems that the uncached allocator will need this
functionality at some point.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/migrate.h | 11 ++++++++++-
 include/linux/mm.h      |  2 ++
 mm/mempolicy.c          |  6 +++++-
 mm/migrate.c            | 20 ++++++++++++++++++++
 4 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 5dba23a1c0d0..48148e0cdbd1 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -16,7 +16,9 @@ extern int fail_migrate_page(struct address_space *,
 			struct page *, struct page *);
 
 extern int migrate_prep(void);
-
+extern int migrate_vmas(struct mm_struct *mm,
+		const nodemask_t *from, const nodemask_t *to,
+		unsigned long flags);
 #else
 
 static inline int isolate_lru_page(struct page *p, struct list_head *list)
@@ -30,6 +32,13 @@ static inline int migrate_pages_to(struct list_head *pagelist,
 
 static inline int migrate_prep(void) { return -ENOSYS; }
 
+static inline int migrate_vmas(struct mm_struct *mm,
+		const nodemask_t *from, const nodemask_t *to,
+		unsigned long flags)
+{
+	return -ENOSYS;
+}
+
 /* Possible settings for the migrate_page() method in address_operations */
 #define migrate_page NULL
 #define fail_migrate_page NULL
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 71c5d2f667ed..a929ea197e48 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -206,6 +206,8 @@ struct vm_operations_struct {
 	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
 					unsigned long addr);
+	int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
+		const nodemask_t *to, unsigned long flags);
 #endif
 };
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ec4a1a950df9..73e0f23b7f51 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -632,6 +632,10 @@ int do_migrate_pages(struct mm_struct *mm,
 
   	down_read(&mm->mmap_sem);
 
+	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+	if (err)
+		goto out;
+
 /*
  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
@@ -691,7 +695,7 @@ int do_migrate_pages(struct mm_struct *mm,
 		if (err < 0)
 			break;
 	}
-
+out:
 	up_read(&mm->mmap_sem);
 	if (err < 0)
 		return err;
diff --git a/mm/migrate.c b/mm/migrate.c
index 1c2a71aa05cd..0576c0535988 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -976,3 +976,23 @@ out2:
 }
 #endif
 
+/*
+ * Call migration functions in the vma_ops that may prepare
+ * memory in a vm for migration. migration functions may perform
+ * the migration for vmas that do not have an underlying page struct.
+ */
+int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
+	const nodemask_t *from, unsigned long flags)
+{
+ 	struct vm_area_struct *vma;
+ 	int err = 0;
+
+ 	for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
+ 		if (vma->vm_ops && vma->vm_ops->migrate) {
+ 			err = vma->vm_ops->migrate(vma, to, from, flags);
+ 			if (err)
+ 				break;
+ 		}
+ 	}
+ 	return err;
+}
-- 
cgit v1.2.3


From 5474c120aafe78ca54bf272f7a01107c42da2b21 Mon Sep 17 00:00:00 2001
From: Michael Hanselmann <linux-kernel@hansmi.ch>
Date: Sun, 25 Jun 2006 05:47:08 -0700
Subject: [PATCH] Rewritten backlight infrastructure for portable Apple
 computers

This patch contains a total rewrite of the backlight infrastructure for
portable Apple computers.  Backward compatibility is retained.  A sysfs
interface allows userland to control the brightness with more steps than
before.  Userland is allowed to upload a brightness curve for different
monitors, similar to Mac OS X.

[akpm@osdl.org: add needed exports]
Signed-off-by: Michael Hanselmann <linux-kernel@hansmi.ch>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Richard Purdie <rpurdie@rpsys.net>
Cc: "Antonino A. Daplas" <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/kernel/traps.c                 |  15 +-
 arch/powerpc/platforms/powermac/backlight.c | 270 ++++++++++-------------
 arch/powerpc/xmon/xmon.c                    |   3 -
 drivers/macintosh/Kconfig                   |  19 +-
 drivers/macintosh/Makefile                  |   1 +
 drivers/macintosh/adbhid.c                  |  28 +--
 drivers/macintosh/via-pmu-backlight.c       | 150 +++++++++++++
 drivers/macintosh/via-pmu.c                 | 120 +++--------
 drivers/video/Kconfig                       |  56 +++++
 drivers/video/aty/Makefile                  |   1 +
 drivers/video/aty/aty128fb.c                | 322 +++++++++++++++++++---------
 drivers/video/aty/atyfb.h                   |   1 +
 drivers/video/aty/atyfb_base.c              | 178 ++++++++++++---
 drivers/video/aty/radeon_backlight.c        | 247 +++++++++++++++++++++
 drivers/video/aty/radeon_base.c             | 140 +-----------
 drivers/video/aty/radeonfb.h                |   9 +
 drivers/video/chipsfb.c                     |  30 ++-
 drivers/video/fbsysfs.c                     |  88 ++++++++
 drivers/video/nvidia/Makefile               |   3 +-
 drivers/video/nvidia/nv_backlight.c         | 175 +++++++++++++++
 drivers/video/nvidia/nv_proto.h             |  10 +
 drivers/video/nvidia/nvidia.c               |  95 ++------
 drivers/video/riva/fbdev.c                  | 222 +++++++++++++------
 include/asm-powerpc/backlight.h             |  30 +--
 include/linux/fb.h                          |  23 ++
 include/linux/pmu.h                         |   4 +
 26 files changed, 1529 insertions(+), 711 deletions(-)
 create mode 100644 drivers/macintosh/via-pmu-backlight.c
 create mode 100644 drivers/video/aty/radeon_backlight.c
 create mode 100644 drivers/video/nvidia/nv_backlight.c

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 91a6e04d9741..52f5659534f4 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -32,6 +32,7 @@
 #include <linux/delay.h>
 #include <linux/kprobes.h>
 #include <linux/kexec.h>
+#include <linux/backlight.h>
 
 #include <asm/kdebug.h>
 #include <asm/pgtable.h>
@@ -105,10 +106,18 @@ int die(const char *str, struct pt_regs *regs, long err)
 	spin_lock_irq(&die_lock);
 	bust_spinlocks(1);
 #ifdef CONFIG_PMAC_BACKLIGHT
-	if (machine_is(powermac)) {
-		set_backlight_enable(1);
-		set_backlight_level(BACKLIGHT_MAX);
+	mutex_lock(&pmac_backlight_mutex);
+	if (machine_is(powermac) && pmac_backlight) {
+		struct backlight_properties *props;
+
+		down(&pmac_backlight->sem);
+		props = pmac_backlight->props;
+		props->brightness = props->max_brightness;
+		props->power = FB_BLANK_UNBLANK;
+		props->update_status(pmac_backlight);
+		up(&pmac_backlight->sem);
 	}
+	mutex_unlock(&pmac_backlight_mutex);
 #endif
 	printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
 #ifdef CONFIG_PREEMPT
diff --git a/arch/powerpc/platforms/powermac/backlight.c b/arch/powerpc/platforms/powermac/backlight.c
index 8be2f7d071f0..498b042e1837 100644
--- a/arch/powerpc/platforms/powermac/backlight.c
+++ b/arch/powerpc/platforms/powermac/backlight.c
@@ -3,200 +3,148 @@
  * Contains support for the backlight.
  *
  *   Copyright (C) 2000 Benjamin Herrenschmidt
+ *   Copyright (C) 2006 Michael Hanselmann <linux-kernel@hansmi.ch>
  *
  */
 
 #include <linux/config.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/stddef.h>
-#include <linux/reboot.h>
-#include <linux/nvram.h>
-#include <linux/console.h>
-#include <asm/sections.h>
-#include <asm/ptrace.h>
-#include <asm/io.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
+#include <linux/fb.h>
+#include <linux/backlight.h>
 #include <asm/prom.h>
-#include <asm/machdep.h>
-#include <asm/nvram.h>
 #include <asm/backlight.h>
 
-#include <linux/adb.h>
-#include <linux/pmu.h>
+#define OLD_BACKLIGHT_MAX 15
 
-static struct backlight_controller *backlighter;
-static void* backlighter_data;
-static int backlight_autosave;
-static int backlight_level = BACKLIGHT_MAX;
-static int backlight_enabled = 1;
-static int backlight_req_level = -1;
-static int backlight_req_enable = -1;
+/* Protect the pmac_backlight variable */
+DEFINE_MUTEX(pmac_backlight_mutex);
 
-static void backlight_callback(void *);
-static DECLARE_WORK(backlight_work, backlight_callback, NULL);
+/* Main backlight storage
+ *
+ * Backlight drivers in this variable are required to have the "props"
+ * attribute set and to have an update_status function.
+ *
+ * We can only store one backlight here, but since Apple laptops have only one
+ * internal display, it doesn't matter. Other backlight drivers can be used
+ * independently.
+ *
+ * Lock ordering:
+ * pmac_backlight_mutex (global, main backlight)
+ *   pmac_backlight->sem (backlight class)
+ */
+struct backlight_device *pmac_backlight;
 
-void register_backlight_controller(struct backlight_controller *ctrler,
-					  void *data, char *type)
+int pmac_has_backlight_type(const char *type)
 {
-	struct device_node* bk_node;
-	char *prop;
-	int valid = 0;
-
-	/* There's already a matching controller, bail out */
-	if (backlighter != NULL)
-		return;
-
-	bk_node = find_devices("backlight");
-
-#ifdef CONFIG_ADB_PMU
-	/* Special case for the old PowerBook since I can't test on it */
-	backlight_autosave = machine_is_compatible("AAPL,3400/2400")
-		|| machine_is_compatible("AAPL,3500");
-	if ((backlight_autosave
-	     || machine_is_compatible("AAPL,PowerBook1998")
-	     || machine_is_compatible("PowerBook1,1"))
-	    && !strcmp(type, "pmu"))
-		valid = 1;
-#endif
+	struct device_node* bk_node = find_devices("backlight");
+
 	if (bk_node) {
-		prop = get_property(bk_node, "backlight-control", NULL);
-		if (prop && !strncmp(prop, type, strlen(type)))
-			valid = 1;
-	}
-	if (!valid)
-		return;
-	backlighter = ctrler;
-	backlighter_data = data;
-
-	if (bk_node && !backlight_autosave)
-		prop = get_property(bk_node, "bklt", NULL);
-	else
-		prop = NULL;
-	if (prop) {
-		backlight_level = ((*prop)+1) >> 1;
-		if (backlight_level > BACKLIGHT_MAX)
-			backlight_level = BACKLIGHT_MAX;
+		char *prop = get_property(bk_node, "backlight-control", NULL);
+		if (prop && strncmp(prop, type, strlen(type)) == 0)
+			return 1;
 	}
 
-#ifdef CONFIG_ADB_PMU
-	if (backlight_autosave) {
-		struct adb_request req;
-		pmu_request(&req, NULL, 2, 0xd9, 0);
-		while (!req.complete)
-			pmu_poll();
-		backlight_level = req.reply[0] >> 4;
-	}
-#endif
-	acquire_console_sem();
-	if (!backlighter->set_enable(1, backlight_level, data))
-		backlight_enabled = 1;
-	release_console_sem();
-
-	printk(KERN_INFO "Registered \"%s\" backlight controller,"
-	       "level: %d/15\n", type, backlight_level);
+	return 0;
 }
-EXPORT_SYMBOL(register_backlight_controller);
 
-void unregister_backlight_controller(struct backlight_controller
-					    *ctrler, void *data)
+int pmac_backlight_curve_lookup(struct fb_info *info, int value)
 {
-	/* We keep the current backlight level (for now) */
-	if (ctrler == backlighter && data == backlighter_data)
-		backlighter = NULL;
+	int level = (FB_BACKLIGHT_LEVELS - 1);
+
+	if (info && info->bl_dev) {
+		int i, max = 0;
+
+		/* Look for biggest value */
+		for (i = 0; i < FB_BACKLIGHT_LEVELS; i++)
+			max = max((int)info->bl_curve[i], max);
+
+		/* Look for nearest value */
+		for (i = 0; i < FB_BACKLIGHT_LEVELS; i++) {
+			int diff = abs(info->bl_curve[i] - value);
+			if (diff < max) {
+				max = diff;
+				level = i;
+			}
+		}
+
+	}
+
+	return level;
 }
-EXPORT_SYMBOL(unregister_backlight_controller);
 
-static int __set_backlight_enable(int enable)
+static void pmac_backlight_key(int direction)
 {
-	int rc;
-
-	if (!backlighter)
-		return -ENODEV;
-	acquire_console_sem();
-	rc = backlighter->set_enable(enable, backlight_level,
-				     backlighter_data);
-	if (!rc)
-		backlight_enabled = enable;
-	release_console_sem();
-	return rc;
+	mutex_lock(&pmac_backlight_mutex);
+	if (pmac_backlight) {
+		struct backlight_properties *props;
+		int brightness;
+
+		down(&pmac_backlight->sem);
+		props = pmac_backlight->props;
+
+		brightness = props->brightness +
+			((direction?-1:1) * (props->max_brightness / 15));
+
+		if (brightness < 0)
+			brightness = 0;
+		else if (brightness > props->max_brightness)
+			brightness = props->max_brightness;
+
+		props->brightness = brightness;
+		props->update_status(pmac_backlight);
+
+		up(&pmac_backlight->sem);
+	}
+	mutex_unlock(&pmac_backlight_mutex);
 }
-int set_backlight_enable(int enable)
+
+void pmac_backlight_key_up()
 {
-	if (!backlighter)
-		return -ENODEV;
-	backlight_req_enable = enable;
-	schedule_work(&backlight_work);
-	return 0;
+	pmac_backlight_key(0);
 }
 
-EXPORT_SYMBOL(set_backlight_enable);
-
-int get_backlight_enable(void)
+void pmac_backlight_key_down()
 {
-	if (!backlighter)
-		return -ENODEV;
-	return backlight_enabled;
+	pmac_backlight_key(1);
 }
-EXPORT_SYMBOL(get_backlight_enable);
 
-static int __set_backlight_level(int level)
+int pmac_backlight_set_legacy_brightness(int brightness)
 {
-	int rc = 0;
-
-	if (!backlighter)
-		return -ENODEV;
-	if (level < BACKLIGHT_MIN)
-		level = BACKLIGHT_OFF;
-	if (level > BACKLIGHT_MAX)
-		level = BACKLIGHT_MAX;
-	acquire_console_sem();
-	if (backlight_enabled)
-		rc = backlighter->set_level(level, backlighter_data);
-	if (!rc)
-		backlight_level = level;
-	release_console_sem();
-	if (!rc && !backlight_autosave) {
-		level <<=1;
-		if (level & 0x10)
-			level |= 0x01;
-		// -- todo: save to property "bklt"
+	int error = -ENXIO;
+
+	mutex_lock(&pmac_backlight_mutex);
+	if (pmac_backlight) {
+		struct backlight_properties *props;
+
+		down(&pmac_backlight->sem);
+		props = pmac_backlight->props;
+		props->brightness = brightness *
+			props->max_brightness / OLD_BACKLIGHT_MAX;
+		props->update_status(pmac_backlight);
+		up(&pmac_backlight->sem);
+
+		error = 0;
 	}
-	return rc;
+	mutex_unlock(&pmac_backlight_mutex);
+
+	return error;
 }
-int set_backlight_level(int level)
+
+int pmac_backlight_get_legacy_brightness()
 {
-	if (!backlighter)
-		return -ENODEV;
-	backlight_req_level = level;
-	schedule_work(&backlight_work);
-	return 0;
-}
+	int result = -ENXIO;
 
-EXPORT_SYMBOL(set_backlight_level);
+	mutex_lock(&pmac_backlight_mutex);
+	if (pmac_backlight) {
+		struct backlight_properties *props;
 
-int get_backlight_level(void)
-{
-	if (!backlighter)
-		return -ENODEV;
-	return backlight_level;
-}
-EXPORT_SYMBOL(get_backlight_level);
+		down(&pmac_backlight->sem);
+		props = pmac_backlight->props;
+		result = props->brightness *
+			OLD_BACKLIGHT_MAX / props->max_brightness;
+		up(&pmac_backlight->sem);
+	}
+	mutex_unlock(&pmac_backlight_mutex);
 
-static void backlight_callback(void *dummy)
-{
-	int level, enable;
-
-	do {
-		level = backlight_req_level;
-		enable = backlight_req_enable;
-		mb();
-
-		if (level >= 0)
-			__set_backlight_level(level);
-		if (enable >= 0)
-			__set_backlight_enable(enable);
-	} while(cmpxchg(&backlight_req_level, level, -1) != level ||
-		cmpxchg(&backlight_req_enable, enable, -1) != enable);
+	return result;
 }
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 4735b41c113c..0741df8c41b7 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -26,9 +26,6 @@
 #include <asm/prom.h>
 #include <asm/machdep.h>
 #include <asm/xmon.h>
-#ifdef CONFIG_PMAC_BACKLIGHT
-#include <asm/backlight.h>
-#endif
 #include <asm/processor.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig
index ccf5df44cde4..37cd6ee4586b 100644
--- a/drivers/macintosh/Kconfig
+++ b/drivers/macintosh/Kconfig
@@ -99,17 +99,22 @@ config PMAC_MEDIABAY
 	  devices are not fully supported in the bay as I never had one to
 	  try with
 
-# made a separate option since backlight may end up beeing used
-# on non-powerbook machines (but only on PMU based ones AFAIK)
 config PMAC_BACKLIGHT
 	bool "Backlight control for LCD screens"
 	depends on ADB_PMU && (BROKEN || !PPC64)
 	help
-	  Say Y here to build in code to manage the LCD backlight on a
-	  Macintosh PowerBook.  With this code, the backlight will be turned
-	  on and off appropriately on power-management and lid-open/lid-closed
-	  events; also, the PowerBook button device will be enabled so you can
-	  change the screen brightness.
+	  Say Y here to enable Macintosh specific extensions of the generic
+	  backlight code. With this enabled, the brightness keys on older
+	  PowerBooks will be enabled so you can change the screen brightness.
+	  Newer models should use an userspace daemon like pbbuttonsd.
+
+config PMAC_BACKLIGHT_LEGACY
+	bool "Provide legacy ioctl's on /dev/pmu for the backlight"
+	depends on PMAC_BACKLIGHT && (BROKEN || !PPC64)
+	help
+	  Say Y if you want to enable legacy ioctl's on /dev/pmu. This is for
+	  programs which use this old interface. New and updated programs
+	  should use the backlight classes in sysfs.
 
 config ADB_MACIO
 	bool "Include MacIO (CHRP) ADB driver"
diff --git a/drivers/macintosh/Makefile b/drivers/macintosh/Makefile
index 6081acdea404..8972e53d2dcb 100644
--- a/drivers/macintosh/Makefile
+++ b/drivers/macintosh/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_INPUT_ADBHID)	+= adbhid.o
 obj-$(CONFIG_ANSLCD)		+= ans-lcd.o
 
 obj-$(CONFIG_ADB_PMU)		+= via-pmu.o
+obj-$(CONFIG_PMAC_BACKLIGHT)	+= via-pmu-backlight.o
 obj-$(CONFIG_ADB_CUDA)		+= via-cuda.o
 obj-$(CONFIG_PMAC_APM_EMU)	+= apm_emu.o
 obj-$(CONFIG_PMAC_SMU)		+= smu.o
diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c
index 394334ec5765..c26e1236b275 100644
--- a/drivers/macintosh/adbhid.c
+++ b/drivers/macintosh/adbhid.c
@@ -503,9 +503,7 @@ adbhid_buttons_input(unsigned char *data, int nb, struct pt_regs *regs, int auto
 	case 0x1f: /* Powerbook button device */
 	  {
 		int down = (data[1] == (data[1] & 0xf));
-#ifdef CONFIG_PMAC_BACKLIGHT
-		int backlight = get_backlight_level();
-#endif
+
 		/*
 		 * XXX: Where is the contrast control for the passive?
 		 *  -- Cort
@@ -530,29 +528,17 @@ adbhid_buttons_input(unsigned char *data, int nb, struct pt_regs *regs, int auto
 
 		case 0xa:	/* brightness decrease */
 #ifdef CONFIG_PMAC_BACKLIGHT
-			if (!disable_kernel_backlight) {
-				if (down && backlight >= 0) {
-					if (backlight > BACKLIGHT_OFF)
-						set_backlight_level(backlight-1);
-					else
-						set_backlight_level(BACKLIGHT_OFF);
-				}
-			}
-#endif /* CONFIG_PMAC_BACKLIGHT */
+			if (!disable_kernel_backlight && down)
+				pmac_backlight_key_down();
+#endif
 			input_report_key(adbhid[id]->input, KEY_BRIGHTNESSDOWN, down);
 			break;
 
 		case 0x9:	/* brightness increase */
 #ifdef CONFIG_PMAC_BACKLIGHT
-			if (!disable_kernel_backlight) {
-				if (down && backlight >= 0) {
-					if (backlight < BACKLIGHT_MAX)
-						set_backlight_level(backlight+1);
-					else 
-						set_backlight_level(BACKLIGHT_MAX);
-				}
-			}
-#endif /* CONFIG_PMAC_BACKLIGHT */
+			if (!disable_kernel_backlight && down)
+				pmac_backlight_key_up();
+#endif
 			input_report_key(adbhid[id]->input, KEY_BRIGHTNESSUP, down);
 			break;
 
diff --git a/drivers/macintosh/via-pmu-backlight.c b/drivers/macintosh/via-pmu-backlight.c
new file mode 100644
index 000000000000..b42d05f2aaff
--- /dev/null
+++ b/drivers/macintosh/via-pmu-backlight.c
@@ -0,0 +1,150 @@
+/*
+ * Backlight code for via-pmu
+ *
+ * Copyright (C) 1998 Paul Mackerras and Fabio Riccardi.
+ * Copyright (C) 2001-2002 Benjamin Herrenschmidt
+ * Copyright (C) 2006      Michael Hanselmann <linux-kernel@hansmi.ch>
+ *
+ */
+
+#include <asm/ptrace.h>
+#include <linux/adb.h>
+#include <linux/pmu.h>
+#include <asm/backlight.h>
+#include <asm/prom.h>
+
+#define MAX_PMU_LEVEL 0xFF
+
+static struct device_node *vias;
+static struct backlight_properties pmu_backlight_data;
+
+static int pmu_backlight_get_level_brightness(struct fb_info *info,
+		int level)
+{
+	int pmulevel;
+
+	/* Get and convert the value */
+	mutex_lock(&info->bl_mutex);
+	pmulevel = info->bl_curve[level] * FB_BACKLIGHT_MAX / MAX_PMU_LEVEL;
+	mutex_unlock(&info->bl_mutex);
+
+	if (pmulevel < 0)
+		pmulevel = 0;
+	else if (pmulevel > MAX_PMU_LEVEL)
+		pmulevel = MAX_PMU_LEVEL;
+
+	return pmulevel;
+}
+
+static int pmu_backlight_update_status(struct backlight_device *bd)
+{
+	struct fb_info *info = class_get_devdata(&bd->class_dev);
+	struct adb_request req;
+	int pmulevel, level = bd->props->brightness;
+
+	if (vias == NULL)
+		return -ENODEV;
+
+	if (bd->props->power != FB_BLANK_UNBLANK ||
+	    bd->props->fb_blank != FB_BLANK_UNBLANK)
+		level = 0;
+
+	pmulevel = pmu_backlight_get_level_brightness(info, level);
+
+	pmu_request(&req, NULL, 2, PMU_BACKLIGHT_BRIGHT, pmulevel);
+	pmu_wait_complete(&req);
+
+	pmu_request(&req, NULL, 2, PMU_POWER_CTRL,
+		PMU_POW_BACKLIGHT | (level > 0 ? PMU_POW_ON : PMU_POW_OFF));
+	pmu_wait_complete(&req);
+
+	return 0;
+}
+
+static int pmu_backlight_get_brightness(struct backlight_device *bd)
+{
+	return bd->props->brightness;
+}
+
+static struct backlight_properties pmu_backlight_data = {
+	.owner		= THIS_MODULE,
+	.get_brightness	= pmu_backlight_get_brightness,
+	.update_status	= pmu_backlight_update_status,
+	.max_brightness	= (FB_BACKLIGHT_LEVELS - 1),
+};
+
+void __init pmu_backlight_init(struct device_node *in_vias)
+{
+	struct backlight_device *bd;
+	struct fb_info *info;
+	char name[10];
+	int level, autosave;
+
+	vias = in_vias;
+
+	/* Special case for the old PowerBook since I can't test on it */
+	autosave =
+		machine_is_compatible("AAPL,3400/2400") ||
+		machine_is_compatible("AAPL,3500");
+
+	if (!autosave &&
+	    !pmac_has_backlight_type("pmu") &&
+	    !machine_is_compatible("AAPL,PowerBook1998") &&
+	    !machine_is_compatible("PowerBook1,1"))
+		return;
+
+	/* Actually, this is a hack, but I don't know of a better way
+	 * to get the first framebuffer device.
+	 */
+	info = registered_fb[0];
+	if (!info) {
+		printk("pmubl: No framebuffer found\n");
+		goto error;
+	}
+
+	snprintf(name, sizeof(name), "pmubl%d", info->node);
+
+	bd = backlight_device_register(name, info, &pmu_backlight_data);
+	if (IS_ERR(bd)) {
+		printk("pmubl: Backlight registration failed\n");
+		goto error;
+	}
+
+	mutex_lock(&info->bl_mutex);
+	info->bl_dev = bd;
+	fb_bl_default_curve(info, 0x7F, 0x46, 0x0E);
+	mutex_unlock(&info->bl_mutex);
+
+	level = pmu_backlight_data.max_brightness;
+
+	if (autosave) {
+		/* read autosaved value if available */
+		struct adb_request req;
+		pmu_request(&req, NULL, 2, 0xd9, 0);
+		pmu_wait_complete(&req);
+
+		mutex_lock(&info->bl_mutex);
+		level = pmac_backlight_curve_lookup(info,
+				(req.reply[0] >> 4) *
+				pmu_backlight_data.max_brightness / 15);
+		mutex_unlock(&info->bl_mutex);
+	}
+
+	up(&bd->sem);
+	bd->props->brightness = level;
+	bd->props->power = FB_BLANK_UNBLANK;
+	bd->props->update_status(bd);
+	down(&bd->sem);
+
+	mutex_lock(&pmac_backlight_mutex);
+	if (!pmac_backlight)
+		pmac_backlight = bd;
+	mutex_unlock(&pmac_backlight_mutex);
+
+	printk("pmubl: Backlight initialized (%s)\n", name);
+
+	return;
+
+error:
+	return;
+}
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index c63d4e7984be..2a355ae59562 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -144,7 +144,6 @@ static int data_index;
 static int data_len;
 static volatile int adb_int_pending;
 static volatile int disable_poll;
-static struct adb_request bright_req_1, bright_req_2;
 static struct device_node *vias;
 static int pmu_kind = PMU_UNKNOWN;
 static int pmu_fully_inited = 0;
@@ -161,7 +160,7 @@ static int drop_interrupts;
 #if defined(CONFIG_PM) && defined(CONFIG_PPC32)
 static int option_lid_wakeup = 1;
 #endif /* CONFIG_PM && CONFIG_PPC32 */
-#if (defined(CONFIG_PM)&&defined(CONFIG_PPC32))||defined(CONFIG_PMAC_BACKLIGHT)
+#if (defined(CONFIG_PM)&&defined(CONFIG_PPC32))||defined(CONFIG_PMAC_BACKLIGHT_LEGACY)
 static int sleep_in_progress;
 #endif
 static unsigned long async_req_locks;
@@ -208,10 +207,6 @@ static int proc_get_info(char *page, char **start, off_t off,
 			  int count, int *eof, void *data);
 static int proc_get_irqstats(char *page, char **start, off_t off,
 			  int count, int *eof, void *data);
-#ifdef CONFIG_PMAC_BACKLIGHT
-static int pmu_set_backlight_level(int level, void* data);
-static int pmu_set_backlight_enable(int on, int level, void* data);
-#endif /* CONFIG_PMAC_BACKLIGHT */
 static void pmu_pass_intr(unsigned char *data, int len);
 static int proc_get_batt(char *page, char **start, off_t off,
 			int count, int *eof, void *data);
@@ -292,13 +287,6 @@ static char *pbook_type[] = {
 	"Core99"
 };
 
-#ifdef CONFIG_PMAC_BACKLIGHT
-static struct backlight_controller pmu_backlight_controller = {
-	pmu_set_backlight_enable,
-	pmu_set_backlight_level
-};
-#endif /* CONFIG_PMAC_BACKLIGHT */
-
 int __init find_via_pmu(void)
 {
 	u64 taddr;
@@ -417,8 +405,6 @@ static int __init via_pmu_start(void)
 	if (vias == NULL)
 		return -ENODEV;
 
-	bright_req_1.complete = 1;
-	bright_req_2.complete = 1;
 	batt_req.complete = 1;
 
 #ifndef CONFIG_PPC_MERGE
@@ -483,9 +469,9 @@ static int __init via_pmu_dev_init(void)
 		return -ENODEV;
 
 #ifdef CONFIG_PMAC_BACKLIGHT
-	/* Enable backlight */
-	register_backlight_controller(&pmu_backlight_controller, NULL, "pmu");
-#endif /* CONFIG_PMAC_BACKLIGHT */
+	/* Initialize backlight */
+	pmu_backlight_init(vias);
+#endif
 
 #ifdef CONFIG_PPC32
   	if (machine_is_compatible("AAPL,3400/2400") ||
@@ -1424,7 +1410,7 @@ next:
 #ifdef CONFIG_INPUT_ADBHID
 			if (!disable_kernel_backlight)
 #endif /* CONFIG_INPUT_ADBHID */
-				set_backlight_level(data[1] >> 4);
+				pmac_backlight_set_legacy_brightness(data[1] >> 4);
 #endif /* CONFIG_PMAC_BACKLIGHT */
 	}
 	/* Tick interrupt */
@@ -1674,61 +1660,6 @@ gpio1_interrupt(int irq, void *arg, struct pt_regs *regs)
 	return IRQ_NONE;
 }
 
-#ifdef CONFIG_PMAC_BACKLIGHT
-static int backlight_to_bright[] = {
-	0x7f, 0x46, 0x42, 0x3e, 0x3a, 0x36, 0x32, 0x2e,
-	0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e
-};
- 
-static int
-pmu_set_backlight_enable(int on, int level, void* data)
-{
-	struct adb_request req;
-	
-	if (vias == NULL)
-		return -ENODEV;
-
-	if (on) {
-		pmu_request(&req, NULL, 2, PMU_BACKLIGHT_BRIGHT,
-			    backlight_to_bright[level]);
-		pmu_wait_complete(&req);
-	}
-	pmu_request(&req, NULL, 2, PMU_POWER_CTRL,
-		    PMU_POW_BACKLIGHT | (on ? PMU_POW_ON : PMU_POW_OFF));
-       	pmu_wait_complete(&req);
-
-	return 0;
-}
-
-static void
-pmu_bright_complete(struct adb_request *req)
-{
-	if (req == &bright_req_1)
-		clear_bit(1, &async_req_locks);
-	if (req == &bright_req_2)
-		clear_bit(2, &async_req_locks);
-}
-
-static int
-pmu_set_backlight_level(int level, void* data)
-{
-	if (vias == NULL)
-		return -ENODEV;
-
-	if (test_and_set_bit(1, &async_req_locks))
-		return -EAGAIN;
-	pmu_request(&bright_req_1, pmu_bright_complete, 2, PMU_BACKLIGHT_BRIGHT,
-		backlight_to_bright[level]);
-	if (test_and_set_bit(2, &async_req_locks))
-		return -EAGAIN;
-	pmu_request(&bright_req_2, pmu_bright_complete, 2, PMU_POWER_CTRL,
-		    PMU_POW_BACKLIGHT | (level > BACKLIGHT_OFF ?
-					 PMU_POW_ON : PMU_POW_OFF));
-
-	return 0;
-}
-#endif /* CONFIG_PMAC_BACKLIGHT */
-
 void
 pmu_enable_irled(int on)
 {
@@ -2145,9 +2076,8 @@ pmac_suspend_devices(void)
 		return -EBUSY;
 	}
 
-	/* Wait for completion of async backlight requests */
-	while (!bright_req_1.complete || !bright_req_2.complete ||
-			!batt_req.complete)
+	/* Wait for completion of async requests */
+	while (!batt_req.complete)
 		pmu_poll();
 
 	/* Giveup the lazy FPU & vec so we don't have to back them
@@ -2678,26 +2608,34 @@ pmu_ioctl(struct inode * inode, struct file *filp,
 			return put_user(1, argp);
 #endif /* CONFIG_PM && CONFIG_PPC32 */
 
-#ifdef CONFIG_PMAC_BACKLIGHT
-	/* Backlight should have its own device or go via
-	 * the fbdev
-	 */
+#ifdef CONFIG_PMAC_BACKLIGHT_LEGACY
+	/* Compatibility ioctl's for backlight */
 	case PMU_IOC_GET_BACKLIGHT:
+	{
+		int brightness;
+
 		if (sleep_in_progress)
 			return -EBUSY;
-		error = get_backlight_level();
-		if (error < 0)
-			return error;
-		return put_user(error, argp);
+
+		brightness = pmac_backlight_get_legacy_brightness();
+		if (brightness < 0)
+			return brightness;
+		else
+			return put_user(brightness, argp);
+
+	}
 	case PMU_IOC_SET_BACKLIGHT:
 	{
-		__u32 value;
+		int brightness;
+
 		if (sleep_in_progress)
 			return -EBUSY;
-		error = get_user(value, argp);
-		if (!error)
-			error = set_backlight_level(value);
-		break;
+
+		error = get_user(brightness, argp);
+		if (error)
+			return error;
+
+		return pmac_backlight_set_legacy_brightness(brightness);
 	}
 #ifdef CONFIG_INPUT_ADBHID
 	case PMU_IOC_GRAB_BACKLIGHT: {
@@ -2713,7 +2651,7 @@ pmu_ioctl(struct inode * inode, struct file *filp,
 		return 0;
 	}
 #endif /* CONFIG_INPUT_ADBHID */
-#endif /* CONFIG_PMAC_BACKLIGHT */
+#endif /* CONFIG_PMAC_BACKLIGHT_LEGACY */
 	case PMU_IOC_GET_MODEL:
 	    	return put_user(pmu_kind, argp);
 	case PMU_IOC_HAS_ADB:
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index 5a2840aeb547..168ede7902bd 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -86,6 +86,11 @@ config FB_FIRMWARE_EDID
 	 combination with certain motherboards and monitors are known to
 	 suffer from this problem.
 
+config FB_BACKLIGHT
+       bool
+       depends on FB
+       default n
+
 config FB_MODE_HELPERS
         bool "Enable Video Mode Handling Helpers"
         depends on FB
@@ -717,6 +722,16 @@ config FB_NVIDIA_I2C
 	  independently validate video mode parameters, you should say Y
 	  here.
 
+config FB_NVIDIA_BACKLIGHT
+	bool "Support for backlight control"
+	depends on FB_NVIDIA && PPC_PMAC
+	select FB_BACKLIGHT
+	select BACKLIGHT_LCD_SUPPORT
+	select BACKLIGHT_CLASS_DEVICE
+	default y
+	help
+	  Say Y here if you want to control the backlight of your display.
+
 config FB_RIVA
 	tristate "nVidia Riva support"
 	depends on FB && PCI
@@ -755,6 +770,16 @@ config FB_RIVA_DEBUG
 	  of debugging informations to provide to the maintainer when
 	  something goes wrong.
 
+config FB_RIVA_BACKLIGHT
+	bool "Support for backlight control"
+	depends on FB_RIVA && PPC_PMAC
+	select FB_BACKLIGHT
+	select BACKLIGHT_LCD_SUPPORT
+	select BACKLIGHT_CLASS_DEVICE
+	default y
+	help
+	  Say Y here if you want to control the backlight of your display.
+
 config FB_I810
 	tristate "Intel 810/815 support (EXPERIMENTAL)"
 	depends on FB && EXPERIMENTAL && PCI && X86_32
@@ -993,6 +1018,7 @@ config FB_RADEON
 
 	  There is a product page at
 	  http://apps.ati.com/ATIcompare/
+
 config FB_RADEON_I2C
 	bool "DDC/I2C for ATI Radeon support"
 	depends on FB_RADEON
@@ -1000,6 +1026,16 @@ config FB_RADEON_I2C
 	help
 	  Say Y here if you want DDC/I2C support for your Radeon board. 
 
+config FB_RADEON_BACKLIGHT
+	bool "Support for backlight control"
+	depends on FB_RADEON && PPC_PMAC
+	select FB_BACKLIGHT
+	select BACKLIGHT_LCD_SUPPORT
+	select BACKLIGHT_CLASS_DEVICE
+	default y
+	help
+	  Say Y here if you want to control the backlight of your display.
+
 config FB_RADEON_DEBUG
 	bool "Lots of debug output from Radeon driver"
 	depends on FB_RADEON
@@ -1024,6 +1060,16 @@ config FB_ATY128
 	  To compile this driver as a module, choose M here: the
 	  module will be called aty128fb.
 
+config FB_ATY128_BACKLIGHT
+	bool "Support for backlight control"
+	depends on FB_ATY128 && PPC_PMAC
+	select FB_BACKLIGHT
+	select BACKLIGHT_LCD_SUPPORT
+	select BACKLIGHT_CLASS_DEVICE
+	default y
+	help
+	  Say Y here if you want to control the backlight of your display.
+
 config FB_ATY
 	tristate "ATI Mach64 display support" if PCI || ATARI
 	depends on FB && !SPARC32
@@ -1066,6 +1112,16 @@ config FB_ATY_GX
 	  is at
 	  <http://support.ati.com/products/pc/mach64/graphics_xpression.html>.
 
+config FB_ATY_BACKLIGHT
+	bool "Support for backlight control"
+	depends on FB_ATY && PPC_PMAC
+	select FB_BACKLIGHT
+	select BACKLIGHT_LCD_SUPPORT
+	select BACKLIGHT_CLASS_DEVICE
+	default y
+	help
+	  Say Y here if you want to control the backlight of your display.
+
 config FB_S3TRIO
 	bool "S3 Trio display support"
 	depends on (FB = y) && PPC && BROKEN
diff --git a/drivers/video/aty/Makefile b/drivers/video/aty/Makefile
index 18521397a6e3..a6cc0e9ec790 100644
--- a/drivers/video/aty/Makefile
+++ b/drivers/video/aty/Makefile
@@ -10,5 +10,6 @@ atyfb-objs			:= $(atyfb-y)
 
 radeonfb-y			:= radeon_base.o radeon_pm.o radeon_monitor.o radeon_accel.o
 radeonfb-$(CONFIG_FB_RADEON_I2C)	+= radeon_i2c.o
+radeonfb-$(CONFIG_FB_RADEON_BACKLIGHT)	+= radeon_backlight.o
 radeonfb-objs			:= $(radeonfb-y)
 
diff --git a/drivers/video/aty/aty128fb.c b/drivers/video/aty/aty128fb.c
index f7bbff4ddc6a..db878fd55fb2 100644
--- a/drivers/video/aty/aty128fb.c
+++ b/drivers/video/aty/aty128fb.c
@@ -64,6 +64,7 @@
 #include <linux/pci.h>
 #include <linux/ioport.h>
 #include <linux/console.h>
+#include <linux/backlight.h>
 #include <asm/io.h>
 
 #ifdef CONFIG_PPC_PMAC
@@ -480,16 +481,6 @@ static struct fb_ops aty128fb_ops = {
 	.fb_imageblit	= cfb_imageblit,
 };
 
-#ifdef CONFIG_PMAC_BACKLIGHT
-static int aty128_set_backlight_enable(int on, int level, void* data);
-static int aty128_set_backlight_level(int level, void* data);
-
-static struct backlight_controller aty128_backlight_controller = {
-	aty128_set_backlight_enable,
-	aty128_set_backlight_level
-};
-#endif /* CONFIG_PMAC_BACKLIGHT */
-
     /*
      * Functions to read from/write to the mmio registers
      *	- endian conversions may possibly be avoided by
@@ -1258,19 +1249,35 @@ static void aty128_set_crt_enable(struct aty128fb_par *par, int on)
 static void aty128_set_lcd_enable(struct aty128fb_par *par, int on)
 {
 	u32 reg;
+#ifdef CONFIG_FB_ATY128_BACKLIGHT
+	struct fb_info *info = pci_get_drvdata(par->pdev);
+#endif
 
 	if (on) {
 		reg = aty_ld_le32(LVDS_GEN_CNTL);
 		reg |= LVDS_ON | LVDS_EN | LVDS_BLON | LVDS_DIGION;
 		reg &= ~LVDS_DISPLAY_DIS;
 		aty_st_le32(LVDS_GEN_CNTL, reg);
-#ifdef CONFIG_PMAC_BACKLIGHT
-		aty128_set_backlight_enable(get_backlight_enable(),
-					    get_backlight_level(), par);
+#ifdef CONFIG_FB_ATY128_BACKLIGHT
+		mutex_lock(&info->bl_mutex);
+		if (info->bl_dev) {
+			down(&info->bl_dev->sem);
+			info->bl_dev->props->update_status(info->bl_dev);
+			up(&info->bl_dev->sem);
+		}
+		mutex_unlock(&info->bl_mutex);
 #endif	
 	} else {
-#ifdef CONFIG_PMAC_BACKLIGHT
-		aty128_set_backlight_enable(0, 0, par);
+#ifdef CONFIG_FB_ATY128_BACKLIGHT
+		mutex_lock(&info->bl_mutex);
+		if (info->bl_dev) {
+			down(&info->bl_dev->sem);
+			info->bl_dev->props->brightness = 0;
+			info->bl_dev->props->power = FB_BLANK_POWERDOWN;
+			info->bl_dev->props->update_status(info->bl_dev);
+			up(&info->bl_dev->sem);
+		}
+		mutex_unlock(&info->bl_mutex);
 #endif	
 		reg = aty_ld_le32(LVDS_GEN_CNTL);
 		reg |= LVDS_DISPLAY_DIS;
@@ -1691,6 +1698,184 @@ static int __init aty128fb_setup(char *options)
 }
 #endif  /*  MODULE  */
 
+/* Backlight */
+#ifdef CONFIG_FB_ATY128_BACKLIGHT
+#define MAX_LEVEL 0xFF
+
+static struct backlight_properties aty128_bl_data;
+
+static int aty128_bl_get_level_brightness(struct aty128fb_par *par,
+		int level)
+{
+	struct fb_info *info = pci_get_drvdata(par->pdev);
+	int atylevel;
+
+	/* Get and convert the value */
+	mutex_lock(&info->bl_mutex);
+	atylevel = MAX_LEVEL -
+		(info->bl_curve[level] * FB_BACKLIGHT_MAX / MAX_LEVEL);
+	mutex_unlock(&info->bl_mutex);
+
+	if (atylevel < 0)
+		atylevel = 0;
+	else if (atylevel > MAX_LEVEL)
+		atylevel = MAX_LEVEL;
+
+	return atylevel;
+}
+
+/* We turn off the LCD completely instead of just dimming the backlight.
+ * This provides greater power saving and the display is useless without
+ * backlight anyway
+ */
+#define BACKLIGHT_LVDS_OFF
+/* That one prevents proper CRT output with LCD off */
+#undef BACKLIGHT_DAC_OFF
+
+static int aty128_bl_update_status(struct backlight_device *bd)
+{
+	struct aty128fb_par *par = class_get_devdata(&bd->class_dev);
+	unsigned int reg = aty_ld_le32(LVDS_GEN_CNTL);
+	int level;
+
+	if (bd->props->power != FB_BLANK_UNBLANK ||
+	    bd->props->fb_blank != FB_BLANK_UNBLANK ||
+	    !par->lcd_on)
+		level = 0;
+	else
+		level = bd->props->brightness;
+
+	reg |= LVDS_BL_MOD_EN | LVDS_BLON;
+	if (level > 0) {
+		reg |= LVDS_DIGION;
+		if (!(reg & LVDS_ON)) {
+			reg &= ~LVDS_BLON;
+			aty_st_le32(LVDS_GEN_CNTL, reg);
+			aty_ld_le32(LVDS_GEN_CNTL);
+			mdelay(10);
+			reg |= LVDS_BLON;
+			aty_st_le32(LVDS_GEN_CNTL, reg);
+		}
+		reg &= ~LVDS_BL_MOD_LEVEL_MASK;
+		reg |= (aty128_bl_get_level_brightness(par, level) << LVDS_BL_MOD_LEVEL_SHIFT);
+#ifdef BACKLIGHT_LVDS_OFF
+		reg |= LVDS_ON | LVDS_EN;
+		reg &= ~LVDS_DISPLAY_DIS;
+#endif
+		aty_st_le32(LVDS_GEN_CNTL, reg);
+#ifdef BACKLIGHT_DAC_OFF
+		aty_st_le32(DAC_CNTL, aty_ld_le32(DAC_CNTL) & (~DAC_PDWN));
+#endif
+	} else {
+		reg &= ~LVDS_BL_MOD_LEVEL_MASK;
+		reg |= (aty128_bl_get_level_brightness(par, 0) << LVDS_BL_MOD_LEVEL_SHIFT);
+#ifdef BACKLIGHT_LVDS_OFF
+		reg |= LVDS_DISPLAY_DIS;
+		aty_st_le32(LVDS_GEN_CNTL, reg);
+		aty_ld_le32(LVDS_GEN_CNTL);
+		udelay(10);
+		reg &= ~(LVDS_ON | LVDS_EN | LVDS_BLON | LVDS_DIGION);
+#endif
+		aty_st_le32(LVDS_GEN_CNTL, reg);
+#ifdef BACKLIGHT_DAC_OFF
+		aty_st_le32(DAC_CNTL, aty_ld_le32(DAC_CNTL) | DAC_PDWN);
+#endif
+	}
+
+	return 0;
+}
+
+static int aty128_bl_get_brightness(struct backlight_device *bd)
+{
+	return bd->props->brightness;
+}
+
+static struct backlight_properties aty128_bl_data = {
+	.owner		= THIS_MODULE,
+	.get_brightness	= aty128_bl_get_brightness,
+	.update_status	= aty128_bl_update_status,
+	.max_brightness	= (FB_BACKLIGHT_LEVELS - 1),
+};
+
+static void aty128_bl_init(struct aty128fb_par *par)
+{
+	struct fb_info *info = pci_get_drvdata(par->pdev);
+	struct backlight_device *bd;
+	char name[12];
+
+	/* Could be extended to Rage128Pro LVDS output too */
+	if (par->chip_gen != rage_M3)
+		return;
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	if (!pmac_has_backlight_type("ati"))
+		return;
+#endif
+
+	snprintf(name, sizeof(name), "aty128bl%d", info->node);
+
+	bd = backlight_device_register(name, par, &aty128_bl_data);
+	if (IS_ERR(bd)) {
+		info->bl_dev = NULL;
+		printk("aty128: Backlight registration failed\n");
+		goto error;
+	}
+
+	mutex_lock(&info->bl_mutex);
+	info->bl_dev = bd;
+	fb_bl_default_curve(info, 0,
+		 63 * FB_BACKLIGHT_MAX / MAX_LEVEL,
+		219 * FB_BACKLIGHT_MAX / MAX_LEVEL);
+	mutex_unlock(&info->bl_mutex);
+
+	up(&bd->sem);
+	bd->props->brightness = aty128_bl_data.max_brightness;
+	bd->props->power = FB_BLANK_UNBLANK;
+	bd->props->update_status(bd);
+	down(&bd->sem);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_lock(&pmac_backlight_mutex);
+	if (!pmac_backlight)
+		pmac_backlight = bd;
+	mutex_unlock(&pmac_backlight_mutex);
+#endif
+
+	printk("aty128: Backlight initialized (%s)\n", name);
+
+	return;
+
+error:
+	return;
+}
+
+static void aty128_bl_exit(struct aty128fb_par *par)
+{
+	struct fb_info *info = pci_get_drvdata(par->pdev);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_lock(&pmac_backlight_mutex);
+#endif
+
+	mutex_lock(&info->bl_mutex);
+	if (info->bl_dev) {
+#ifdef CONFIG_PMAC_BACKLIGHT
+		if (pmac_backlight == info->bl_dev)
+			pmac_backlight = NULL;
+#endif
+
+		backlight_device_unregister(info->bl_dev);
+		info->bl_dev = NULL;
+
+		printk("aty128: Backlight unloaded\n");
+	}
+	mutex_unlock(&info->bl_mutex);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_unlock(&pmac_backlight_mutex);
+#endif
+}
+#endif /* CONFIG_FB_ATY128_BACKLIGHT */
 
 /*
  *  Initialisation
@@ -1835,17 +2020,15 @@ static int __init aty128_init(struct pci_dev *pdev, const struct pci_device_id *
 	if (register_framebuffer(info) < 0)
 		return 0;
 
-#ifdef CONFIG_PMAC_BACKLIGHT
-	/* Could be extended to Rage128Pro LVDS output too */
-	if (par->chip_gen == rage_M3)
-		register_backlight_controller(&aty128_backlight_controller, par, "ati");
-#endif /* CONFIG_PMAC_BACKLIGHT */
-
 	par->pm_reg = pci_find_capability(pdev, PCI_CAP_ID_PM);
 	par->pdev = pdev;
 	par->asleep = 0;
 	par->lock_blank = 0;
-	
+
+#ifdef CONFIG_FB_ATY128_BACKLIGHT
+	aty128_bl_init(par);
+#endif
+
 	printk(KERN_INFO "fb%d: %s frame buffer device on %s\n",
 	       info->node, info->fix.id, video_card);
 
@@ -1981,6 +2164,10 @@ static void __devexit aty128_remove(struct pci_dev *pdev)
 
 	par = info->par;
 
+#ifdef CONFIG_FB_ATY128_BACKLIGHT
+	aty128_bl_exit(par);
+#endif
+
 	unregister_framebuffer(info);
 #ifdef CONFIG_MTRR
 	if (par->mtrr.vram_valid)
@@ -2011,10 +2198,14 @@ static int aty128fb_blank(int blank, struct fb_info *fb)
 	if (par->lock_blank || par->asleep)
 		return 0;
 
-#ifdef CONFIG_PMAC_BACKLIGHT
-	if (machine_is(powermac) && blank)
-		set_backlight_enable(0);
-#endif /* CONFIG_PMAC_BACKLIGHT */
+#ifdef CONFIG_FB_ATY128_BACKLIGHT
+	if (machine_is(powermac) && blank) {
+		down(&fb->bl_dev->sem);
+		fb->bl_dev->props->power = FB_BLANK_POWERDOWN;
+		fb->bl_dev->props->update_status(fb->bl_dev);
+		up(&fb->bl_dev->sem);
+	}
+#endif
 
 	if (blank & FB_BLANK_VSYNC_SUSPEND)
 		state |= 2;
@@ -2029,10 +2220,14 @@ static int aty128fb_blank(int blank, struct fb_info *fb)
 		aty128_set_crt_enable(par, par->crt_on && !blank);
 		aty128_set_lcd_enable(par, par->lcd_on && !blank);
 	}
-#ifdef CONFIG_PMAC_BACKLIGHT
-	if (machine_is(powermac) && !blank)
-		set_backlight_enable(1);
-#endif /* CONFIG_PMAC_BACKLIGHT */
+#ifdef CONFIG_FB_ATY128_BACKLIGHT
+	if (machine_is(powermac) && !blank) {
+		down(&fb->bl_dev->sem);
+		fb->bl_dev->props->power = FB_BLANK_UNBLANK;
+		fb->bl_dev->props->update_status(fb->bl_dev);
+		up(&fb->bl_dev->sem);
+	}
+#endif
 	return 0;
 }
 
@@ -2138,73 +2333,6 @@ static int aty128fb_ioctl(struct fb_info *info, u_int cmd, u_long arg)
 	return -EINVAL;
 }
 
-#ifdef CONFIG_PMAC_BACKLIGHT
-static int backlight_conv[] = {
-	0xff, 0xc0, 0xb5, 0xaa, 0x9f, 0x94, 0x89, 0x7e,
-	0x73, 0x68, 0x5d, 0x52, 0x47, 0x3c, 0x31, 0x24
-};
-
-/* We turn off the LCD completely instead of just dimming the backlight.
- * This provides greater power saving and the display is useless without
- * backlight anyway
- */
-#define BACKLIGHT_LVDS_OFF
-/* That one prevents proper CRT output with LCD off */
-#undef BACKLIGHT_DAC_OFF
-
-static int aty128_set_backlight_enable(int on, int level, void *data)
-{
-	struct aty128fb_par *par = data;
-	unsigned int reg = aty_ld_le32(LVDS_GEN_CNTL);
-
-	if (!par->lcd_on)
-		on = 0;
-	reg |= LVDS_BL_MOD_EN | LVDS_BLON;
-	if (on && level > BACKLIGHT_OFF) {
-		reg |= LVDS_DIGION;
-		if (!(reg & LVDS_ON)) {
-			reg &= ~LVDS_BLON;
-			aty_st_le32(LVDS_GEN_CNTL, reg);
-			(void)aty_ld_le32(LVDS_GEN_CNTL);
-			mdelay(10);
-			reg |= LVDS_BLON;
-			aty_st_le32(LVDS_GEN_CNTL, reg);
-		}
-		reg &= ~LVDS_BL_MOD_LEVEL_MASK;
-		reg |= (backlight_conv[level] << LVDS_BL_MOD_LEVEL_SHIFT);
-#ifdef BACKLIGHT_LVDS_OFF
-		reg |= LVDS_ON | LVDS_EN;
-		reg &= ~LVDS_DISPLAY_DIS;
-#endif
-		aty_st_le32(LVDS_GEN_CNTL, reg);
-#ifdef BACKLIGHT_DAC_OFF
-		aty_st_le32(DAC_CNTL, aty_ld_le32(DAC_CNTL) & (~DAC_PDWN));
-#endif		
-	} else {
-		reg &= ~LVDS_BL_MOD_LEVEL_MASK;
-		reg |= (backlight_conv[0] << LVDS_BL_MOD_LEVEL_SHIFT);
-#ifdef BACKLIGHT_LVDS_OFF
-		reg |= LVDS_DISPLAY_DIS;
-		aty_st_le32(LVDS_GEN_CNTL, reg);
-		(void)aty_ld_le32(LVDS_GEN_CNTL);
-		udelay(10);
-		reg &= ~(LVDS_ON | LVDS_EN | LVDS_BLON | LVDS_DIGION);
-#endif		
-		aty_st_le32(LVDS_GEN_CNTL, reg);
-#ifdef BACKLIGHT_DAC_OFF
-		aty_st_le32(DAC_CNTL, aty_ld_le32(DAC_CNTL) | DAC_PDWN);
-#endif		
-	}
-
-	return 0;
-}
-
-static int aty128_set_backlight_level(int level, void* data)
-{
-	return aty128_set_backlight_enable(1, level, data);
-}
-#endif /* CONFIG_PMAC_BACKLIGHT */
-
 #if 0
     /*
      *  Accelerated functions
diff --git a/drivers/video/aty/atyfb.h b/drivers/video/aty/atyfb.h
index e9b7a64c1ac4..43d2cb58af87 100644
--- a/drivers/video/aty/atyfb.h
+++ b/drivers/video/aty/atyfb.h
@@ -151,6 +151,7 @@ struct atyfb_par {
 	int lock_blank;
 	unsigned long res_start;
 	unsigned long res_size;
+	struct pci_dev *pdev;
 #ifdef __sparc__
 	struct pci_mmap_map *mmap_map;
 	u8 mmaped;
diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c
index c054bb28b1c4..c5185f7cf4ba 100644
--- a/drivers/video/aty/atyfb_base.c
+++ b/drivers/video/aty/atyfb_base.c
@@ -66,6 +66,7 @@
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
+#include <linux/backlight.h>
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
@@ -2115,45 +2116,142 @@ static int atyfb_pci_resume(struct pci_dev *pdev)
 
 #endif /*  defined(CONFIG_PM) && defined(CONFIG_PCI) */
 
-#ifdef CONFIG_PMAC_BACKLIGHT
+/* Backlight */
+#ifdef CONFIG_FB_ATY_BACKLIGHT
+#define MAX_LEVEL 0xFF
 
-    /*
-     *   LCD backlight control
-     */
+static struct backlight_properties aty_bl_data;
 
-static int backlight_conv[] = {
-	0x00, 0x3f, 0x4c, 0x59, 0x66, 0x73, 0x80, 0x8d,
-	0x9a, 0xa7, 0xb4, 0xc1, 0xcf, 0xdc, 0xe9, 0xff
-};
+static int aty_bl_get_level_brightness(struct atyfb_par *par, int level)
+{
+	struct fb_info *info = pci_get_drvdata(par->pdev);
+	int atylevel;
+
+	/* Get and convert the value */
+	mutex_lock(&info->bl_mutex);
+	atylevel = info->bl_curve[level] * FB_BACKLIGHT_MAX / MAX_LEVEL;
+	mutex_unlock(&info->bl_mutex);
+
+	if (atylevel < 0)
+		atylevel = 0;
+	else if (atylevel > MAX_LEVEL)
+		atylevel = MAX_LEVEL;
 
-static int aty_set_backlight_enable(int on, int level, void *data)
+	return atylevel;
+}
+
+static int aty_bl_update_status(struct backlight_device *bd)
 {
-	struct fb_info *info = (struct fb_info *) data;
-	struct atyfb_par *par = (struct atyfb_par *) info->par;
+	struct atyfb_par *par = class_get_devdata(&bd->class_dev);
 	unsigned int reg = aty_ld_lcd(LCD_MISC_CNTL, par);
+	int level;
+
+	if (bd->props->power != FB_BLANK_UNBLANK ||
+	    bd->props->fb_blank != FB_BLANK_UNBLANK)
+		level = 0;
+	else
+		level = bd->props->brightness;
 
 	reg |= (BLMOD_EN | BIASMOD_EN);
-	if (on && level > BACKLIGHT_OFF) {
+	if (level > 0) {
 		reg &= ~BIAS_MOD_LEVEL_MASK;
-		reg |= (backlight_conv[level] << BIAS_MOD_LEVEL_SHIFT);
+		reg |= (aty_bl_get_level_brightness(par, level) << BIAS_MOD_LEVEL_SHIFT);
 	} else {
 		reg &= ~BIAS_MOD_LEVEL_MASK;
-		reg |= (backlight_conv[0] << BIAS_MOD_LEVEL_SHIFT);
+		reg |= (aty_bl_get_level_brightness(par, 0) << BIAS_MOD_LEVEL_SHIFT);
 	}
 	aty_st_lcd(LCD_MISC_CNTL, reg, par);
+
 	return 0;
 }
 
-static int aty_set_backlight_level(int level, void *data)
+static int aty_bl_get_brightness(struct backlight_device *bd)
 {
-	return aty_set_backlight_enable(1, level, data);
+	return bd->props->brightness;
 }
 
-static struct backlight_controller aty_backlight_controller = {
-	aty_set_backlight_enable,
-	aty_set_backlight_level
+static struct backlight_properties aty_bl_data = {
+	.owner	  = THIS_MODULE,
+	.get_brightness = aty_bl_get_brightness,
+	.update_status	= aty_bl_update_status,
+	.max_brightness = (FB_BACKLIGHT_LEVELS - 1),
 };
-#endif /* CONFIG_PMAC_BACKLIGHT */
+
+static void aty_bl_init(struct atyfb_par *par)
+{
+	struct fb_info *info = pci_get_drvdata(par->pdev);
+	struct backlight_device *bd;
+	char name[12];
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	if (!pmac_has_backlight_type("ati"))
+		return;
+#endif
+
+	snprintf(name, sizeof(name), "atybl%d", info->node);
+
+	bd = backlight_device_register(name, par, &aty_bl_data);
+	if (IS_ERR(bd)) {
+		info->bl_dev = NULL;
+		printk("aty: Backlight registration failed\n");
+		goto error;
+	}
+
+	mutex_lock(&info->bl_mutex);
+	info->bl_dev = bd;
+	fb_bl_default_curve(info, 0,
+		0x3F * FB_BACKLIGHT_MAX / MAX_LEVEL,
+		0xFF * FB_BACKLIGHT_MAX / MAX_LEVEL);
+	mutex_unlock(&info->bl_mutex);
+
+	up(&bd->sem);
+	bd->props->brightness = aty_bl_data.max_brightness;
+	bd->props->power = FB_BLANK_UNBLANK;
+	bd->props->update_status(bd);
+	down(&bd->sem);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_lock(&pmac_backlight_mutex);
+	if (!pmac_backlight)
+		pmac_backlight = bd;
+	mutex_unlock(&pmac_backlight_mutex);
+#endif
+
+	printk("aty: Backlight initialized (%s)\n", name);
+
+	return;
+
+error:
+	return;
+}
+
+static void aty_bl_exit(struct atyfb_par *par)
+{
+	struct fb_info *info = pci_get_drvdata(par->pdev);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_lock(&pmac_backlight_mutex);
+#endif
+
+	mutex_lock(&info->bl_mutex);
+	if (info->bl_dev) {
+#ifdef CONFIG_PMAC_BACKLIGHT
+		if (pmac_backlight == info->bl_dev)
+			pmac_backlight = NULL;
+#endif
+
+		backlight_device_unregister(info->bl_dev);
+
+		printk("aty: Backlight unloaded\n");
+	}
+	mutex_unlock(&info->bl_mutex);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_unlock(&pmac_backlight_mutex);
+#endif
+}
+
+#endif /* CONFIG_FB_ATY_BACKLIGHT */
 
 static void __init aty_calc_mem_refresh(struct atyfb_par *par, int xclk)
 {
@@ -2513,9 +2611,13 @@ static int __init aty_init(struct fb_info *info, const char *name)
 		/* these bits let the 101 powerbook wake up from sleep -- paulus */
 		aty_st_lcd(POWER_MANAGEMENT, aty_ld_lcd(POWER_MANAGEMENT, par)
 			   | (USE_F32KHZ | TRISTATE_MEM_EN), par);
-	} else if (M64_HAS(MOBIL_BUS))
-		register_backlight_controller(&aty_backlight_controller, info, "ati");
-#endif /* CONFIG_PMAC_BACKLIGHT */
+	} else
+#endif
+	if (M64_HAS(MOBIL_BUS)) {
+#ifdef CONFIG_FB_ATY_BACKLIGHT
+		aty_bl_init (par);
+#endif
+	}
 
 	memset(&var, 0, sizeof(var));
 #ifdef CONFIG_PPC
@@ -2674,8 +2776,16 @@ static int atyfb_blank(int blank, struct fb_info *info)
 		return 0;
 
 #ifdef CONFIG_PMAC_BACKLIGHT
-	if (machine_is(powermac) && blank > FB_BLANK_NORMAL)
-		set_backlight_enable(0);
+	if (machine_is(powermac) && blank > FB_BLANK_NORMAL) {
+		mutex_lock(&info->bl_mutex);
+		if (info->bl_dev) {
+			down(&info->bl_dev->sem);
+			info->bl_dev->props->power = FB_BLANK_POWERDOWN;
+			info->bl_dev->props->update_status(info->bl_dev);
+			up(&info->bl_dev->sem);
+		}
+		mutex_unlock(&info->bl_mutex);
+	}
 #elif defined(CONFIG_FB_ATY_GENERIC_LCD)
 	if (par->lcd_table && blank > FB_BLANK_NORMAL &&
 	    (aty_ld_lcd(LCD_GEN_CNTL, par) & LCD_ON)) {
@@ -2706,8 +2816,16 @@ static int atyfb_blank(int blank, struct fb_info *info)
 	aty_st_le32(CRTC_GEN_CNTL, gen_cntl, par);
 
 #ifdef CONFIG_PMAC_BACKLIGHT
-	if (machine_is(powermac) && blank <= FB_BLANK_NORMAL)
-		set_backlight_enable(1);
+	if (machine_is(powermac) && blank <= FB_BLANK_NORMAL) {
+		mutex_lock(&info->bl_mutex);
+		if (info->bl_dev) {
+			down(&info->bl_dev->sem);
+			info->bl_dev->props->power = FB_BLANK_UNBLANK;
+			info->bl_dev->props->update_status(info->bl_dev);
+			up(&info->bl_dev->sem);
+		}
+		mutex_unlock(&info->bl_mutex);
+	}
 #elif defined(CONFIG_FB_ATY_GENERIC_LCD)
 	if (par->lcd_table && blank <= FB_BLANK_NORMAL &&
 	    (aty_ld_lcd(LCD_GEN_CNTL, par) & LCD_ON)) {
@@ -3440,6 +3558,7 @@ static int __devinit atyfb_pci_probe(struct pci_dev *pdev, const struct pci_devi
 	par->res_start = res_start;
 	par->res_size = res_size;
 	par->irq = pdev->irq;
+	par->pdev = pdev;
 
 	/* Setup "info" structure */
 #ifdef __sparc__
@@ -3571,6 +3690,11 @@ static void __devexit atyfb_remove(struct fb_info *info)
 	aty_set_crtc(par, &saved_crtc);
 	par->pll_ops->set_pll(info, &saved_pll);
 
+#ifdef CONFIG_FB_ATY_BACKLIGHT
+	if (M64_HAS(MOBIL_BUS))
+		aty_bl_exit(par);
+#endif
+
 	unregister_framebuffer(info);
 
 #ifdef CONFIG_MTRR
diff --git a/drivers/video/aty/radeon_backlight.c b/drivers/video/aty/radeon_backlight.c
new file mode 100644
index 000000000000..7de66b855d4e
--- /dev/null
+++ b/drivers/video/aty/radeon_backlight.c
@@ -0,0 +1,247 @@
+/*
+ * Backlight code for ATI Radeon based graphic cards
+ *
+ * Copyright (c) 2000 Ani Joshi <ajoshi@kernel.crashing.org>
+ * Copyright (c) 2003 Benjamin Herrenschmidt <benh@kernel.crashing.org>
+ * Copyright (c) 2006 Michael Hanselmann <linux-kernel@hansmi.ch>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "radeonfb.h"
+#include <linux/backlight.h>
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+#include <asm/backlight.h>
+#endif
+
+#define MAX_RADEON_LEVEL 0xFF
+
+static struct backlight_properties radeon_bl_data;
+
+struct radeon_bl_privdata {
+	struct radeonfb_info *rinfo;
+	uint8_t negative;
+};
+
+static int radeon_bl_get_level_brightness(struct radeon_bl_privdata *pdata,
+		int level)
+{
+	struct fb_info *info = pdata->rinfo->info;
+	int rlevel;
+
+	mutex_lock(&info->bl_mutex);
+
+	/* Get and convert the value */
+	rlevel = pdata->rinfo->info->bl_curve[level] *
+		 FB_BACKLIGHT_MAX / MAX_RADEON_LEVEL;
+
+	mutex_unlock(&info->bl_mutex);
+
+	if (pdata->negative)
+		rlevel = MAX_RADEON_LEVEL - rlevel;
+
+	if (rlevel < 0)
+		rlevel = 0;
+	else if (rlevel > MAX_RADEON_LEVEL)
+		rlevel = MAX_RADEON_LEVEL;
+
+	return rlevel;
+}
+
+static int radeon_bl_update_status(struct backlight_device *bd)
+{
+	struct radeon_bl_privdata *pdata = class_get_devdata(&bd->class_dev);
+	struct radeonfb_info *rinfo = pdata->rinfo;
+	u32 lvds_gen_cntl, tmpPixclksCntl;
+	int level;
+
+	if (rinfo->mon1_type != MT_LCD)
+		return 0;
+
+	/* We turn off the LCD completely instead of just dimming the
+	 * backlight. This provides some greater power saving and the display
+	 * is useless without backlight anyway.
+	 */
+        if (bd->props->power != FB_BLANK_UNBLANK ||
+	    bd->props->fb_blank != FB_BLANK_UNBLANK)
+		level = 0;
+	else
+		level = bd->props->brightness;
+
+	del_timer_sync(&rinfo->lvds_timer);
+	radeon_engine_idle();
+
+	lvds_gen_cntl = INREG(LVDS_GEN_CNTL);
+	if (level > 0) {
+		lvds_gen_cntl &= ~LVDS_DISPLAY_DIS;
+		if (!(lvds_gen_cntl & LVDS_BLON) || !(lvds_gen_cntl & LVDS_ON)) {
+			lvds_gen_cntl |= (rinfo->init_state.lvds_gen_cntl & LVDS_DIGON);
+			lvds_gen_cntl |= LVDS_BLON | LVDS_EN;
+			OUTREG(LVDS_GEN_CNTL, lvds_gen_cntl);
+			lvds_gen_cntl &= ~LVDS_BL_MOD_LEVEL_MASK;
+			lvds_gen_cntl |=
+				(radeon_bl_get_level_brightness(pdata, level) <<
+				 LVDS_BL_MOD_LEVEL_SHIFT);
+			lvds_gen_cntl |= LVDS_ON;
+			lvds_gen_cntl |= (rinfo->init_state.lvds_gen_cntl & LVDS_BL_MOD_EN);
+			rinfo->pending_lvds_gen_cntl = lvds_gen_cntl;
+			mod_timer(&rinfo->lvds_timer,
+				  jiffies + msecs_to_jiffies(rinfo->panel_info.pwr_delay));
+		} else {
+			lvds_gen_cntl &= ~LVDS_BL_MOD_LEVEL_MASK;
+			lvds_gen_cntl |=
+				(radeon_bl_get_level_brightness(pdata, level) <<
+				 LVDS_BL_MOD_LEVEL_SHIFT);
+			OUTREG(LVDS_GEN_CNTL, lvds_gen_cntl);
+		}
+		rinfo->init_state.lvds_gen_cntl &= ~LVDS_STATE_MASK;
+		rinfo->init_state.lvds_gen_cntl |= rinfo->pending_lvds_gen_cntl
+			& LVDS_STATE_MASK;
+	} else {
+		/* Asic bug, when turning off LVDS_ON, we have to make sure
+		   RADEON_PIXCLK_LVDS_ALWAYS_ON bit is off
+		*/
+		tmpPixclksCntl = INPLL(PIXCLKS_CNTL);
+		if (rinfo->is_mobility || rinfo->is_IGP)
+			OUTPLLP(PIXCLKS_CNTL, 0, ~PIXCLK_LVDS_ALWAYS_ONb);
+		lvds_gen_cntl &= ~(LVDS_BL_MOD_LEVEL_MASK | LVDS_BL_MOD_EN);
+		lvds_gen_cntl |= (radeon_bl_get_level_brightness(pdata, 0) <<
+				  LVDS_BL_MOD_LEVEL_SHIFT);
+		lvds_gen_cntl |= LVDS_DISPLAY_DIS;
+		OUTREG(LVDS_GEN_CNTL, lvds_gen_cntl);
+		udelay(100);
+		lvds_gen_cntl &= ~(LVDS_ON | LVDS_EN);
+		OUTREG(LVDS_GEN_CNTL, lvds_gen_cntl);
+		lvds_gen_cntl &= ~(LVDS_DIGON);
+		rinfo->pending_lvds_gen_cntl = lvds_gen_cntl;
+		mod_timer(&rinfo->lvds_timer,
+			  jiffies + msecs_to_jiffies(rinfo->panel_info.pwr_delay));
+		if (rinfo->is_mobility || rinfo->is_IGP)
+			OUTPLL(PIXCLKS_CNTL, tmpPixclksCntl);
+	}
+	rinfo->init_state.lvds_gen_cntl &= ~LVDS_STATE_MASK;
+	rinfo->init_state.lvds_gen_cntl |= (lvds_gen_cntl & LVDS_STATE_MASK);
+
+	return 0;
+}
+
+static int radeon_bl_get_brightness(struct backlight_device *bd)
+{
+	return bd->props->brightness;
+}
+
+static struct backlight_properties radeon_bl_data = {
+	.owner		= THIS_MODULE,
+	.get_brightness = radeon_bl_get_brightness,
+	.update_status	= radeon_bl_update_status,
+	.max_brightness = (FB_BACKLIGHT_LEVELS - 1),
+};
+
+void radeonfb_bl_init(struct radeonfb_info *rinfo)
+{
+	struct backlight_device *bd;
+	struct radeon_bl_privdata *pdata;
+	char name[12];
+
+	if (rinfo->mon1_type != MT_LCD)
+		return;
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	if (!pmac_has_backlight_type("ati") &&
+	    !pmac_has_backlight_type("mnca"))
+		return;
+#endif
+
+	pdata = kmalloc(sizeof(struct radeon_bl_privdata), GFP_KERNEL);
+	if (!pdata) {
+		printk("radeonfb: Memory allocation failed\n");
+		goto error;
+	}
+
+	snprintf(name, sizeof(name), "radeonbl%d", rinfo->info->node);
+
+	bd = backlight_device_register(name, pdata, &radeon_bl_data);
+	if (IS_ERR(bd)) {
+		rinfo->info->bl_dev = NULL;
+		printk("radeonfb: Backlight registration failed\n");
+		goto error;
+	}
+
+	pdata->rinfo = rinfo;
+
+	/* Pardon me for that hack... maybe some day we can figure out in what
+	 * direction backlight should work on a given panel?
+	 */
+	pdata->negative =
+		(rinfo->family != CHIP_FAMILY_RV200 &&
+		 rinfo->family != CHIP_FAMILY_RV250 &&
+		 rinfo->family != CHIP_FAMILY_RV280 &&
+		 rinfo->family != CHIP_FAMILY_RV350);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	pdata->negative = pdata->negative ||
+		machine_is_compatible("PowerBook4,3") ||
+		machine_is_compatible("PowerBook6,3") ||
+		machine_is_compatible("PowerBook6,5");
+#endif
+
+	mutex_lock(&rinfo->info->bl_mutex);
+	rinfo->info->bl_dev = bd;
+	fb_bl_default_curve(rinfo->info, 0,
+		 63 * FB_BACKLIGHT_MAX / MAX_RADEON_LEVEL,
+		217 * FB_BACKLIGHT_MAX / MAX_RADEON_LEVEL);
+	mutex_unlock(&rinfo->info->bl_mutex);
+
+	up(&bd->sem);
+	bd->props->brightness = radeon_bl_data.max_brightness;
+	bd->props->power = FB_BLANK_UNBLANK;
+	bd->props->update_status(bd);
+	down(&bd->sem);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_lock(&pmac_backlight_mutex);
+	if (!pmac_backlight)
+		pmac_backlight = bd;
+	mutex_unlock(&pmac_backlight_mutex);
+#endif
+
+	printk("radeonfb: Backlight initialized (%s)\n", name);
+
+	return;
+
+error:
+	kfree(pdata);
+	return;
+}
+
+void radeonfb_bl_exit(struct radeonfb_info *rinfo)
+{
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_lock(&pmac_backlight_mutex);
+#endif
+
+	mutex_lock(&rinfo->info->bl_mutex);
+	if (rinfo->info->bl_dev) {
+		struct radeon_bl_privdata *pdata;
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+		if (pmac_backlight == rinfo->info->bl_dev)
+			pmac_backlight = NULL;
+#endif
+
+		pdata = class_get_devdata(&rinfo->info->bl_dev->class_dev);
+		backlight_device_unregister(rinfo->info->bl_dev);
+		kfree(pdata);
+		rinfo->info->bl_dev = NULL;
+
+		printk("radeonfb: Backlight unloaded\n");
+	}
+	mutex_unlock(&rinfo->info->bl_mutex);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_unlock(&pmac_backlight_mutex);
+#endif
+}
diff --git a/drivers/video/aty/radeon_base.c b/drivers/video/aty/radeon_base.c
index 387a18a47ac2..c5ecbb02e01d 100644
--- a/drivers/video/aty/radeon_base.c
+++ b/drivers/video/aty/radeon_base.c
@@ -78,10 +78,6 @@
 #include <asm/pci-bridge.h>
 #include "../macmodes.h"
 
-#ifdef CONFIG_PMAC_BACKLIGHT
-#include <asm/backlight.h>
-#endif
-
 #ifdef CONFIG_BOOTX_TEXT
 #include <asm/btext.h>
 #endif
@@ -277,20 +273,6 @@ static int nomtrr = 0;
  * prototypes
  */
 
-
-#ifdef CONFIG_PPC_OF
-
-#ifdef CONFIG_PMAC_BACKLIGHT
-static int radeon_set_backlight_enable(int on, int level, void *data);
-static int radeon_set_backlight_level(int level, void *data);
-static struct backlight_controller radeon_backlight_controller = {
-	radeon_set_backlight_enable,
-	radeon_set_backlight_level
-};
-#endif /* CONFIG_PMAC_BACKLIGHT */
-
-#endif /* CONFIG_PPC_OF */
-
 static void radeon_unmap_ROM(struct radeonfb_info *rinfo, struct pci_dev *dev)
 {
 	if (!rinfo->bios_seg)
@@ -1913,116 +1895,6 @@ static int __devinit radeon_set_fbinfo (struct radeonfb_info *rinfo)
         return 0;
 }
 
-
-#ifdef CONFIG_PMAC_BACKLIGHT
-
-/* TODO: Dbl check these tables, we don't go up to full ON backlight
- * in these, possibly because we noticed MacOS doesn't, but I'd prefer
- * having some more official numbers from ATI
- */
-static int backlight_conv_m6[] = {
-	0xff, 0xc0, 0xb5, 0xaa, 0x9f, 0x94, 0x89, 0x7e,
-	0x73, 0x68, 0x5d, 0x52, 0x47, 0x3c, 0x31, 0x24
-};
-static int backlight_conv_m7[] = {
-	0x00, 0x3f, 0x4a, 0x55, 0x60, 0x6b, 0x76, 0x81,
-	0x8c, 0x97, 0xa2, 0xad, 0xb8, 0xc3, 0xce, 0xd9
-};
-
-#define BACKLIGHT_LVDS_OFF
-#undef BACKLIGHT_DAC_OFF
-
-/* We turn off the LCD completely instead of just dimming the backlight.
- * This provides some greater power saving and the display is useless
- * without backlight anyway.
- */
-static int radeon_set_backlight_enable(int on, int level, void *data)
-{
-	struct radeonfb_info *rinfo = (struct radeonfb_info *)data;
-	u32 lvds_gen_cntl, tmpPixclksCntl;
-	int* conv_table;
-
-	if (rinfo->mon1_type != MT_LCD)
-		return 0;
-
-	/* Pardon me for that hack... maybe some day we can figure
-	 * out in what direction backlight should work on a given
-	 * panel ?
-	 */
-	if ((rinfo->family == CHIP_FAMILY_RV200 ||
-	     rinfo->family == CHIP_FAMILY_RV250 ||
-	     rinfo->family == CHIP_FAMILY_RV280 ||
-	     rinfo->family == CHIP_FAMILY_RV350) &&
-	    !machine_is_compatible("PowerBook4,3") &&
-	    !machine_is_compatible("PowerBook6,3") &&
-	    !machine_is_compatible("PowerBook6,5"))
-		conv_table = backlight_conv_m7;
-	else
-		conv_table = backlight_conv_m6;
-
-	del_timer_sync(&rinfo->lvds_timer);
-	radeon_engine_idle();
-
-	lvds_gen_cntl = INREG(LVDS_GEN_CNTL);
-	if (on && (level > BACKLIGHT_OFF)) {
-		lvds_gen_cntl &= ~LVDS_DISPLAY_DIS;
-		if (!(lvds_gen_cntl & LVDS_BLON) || !(lvds_gen_cntl & LVDS_ON)) {
-			lvds_gen_cntl |= (rinfo->init_state.lvds_gen_cntl & LVDS_DIGON);
-			lvds_gen_cntl |= LVDS_BLON | LVDS_EN;
-			OUTREG(LVDS_GEN_CNTL, lvds_gen_cntl);
-			lvds_gen_cntl &= ~LVDS_BL_MOD_LEVEL_MASK;
-			lvds_gen_cntl |= (conv_table[level] <<
-					  LVDS_BL_MOD_LEVEL_SHIFT);
-			lvds_gen_cntl |= LVDS_ON;
-			lvds_gen_cntl |= (rinfo->init_state.lvds_gen_cntl & LVDS_BL_MOD_EN);
-			rinfo->pending_lvds_gen_cntl = lvds_gen_cntl;
-			mod_timer(&rinfo->lvds_timer,
-				  jiffies + msecs_to_jiffies(rinfo->panel_info.pwr_delay));
-		} else {
-			lvds_gen_cntl &= ~LVDS_BL_MOD_LEVEL_MASK;
-			lvds_gen_cntl |= (conv_table[level] <<
-					  LVDS_BL_MOD_LEVEL_SHIFT);
-			OUTREG(LVDS_GEN_CNTL, lvds_gen_cntl);
-		}
-		rinfo->init_state.lvds_gen_cntl &= ~LVDS_STATE_MASK;
-		rinfo->init_state.lvds_gen_cntl |= rinfo->pending_lvds_gen_cntl
-			& LVDS_STATE_MASK;
-	} else {
-		/* Asic bug, when turning off LVDS_ON, we have to make sure
-		   RADEON_PIXCLK_LVDS_ALWAYS_ON bit is off
-		*/
-		tmpPixclksCntl = INPLL(PIXCLKS_CNTL);
-		if (rinfo->is_mobility || rinfo->is_IGP)
-			OUTPLLP(PIXCLKS_CNTL, 0, ~PIXCLK_LVDS_ALWAYS_ONb);
-		lvds_gen_cntl &= ~(LVDS_BL_MOD_LEVEL_MASK | LVDS_BL_MOD_EN);
-		lvds_gen_cntl |= (conv_table[0] <<
-				  LVDS_BL_MOD_LEVEL_SHIFT);
-		lvds_gen_cntl |= LVDS_DISPLAY_DIS;
-		OUTREG(LVDS_GEN_CNTL, lvds_gen_cntl);
-		udelay(100);
-		lvds_gen_cntl &= ~(LVDS_ON | LVDS_EN);
-		OUTREG(LVDS_GEN_CNTL, lvds_gen_cntl);
-		lvds_gen_cntl &= ~(LVDS_DIGON);
-		rinfo->pending_lvds_gen_cntl = lvds_gen_cntl;
-		mod_timer(&rinfo->lvds_timer,
-			  jiffies + msecs_to_jiffies(rinfo->panel_info.pwr_delay));
-		if (rinfo->is_mobility || rinfo->is_IGP)
-			OUTPLL(PIXCLKS_CNTL, tmpPixclksCntl);
-	}
-	rinfo->init_state.lvds_gen_cntl &= ~LVDS_STATE_MASK;
-	rinfo->init_state.lvds_gen_cntl |= (lvds_gen_cntl & LVDS_STATE_MASK);
-
-	return 0;
-}
-
-
-static int radeon_set_backlight_level(int level, void *data)
-{
-	return radeon_set_backlight_enable(1, level, data);
-}
-#endif /* CONFIG_PMAC_BACKLIGHT */
-
-
 /*
  * This reconfigure the card's internal memory map. In theory, we'd like
  * to setup the card's memory at the same address as it's PCI bus address,
@@ -2477,14 +2349,7 @@ static int __devinit radeonfb_pci_register (struct pci_dev *pdev,
 						 MTRR_TYPE_WRCOMB, 1);
 #endif
 
-#ifdef CONFIG_PMAC_BACKLIGHT
-	if (rinfo->mon1_type == MT_LCD) {
-		register_backlight_controller(&radeon_backlight_controller,
-					      rinfo, "ati");
-		register_backlight_controller(&radeon_backlight_controller,
-					      rinfo, "mnca");
-	}
-#endif
+	radeonfb_bl_init(rinfo);
 
 	printk ("radeonfb (%s): %s\n", pci_name(rinfo->pdev), rinfo->name);
 
@@ -2528,7 +2393,8 @@ static void __devexit radeonfb_pci_unregister (struct pci_dev *pdev)
  
         if (!rinfo)
                 return;
- 
+
+	radeonfb_bl_exit(rinfo);
 	radeonfb_pm_exit(rinfo);
 
 	if (rinfo->mon1_EDID)
diff --git a/drivers/video/aty/radeonfb.h b/drivers/video/aty/radeonfb.h
index 217e00ab4a2d..1645943b1123 100644
--- a/drivers/video/aty/radeonfb.h
+++ b/drivers/video/aty/radeonfb.h
@@ -625,4 +625,13 @@ extern int radeon_screen_blank(struct radeonfb_info *rinfo, int blank, int mode_
 extern void radeon_write_mode (struct radeonfb_info *rinfo, struct radeon_regs *mode,
 			       int reg_only);
 
+/* Backlight functions */
+#ifdef CONFIG_FB_RADEON_BACKLIGHT
+extern void radeonfb_bl_init(struct radeonfb_info *rinfo);
+extern void radeonfb_bl_exit(struct radeonfb_info *rinfo);
+#else
+static inline void radeonfb_bl_init(struct radeonfb_info *rinfo) {}
+static inline void radeonfb_bl_exit(struct radeonfb_info *rinfo) {}
+#endif
+
 #endif /* __RADEONFB_H__ */
diff --git a/drivers/video/chipsfb.c b/drivers/video/chipsfb.c
index 72ff6bf75e5e..d76bbfac92cc 100644
--- a/drivers/video/chipsfb.c
+++ b/drivers/video/chipsfb.c
@@ -148,9 +148,24 @@ static int chipsfb_set_par(struct fb_info *info)
 static int chipsfb_blank(int blank, struct fb_info *info)
 {
 #ifdef CONFIG_PMAC_BACKLIGHT
-	// used to disable backlight only for blank > 1, but it seems
-	// useful at blank = 1 too (saves battery, extends backlight life)
-	set_backlight_enable(!blank);
+	mutex_lock(&pmac_backlight_mutex);
+
+	if (pmac_backlight) {
+		down(&pmac_backlight->sem);
+
+		/* used to disable backlight only for blank > 1, but it seems
+		 * useful at blank = 1 too (saves battery, extends backlight
+		 * life)
+	 	 */
+		if (blank)
+			pmac_backlight->props->power = FB_BLANK_POWERDOWN;
+		else
+			pmac_backlight->props->power = FB_BLANK_UNBLANK;
+		pmac_backlight->props->update_status(pmac_backlight);
+		up(&pmac_backlight->sem);
+	}
+
+	mutex_unlock(&pmac_backlight_mutex);
 #endif /* CONFIG_PMAC_BACKLIGHT */
 
 	return 1;	/* get fb_blank to set the colormap to all black */
@@ -401,7 +416,14 @@ chipsfb_pci_init(struct pci_dev *dp, const struct pci_device_id *ent)
 
 #ifdef CONFIG_PMAC_BACKLIGHT
 	/* turn on the backlight */
-	set_backlight_enable(1);
+	mutex_lock(&pmac_backlight_mutex);
+	if (pmac_backlight) {
+		down(&pmac_backlight->sem);
+		pmac_backlight->props->power = FB_BLANK_UNBLANK;
+		pmac_backlight->props->update_status(pmac_backlight);
+		up(&pmac_backlight->sem);
+	}
+	mutex_unlock(&pmac_backlight_mutex);
 #endif /* CONFIG_PMAC_BACKLIGHT */
 
 #ifdef CONFIG_PPC
diff --git a/drivers/video/fbsysfs.c b/drivers/video/fbsysfs.c
index 34e07399756b..3ceb8c1b392e 100644
--- a/drivers/video/fbsysfs.c
+++ b/drivers/video/fbsysfs.c
@@ -18,6 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/fb.h>
 #include <linux/console.h>
+#include <linux/module.h>
 
 /**
  * framebuffer_alloc - creates a new frame buffer info structure
@@ -55,6 +56,10 @@ struct fb_info *framebuffer_alloc(size_t size, struct device *dev)
 
 	info->device = dev;
 
+#ifdef CONFIG_FB_BACKLIGHT
+	mutex_init(&info->bl_mutex);
+#endif
+
 	return info;
 #undef PADDING
 #undef BYTES_PER_LONG
@@ -414,6 +419,65 @@ static ssize_t show_fbstate(struct class_device *class_device, char *buf)
 	return snprintf(buf, PAGE_SIZE, "%d\n", fb_info->state);
 }
 
+#ifdef CONFIG_FB_BACKLIGHT
+static ssize_t store_bl_curve(struct class_device *class_device,
+		const char *buf, size_t count)
+{
+	struct fb_info *fb_info = class_get_devdata(class_device);
+	u8 tmp_curve[FB_BACKLIGHT_LEVELS];
+	unsigned int i;
+
+	if (count != (FB_BACKLIGHT_LEVELS / 8 * 24))
+		return -EINVAL;
+
+	for (i = 0; i < (FB_BACKLIGHT_LEVELS / 8); ++i)
+		if (sscanf(&buf[i * 24],
+			"%2hhx %2hhx %2hhx %2hhx %2hhx %2hhx %2hhx %2hhx\n",
+			&tmp_curve[i * 8 + 0],
+			&tmp_curve[i * 8 + 1],
+			&tmp_curve[i * 8 + 2],
+			&tmp_curve[i * 8 + 3],
+			&tmp_curve[i * 8 + 4],
+			&tmp_curve[i * 8 + 5],
+			&tmp_curve[i * 8 + 6],
+			&tmp_curve[i * 8 + 7]) != 8)
+			return -EINVAL;
+
+	/* If there has been an error in the input data, we won't
+	 * reach this loop.
+	 */
+	mutex_lock(&fb_info->bl_mutex);
+	for (i = 0; i < FB_BACKLIGHT_LEVELS; ++i)
+		fb_info->bl_curve[i] = tmp_curve[i];
+	mutex_unlock(&fb_info->bl_mutex);
+
+	return count;
+}
+
+static ssize_t show_bl_curve(struct class_device *class_device, char *buf)
+{
+	struct fb_info *fb_info = class_get_devdata(class_device);
+	ssize_t len = 0;
+	unsigned int i;
+
+	mutex_lock(&fb_info->bl_mutex);
+	for (i = 0; i < FB_BACKLIGHT_LEVELS; i += 8)
+		len += snprintf(&buf[len], PAGE_SIZE,
+				"%02x %02x %02x %02x %02x %02x %02x %02x\n",
+				fb_info->bl_curve[i + 0],
+				fb_info->bl_curve[i + 1],
+				fb_info->bl_curve[i + 2],
+				fb_info->bl_curve[i + 3],
+				fb_info->bl_curve[i + 4],
+				fb_info->bl_curve[i + 5],
+				fb_info->bl_curve[i + 6],
+				fb_info->bl_curve[i + 7]);
+	mutex_unlock(&fb_info->bl_mutex);
+
+	return len;
+}
+#endif
+
 /* When cmap is added back in it should be a binary attribute
  * not a text one. Consideration should also be given to converting
  * fbdev to use configfs instead of sysfs */
@@ -432,6 +496,9 @@ static struct class_device_attribute class_device_attrs[] = {
 	__ATTR(con_rotate, S_IRUGO|S_IWUSR, show_con_rotate, store_con_rotate),
 	__ATTR(con_rotate_all, S_IWUSR, NULL, store_con_rotate_all),
 	__ATTR(state, S_IRUGO|S_IWUSR, show_fbstate, store_fbstate),
+#ifdef CONFIG_FB_BACKLIGHT
+	__ATTR(bl_curve, S_IRUGO|S_IWUSR, show_bl_curve, store_bl_curve),
+#endif
 };
 
 int fb_init_class_device(struct fb_info *fb_info)
@@ -454,4 +521,25 @@ void fb_cleanup_class_device(struct fb_info *fb_info)
 					 &class_device_attrs[i]);
 }
 
+#ifdef CONFIG_FB_BACKLIGHT
+/* This function generates a linear backlight curve
+ *
+ *     0: off
+ *   1-7: min
+ * 8-127: linear from min to max
+ */
+void fb_bl_default_curve(struct fb_info *fb_info, u8 off, u8 min, u8 max)
+{
+	unsigned int i, flat, count, range = (max - min);
+
+	fb_info->bl_curve[0] = off;
 
+	for (flat = 1; flat < (FB_BACKLIGHT_LEVELS / 16); ++flat)
+		fb_info->bl_curve[flat] = min;
+
+	count = FB_BACKLIGHT_LEVELS * 15 / 16;
+	for (i = 0; i < count; ++i)
+		fb_info->bl_curve[flat + i] = min + (range * (i + 1) / count);
+}
+EXPORT_SYMBOL_GPL(fb_bl_default_curve);
+#endif
diff --git a/drivers/video/nvidia/Makefile b/drivers/video/nvidia/Makefile
index 690d37e8de5b..ca47432113e0 100644
--- a/drivers/video/nvidia/Makefile
+++ b/drivers/video/nvidia/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_FB_NVIDIA)          += nvidiafb.o
 nvidiafb-y                       := nvidia.o nv_hw.o nv_setup.o \
 			            nv_accel.o
 nvidiafb-$(CONFIG_FB_NVIDIA_I2C) += nv_i2c.o
+nvidiafb-$(CONFIG_FB_NVIDIA_BACKLIGHT)  += nv_backlight.o
 nvidiafb-$(CONFIG_PPC_OF)	 += nv_of.o
 
-nvidiafb-objs                    := $(nvidiafb-y)
\ No newline at end of file
+nvidiafb-objs                    := $(nvidiafb-y)
diff --git a/drivers/video/nvidia/nv_backlight.c b/drivers/video/nvidia/nv_backlight.c
new file mode 100644
index 000000000000..1c1c10c699c5
--- /dev/null
+++ b/drivers/video/nvidia/nv_backlight.c
@@ -0,0 +1,175 @@
+/*
+ * Backlight code for nVidia based graphic cards
+ *
+ * Copyright 2004 Antonino Daplas <adaplas@pol.net>
+ * Copyright (c) 2006 Michael Hanselmann <linux-kernel@hansmi.ch>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/backlight.h>
+#include <linux/fb.h>
+#include <linux/pci.h>
+#include "nv_local.h"
+#include "nv_type.h"
+#include "nv_proto.h"
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+#include <asm/backlight.h>
+#include <asm/machdep.h>
+#endif
+
+/* We do not have any information about which values are allowed, thus
+ * we used safe values.
+ */
+#define MIN_LEVEL 0x158
+#define MAX_LEVEL 0x534
+
+static struct backlight_properties nvidia_bl_data;
+
+static int nvidia_bl_get_level_brightness(struct nvidia_par *par,
+		int level)
+{
+	struct fb_info *info = pci_get_drvdata(par->pci_dev);
+	int nlevel;
+
+	/* Get and convert the value */
+	mutex_lock(&info->bl_mutex);
+	nlevel = info->bl_curve[level] * FB_BACKLIGHT_MAX / MAX_LEVEL;
+	mutex_unlock(&info->bl_mutex);
+
+	if (nlevel < 0)
+		nlevel = 0;
+	else if (nlevel < MIN_LEVEL)
+		nlevel = MIN_LEVEL;
+	else if (nlevel > MAX_LEVEL)
+		nlevel = MAX_LEVEL;
+
+	return nlevel;
+}
+
+static int nvidia_bl_update_status(struct backlight_device *bd)
+{
+	struct nvidia_par *par = class_get_devdata(&bd->class_dev);
+	u32 tmp_pcrt, tmp_pmc, fpcontrol;
+	int level;
+
+	if (!par->FlatPanel)
+		return 0;
+
+	if (bd->props->power != FB_BLANK_UNBLANK ||
+	    bd->props->fb_blank != FB_BLANK_UNBLANK)
+		level = 0;
+	else
+		level = bd->props->brightness;
+
+	tmp_pmc = NV_RD32(par->PMC, 0x10F0) & 0x0000FFFF;
+	tmp_pcrt = NV_RD32(par->PCRTC0, 0x081C) & 0xFFFFFFFC;
+	fpcontrol = NV_RD32(par->PRAMDAC, 0x0848) & 0xCFFFFFCC;
+
+	if (level > 0) {
+		tmp_pcrt |= 0x1;
+		tmp_pmc |= (1 << 31); /* backlight bit */
+		tmp_pmc |= nvidia_bl_get_level_brightness(par, level) << 16;
+		fpcontrol |= par->fpSyncs;
+	} else
+		fpcontrol |= 0x20000022;
+
+	NV_WR32(par->PCRTC0, 0x081C, tmp_pcrt);
+	NV_WR32(par->PMC, 0x10F0, tmp_pmc);
+	NV_WR32(par->PRAMDAC, 0x848, fpcontrol);
+
+	return 0;
+}
+
+static int nvidia_bl_get_brightness(struct backlight_device *bd)
+{
+	return bd->props->brightness;
+}
+
+static struct backlight_properties nvidia_bl_data = {
+	.owner		= THIS_MODULE,
+	.get_brightness = nvidia_bl_get_brightness,
+	.update_status	= nvidia_bl_update_status,
+	.max_brightness = (FB_BACKLIGHT_LEVELS - 1),
+};
+
+void nvidia_bl_init(struct nvidia_par *par)
+{
+	struct fb_info *info = pci_get_drvdata(par->pci_dev);
+	struct backlight_device *bd;
+	char name[12];
+
+	if (!par->FlatPanel)
+		return;
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	if (!machine_is(powermac) ||
+	    !pmac_has_backlight_type("mnca"))
+		return;
+#endif
+
+	snprintf(name, sizeof(name), "nvidiabl%d", info->node);
+
+	bd = backlight_device_register(name, par, &nvidia_bl_data);
+	if (IS_ERR(bd)) {
+		info->bl_dev = NULL;
+		printk("nvidia: Backlight registration failed\n");
+		goto error;
+	}
+
+	mutex_lock(&info->bl_mutex);
+	info->bl_dev = bd;
+	fb_bl_default_curve(info, 0,
+		0x158 * FB_BACKLIGHT_MAX / MAX_LEVEL,
+		0x534 * FB_BACKLIGHT_MAX / MAX_LEVEL);
+	mutex_unlock(&info->bl_mutex);
+
+	up(&bd->sem);
+	bd->props->brightness = nvidia_bl_data.max_brightness;
+	bd->props->power = FB_BLANK_UNBLANK;
+	bd->props->update_status(bd);
+	down(&bd->sem);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_lock(&pmac_backlight_mutex);
+	if (!pmac_backlight)
+		pmac_backlight = bd;
+	mutex_unlock(&pmac_backlight_mutex);
+#endif
+
+	printk("nvidia: Backlight initialized (%s)\n", name);
+
+	return;
+
+error:
+	return;
+}
+
+void nvidia_bl_exit(struct nvidia_par *par)
+{
+	struct fb_info *info = pci_get_drvdata(par->pci_dev);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_lock(&pmac_backlight_mutex);
+#endif
+
+	mutex_lock(&info->bl_mutex);
+	if (info->bl_dev) {
+#ifdef CONFIG_PMAC_BACKLIGHT
+		if (pmac_backlight == info->bl_dev)
+			pmac_backlight = NULL;
+#endif
+
+		backlight_device_unregister(info->bl_dev);
+
+		printk("nvidia: Backlight unloaded\n");
+	}
+	mutex_unlock(&info->bl_mutex);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_unlock(&pmac_backlight_mutex);
+#endif
+}
diff --git a/drivers/video/nvidia/nv_proto.h b/drivers/video/nvidia/nv_proto.h
index b149a690ee0f..6fba656cd56b 100644
--- a/drivers/video/nvidia/nv_proto.h
+++ b/drivers/video/nvidia/nv_proto.h
@@ -63,4 +63,14 @@ extern void nvidiafb_imageblit(struct fb_info *info,
 			       const struct fb_image *image);
 extern int nvidiafb_sync(struct fb_info *info);
 extern u8 byte_rev[256];
+
+/* in nv_backlight.h */
+#ifdef CONFIG_FB_NVIDIA_BACKLIGHT
+extern void nvidia_bl_init(struct nvidia_par *par);
+extern void nvidia_bl_exit(struct nvidia_par *par);
+#else
+static inline void nvidia_bl_init(struct nvidia_par *par) {}
+static inline void nvidia_bl_exit(struct nvidia_par *par) {}
+#endif
+
 #endif				/* __NV_PROTO_H__ */
diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c
index 093ab9977c7c..03a7c1e9ce38 100644
--- a/drivers/video/nvidia/nvidia.c
+++ b/drivers/video/nvidia/nvidia.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/console.h>
+#include <linux/backlight.h>
 #ifdef CONFIG_MTRR
 #include <asm/mtrr.h>
 #endif
@@ -29,10 +30,6 @@
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
 #endif
-#ifdef CONFIG_PMAC_BACKLIGHT
-#include <asm/machdep.h>
-#include <asm/backlight.h>
-#endif
 
 #include "nv_local.h"
 #include "nv_type.h"
@@ -470,75 +467,6 @@ static struct fb_var_screeninfo __devinitdata nvidiafb_default_var = {
 	.vmode = FB_VMODE_NONINTERLACED
 };
 
-/*
- * Backlight control
- */
-#ifdef CONFIG_PMAC_BACKLIGHT
-
-static int nvidia_backlight_levels[] = {
-	0x158,
-	0x192,
-	0x1c6,
-	0x200,
-	0x234,
-	0x268,
-	0x2a2,
-	0x2d6,
-	0x310,
-	0x344,
-	0x378,
-	0x3b2,
-	0x3e6,
-	0x41a,
-	0x454,
-	0x534,
-};
-
-/* ------------------------------------------------------------------------- *
- *
- * Backlight operations
- *
- * ------------------------------------------------------------------------- */
-
-static int nvidia_set_backlight_enable(int on, int level, void *data)
-{
-	struct nvidia_par *par = data;
-	u32 tmp_pcrt, tmp_pmc, fpcontrol;
-
-	tmp_pmc = NV_RD32(par->PMC, 0x10F0) & 0x0000FFFF;
-	tmp_pcrt = NV_RD32(par->PCRTC0, 0x081C) & 0xFFFFFFFC;
-	fpcontrol = NV_RD32(par->PRAMDAC, 0x0848) & 0xCFFFFFCC;
-
-	if (on && (level > BACKLIGHT_OFF)) {
-		tmp_pcrt |= 0x1;
-		tmp_pmc |= (1 << 31);	// backlight bit
-		tmp_pmc |= nvidia_backlight_levels[level - 1] << 16;
-	}
-
-	if (on)
-		fpcontrol |= par->fpSyncs;
-	else
-		fpcontrol |= 0x20000022;
-
-	NV_WR32(par->PCRTC0, 0x081C, tmp_pcrt);
-	NV_WR32(par->PMC, 0x10F0, tmp_pmc);
-	NV_WR32(par->PRAMDAC, 0x848, fpcontrol);
-
-	return 0;
-}
-
-static int nvidia_set_backlight_level(int level, void *data)
-{
-	return nvidia_set_backlight_enable(1, level, data);
-}
-
-static struct backlight_controller nvidia_backlight_controller = {
-	nvidia_set_backlight_enable,
-	nvidia_set_backlight_level
-};
-
-#endif				/* CONFIG_PMAC_BACKLIGHT */
-
 static void nvidiafb_load_cursor_image(struct nvidia_par *par, u8 * data8,
 				       u16 bg, u16 fg, u32 w, u32 h)
 {
@@ -1355,10 +1283,15 @@ static int nvidiafb_blank(int blank, struct fb_info *info)
 	NVWriteSeq(par, 0x01, tmp);
 	NVWriteCrtc(par, 0x1a, vesa);
 
-#ifdef CONFIG_PMAC_BACKLIGHT
-	if (par->FlatPanel && machine_is(powermac)) {
-		set_backlight_enable(!blank);
+#ifdef CONFIG_FB_NVIDIA_BACKLIGHT
+	mutex_lock(&info->bl_mutex);
+	if (info->bl_dev) {
+		down(&info->bl_dev->sem);
+		info->bl_dev->props->power = blank;
+		info->bl_dev->props->update_status(info->bl_dev);
+		up(&info->bl_dev->sem);
 	}
+	mutex_unlock(&info->bl_mutex);
 #endif
 
 	NVTRACE_LEAVE();
@@ -1741,11 +1674,9 @@ static int __devinit nvidiafb_probe(struct pci_dev *pd,
 	       "PCI nVidia %s framebuffer (%dMB @ 0x%lX)\n",
 	       info->fix.id,
 	       par->FbMapSize / (1024 * 1024), info->fix.smem_start);
-#ifdef CONFIG_PMAC_BACKLIGHT
-	if (par->FlatPanel && machine_is(powermac))
-		register_backlight_controller(&nvidia_backlight_controller,
-					      par, "mnca");
-#endif
+
+	nvidia_bl_init(par);
+
 	NVTRACE_LEAVE();
 	return 0;
 
@@ -1775,6 +1706,8 @@ static void __exit nvidiafb_remove(struct pci_dev *pd)
 
 	NVTRACE_ENTER();
 
+	nvidia_bl_exit(par);
+
 	unregister_framebuffer(info);
 #ifdef CONFIG_MTRR
 	if (par->mtrr.vram_valid)
diff --git a/drivers/video/riva/fbdev.c b/drivers/video/riva/fbdev.c
index 3e9308f0f165..d4384ab1df65 100644
--- a/drivers/video/riva/fbdev.c
+++ b/drivers/video/riva/fbdev.c
@@ -41,6 +41,7 @@
 #include <linux/fb.h>
 #include <linux/init.h>
 #include <linux/pci.h>
+#include <linux/backlight.h>
 #ifdef CONFIG_MTRR
 #include <asm/mtrr.h>
 #endif
@@ -272,34 +273,154 @@ static const struct riva_regs reg_template = {
 /*
  * Backlight control
  */
-#ifdef CONFIG_PMAC_BACKLIGHT
+#ifdef CONFIG_FB_RIVA_BACKLIGHT
+/* We do not have any information about which values are allowed, thus
+ * we used safe values.
+ */
+#define MIN_LEVEL 0x158
+#define MAX_LEVEL 0x534
 
-static int riva_backlight_levels[] = {
-    0x158,
-    0x192,
-    0x1c6,
-    0x200,
-    0x234,
-    0x268,
-    0x2a2,
-    0x2d6,
-    0x310,
-    0x344,
-    0x378,
-    0x3b2,
-    0x3e6,
-    0x41a,
-    0x454,
-    0x534,
-};
+static struct backlight_properties riva_bl_data;
+
+static int riva_bl_get_level_brightness(struct riva_par *par,
+		int level)
+{
+	struct fb_info *info = pci_get_drvdata(par->pdev);
+	int nlevel;
+
+	/* Get and convert the value */
+	mutex_lock(&info->bl_mutex);
+	nlevel = info->bl_curve[level] * FB_BACKLIGHT_MAX / MAX_LEVEL;
+	mutex_unlock(&info->bl_mutex);
+
+	if (nlevel < 0)
+		nlevel = 0;
+	else if (nlevel < MIN_LEVEL)
+		nlevel = MIN_LEVEL;
+	else if (nlevel > MAX_LEVEL)
+		nlevel = MAX_LEVEL;
+
+	return nlevel;
+}
+
+static int riva_bl_update_status(struct backlight_device *bd)
+{
+	struct riva_par *par = class_get_devdata(&bd->class_dev);
+	U032 tmp_pcrt, tmp_pmc;
+	int level;
+
+	if (bd->props->power != FB_BLANK_UNBLANK ||
+	    bd->props->fb_blank != FB_BLANK_UNBLANK)
+		level = 0;
+	else
+		level = bd->props->brightness;
+
+	tmp_pmc = par->riva.PMC[0x10F0/4] & 0x0000FFFF;
+	tmp_pcrt = par->riva.PCRTC0[0x081C/4] & 0xFFFFFFFC;
+	if(level > 0) {
+		tmp_pcrt |= 0x1;
+		tmp_pmc |= (1 << 31); /* backlight bit */
+		tmp_pmc |= riva_bl_get_level_brightness(par, level) << 16; /* level */
+	}
+	par->riva.PCRTC0[0x081C/4] = tmp_pcrt;
+	par->riva.PMC[0x10F0/4] = tmp_pmc;
+
+	return 0;
+}
+
+static int riva_bl_get_brightness(struct backlight_device *bd)
+{
+	return bd->props->brightness;
+}
 
-static int riva_set_backlight_enable(int on, int level, void *data);
-static int riva_set_backlight_level(int level, void *data);
-static struct backlight_controller riva_backlight_controller = {
-	riva_set_backlight_enable,
-	riva_set_backlight_level
+static struct backlight_properties riva_bl_data = {
+	.owner    = THIS_MODULE,
+	.get_brightness = riva_bl_get_brightness,
+	.update_status	= riva_bl_update_status,
+	.max_brightness = (FB_BACKLIGHT_LEVELS - 1),
 };
-#endif /* CONFIG_PMAC_BACKLIGHT */
+
+static void riva_bl_init(struct riva_par *par)
+{
+	struct fb_info *info = pci_get_drvdata(par->pdev);
+	struct backlight_device *bd;
+	char name[12];
+
+	if (!par->FlatPanel)
+		return;
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	if (!machine_is(powermac) ||
+	    !pmac_has_backlight_type("mnca"))
+		return;
+#endif
+
+	snprintf(name, sizeof(name), "rivabl%d", info->node);
+
+	bd = backlight_device_register(name, par, &riva_bl_data);
+	if (IS_ERR(bd)) {
+		info->bl_dev = NULL;
+		printk("riva: Backlight registration failed\n");
+		goto error;
+	}
+
+	mutex_lock(&info->bl_mutex);
+	info->bl_dev = bd;
+	fb_bl_default_curve(info, 0,
+		0x158 * FB_BACKLIGHT_MAX / MAX_LEVEL,
+		0x534 * FB_BACKLIGHT_MAX / MAX_LEVEL);
+	mutex_unlock(&info->bl_mutex);
+
+	up(&bd->sem);
+	bd->props->brightness = riva_bl_data.max_brightness;
+	bd->props->power = FB_BLANK_UNBLANK;
+	bd->props->update_status(bd);
+	down(&bd->sem);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_lock(&pmac_backlight_mutex);
+	if (!pmac_backlight)
+		pmac_backlight = bd;
+	mutex_unlock(&pmac_backlight_mutex);
+#endif
+
+	printk("riva: Backlight initialized (%s)\n", name);
+
+	return;
+
+error:
+	return;
+}
+
+static void riva_bl_exit(struct riva_par *par)
+{
+	struct fb_info *info = pci_get_drvdata(par->pdev);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_lock(&pmac_backlight_mutex);
+#endif
+
+	mutex_lock(&info->bl_mutex);
+	if (info->bl_dev) {
+#ifdef CONFIG_PMAC_BACKLIGHT
+		if (pmac_backlight == info->bl_dev)
+			pmac_backlight = NULL;
+#endif
+
+		backlight_device_unregister(info->bl_dev);
+
+		printk("riva: Backlight unloaded\n");
+	}
+	mutex_unlock(&info->bl_mutex);
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	mutex_unlock(&pmac_backlight_mutex);
+#endif
+}
+#else
+static inline void riva_bl_init(struct riva_par *par) {}
+static inline void riva_bl_exit(struct riva_par *par) {}
+#endif /* CONFIG_FB_RIVA_BACKLIGHT */
 
 /* ------------------------------------------------------------------------- *
  *
@@ -971,36 +1092,6 @@ static int riva_get_cmap_len(const struct fb_var_screeninfo *var)
 	return rc;
 }
 
-/* ------------------------------------------------------------------------- *
- *
- * Backlight operations
- *
- * ------------------------------------------------------------------------- */
-
-#ifdef CONFIG_PMAC_BACKLIGHT
-static int riva_set_backlight_enable(int on, int level, void *data)
-{
-	struct riva_par *par = data;
-	U032 tmp_pcrt, tmp_pmc;
-
-	tmp_pmc = par->riva.PMC[0x10F0/4] & 0x0000FFFF;
-	tmp_pcrt = par->riva.PCRTC0[0x081C/4] & 0xFFFFFFFC;
-	if(on && (level > BACKLIGHT_OFF)) {
-		tmp_pcrt |= 0x1;
-		tmp_pmc |= (1 << 31); // backlight bit
-		tmp_pmc |= riva_backlight_levels[level-1] << 16; // level
-	}
-	par->riva.PCRTC0[0x081C/4] = tmp_pcrt;
-	par->riva.PMC[0x10F0/4] = tmp_pmc;
-	return 0;
-}
-
-static int riva_set_backlight_level(int level, void *data)
-{
-	return riva_set_backlight_enable(1, level, data);
-}
-#endif /* CONFIG_PMAC_BACKLIGHT */
-
 /* ------------------------------------------------------------------------- *
  *
  * framebuffer operations
@@ -1247,10 +1338,15 @@ static int rivafb_blank(int blank, struct fb_info *info)
 	SEQout(par, 0x01, tmp);
 	CRTCout(par, 0x1a, vesa);
 
-#ifdef CONFIG_PMAC_BACKLIGHT
-	if ( par->FlatPanel && machine_is(powermac)) {
-		set_backlight_enable(!blank);
+#ifdef CONFIG_FB_RIVA_BACKLIGHT
+	mutex_lock(&info->bl_mutex);
+	if (info->bl_dev) {
+		down(&info->bl_dev->sem);
+		info->bl_dev->props->power = blank;
+		info->bl_dev->props->update_status(info->bl_dev);
+		up(&info->bl_dev->sem);
 	}
+	mutex_unlock(&info->bl_mutex);
 #endif
 
 	NVTRACE_LEAVE();
@@ -2037,11 +2133,9 @@ static int __devinit rivafb_probe(struct pci_dev *pd,
 		RIVAFB_VERSION,
 		info->fix.smem_len / (1024 * 1024),
 		info->fix.smem_start);
-#ifdef CONFIG_PMAC_BACKLIGHT
-	if (default_par->FlatPanel && machine_is(powermac))
-		register_backlight_controller(&riva_backlight_controller,
-					      default_par, "mnca");
-#endif
+
+	riva_bl_init(info->par);
+
 	NVTRACE_LEAVE();
 	return 0;
 
@@ -2074,6 +2168,8 @@ static void __exit rivafb_remove(struct pci_dev *pd)
 	
 	NVTRACE_ENTER();
 
+	riva_bl_exit(par);
+
 #ifdef CONFIG_FB_RIVA_I2C
 	riva_delete_i2c_busses(par);
 	kfree(par->EDID);
diff --git a/include/asm-powerpc/backlight.h b/include/asm-powerpc/backlight.h
index 1ba1f27a0b63..a5e9e656e332 100644
--- a/include/asm-powerpc/backlight.h
+++ b/include/asm-powerpc/backlight.h
@@ -2,30 +2,30 @@
  * Routines for handling backlight control on PowerBooks
  *
  * For now, implementation resides in
- * arch/powerpc/platforms/powermac/pmac_support.c
+ * arch/powerpc/platforms/powermac/backlight.c
  *
  */
 #ifndef __ASM_POWERPC_BACKLIGHT_H
 #define __ASM_POWERPC_BACKLIGHT_H
 #ifdef __KERNEL__
 
-/* Abstract values */
-#define BACKLIGHT_OFF	0
-#define BACKLIGHT_MIN	1
-#define BACKLIGHT_MAX	0xf
+#include <linux/fb.h>
+#include <linux/mutex.h>
 
-struct backlight_controller {
-	int (*set_enable)(int enable, int level, void *data);
-	int (*set_level)(int level, void *data);
-};
+/* For locking instructions, see the implementation file */
+extern struct backlight_device *pmac_backlight;
+extern struct mutex pmac_backlight_mutex;
 
-extern void register_backlight_controller(struct backlight_controller *ctrler, void *data, char *type);
-extern void unregister_backlight_controller(struct backlight_controller *ctrler, void *data);
+extern void pmac_backlight_calc_curve(struct fb_info*);
+extern int pmac_backlight_curve_lookup(struct fb_info *info, int value);
 
-extern int set_backlight_enable(int enable);
-extern int get_backlight_enable(void);
-extern int set_backlight_level(int level);
-extern int get_backlight_level(void);
+extern int pmac_has_backlight_type(const char *type);
+
+extern void pmac_backlight_key_up(void);
+extern void pmac_backlight_key_down(void);
+
+extern int pmac_backlight_set_legacy_brightness(int brightness);
+extern int pmac_backlight_get_legacy_brightness(void);
 
 #endif /* __KERNEL__ */
 #endif
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 315d89740ddf..f1281687e549 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_FB_H
 #define _LINUX_FB_H
 
+#include <linux/backlight.h>
 #include <asm/types.h>
 
 /* Definitions of frame buffers						*/
@@ -366,6 +367,12 @@ struct fb_cursor {
 	struct fb_image	image;	/* Cursor image */
 };
 
+#ifdef CONFIG_FB_BACKLIGHT
+/* Settings for the generic backlight code */
+#define FB_BACKLIGHT_LEVELS	128
+#define FB_BACKLIGHT_MAX	0xFF
+#endif
+
 #ifdef __KERNEL__
 
 #include <linux/fs.h>
@@ -756,6 +763,21 @@ struct fb_info {
 	struct fb_cmap cmap;		/* Current cmap */
 	struct list_head modelist;      /* mode list */
 	struct fb_videomode *mode;	/* current mode */
+
+#ifdef CONFIG_FB_BACKLIGHT
+	/* Lock ordering:
+	 * bl_mutex (protects bl_dev and bl_curve)
+	 *   bl_dev->sem (backlight class)
+	 */
+	struct mutex bl_mutex;
+
+	/* assigned backlight device */
+	struct backlight_device *bl_dev;
+
+	/* Backlight level curve */
+	u8 bl_curve[FB_BACKLIGHT_LEVELS];
+#endif
+
 	struct fb_ops *fbops;
 	struct device *device;
 	struct class_device *class_device; /* sysfs per device attrs */
@@ -895,6 +917,7 @@ extern struct fb_info *framebuffer_alloc(size_t size, struct device *dev);
 extern void framebuffer_release(struct fb_info *info);
 extern int fb_init_class_device(struct fb_info *fb_info);
 extern void fb_cleanup_class_device(struct fb_info *head);
+extern void fb_bl_default_curve(struct fb_info *fb_info, u8 off, u8 min, u8 max);
 
 /* drivers/video/fbmon.c */
 #define FB_MAXTIMINGS		0
diff --git a/include/linux/pmu.h b/include/linux/pmu.h
index ecce5912f4d6..2ed807ddc08c 100644
--- a/include/linux/pmu.h
+++ b/include/linux/pmu.h
@@ -230,4 +230,8 @@ extern int pmu_battery_count;
 extern struct pmu_battery_info pmu_batteries[PMU_MAX_BATTERIES];
 extern unsigned int pmu_power_flags;
 
+/* Backlight */
+extern int disable_kernel_backlight;
+extern void pmu_backlight_init(struct device_node*);
+
 #endif	/* __KERNEL__ */
-- 
cgit v1.2.3


From 6ef4d6bf86a82965896eaa1a189177239ec2bbab Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:20 -0700
Subject: [PATCH] ufs: change block number on the fly

First of all some necessary notes about UFS by it self: To avoid waste of disk
space the tail of file consists not from blocks (which is ordinary big enough,
16K usually), it consists from fragments(which is ordinary 2K).  When file is
growing its tail occupy 1 fragment, 2 fragments...  At some stage decision to
allocate whole block is made and all fragments are moved to one block.

How this situation was handled before:

  ufs_prepare_write
  ->block_prepare_write
    ->ufs_getfrag_block
      ->...
        ->ufs_new_fragments:

	bh = sb_bread
	bh->b_blocknr = result + i;
	mark_buffer_dirty (bh);

This is wrong solution, because:

- it didn't take into consideration that there is another cache: "inode page
  cache"

- because of sb_getblk uses not b_blocknr, (it uses page->index) to find
  certain block, this breaks sb_getblk.

How this situation is handled now: we go though all "page inode cache", if
there are no such page in cache we load it into cache, and change b_blocknr.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/balloc.c        | 137 ++++++++++++++++++++++++++++++++++++++++---------
 fs/ufs/inode.c         |  44 +++++++++-------
 include/linux/ufs_fs.h |   3 +-
 3 files changed, 140 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index cc0c8f15d8fd..06f970d02e3d 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -39,7 +39,8 @@ static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *,
 /*
  * Free 'count' fragments from fragment number 'fragment'
  */
-void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count) {
+void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
+{
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
@@ -134,7 +135,8 @@ failed:
 /*
  * Free 'count' fragments from fragment number 'fragment' (free whole blocks)
  */
-void ufs_free_blocks (struct inode * inode, unsigned fragment, unsigned count) {
+void ufs_free_blocks(struct inode *inode, unsigned fragment, unsigned count)
+{
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
@@ -222,15 +224,118 @@ failed:
 	return;
 }
 
+static struct page *ufs_get_locked_page(struct address_space *mapping,
+				  unsigned long index)
+{
+	struct page *page;
+
+try_again:
+	page = find_lock_page(mapping, index);
+	if (!page) {
+		page = read_cache_page(mapping, index,
+				       (filler_t*)mapping->a_ops->readpage,
+				       NULL);
+		if (IS_ERR(page)) {
+			printk(KERN_ERR "ufs_change_blocknr: "
+			       "read_cache_page error: ino %lu, index: %lu\n",
+			       mapping->host->i_ino, index);
+			goto out;
+		}
+
+		lock_page(page);
+
+		if (!PageUptodate(page) || PageError(page)) {
+			unlock_page(page);
+			page_cache_release(page);
+
+			printk(KERN_ERR "ufs_change_blocknr: "
+			       "can not read page: ino %lu, index: %lu\n",
+			       mapping->host->i_ino, index);
+
+			page = ERR_PTR(-EIO);
+			goto out;
+		}
+	}
+
+	if (unlikely(!page->mapping || !page_has_buffers(page))) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto try_again;/*we really need these buffers*/
+	}
+out:
+	return page;
+}
+
+/*
+ * Modify inode page cache in such way:
+ * have - blocks with b_blocknr equal to oldb...oldb+count-1
+ * get - blocks with b_blocknr equal to newb...newb+count-1
+ * also we suppose that oldb...oldb+count-1 blocks
+ * situated at the end of file.
+ *
+ * We can come here from ufs_writepage or ufs_prepare_write,
+ * locked_page is argument of these functions, so we already lock it.
+ */
+static void ufs_change_blocknr(struct inode *inode, unsigned int count,
+			       unsigned int oldb, unsigned int newb,
+			       struct page *locked_page)
+{
+	unsigned int blk_per_page = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	sector_t baseblk;
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t index, cur_index = locked_page->index;
+	unsigned int i, j;
+	struct page *page;
+	struct buffer_head *head, *bh;
+
+	baseblk = ((i_size_read(inode) - 1) >> inode->i_blkbits) + 1 - count;
+
+	UFSD(("ENTER, ino %lu, count %u, oldb %u, newb %u\n",
+	      inode->i_ino, count, oldb, newb));
+
+	BUG_ON(!PageLocked(locked_page));
+
+	for (i = 0; i < count; i += blk_per_page) {
+		index = (baseblk+i) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+		if (likely(cur_index != index)) {
+			page = ufs_get_locked_page(mapping, index);
+			if (IS_ERR(page))
+				continue;
+		} else
+			page = locked_page;
+
+		j = i;
+		head = page_buffers(page);
+		bh = head;
+		do {
+			if (likely(bh->b_blocknr == j + oldb && j < count)) {
+				unmap_underlying_metadata(bh->b_bdev,
+							  bh->b_blocknr);
+				bh->b_blocknr = newb + j++;
+				mark_buffer_dirty(bh);
+			}
+
+			bh = bh->b_this_page;
+		} while (bh != head);
+
+		set_page_dirty(page);
 
-unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
-	unsigned goal, unsigned count, int * err )
+		if (likely(cur_index != index)) {
+			unlock_page(page);
+			page_cache_release(page);
+		}
+ 	}
+	UFSD(("EXIT\n"));
+}
+
+unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
+			   unsigned goal, unsigned count, int * err, struct page *locked_page)
 {
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
-	struct buffer_head * bh;
-	unsigned cgno, oldcount, newcount, tmp, request, i, result;
+	unsigned cgno, oldcount, newcount, tmp, request, result;
 	
 	UFSD(("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count))
 	
@@ -343,24 +448,8 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
 	}
 	result = ufs_alloc_fragments (inode, cgno, goal, request, err);
 	if (result) {
-		for (i = 0; i < oldcount; i++) {
-			bh = sb_bread(sb, tmp + i);
-			if(bh)
-			{
-				clear_buffer_dirty(bh);
-				bh->b_blocknr = result + i;
-				mark_buffer_dirty (bh);
-				if (IS_SYNC(inode))
-					sync_dirty_buffer(bh);
-				brelse (bh);
-			}
-			else
-			{
-				printk(KERN_ERR "ufs_new_fragments: bread fail\n");
-				unlock_super(sb);
-				return 0;
-			}
-		}
+		ufs_change_blocknr(inode, oldcount, tmp, result, locked_page);
+
 		*p = cpu_to_fs32(sb, result);
 		*err = 0;
 		inode->i_blocks += count << uspi->s_nspfshift;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 2b2366360e5a..ea2267316a72 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -172,9 +172,10 @@ static void ufs_clear_block(struct inode *inode, struct buffer_head *bh)
 		sync_dirty_buffer(bh);
 }
 
-static struct buffer_head * ufs_inode_getfrag (struct inode *inode,
-	unsigned int fragment, unsigned int new_fragment,
-	unsigned int required, int *err, int metadata, long *phys, int *new)
+static struct buffer_head *ufs_inode_getfrag(struct inode *inode,
+					     unsigned int fragment, unsigned int new_fragment,
+					     unsigned int required, int *err, int metadata,
+					     long *phys, int *new, struct page *locked_page)
 {
 	struct ufs_inode_info *ufsi = UFS_I(inode);
 	struct super_block * sb;
@@ -232,7 +233,8 @@ repeat:
 		if (lastblockoff) {
 			p2 = ufsi->i_u1.i_data + lastblock;
 			tmp = ufs_new_fragments (inode, p2, lastfrag, 
-				fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff, err);
+						 fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff,
+						 err, locked_page);
 			if (!tmp) {
 				if (lastfrag != ufsi->i_lastfrag)
 					goto repeat;
@@ -244,14 +246,16 @@ repeat:
 		}
 		goal = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]) + uspi->s_fpb;
 		tmp = ufs_new_fragments (inode, p, fragment - blockoff, 
-			goal, required + blockoff, err);
+					 goal, required + blockoff,
+					 err, locked_page);
 	}
 	/*
 	 * We will extend last allocated block
 	 */
 	else if (lastblock == block) {
-		tmp = ufs_new_fragments (inode, p, fragment - (blockoff - lastblockoff),
-			fs32_to_cpu(sb, *p), required +  (blockoff - lastblockoff), err);
+		tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff),
+					fs32_to_cpu(sb, *p), required +  (blockoff - lastblockoff),
+					err, locked_page);
 	}
 	/*
 	 * We will allocate new block before last allocated block
@@ -259,8 +263,8 @@ repeat:
 	else /* (lastblock > block) */ {
 		if (lastblock && (tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock-1])))
 			goal = tmp + uspi->s_fpb;
-		tmp = ufs_new_fragments (inode, p, fragment - blockoff, 
-			goal, uspi->s_fpb, err);
+		tmp = ufs_new_fragments(inode, p, fragment - blockoff,
+					goal, uspi->s_fpb, err, locked_page);
 	}
 	if (!tmp) {
 		if ((!blockoff && *p) || 
@@ -303,9 +307,10 @@ repeat2:
      */
 }
 
-static struct buffer_head * ufs_block_getfrag (struct inode *inode,
-	struct buffer_head *bh, unsigned int fragment, unsigned int new_fragment, 
-	unsigned int blocksize, int * err, int metadata, long *phys, int *new)
+static struct buffer_head *ufs_block_getfrag(struct inode *inode, struct buffer_head *bh,
+					     unsigned int fragment, unsigned int new_fragment,
+					     unsigned int blocksize, int * err, int metadata,
+					     long *phys, int *new, struct page *locked_page)
 {
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
@@ -350,7 +355,8 @@ repeat:
 		goal = tmp + uspi->s_fpb;
 	else
 		goal = bh->b_blocknr + uspi->s_fpb;
-	tmp = ufs_new_fragments (inode, p, ufs_blknum(new_fragment), goal, uspi->s_fpb, err);
+	tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal,
+				uspi->s_fpb, err, locked_page);
 	if (!tmp) {
 		if (fs32_to_cpu(sb, *p))
 			goto repeat;
@@ -424,15 +430,15 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
 	 * it much more readable:
 	 */
 #define GET_INODE_DATABLOCK(x) \
-		ufs_inode_getfrag(inode, x, fragment, 1, &err, 0, &phys, &new)
+	ufs_inode_getfrag(inode, x, fragment, 1, &err, 0, &phys, &new, bh_result->b_page)
 #define GET_INODE_PTR(x) \
-		ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, 1, NULL, NULL)
+	ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, 1, NULL, NULL, bh_result->b_page)
 #define GET_INDIRECT_DATABLOCK(x) \
-		ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize, \
-				  &err, 0, &phys, &new);
+	ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize,	\
+			  &err, 0, &phys, &new, bh_result->b_page);
 #define GET_INDIRECT_PTR(x) \
-		ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize, \
-				  &err, 1, NULL, NULL);
+	ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize,	\
+			  &err, 1, NULL, NULL, bh_result->b_page);
 
 	if (ptr < UFS_NDIR_FRAGMENT) {
 		bh = GET_INODE_DATABLOCK(ptr);
diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h
index 86b5b4271b5a..ed5053f5cd71 100644
--- a/include/linux/ufs_fs.h
+++ b/include/linux/ufs_fs.h
@@ -875,7 +875,8 @@ struct ufs_super_block_third {
 /* balloc.c */
 extern void ufs_free_fragments (struct inode *, unsigned, unsigned);
 extern void ufs_free_blocks (struct inode *, unsigned, unsigned);
-extern unsigned ufs_new_fragments (struct inode *, __fs32 *, unsigned, unsigned, unsigned, int *);
+extern unsigned ufs_new_fragments(struct inode *, __fs32 *, unsigned, unsigned,
+				  unsigned, int *, struct page *);
 
 /* cylinder.c */
 extern struct ufs_cg_private_info * ufs_load_cylinder (struct super_block *, unsigned);
-- 
cgit v1.2.3


From b71034e5e67d1577424cebe7bbb7d0ce134a4cd8 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:22 -0700
Subject: [PATCH] ufs: directory and page cache: from blocks to pages

Change function in fs/ufs/dir.c and fs/ufs/namei.c to work with pages
instead of straight work with blocks.  It fixed such bugs:

* for i in `seq 1 1000`; do touch $i; done - crash system
* mkdir create directory without "." and ".." entries

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/dir.c           | 989 +++++++++++++++++++++++++------------------------
 fs/ufs/namei.c         |  62 ++--
 include/linux/ufs_fs.h |   9 +-
 3 files changed, 552 insertions(+), 508 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 1a561202d3f4..9473df5bff51 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -11,13 +11,15 @@
  * 4.4BSD (FreeBSD) support added on February 1st 1998 by
  * Niels Kristian Bech Jensen <nkbj@image.dk> partially based
  * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>.
+ *
+ * Migration to usage of "page cache" on May 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
  */
 
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
 #include <linux/sched.h>
 
 #include "swab.h"
@@ -31,11 +33,6 @@
 #define UFSD(x)
 #endif
 
-static int
-ufs_check_dir_entry (const char *, struct inode *, struct ufs_dir_entry *,
-		     struct buffer_head *, unsigned long);
-
-
 /*
  * NOTE! unlike strncmp, ufs_match returns 1 for success, 0 for failure.
  *
@@ -51,495 +48,540 @@ static inline int ufs_match(struct super_block *sb, int len,
 	return !memcmp(name, de->d_name, len);
 }
 
-/*
- * This is blatantly stolen from ext2fs
- */
-static int
-ufs_readdir (struct file * filp, void * dirent, filldir_t filldir)
+static int ufs_commit_chunk(struct page *page, unsigned from, unsigned to)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
-	int error = 0;
-	unsigned long offset, lblk;
-	int i, stored;
-	struct buffer_head * bh;
-	struct ufs_dir_entry * de;
-	struct super_block * sb;
-	int de_reclen;
-	unsigned flags;
-	u64     blk= 0L;
-
-	lock_kernel();
-
-	sb = inode->i_sb;
-	flags = UFS_SB(sb)->s_flags;
-
-	UFSD(("ENTER, ino %lu  f_pos %lu\n", inode->i_ino, (unsigned long) filp->f_pos))
-
-	stored = 0;
-	bh = NULL;
-	offset = filp->f_pos & (sb->s_blocksize - 1);
-
-	while (!error && !stored && filp->f_pos < inode->i_size) {
-		lblk = (filp->f_pos) >> sb->s_blocksize_bits;
-		blk = ufs_frag_map(inode, lblk);
-		if (!blk || !(bh = sb_bread(sb, blk))) {
-			/* XXX - error - skip to the next block */
-			printk("ufs_readdir: "
-			       "dir inode %lu has a hole at offset %lu\n",
-			       inode->i_ino, (unsigned long int)filp->f_pos);
-			filp->f_pos += sb->s_blocksize - offset;
-			continue;
-		}
-
-revalidate:
-		/* If the dir block has changed since the last call to
-		 * readdir(2), then we might be pointing to an invalid
-		 * dirent right now.  Scan from the start of the block
-		 * to make sure. */
-		if (filp->f_version != inode->i_version) {
-			for (i = 0; i < sb->s_blocksize && i < offset; ) {
-				de = (struct ufs_dir_entry *)(bh->b_data + i);
-				/* It's too expensive to do a full
-				 * dirent test each time round this
-				 * loop, but we do have to test at
-				 * least that it is non-zero.  A
-				 * failure will be detected in the
-				 * dirent test below. */
-				de_reclen = fs16_to_cpu(sb, de->d_reclen);
-				if (de_reclen < 1)
-					break;
-				i += de_reclen;
-			}
-			offset = i;
-			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
-				| offset;
-			filp->f_version = inode->i_version;
-		}
+	struct inode *dir = page->mapping->host;
+	int err = 0;
+	dir->i_version++;
+	page->mapping->a_ops->commit_write(NULL, page, from, to);
+	if (IS_DIRSYNC(dir))
+		err = write_one_page(page, 1);
+	else
+		unlock_page(page);
+	return err;
+}
 
-		while (!error && filp->f_pos < inode->i_size
-		       && offset < sb->s_blocksize) {
-			de = (struct ufs_dir_entry *) (bh->b_data + offset);
-			/* XXX - put in a real ufs_check_dir_entry() */
-			if ((de->d_reclen == 0) || (ufs_get_de_namlen(sb, de) == 0)) {
-				filp->f_pos = (filp->f_pos &
-				              (sb->s_blocksize - 1)) +
-				               sb->s_blocksize;
-				brelse(bh);
-				unlock_kernel();
-				return stored;
-			}
-			if (!ufs_check_dir_entry ("ufs_readdir", inode, de,
-						   bh, offset)) {
-				/* On error, skip the f_pos to the
-				   next block. */
-				filp->f_pos = (filp->f_pos |
-				              (sb->s_blocksize - 1)) +
-					       1;
-				brelse (bh);
-				unlock_kernel();
-				return stored;
-			}
-			offset += fs16_to_cpu(sb, de->d_reclen);
-			if (de->d_ino) {
-				/* We might block in the next section
-				 * if the data destination is
-				 * currently swapped out.  So, use a
-				 * version stamp to detect whether or
-				 * not the directory has been modified
-				 * during the copy operation. */
-				unsigned long version = filp->f_version;
-				unsigned char d_type = DT_UNKNOWN;
+static inline void ufs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
 
-				UFSD(("filldir(%s,%u)\n", de->d_name,
-							fs32_to_cpu(sb, de->d_ino)))
-				UFSD(("namlen %u\n", ufs_get_de_namlen(sb, de)))
+static inline unsigned long ufs_dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
 
-				if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
-					d_type = de->d_u.d_44.d_type;
-				error = filldir(dirent, de->d_name,
-						ufs_get_de_namlen(sb, de), filp->f_pos,
-						fs32_to_cpu(sb, de->d_ino), d_type);
-				if (error)
-					break;
-				if (version != filp->f_version)
-					goto revalidate;
-				stored ++;
-			}
-			filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
-		}
-		offset = 0;
-		brelse (bh);
+ino_t ufs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+	ino_t res = 0;
+	struct ufs_dir_entry *de;
+	struct page *page;
+	
+	de = ufs_find_entry(dir, dentry, &page);
+	if (de) {
+		res = fs32_to_cpu(dir->i_sb, de->d_ino);
+		ufs_put_page(page);
 	}
-	unlock_kernel();
-	return 0;
+	return res;
 }
 
-/*
- * define how far ahead to read directories while searching them.
- */
-#define NAMEI_RA_CHUNKS  2
-#define NAMEI_RA_BLOCKS  4
-#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
 
-/*
- *	ufs_find_entry()
- *
- * finds an entry in the specified directory with the wanted name. It
- * returns the cache buffer in which the entry was found, and the entry
- * itself (as a parameter - res_bh). It does NOT read the inode of the
- * entry - you'll have to do that yourself if you want to.
- */
-struct ufs_dir_entry * ufs_find_entry (struct dentry *dentry,
-	struct buffer_head ** res_bh)
+/* Releases the page */
+void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+		  struct page *page, struct inode *inode)
 {
-	struct super_block * sb;
-	struct buffer_head * bh_use[NAMEI_RA_SIZE];
-	struct buffer_head * bh_read[NAMEI_RA_SIZE];
-	unsigned long offset;
-	int block, toread, i, err;
-	struct inode *dir = dentry->d_parent->d_inode;
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	unsigned from = (char *) de - (char *) page_address(page);
+	unsigned to = from + fs16_to_cpu(dir->i_sb, de->d_reclen);
+	int err;
 
-	UFSD(("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen))
-	
-	*res_bh = NULL;
-	
-	sb = dir->i_sb;
-	
-	if (namelen > UFS_MAXNAMLEN)
-		return NULL;
+	lock_page(page);
+	err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
+	BUG_ON(err);
+	de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
+	ufs_set_de_type(dir->i_sb, de, inode->i_mode);
+	err = ufs_commit_chunk(page, from, to);
+	ufs_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+}
 
-	memset (bh_use, 0, sizeof (bh_use));
-	toread = 0;
-	for (block = 0; block < NAMEI_RA_SIZE; ++block) {
-		struct buffer_head * bh;
 
-		if ((block << sb->s_blocksize_bits) >= dir->i_size)
-			break;
-		bh = ufs_getfrag (dir, block, 0, &err);
-		bh_use[block] = bh;
-		if (bh && !buffer_uptodate(bh))
-			bh_read[toread++] = bh;
+static void ufs_check_page(struct page *page)
+{
+	struct inode *dir = page->mapping->host;
+	struct super_block *sb = dir->i_sb;
+	char *kaddr = page_address(page);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	struct ufs_dir_entry *p;
+	char *error;
+
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & (UFS_SECTOR_SIZE - 1))
+			goto Ebadsize;
+		if (!limit)
+			goto out;
 	}
+	for (offs = 0; offs <= limit - UFS_DIR_REC_LEN(1); offs += rec_len) {
+		p = (struct ufs_dir_entry *)(kaddr + offs);
+		rec_len = fs16_to_cpu(sb, p->d_reclen);
+
+		if (rec_len < UFS_DIR_REC_LEN(1))
+			goto Eshort;
+		if (rec_len & 3)
+			goto Ealign;
+		if (rec_len < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, p)))
+			goto Enamelen;
+		if (((offs + rec_len - 1) ^ offs) & ~(UFS_SECTOR_SIZE-1))
+			goto Espan;
+		if (fs32_to_cpu(sb, p->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg *
+						  UFS_SB(sb)->s_uspi->s_ncg))
+			goto Einumber;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+	/* Too bad, we had an error */
+
+Ebadsize:
+	ufs_error(sb, "ufs_check_page",
+		  "size of directory #%lu is not a multiple of chunk size",
+		  dir->i_ino
+	);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+	goto bad_entry;
+Einumber:
+	error = "inode out of bounds";
+bad_entry:
+	ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - "
+		   "offset=%lu, rec_len=%d, name_len=%d",
+		   dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		   rec_len, ufs_get_de_namlen(sb, p));
+	goto fail;
+Eend:
+	p = (struct ufs_dir_entry *)(kaddr + offs);
+	ufs_error (sb, "ext2_check_page",
+		   "entry in directory #%lu spans the page boundary"
+		   "offset=%lu",
+		   dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs);
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
 
-	for (block = 0, offset = 0; offset < dir->i_size; block++) {
-		struct buffer_head * bh;
-		struct ufs_dir_entry * de;
-		char * dlimit;
-
-		if ((block % NAMEI_RA_BLOCKS) == 0 && toread) {
-			ll_rw_block (READ, toread, bh_read);
-			toread = 0;
-		}
-		bh = bh_use[block % NAMEI_RA_SIZE];
-		if (!bh) {
-			ufs_error (sb, "ufs_find_entry", 
-				"directory #%lu contains a hole at offset %lu",
-				dir->i_ino, offset);
-			offset += sb->s_blocksize;
-			continue;
-		}
-		wait_on_buffer (bh);
-		if (!buffer_uptodate(bh)) {
-			/*
-			 * read error: all bets are off
-			 */
-			break;
-		}
-
-		de = (struct ufs_dir_entry *) bh->b_data;
-		dlimit = bh->b_data + sb->s_blocksize;
-		while ((char *) de < dlimit && offset < dir->i_size) {
-			/* this code is executed quadratically often */
-			/* do minimal checking by hand */
-			int de_len;
-
-			if ((char *) de + namelen <= dlimit &&
-			    ufs_match(sb, namelen, name, de)) {
-				/* found a match -
-				just to be sure, do a full check */
-				if (!ufs_check_dir_entry("ufs_find_entry",
-				    dir, de, bh, offset))
-					goto failed;
-				for (i = 0; i < NAMEI_RA_SIZE; ++i) {
-					if (bh_use[i] != bh)
-						brelse (bh_use[i]);
-				}
-				*res_bh = bh;
-				return de;
-			}
-                        /* prevent looping on a bad block */
-			de_len = fs16_to_cpu(sb, de->d_reclen);
-			if (de_len <= 0)
-				goto failed;
-			offset += de_len;
-			de = (struct ufs_dir_entry *) ((char *) de + de_len);
-		}
-
-		brelse (bh);
-		if (((block + NAMEI_RA_SIZE) << sb->s_blocksize_bits ) >=
-		    dir->i_size)
-			bh = NULL;
-		else
-			bh = ufs_getfrag (dir, block + NAMEI_RA_SIZE, 0, &err);
-		bh_use[block % NAMEI_RA_SIZE] = bh;
-		if (bh && !buffer_uptodate(bh))
-			bh_read[toread++] = bh;
+static struct page *ufs_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_cache_page(mapping, n,
+				(filler_t*)mapping->a_ops->readpage, NULL);
+	if (!IS_ERR(page)) {
+		wait_on_page_locked(page);
+		kmap(page);
+		if (!PageUptodate(page))
+			goto fail;
+		if (!PageChecked(page))
+			ufs_check_page(page);
+		if (PageError(page))
+			goto fail;
 	}
+	return page;
 
-failed:
-	for (i = 0; i < NAMEI_RA_SIZE; ++i) brelse (bh_use[i]);
-	UFSD(("EXIT\n"))
-	return NULL;
+fail:
+	ufs_put_page(page);
+	return ERR_PTR(-EIO);
 }
 
-static int
-ufs_check_dir_entry (const char *function, struct inode *dir,
-		     struct ufs_dir_entry *de, struct buffer_head *bh,
-		     unsigned long offset)
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned
+ufs_last_byte(struct inode *inode, unsigned long page_nr)
 {
-	struct super_block *sb = dir->i_sb;
-	const char *error_msg = NULL;
-	int rlen = fs16_to_cpu(sb, de->d_reclen);
-
-	if (rlen < UFS_DIR_REC_LEN(1))
-		error_msg = "reclen is smaller than minimal";
-	else if (rlen % 4 != 0)
-		error_msg = "reclen % 4 != 0";
-	else if (rlen < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)))
-		error_msg = "reclen is too small for namlen";
-	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
-		error_msg = "directory entry across blocks";
-	else if (fs32_to_cpu(sb, de->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg *
-				      UFS_SB(sb)->s_uspi->s_ncg))
-		error_msg = "inode out of bounds";
-
-	if (error_msg != NULL)
-		ufs_error (sb, function, "bad entry in directory #%lu, size %Lu: %s - "
-			    "offset=%lu, inode=%lu, reclen=%d, namlen=%d",
-			    dir->i_ino, dir->i_size, error_msg, offset,
-			    (unsigned long)fs32_to_cpu(sb, de->d_ino),
-			    rlen, ufs_get_de_namlen(sb, de));
-	
-	return (error_msg == NULL ? 1 : 0);
+	unsigned last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
 }
 
-struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct buffer_head **p)
+static inline struct ufs_dir_entry *
+ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p)
 {
-	int err;
-	struct buffer_head *bh = ufs_bread (dir, 0, 0, &err);
-	struct ufs_dir_entry *res = NULL;
-
-	if (bh) {
-		res = (struct ufs_dir_entry *) bh->b_data;
-		res = (struct ufs_dir_entry *)((char *)res +
-			fs16_to_cpu(dir->i_sb, res->d_reclen));
-	}
-	*p = bh;
-	return res;
+	return (struct ufs_dir_entry *)((char *)p +
+					fs16_to_cpu(sb, p->d_reclen));
 }
-ino_t ufs_inode_by_name(struct inode * dir, struct dentry *dentry)
+
+struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
 {
-	ino_t res = 0;
-	struct ufs_dir_entry * de;
-	struct buffer_head *bh;
+	struct page *page = ufs_get_page(dir, 0);
+	struct ufs_dir_entry *de = NULL;
 
-	de = ufs_find_entry (dentry, &bh);
-	if (de) {
-		res = fs32_to_cpu(dir->i_sb, de->d_ino);
-		brelse(bh);
+	if (!IS_ERR(page)) {
+		de = ufs_next_entry(dir->i_sb,
+				    (struct ufs_dir_entry *)page_address(page));
+		*p = page;
 	}
-	return res;
+	return de;
 }
 
-void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
-		struct buffer_head *bh, struct inode *inode)
+/*
+ *	ufs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found, and the entry itself
+ * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
+				     struct page **res_page)
 {
-	dir->i_version++;
-	de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
-	mark_buffer_dirty(bh);
-	if (IS_DIRSYNC(dir))
-		sync_dirty_buffer(bh);
-	brelse (bh);
+	struct super_block *sb = dir->i_sb;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned reclen = UFS_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = ufs_dir_pages(dir);
+	struct page *page = NULL;
+	struct ufs_dir_entry *de;
+
+	UFSD(("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen));
+
+	if (npages == 0 || namelen > UFS_MAXNAMLEN)
+		goto out;
+
+	/* OFFSET_CACHE */
+	*res_page = NULL;
+
+	/* start = ei->i_dir_start_lookup; */
+	start = 0;
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = ufs_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (struct ufs_dir_entry *) kaddr;
+			kaddr += ufs_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->d_reclen == 0) {
+					ufs_error(dir->i_sb, __FUNCTION__,
+						  "zero-length directory entry");
+					ufs_put_page(page);
+					goto out;
+				}
+				if (ufs_match(sb, namelen, name, de))
+					goto found;
+				de = ufs_next_entry(sb, de);
+			}
+			ufs_put_page(page);
+		}
+		if (++n >= npages)
+			n = 0;
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	/* ei->i_dir_start_lookup = n; */
+	return de;
 }
 
 /*
- *	ufs_add_entry()
- *
- * adds a file entry to the specified directory, using the same
- * semantics as ufs_find_entry(). It returns NULL if it failed.
+ *	Parent is locked.
  */
 int ufs_add_link(struct dentry *dentry, struct inode *inode)
 {
-	struct super_block * sb;
-	struct ufs_sb_private_info * uspi;
-	unsigned long offset;
-	unsigned fragoff;
-	unsigned short rec_len;
-	struct buffer_head * bh;
-	struct ufs_dir_entry * de, * de1;
 	struct inode *dir = dentry->d_parent->d_inode;
 	const char *name = dentry->d_name.name;
 	int namelen = dentry->d_name.len;
+	struct super_block *sb = dir->i_sb;
+	unsigned reclen = UFS_DIR_REC_LEN(namelen);
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	struct ufs_dir_entry *de;
+	unsigned long npages = ufs_dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	unsigned from, to;
 	int err;
 
-	UFSD(("ENTER, name %s, namelen %u\n", name, namelen))
-	
-	sb = dir->i_sb;
-	uspi = UFS_SB(sb)->s_uspi;
-
-	if (!namelen)
-		return -EINVAL;
-	bh = ufs_bread (dir, 0, 0, &err);
-	if (!bh)
-		return err;
-	rec_len = UFS_DIR_REC_LEN(namelen);
-	offset = 0;
-	de = (struct ufs_dir_entry *) bh->b_data;
-	while (1) {
-		if ((char *)de >= UFS_SECTOR_SIZE + bh->b_data) {
-			fragoff = offset & ~uspi->s_fmask;
-			if (fragoff != 0 && fragoff != UFS_SECTOR_SIZE)
-				ufs_error (sb, "ufs_add_entry", "internal error"
-					" fragoff %u", fragoff);
-			if (!fragoff) {
-				brelse (bh);
-				bh = ufs_bread (dir, offset >> sb->s_blocksize_bits, 1, &err);
-				if (!bh)
-					return err;
-			}
-			if (dir->i_size <= offset) {
-				if (dir->i_size == 0) {
-					brelse(bh);
-					return -ENOENT;
-				}
-				de = (struct ufs_dir_entry *) (bh->b_data + fragoff);
-				de->d_ino = 0;
+	UFSD(("ENTER, name %s, namelen %u\n", name, namelen));
+
+	/*
+	 * We take care of directory expansion in the same loop.
+	 * This code plays outside i_size, so it locks the page
+	 * to protect that region.
+	 */
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = ufs_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + ufs_last_byte(dir, n);
+		de = (struct ufs_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				/* We hit i_size */
+				name_len = 0;
+				rec_len = UFS_SECTOR_SIZE;
 				de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE);
-				ufs_set_de_namlen(sb, de, 0);
-				dir->i_size = offset + UFS_SECTOR_SIZE;
-				mark_inode_dirty(dir);
-			} else {
-				de = (struct ufs_dir_entry *) bh->b_data;
+				de->d_ino = 0;
+				goto got_it;
 			}
+			if (de->d_reclen == 0) {
+				ufs_error(dir->i_sb, __FUNCTION__,
+					  "zero-length directory entry");
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (ufs_match(sb, namelen, name, de))
+				goto out_unlock;
+			name_len = UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de));
+			rec_len = fs16_to_cpu(sb, de->d_reclen);
+			if (!de->d_ino && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (struct ufs_dir_entry *) ((char *) de + rec_len);
 		}
-		if (!ufs_check_dir_entry ("ufs_add_entry", dir, de, bh, offset)) {
-			brelse (bh);
-			return -ENOENT;
-		}
-		if (ufs_match(sb, namelen, name, de)) {
-			brelse (bh);
-			return -EEXIST;
-		}
-		if (de->d_ino == 0 && fs16_to_cpu(sb, de->d_reclen) >= rec_len)
-			break;
-			
-		if (fs16_to_cpu(sb, de->d_reclen) >=
-		     UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)) + rec_len)
-			break;
-		offset += fs16_to_cpu(sb, de->d_reclen);
-		de = (struct ufs_dir_entry *) ((char *) de + fs16_to_cpu(sb, de->d_reclen));
+		unlock_page(page);
+		ufs_put_page(page);
 	}
-
+	BUG();
+	return -EINVAL;
+
+got_it:
+	from = (char*)de - (char*)page_address(page);
+	to = from + rec_len;
+	err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
+	if (err)
+		goto out_unlock;
 	if (de->d_ino) {
-		de1 = (struct ufs_dir_entry *) ((char *) de +
-			UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
-		de1->d_reclen =
-			cpu_to_fs16(sb, fs16_to_cpu(sb, de->d_reclen) -
-				UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
-		de->d_reclen =
-			cpu_to_fs16(sb, UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
+		struct ufs_dir_entry *de1 =
+			(struct ufs_dir_entry *) ((char *) de + name_len);
+		de1->d_reclen = cpu_to_fs16(sb, rec_len - name_len);
+		de->d_reclen = cpu_to_fs16(sb, name_len);
+
 		de = de1;
 	}
-	de->d_ino = 0;
+
 	ufs_set_de_namlen(sb, de, namelen);
-	memcpy (de->d_name, name, namelen + 1);
+	memcpy(de->d_name, name, namelen + 1);
 	de->d_ino = cpu_to_fs32(sb, inode->i_ino);
 	ufs_set_de_type(sb, de, inode->i_mode);
-	mark_buffer_dirty(bh);
-	if (IS_DIRSYNC(dir))
-		sync_dirty_buffer(bh);
-	brelse (bh);
+
+	err = ufs_commit_chunk(page, from, to);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-	dir->i_version++;
+
 	mark_inode_dirty(dir);
+	/* OFFSET_CACHE */
+out_put:
+	ufs_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+static inline unsigned
+ufs_validate_entry(struct super_block *sb, char *base,
+		   unsigned offset, unsigned mask)
+{
+	struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset);
+	struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask));
+	while ((char*)p < (char*)de) {
+		if (p->d_reclen == 0)
+			break;
+		p = ufs_next_entry(sb, p);
+	}
+	return (char *)p - base;
+}
 
-	UFSD(("EXIT\n"))
+
+/*
+ * This is blatantly stolen from ext2fs
+ */
+static int
+ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	loff_t pos = filp->f_pos;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = ufs_dir_pages(inode);
+	unsigned chunk_mask = ~(UFS_SECTOR_SIZE - 1);
+	int need_revalidate = filp->f_version != inode->i_version;
+	unsigned flags = UFS_SB(sb)->s_flags;
+
+	UFSD(("BEGIN"));
+
+	if (pos > inode->i_size - UFS_DIR_REC_LEN(1))
+		return 0;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct ufs_dir_entry *de;
+
+		struct page *page = ufs_get_page(inode, n);
+
+		if (IS_ERR(page)) {
+			ufs_error(sb, __FUNCTION__,
+				  "bad page in #%lu",
+				  inode->i_ino);
+			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			return -EIO;
+		}
+		kaddr = page_address(page);
+		if (unlikely(need_revalidate)) {
+			if (offset) {
+				offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
+				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+			}
+			filp->f_version = inode->i_version;
+			need_revalidate = 0;
+		}
+		de = (struct ufs_dir_entry *)(kaddr+offset);
+		limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1);
+		for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) {
+			if (de->d_reclen == 0) {
+				ufs_error(sb, __FUNCTION__,
+					"zero-length directory entry");
+				ufs_put_page(page);
+				return -EIO;
+			}
+			if (de->d_ino) {
+				int over;
+				unsigned char d_type = DT_UNKNOWN;
+
+				offset = (char *)de - kaddr;
+
+				UFSD(("filldir(%s,%u)\n", de->d_name,
+				      fs32_to_cpu(sb, de->d_ino)));
+				UFSD(("namlen %u\n", ufs_get_de_namlen(sb, de)));
+
+				if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
+					d_type = de->d_u.d_44.d_type;
+
+				over = filldir(dirent, de->d_name,
+					       ufs_get_de_namlen(sb, de),
+						(n<<PAGE_CACHE_SHIFT) | offset,
+					       fs32_to_cpu(sb, de->d_ino), d_type);
+				if (over) {
+					ufs_put_page(page);
+					return 0;
+				}
+			}
+			filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
+		}
+		ufs_put_page(page);
+	}
 	return 0;
 }
 
+
 /*
  * ufs_delete_entry deletes a directory entry by merging it with the
  * previous entry.
  */
-int ufs_delete_entry (struct inode * inode, struct ufs_dir_entry * dir,
-	struct buffer_head * bh )
-	
+int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
+		     struct page * page)
 {
-	struct super_block * sb;
-	struct ufs_dir_entry * de, * pde;
-	unsigned i;
-	
-	UFSD(("ENTER\n"))
+	struct super_block *sb = inode->i_sb;
+	struct address_space *mapping = page->mapping;
+	char *kaddr = page_address(page);
+	unsigned from = ((char*)dir - kaddr) & ~(UFS_SECTOR_SIZE - 1);
+	unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen);
+	struct ufs_dir_entry *pde = NULL;
+	struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from);
+	int err;
 
-	sb = inode->i_sb;
-	i = 0;
-	pde = NULL;
-	de = (struct ufs_dir_entry *) bh->b_data;
-	
-	UFSD(("ino %u, reclen %u, namlen %u, name %s\n",
-		fs32_to_cpu(sb, de->d_ino),
-		fs16_to_cpu(sb, de->d_reclen),
-		ufs_get_de_namlen(sb, de), de->d_name))
+	UFSD(("ENTER\n"));
 
-	while (i < bh->b_size) {
-		if (!ufs_check_dir_entry ("ufs_delete_entry", inode, de, bh, i)) {
-			brelse(bh);
-			return -EIO;
-		}
-		if (de == dir)  {
-			if (pde)
-				fs16_add(sb, &pde->d_reclen,
-					fs16_to_cpu(sb, dir->d_reclen));
-			dir->d_ino = 0;
-			inode->i_version++;
-			inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
-			mark_inode_dirty(inode);
-			mark_buffer_dirty(bh);
-			if (IS_DIRSYNC(inode))
-				sync_dirty_buffer(bh);
-			brelse(bh);
-			UFSD(("EXIT\n"))
-			return 0;
+	UFSD(("ino %u, reclen %u, namlen %u, name %s\n",
+	      fs32_to_cpu(sb, de->d_ino),
+	      fs16_to_cpu(sb, de->d_reclen),
+	      ufs_get_de_namlen(sb, de), de->d_name));
+
+	while ((char*)de < (char*)dir) {
+		if (de->d_reclen == 0) {
+			ufs_error(inode->i_sb, __FUNCTION__,
+				  "zero-length directory entry");
+			err = -EIO;
+			goto out;
 		}
-		i += fs16_to_cpu(sb, de->d_reclen);
-		if (i == UFS_SECTOR_SIZE) pde = NULL;
-		else pde = de;
-		de = (struct ufs_dir_entry *)
-		    ((char *) de + fs16_to_cpu(sb, de->d_reclen));
-		if (i == UFS_SECTOR_SIZE && de->d_reclen == 0)
-			break;
+		pde = de;
+		de = ufs_next_entry(sb, de);
 	}
-	UFSD(("EXIT\n"))
-	brelse(bh);
-	return -ENOENT;
+	if (pde)
+		from = (char*)pde - (char*)page_address(page);
+	lock_page(page);
+	err = mapping->a_ops->prepare_write(NULL, page, from, to);
+	BUG_ON(err);
+	if (pde)
+		pde->d_reclen = cpu_to_fs16(sb, to-from);
+	dir->d_ino = 0;
+	err = ufs_commit_chunk(page, from, to);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+out:
+	ufs_put_page(page);
+	UFSD(("EXIT\n"));
+	return err;
 }
 
 int ufs_make_empty(struct inode * inode, struct inode *dir)
 {
 	struct super_block * sb = dir->i_sb;
-	struct buffer_head * dir_block;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
 	struct ufs_dir_entry * de;
+	char *base;
 	int err;
 
-	dir_block = ufs_bread (inode, 0, 1, &err);
-	if (!dir_block)
-		return err;
+	if (!page)
+		return -ENOMEM;
+	kmap(page);
+	err = mapping->a_ops->prepare_write(NULL, page, 0, UFS_SECTOR_SIZE);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+
+
+	base = (char*)page_address(page);
+	memset(base, 0, PAGE_CACHE_SIZE);
+
+	de = (struct ufs_dir_entry *) base;
 
-	inode->i_blocks = sb->s_blocksize / UFS_SECTOR_SIZE;
-	de = (struct ufs_dir_entry *) dir_block->b_data;
 	de->d_ino = cpu_to_fs32(sb, inode->i_ino);
 	ufs_set_de_type(sb, de, inode->i_mode);
 	ufs_set_de_namlen(sb, de, 1);
@@ -552,72 +594,65 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
 	de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE - UFS_DIR_REC_LEN(1));
 	ufs_set_de_namlen(sb, de, 2);
 	strcpy (de->d_name, "..");
-	mark_buffer_dirty(dir_block);
-	brelse (dir_block);
-	mark_inode_dirty(inode);
-	return 0;
+
+	err = ufs_commit_chunk(page, 0, UFS_SECTOR_SIZE);
+fail:
+	kunmap(page);
+	page_cache_release(page);
+	return err;
 }
 
 /*
  * routine to check that the specified directory is empty (for rmdir)
  */
-int ufs_empty_dir (struct inode * inode)
+int ufs_empty_dir(struct inode * inode)
 {
-	struct super_block * sb;
-	unsigned long offset;
-	struct buffer_head * bh;
-	struct ufs_dir_entry * de, * de1;
-	int err;
-	
-	sb = inode->i_sb;
-
-	if (inode->i_size < UFS_DIR_REC_LEN(1) + UFS_DIR_REC_LEN(2) ||
-	    !(bh = ufs_bread (inode, 0, 0, &err))) {
-	    	ufs_warning (inode->i_sb, "empty_dir",
-			      "bad directory (dir #%lu) - no data block",
-			      inode->i_ino);
-		return 1;
-	}
-	de = (struct ufs_dir_entry *) bh->b_data;
-	de1 = (struct ufs_dir_entry *)
-		((char *)de + fs16_to_cpu(sb, de->d_reclen));
-	if (fs32_to_cpu(sb, de->d_ino) != inode->i_ino || de1->d_ino == 0 ||
-	     strcmp (".", de->d_name) || strcmp ("..", de1->d_name)) {
-	    	ufs_warning (inode->i_sb, "empty_dir",
-			      "bad directory (dir #%lu) - no `.' or `..'",
-			      inode->i_ino);
-		return 1;
-	}
-	offset = fs16_to_cpu(sb, de->d_reclen) + fs16_to_cpu(sb, de1->d_reclen);
-	de = (struct ufs_dir_entry *)
-		((char *)de1 + fs16_to_cpu(sb, de1->d_reclen));
-	while (offset < inode->i_size ) {
-		if (!bh || (void *) de >= (void *) (bh->b_data + sb->s_blocksize)) {
-			brelse (bh);
-			bh = ufs_bread (inode, offset >> sb->s_blocksize_bits, 1, &err);
-	 		if (!bh) {
-				ufs_error (sb, "empty_dir",
-					    "directory #%lu contains a hole at offset %lu",
-					    inode->i_ino, offset);
-				offset += sb->s_blocksize;
-				continue;
+	struct super_block *sb = inode->i_sb;
+	struct page *page = NULL;
+	unsigned long i, npages = ufs_dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct ufs_dir_entry *de;
+		page = ufs_get_page(inode, i);
+
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = page_address(page);
+		de = (struct ufs_dir_entry *)kaddr;
+		kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->d_reclen == 0) {
+				ufs_error(inode->i_sb, __FUNCTION__,
+					"zero-length directory entry: "
+					"kaddr=%p, de=%p\n", kaddr, de);
+				goto not_empty;
 			}
-			de = (struct ufs_dir_entry *) bh->b_data;
-		}
-		if (!ufs_check_dir_entry ("empty_dir", inode, de, bh, offset)) {
-			brelse (bh);
-			return 1;
-		}
-		if (de->d_ino) {
-			brelse (bh);
-			return 0;
+			if (de->d_ino) {
+				u16 namelen=ufs_get_de_namlen(sb, de);
+				/* check for . and .. */
+				if (de->d_name[0] != '.')
+					goto not_empty;
+				if (namelen > 2)
+					goto not_empty;
+				if (namelen < 2) {
+					if (inode->i_ino !=
+					    fs32_to_cpu(sb, de->d_ino))
+						goto not_empty;
+				} else if (de->d_name[1] != '.')
+					goto not_empty;
+			}
+			de = ufs_next_entry(sb, de);
 		}
-		offset += fs16_to_cpu(sb, de->d_reclen);
-		de = (struct ufs_dir_entry *)
-			((char *)de + fs16_to_cpu(sb, de->d_reclen));
+		ufs_put_page(page);
 	}
-	brelse (bh);
 	return 1;
+
+not_empty:
+	ufs_put_page(page);
+	return 0;
 }
 
 const struct file_operations ufs_dir_operations = {
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 51f702700308..364bb92b0917 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -1,6 +1,9 @@
 /*
  * linux/fs/ufs/namei.c
  *
+ * Migration to usage of "page cache" on May 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
+ *
  * Copyright (C) 1998
  * Daniel Pirkl <daniel.pirkl@email.cz>
  * Charles University, Faculty of Mathematics and Physics
@@ -28,7 +31,6 @@
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
 #include "swab.h"	/* will go away - see comment in mknod() */
 #include "util.h"
 
@@ -232,19 +234,18 @@ out_dir:
 	goto out;
 }
 
-static int ufs_unlink(struct inode * dir, struct dentry *dentry)
+static int ufs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode * inode = dentry->d_inode;
-	struct buffer_head * bh;
-	struct ufs_dir_entry * de;
+	struct ufs_dir_entry *de;
+	struct page *page;
 	int err = -ENOENT;
 
-	lock_kernel();
-	de = ufs_find_entry (dentry, &bh);
+	de = ufs_find_entry(dir, dentry, &page);
 	if (!de)
 		goto out;
 
-	err = ufs_delete_entry (dir, de, bh);
+	err = ufs_delete_entry(dir, de, page);
 	if (err)
 		goto out;
 
@@ -252,7 +253,6 @@ static int ufs_unlink(struct inode * dir, struct dentry *dentry)
 	inode_dec_link_count(inode);
 	err = 0;
 out:
-	unlock_kernel();
 	return err;
 }
 
@@ -274,42 +274,42 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
 	return err;
 }
 
-static int ufs_rename (struct inode * old_dir, struct dentry * old_dentry,
-	struct inode * new_dir,	struct dentry * new_dentry )
+static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct inode *old_inode = old_dentry->d_inode;
 	struct inode *new_inode = new_dentry->d_inode;
-	struct buffer_head *dir_bh = NULL;
-	struct ufs_dir_entry *dir_de = NULL;
-	struct buffer_head *old_bh;
+	struct page *dir_page = NULL;
+	struct ufs_dir_entry * dir_de = NULL;
+	struct page *old_page;
 	struct ufs_dir_entry *old_de;
 	int err = -ENOENT;
 
-	lock_kernel();
-	old_de = ufs_find_entry (old_dentry, &old_bh);
+	old_de = ufs_find_entry(old_dir, old_dentry, &old_page);
 	if (!old_de)
 		goto out;
 
 	if (S_ISDIR(old_inode->i_mode)) {
 		err = -EIO;
-		dir_de = ufs_dotdot(old_inode, &dir_bh);
+		dir_de = ufs_dotdot(old_inode, &dir_page);
 		if (!dir_de)
 			goto out_old;
 	}
 
 	if (new_inode) {
-		struct buffer_head *new_bh;
+		struct page *new_page;
 		struct ufs_dir_entry *new_de;
 
 		err = -ENOTEMPTY;
-		if (dir_de && !ufs_empty_dir (new_inode))
+		if (dir_de && !ufs_empty_dir(new_inode))
 			goto out_dir;
+
 		err = -ENOENT;
-		new_de = ufs_find_entry (new_dentry, &new_bh);
+		new_de = ufs_find_entry(new_dir, new_dentry, &new_page);
 		if (!new_de)
 			goto out_dir;
 		inode_inc_link_count(old_inode);
-		ufs_set_link(new_dir, new_de, new_bh, old_inode);
+		ufs_set_link(new_dir, new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
 			new_inode->i_nlink--;
@@ -330,24 +330,32 @@ static int ufs_rename (struct inode * old_dir, struct dentry * old_dentry,
 			inode_inc_link_count(new_dir);
 	}
 
-	ufs_delete_entry (old_dir, old_de, old_bh);
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+ 	 * rename.
+	 * inode_dec_link_count() will mark the inode dirty.
+	 */
+	old_inode->i_ctime = CURRENT_TIME_SEC;
 
+	ufs_delete_entry(old_dir, old_de, old_page);
 	inode_dec_link_count(old_inode);
 
 	if (dir_de) {
-		ufs_set_link(old_inode, dir_de, dir_bh, new_dir);
+		ufs_set_link(old_inode, dir_de, dir_page, new_dir);
 		inode_dec_link_count(old_dir);
 	}
-	unlock_kernel();
 	return 0;
 
+
 out_dir:
-	if (dir_de)
-		brelse(dir_bh);
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
 out_old:
-	brelse (old_bh);
+	kunmap(old_page);
+	page_cache_release(old_page);
 out:
-	unlock_kernel();
 	return err;
 }
 
diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h
index ed5053f5cd71..9d2b519700e7 100644
--- a/include/linux/ufs_fs.h
+++ b/include/linux/ufs_fs.h
@@ -887,11 +887,12 @@ extern struct inode_operations ufs_dir_inode_operations;
 extern int ufs_add_link (struct dentry *, struct inode *);
 extern ino_t ufs_inode_by_name(struct inode *, struct dentry *);
 extern int ufs_make_empty(struct inode *, struct inode *);
-extern struct ufs_dir_entry * ufs_find_entry (struct dentry *, struct buffer_head **);
-extern int ufs_delete_entry (struct inode *, struct ufs_dir_entry *, struct buffer_head *);
+extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct dentry *, struct page **);
+extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *);
 extern int ufs_empty_dir (struct inode *);
-extern struct ufs_dir_entry * ufs_dotdot (struct inode *, struct buffer_head **);
-extern void ufs_set_link(struct inode *, struct ufs_dir_entry *, struct buffer_head *, struct inode *);
+extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
+extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+			 struct page *page, struct inode *inode);
 
 /* file.c */
 extern struct inode_operations ufs_file_inode_operations;
-- 
cgit v1.2.3


From 9695ef16ed4e00b59303f39f9a4a422a2c6a3b89 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:22 -0700
Subject: [PATCH] ufs: wrong type cast

There are two ugly macros in ufs code:
#define UCPI_UBH ((struct ufs_buffer_head *)ucpi)
#define USPI_UBH ((struct ufs_buffer_head *)uspi)
when uspi looks like
struct {
struct ufs_buffer_head ;
}
and USPI_UBH has some sence,
ucpi looks like
struct {
struct not_ufs_buffer_head;
}

To prevent bugs in future, this patch convert macros to inline function and
fix "ucpi" structure.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/balloc.c        | 84 +++++++++++++++++++++++++-------------------------
 fs/ufs/cylinder.c      | 18 +++++------
 fs/ufs/ialloc.c        | 28 ++++++++---------
 fs/ufs/super.c         |  8 ++---
 fs/ufs/util.c          | 20 ++++++------
 fs/ufs/util.h          | 16 +++++++---
 include/linux/ufs_fs.h |  2 +-
 7 files changed, 91 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 06f970d02e3d..68de1312e4b6 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -69,7 +69,7 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 	ucpi = ufs_load_cylinder (sb, cgno);
 	if (!ucpi) 
 		goto failed;
-	ucg = ubh_get_ucg (UCPI_UBH);
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) {
 		ufs_panic (sb, "ufs_free_fragments", "internal error, bad magic number on cg %u", cgno);
 		goto failed;
@@ -77,11 +77,11 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 
 	end_bit = bit + count;
 	bbase = ufs_blknum (bit);
-	blkmap = ubh_blkmap (UCPI_UBH, ucpi->c_freeoff, bbase);
+	blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
 	ufs_fragacct (sb, blkmap, ucg->cg_frsum, -1);
 	for (i = bit; i < end_bit; i++) {
-		if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, i))
-			ubh_setbit (UCPI_UBH, ucpi->c_freeoff, i);
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, i))
+			ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, i);
 		else 
 			ufs_error (sb, "ufs_free_fragments",
 				   "bit already cleared for fragment %u", i);
@@ -93,14 +93,14 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 	fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
 	fs32_add(sb, &usb1->fs_cstotal.cs_nffree, count);
 	fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
-	blkmap = ubh_blkmap (UCPI_UBH, ucpi->c_freeoff, bbase);
+	blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
 	ufs_fragacct(sb, blkmap, ucg->cg_frsum, 1);
 
 	/*
 	 * Trying to reassemble free fragments into block
 	 */
 	blkno = ufs_fragstoblks (bbase);
-	if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, blkno)) {
+	if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
 		fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb);
 		fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, uspi->s_fpb);
 		fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb);
@@ -114,11 +114,11 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 		fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
 	}
 	
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
 		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
 	
@@ -176,7 +176,7 @@ do_more:
 	ucpi = ufs_load_cylinder (sb, cgno);
 	if (!ucpi) 
 		goto failed;
-	ucg = ubh_get_ucg (UCPI_UBH);
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) {
 		ufs_panic (sb, "ufs_free_blocks", "internal error, bad magic number on cg %u", cgno);
 		goto failed;
@@ -184,10 +184,10 @@ do_more:
 
 	for (i = bit; i < end_bit; i += uspi->s_fpb) {
 		blkno = ufs_fragstoblks(i);
-		if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, blkno)) {
+		if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
 			ufs_error(sb, "ufs_free_blocks", "freeing free fragment");
 		}
-		ubh_setblock(UCPI_UBH, ucpi->c_freeoff, blkno);
+		ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
 		if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 			ufs_clusteracct (sb, ucpi, blkno, 1);
 		DQUOT_FREE_BLOCK(inode, uspi->s_fpb);
@@ -200,11 +200,11 @@ do_more:
 		fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
 	}
 
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
 		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 
 	if (overflow) {
@@ -493,7 +493,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	ucpi = ufs_load_cylinder (sb, cgno);
 	if (!ucpi)
 		return 0;
-	ucg = ubh_get_ucg (UCPI_UBH);
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) {
 		ufs_panic (sb, "ufs_add_fragments",
 			"internal error, bad magic number on cg %u", cgno);
@@ -503,14 +503,14 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	fragno = ufs_dtogd (fragment);
 	fragoff = ufs_fragnum (fragno);
 	for (i = oldcount; i < newcount; i++)
-		if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, fragno + i))
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
 			return 0;
 	/*
 	 * Block can be extended
 	 */
 	ucg->cg_time = cpu_to_fs32(sb, get_seconds());
 	for (i = newcount; i < (uspi->s_fpb - fragoff); i++)
-		if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, fragno + i))
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
 			break;
 	fragsize = i - oldcount;
 	if (!fs32_to_cpu(sb, ucg->cg_frsum[fragsize]))
@@ -520,7 +520,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	if (fragsize != count)
 		fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
 	for (i = oldcount; i < newcount; i++)
-		ubh_clrbit (UCPI_UBH, ucpi->c_freeoff, fragno + i);
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
 	if(DQUOT_ALLOC_BLOCK(inode, count)) {
 		*err = -EDQUOT;
 		return 0;
@@ -530,11 +530,11 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
 	fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count);
 	
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
 		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
 
@@ -602,7 +602,7 @@ cg_found:
 	ucpi = ufs_load_cylinder (sb, cgno);
 	if (!ucpi)
 		return 0;
-	ucg = ubh_get_ucg (UCPI_UBH);
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) 
 		ufs_panic (sb, "ufs_alloc_fragments",
 			"internal error, bad magic number on cg %u", cgno);
@@ -625,7 +625,7 @@ cg_found:
 			return 0;
 		goal = ufs_dtogd (result);
 		for (i = count; i < uspi->s_fpb; i++)
-			ubh_setbit (UCPI_UBH, ucpi->c_freeoff, goal + i);
+			ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
 		i = uspi->s_fpb - count;
 		DQUOT_FREE_BLOCK(inode, i);
 
@@ -644,7 +644,7 @@ cg_found:
 		return 0;
 	}
 	for (i = 0; i < count; i++)
-		ubh_clrbit (UCPI_UBH, ucpi->c_freeoff, result + i);
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
 	
 	fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
 	fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count);
@@ -655,11 +655,11 @@ cg_found:
 		fs32_add(sb, &ucg->cg_frsum[allocsize - count], 1);
 
 succed:
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
 		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
 
@@ -682,7 +682,7 @@ static unsigned ufs_alloccg_block (struct inode * inode,
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 
 	if (goal == 0) {
 		goal = ucpi->c_rotor;
@@ -694,7 +694,7 @@ static unsigned ufs_alloccg_block (struct inode * inode,
 	/*
 	 * If the requested block is available, use it.
 	 */
-	if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, ufs_fragstoblks(goal))) {
+	if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) {
 		result = goal;
 		goto gotit;
 	}
@@ -706,7 +706,7 @@ norot:
 	ucpi->c_rotor = result;
 gotit:
 	blkno = ufs_fragstoblks(result);
-	ubh_clrblock (UCPI_UBH, ucpi->c_freeoff, blkno);
+	ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
 	if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 		ufs_clusteracct (sb, ucpi, blkno, -1);
 	if(DQUOT_ALLOC_BLOCK(inode, uspi->s_fpb)) {
@@ -739,7 +739,7 @@ static unsigned ufs_bitmap_search (struct super_block * sb,
 
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first (uspi);
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 
 	if (goal)
 		start = ufs_dtogd(goal) >> 3;
@@ -747,12 +747,12 @@ static unsigned ufs_bitmap_search (struct super_block * sb,
 		start = ucpi->c_frotor >> 3;
 		
 	length = ((uspi->s_fpg + 7) >> 3) - start;
-	location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff + start, length,
+	location = ubh_scanc(UCPI_UBH(ucpi), ucpi->c_freeoff + start, length,
 		(uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
 		1 << (count - 1 + (uspi->s_fpb & 7))); 
 	if (location == 0) {
 		length = start + 1;
-		location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff, length, 
+		location = ubh_scanc(UCPI_UBH(ucpi), ucpi->c_freeoff, length,
 			(uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
 			1 << (count - 1 + (uspi->s_fpb & 7)));
 		if (location == 0) {
@@ -769,7 +769,7 @@ static unsigned ufs_bitmap_search (struct super_block * sb,
 	/*
 	 * found the byte in the map
 	 */
-	blockmap = ubh_blkmap(UCPI_UBH, ucpi->c_freeoff, result);
+	blockmap = ubh_blkmap(UCPI_UBH(ucpi), ucpi->c_freeoff, result);
 	fragsize = 0;
 	for (possition = 0, mask = 1; possition < 8; possition++, mask <<= 1) {
 		if (blockmap & mask) {
@@ -808,9 +808,9 @@ static void ufs_clusteracct(struct super_block * sb,
 		return;
 
 	if (cnt > 0)
-		ubh_setbit(UCPI_UBH, ucpi->c_clusteroff, blkno);
+		ubh_setbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
 	else
-		ubh_clrbit(UCPI_UBH, ucpi->c_clusteroff, blkno);
+		ubh_clrbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
 
 	/*
 	 * Find the size of the cluster going forward.
@@ -819,7 +819,7 @@ static void ufs_clusteracct(struct super_block * sb,
 	end = start + uspi->s_contigsumsize;
 	if ( end >= ucpi->c_nclusterblks)
 		end = ucpi->c_nclusterblks;
-	i = ubh_find_next_zero_bit (UCPI_UBH, ucpi->c_clusteroff, end, start);
+	i = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, end, start);
 	if (i > end)
 		i = end;
 	forw = i - start;
@@ -831,7 +831,7 @@ static void ufs_clusteracct(struct super_block * sb,
 	end = start - uspi->s_contigsumsize;
 	if (end < 0 ) 
 		end = -1;
-	i = ubh_find_last_zero_bit (UCPI_UBH, ucpi->c_clusteroff, start, end);
+	i = ubh_find_last_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, start, end);
 	if ( i < end) 
 		i = end;
 	back = start - i;
@@ -843,11 +843,11 @@ static void ufs_clusteracct(struct super_block * sb,
 	i = back + forw + 1;
 	if (i > uspi->s_contigsumsize)
 		i = uspi->s_contigsumsize;
-	fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (i << 2)), cnt);
+	fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (i << 2)), cnt);
 	if (back > 0)
-		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (back << 2)), cnt);
+		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (back << 2)), cnt);
 	if (forw > 0)
-		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (forw << 2)), cnt);
+		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (forw << 2)), cnt);
 }
 
 
diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c
index 14abb8b835f7..65fe06810172 100644
--- a/fs/ufs/cylinder.c
+++ b/fs/ufs/cylinder.c
@@ -47,14 +47,14 @@ static void ufs_read_cylinder (struct super_block * sb,
 	ucpi = sbi->s_ucpi[bitmap_nr];
 	ucg = (struct ufs_cylinder_group *)sbi->s_ucg[cgno]->b_data;
 
-	UCPI_UBH->fragment = ufs_cgcmin(cgno);
-	UCPI_UBH->count = uspi->s_cgsize >> sb->s_blocksize_bits;
+	UCPI_UBH(ucpi)->fragment = ufs_cgcmin(cgno);
+	UCPI_UBH(ucpi)->count = uspi->s_cgsize >> sb->s_blocksize_bits;
 	/*
 	 * We have already the first fragment of cylinder group block in buffer
 	 */
-	UCPI_UBH->bh[0] = sbi->s_ucg[cgno];
-	for (i = 1; i < UCPI_UBH->count; i++)
-		if (!(UCPI_UBH->bh[i] = sb_bread(sb, UCPI_UBH->fragment + i)))
+	UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno];
+	for (i = 1; i < UCPI_UBH(ucpi)->count; i++)
+		if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i)))
 			goto failed;
 	sbi->s_cgno[bitmap_nr] = cgno;
 			
@@ -103,7 +103,7 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
 		return;
 	}
 	ucpi = sbi->s_ucpi[bitmap_nr];
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 
 	if (uspi->s_ncg > UFS_MAX_GROUP_LOADED && bitmap_nr >= sbi->s_cg_loaded) {
 		ufs_panic (sb, "ufs_put_cylinder", "internal error");
@@ -116,9 +116,9 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
 	ucg->cg_rotor = cpu_to_fs32(sb, ucpi->c_rotor);
 	ucg->cg_frotor = cpu_to_fs32(sb, ucpi->c_frotor);
 	ucg->cg_irotor = cpu_to_fs32(sb, ucpi->c_irotor);
-	ubh_mark_buffer_dirty (UCPI_UBH);
-	for (i = 1; i < UCPI_UBH->count; i++) {
-		brelse (UCPI_UBH->bh[i]);
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	for (i = 1; i < UCPI_UBH(ucpi)->count; i++) {
+		brelse (UCPI_UBH(ucpi)->bh[i]);
 	}
 
 	sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index c7a47ed4f430..2da0ffda82cc 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -91,7 +91,7 @@ void ufs_free_inode (struct inode * inode)
 		unlock_super (sb);
 		return;
 	}
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg))
 		ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number");
 
@@ -104,10 +104,10 @@ void ufs_free_inode (struct inode * inode)
 
 	clear_inode (inode);
 
-	if (ubh_isclr (UCPI_UBH, ucpi->c_iusedoff, bit))
+	if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
 		ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino);
 	else {
-		ubh_clrbit (UCPI_UBH, ucpi->c_iusedoff, bit);
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
 		if (ino < ucpi->c_irotor)
 			ucpi->c_irotor = ino;
 		fs32_add(sb, &ucg->cg_cs.cs_nifree, 1);
@@ -121,11 +121,11 @@ void ufs_free_inode (struct inode * inode)
 		}
 	}
 
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
 		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	
 	sb->s_dirt = 1;
@@ -213,14 +213,14 @@ cg_found:
 	ucpi = ufs_load_cylinder (sb, cg);
 	if (!ucpi)
 		goto failed;
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) 
 		ufs_panic (sb, "ufs_new_inode", "internal error, bad cg magic number");
 
 	start = ucpi->c_irotor;
-	bit = ubh_find_next_zero_bit (UCPI_UBH, ucpi->c_iusedoff, uspi->s_ipg, start);
+	bit = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, uspi->s_ipg, start);
 	if (!(bit < uspi->s_ipg)) {
-		bit = ubh_find_first_zero_bit (UCPI_UBH, ucpi->c_iusedoff, start);
+		bit = ubh_find_first_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, start);
 		if (!(bit < start)) {
 			ufs_error (sb, "ufs_new_inode",
 			    "cylinder group %u corrupted - error in inode bitmap\n", cg);
@@ -228,8 +228,8 @@ cg_found:
 		}
 	}
 	UFSD(("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg))
-	if (ubh_isclr (UCPI_UBH, ucpi->c_iusedoff, bit))
-		ubh_setbit (UCPI_UBH, ucpi->c_iusedoff, bit);
+	if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
+		ubh_setbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
 	else {
 		ufs_panic (sb, "ufs_new_inode", "internal error");
 		goto failed;
@@ -245,11 +245,11 @@ cg_found:
 		fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1);
 	}
 
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
 		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index fe5ab2aa2899..c00d1e741529 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -225,7 +225,7 @@ void ufs_error (struct super_block * sb, const char * function,
 	
 	if (!(sb->s_flags & MS_RDONLY)) {
 		usb1->fs_clean = UFS_FSBAD;
-		ubh_mark_buffer_dirty(USPI_UBH);
+		ubh_mark_buffer_dirty(USPI_UBH(uspi));
 		sb->s_dirt = 1;
 		sb->s_flags |= MS_RDONLY;
 	}
@@ -257,7 +257,7 @@ void ufs_panic (struct super_block * sb, const char * function,
 	
 	if (!(sb->s_flags & MS_RDONLY)) {
 		usb1->fs_clean = UFS_FSBAD;
-		ubh_mark_buffer_dirty(USPI_UBH);
+		ubh_mark_buffer_dirty(USPI_UBH(uspi));
 		sb->s_dirt = 1;
 	}
 	va_start (args, fmt);
@@ -1014,7 +1014,7 @@ static void ufs_write_super (struct super_block *sb) {
 		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
 			ufs_set_fs_state(sb, usb1, usb3,
 					UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-		ubh_mark_buffer_dirty (USPI_UBH);
+		ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	}
 	sb->s_dirt = 0;
 	UFSD(("EXIT\n"))
@@ -1083,7 +1083,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) 
 			ufs_set_fs_state(sb, usb1, usb3,
 				UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-		ubh_mark_buffer_dirty (USPI_UBH);
+		ubh_mark_buffer_dirty (USPI_UBH(uspi));
 		sb->s_dirt = 0;
 		sb->s_flags |= MS_RDONLY;
 	}
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 72f91cc84bfe..f9556bc484ef 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -63,17 +63,17 @@ struct ufs_buffer_head * ubh_bread_uspi (struct ufs_sb_private_info * uspi,
 	count = size >> uspi->s_fshift;
 	if (count <= 0 || count > UFS_MAXFRAG)
 		return NULL;
-	USPI_UBH->fragment = fragment;
-	USPI_UBH->count = count;
+	USPI_UBH(uspi)->fragment = fragment;
+	USPI_UBH(uspi)->count = count;
 	for (i = 0; i < count; i++)
-		if (!(USPI_UBH->bh[i] = sb_bread(sb, fragment + i)))
+		if (!(USPI_UBH(uspi)->bh[i] = sb_bread(sb, fragment + i)))
 			goto failed;
 	for (; i < UFS_MAXFRAG; i++)
-		USPI_UBH->bh[i] = NULL;
-	return USPI_UBH;
+		USPI_UBH(uspi)->bh[i] = NULL;
+	return USPI_UBH(uspi);
 failed:
 	for (j = 0; j < i; j++)
-		brelse (USPI_UBH->bh[j]);
+		brelse (USPI_UBH(uspi)->bh[j]);
 	return NULL;
 }
 
@@ -90,11 +90,11 @@ void ubh_brelse (struct ufs_buffer_head * ubh)
 void ubh_brelse_uspi (struct ufs_sb_private_info * uspi)
 {
 	unsigned i;
-	if (!USPI_UBH)
+	if (!USPI_UBH(uspi))
 		return;
-	for ( i = 0; i < USPI_UBH->count; i++ ) {
-		brelse (USPI_UBH->bh[i]);
-		USPI_UBH->bh[i] = NULL;
+	for ( i = 0; i < USPI_UBH(uspi)->count; i++ ) {
+		brelse (USPI_UBH(uspi)->bh[i]);
+		USPI_UBH(uspi)->bh[i] = NULL;
 	}
 }
 
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index e10362d8f456..6a0b48cf9cef 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -17,10 +17,16 @@
 #define in_range(b,first,len)	((b)>=(first)&&(b)<(first)+(len))
 
 /*
- * macros used for retyping
+ * functions used for retyping
  */
-#define UCPI_UBH ((struct ufs_buffer_head *)ucpi)
-#define USPI_UBH ((struct ufs_buffer_head *)uspi)
+static inline struct ufs_buffer_head *UCPI_UBH(struct ufs_cg_private_info *cpi)
+{
+	return &cpi->c_ubh;
+}
+static inline struct ufs_buffer_head *USPI_UBH(struct ufs_sb_private_info *spi)
+{
+	return &spi->s_ubh;
+}
 
 
@@ -326,10 +332,10 @@ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi,
  * Macros to access cylinder group array structures
  */
 #define ubh_cg_blktot(ucpi,cylno) \
-	(*((__fs32*)ubh_get_addr(UCPI_UBH, (ucpi)->c_btotoff + ((cylno) << 2))))
+	(*((__fs32*)ubh_get_addr(UCPI_UBH(ucpi), (ucpi)->c_btotoff + ((cylno) << 2))))
 
 #define ubh_cg_blks(ucpi,cylno,rpos) \
-	(*((__fs16*)ubh_get_addr(UCPI_UBH, \
+	(*((__fs16*)ubh_get_addr(UCPI_UBH(ucpi), \
 	(ucpi)->c_boff + (((cylno) * uspi->s_nrpos + (rpos)) << 1 ))))
 
 /*
diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h
index 9d2b519700e7..48394dae225d 100644
--- a/include/linux/ufs_fs.h
+++ b/include/linux/ufs_fs.h
@@ -666,7 +666,7 @@ struct ufs_buffer_head {
 };
 
 struct ufs_cg_private_info {
-	struct ufs_cylinder_group ucg;
+	struct ufs_buffer_head c_ubh;
 	__u32	c_cgx;		/* number of cylidner group */
 	__u16	c_ncyl;		/* number of cyl's this cg */
 	__u16	c_niblk;	/* number of inode blocks this cg */
-- 
cgit v1.2.3


From abf5d15fd2e52517dd56a17a846d5a1f900b7db4 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:24 -0700
Subject: [PATCH] ufs: easy debug

Currently to turn on debug mode "user" has to edit ~10 files, to turn off he
has to do it again.

This patch introduce such changes:
1)turn on(off) debug messages via ".config"
2)remove unnecessary duplication of code
3)make "UFSD" macros more similar to function
4)fix some compiler warnings

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig             |   8 ++++
 fs/ufs/balloc.c        |  62 ++++++++++++---------------
 fs/ufs/cylinder.c      |  31 +++++---------
 fs/ufs/dir.c           |  28 +++++-------
 fs/ufs/ialloc.c        |  22 +++-------
 fs/ufs/inode.c         |  47 +++++++++-----------
 fs/ufs/namei.c         |  21 ++++-----
 fs/ufs/super.c         | 114 +++++++++++++++++++++++--------------------------
 fs/ufs/truncate.c      |  30 +++++--------
 fs/ufs/util.c          |   9 ----
 include/linux/ufs_fs.h |  13 ++++++
 11 files changed, 168 insertions(+), 217 deletions(-)

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index c0afaccad609..ea60e83e7fed 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1375,6 +1375,14 @@ config UFS_FS_WRITE
 	  Say Y here if you want to try writing to UFS partitions. This is
 	  experimental, so you should back up your UFS partitions beforehand.
 
+config UFS_DEBUG
+	bool "UFS debugging"
+	depends on UFS_FS
+	help
+	  If you are experiencing any problems with the UFS filesystem, say
+	  Y here.  This will result in _many_ additional debugging messages to be
+	  written to the system log.
+
 endmenu
 
 menu "Network File Systems"
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 343eaf4542f8..7a4735f591bc 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -21,14 +21,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_BALLOC_DEBUG
-
-#ifdef UFS_BALLOC_DEBUG
-#define UFSD(x) printk("(%s, %d), %s:", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
 static unsigned ufs_add_fragments (struct inode *, unsigned, unsigned, unsigned, int *);
 static unsigned ufs_alloc_fragments (struct inode *, unsigned, unsigned, unsigned, int *);
 static unsigned ufs_alloccg_block (struct inode *, struct ufs_cg_private_info *, unsigned, int *);
@@ -52,7 +44,7 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
 	
-	UFSD(("ENTER, fragment %u, count %u\n", fragment, count))
+	UFSD("ENTER, fragment %u, count %u\n", fragment, count);
 	
 	if (ufs_fragnum(fragment) + count > uspi->s_fpg)
 		ufs_error (sb, "ufs_free_fragments", "internal error");
@@ -123,12 +115,12 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 	sb->s_dirt = 1;
 	
 	unlock_super (sb);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;
 
 failed:
 	unlock_super (sb);
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return;
 }
 
@@ -148,7 +140,7 @@ void ufs_free_blocks(struct inode *inode, unsigned fragment, unsigned count)
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
 
-	UFSD(("ENTER, fragment %u, count %u\n", fragment, count))
+	UFSD("ENTER, fragment %u, count %u\n", fragment, count);
 	
 	if ((fragment & uspi->s_fpbmask) || (count & uspi->s_fpbmask)) {
 		ufs_error (sb, "ufs_free_blocks", "internal error, "
@@ -215,12 +207,12 @@ do_more:
 
 	sb->s_dirt = 1;
 	unlock_super (sb);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;
 
 failed:
 	unlock_super (sb);
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return;
 }
 
@@ -290,8 +282,8 @@ static void ufs_change_blocknr(struct inode *inode, unsigned int count,
 
 	baseblk = ((i_size_read(inode) - 1) >> inode->i_blkbits) + 1 - count;
 
-	UFSD(("ENTER, ino %lu, count %u, oldb %u, newb %u\n",
-	      inode->i_ino, count, oldb, newb));
+	UFSD("ENTER, ino %lu, count %u, oldb %u, newb %u\n",
+	      inode->i_ino, count, oldb, newb);
 
 	BUG_ON(!PageLocked(locked_page));
 
@@ -326,7 +318,7 @@ static void ufs_change_blocknr(struct inode *inode, unsigned int count,
 			page_cache_release(page);
 		}
  	}
-	UFSD(("EXIT\n"));
+	UFSD("EXIT\n");
 }
 
 unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
@@ -337,7 +329,7 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 	struct ufs_super_block_first * usb1;
 	unsigned cgno, oldcount, newcount, tmp, request, result;
 	
-	UFSD(("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count))
+	UFSD("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count);
 	
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -366,14 +358,14 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 			return (unsigned)-1;
 		}
 		if (fragment < UFS_I(inode)->i_lastfrag) {
-			UFSD(("EXIT (ALREADY ALLOCATED)\n"))
+			UFSD("EXIT (ALREADY ALLOCATED)\n");
 			unlock_super (sb);
 			return 0;
 		}
 	}
 	else {
 		if (tmp) {
-			UFSD(("EXIT (ALREADY ALLOCATED)\n"))
+			UFSD("EXIT (ALREADY ALLOCATED)\n");
 			unlock_super(sb);
 			return 0;
 		}
@@ -384,7 +376,7 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 	 */
 	if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(usb1, UFS_MINFREE) <= 0) {
 		unlock_super (sb);
-		UFSD(("EXIT (FAILED)\n"))
+		UFSD("EXIT (FAILED)\n");
 		return 0;
 	}
 
@@ -407,7 +399,7 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 			UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
 		}
 		unlock_super(sb);
-		UFSD(("EXIT, result %u\n", result))
+		UFSD("EXIT, result %u\n", result);
 		return result;
 	}
 
@@ -420,7 +412,7 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 		inode->i_blocks += count << uspi->s_nspfshift;
 		UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
 		unlock_super(sb);
-		UFSD(("EXIT, result %u\n", result))
+		UFSD("EXIT, result %u\n", result);
 		return result;
 	}
 
@@ -458,12 +450,12 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 		if (newcount < request)
 			ufs_free_fragments (inode, result + newcount, request - newcount);
 		ufs_free_fragments (inode, tmp, oldcount);
-		UFSD(("EXIT, result %u\n", result))
+		UFSD("EXIT, result %u\n", result);
 		return result;
 	}
 
 	unlock_super(sb);
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return 0;
 }		
 
@@ -478,7 +470,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	struct ufs_cylinder_group * ucg;
 	unsigned cgno, fragno, fragoff, count, fragsize, i;
 	
-	UFSD(("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount))
+	UFSD("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount);
 	
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -538,7 +530,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	}
 	sb->s_dirt = 1;
 
-	UFSD(("EXIT, fragment %u\n", fragment))
+	UFSD("EXIT, fragment %u\n", fragment);
 	
 	return fragment;
 }
@@ -561,7 +553,7 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno,
 	struct ufs_cylinder_group * ucg;
 	unsigned oldcg, i, j, k, result, allocsize;
 	
-	UFSD(("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count))
+	UFSD("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count);
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -595,7 +587,7 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno,
 		UFS_TEST_FREE_SPACE_CG
 	}
 	
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return 0;
 
 cg_found:
@@ -664,7 +656,7 @@ succed:
 	sb->s_dirt = 1;
 
 	result += cgno * uspi->s_fpg;
-	UFSD(("EXIT3, result %u\n", result))
+	UFSD("EXIT3, result %u\n", result);
 	return result;
 }
 
@@ -677,7 +669,7 @@ static unsigned ufs_alloccg_block (struct inode * inode,
 	struct ufs_cylinder_group * ucg;
 	unsigned result, cylno, blkno;
 
-	UFSD(("ENTER, goal %u\n", goal))
+	UFSD("ENTER, goal %u\n", goal);
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -721,7 +713,7 @@ gotit:
 	fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(result)), 1);
 	fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1);
 	
-	UFSD(("EXIT, result %u\n", result))
+	UFSD("EXIT, result %u\n", result);
 
 	return result;
 }
@@ -781,7 +773,7 @@ static unsigned ufs_bitmap_search(struct super_block *sb,
 	unsigned start, length, loc, result;
 	unsigned pos, want, blockmap, mask, end;
 
-	UFSD(("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count));
+	UFSD("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count);
 
 	usb1 = ubh_get_usb_first (uspi);
 	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
@@ -825,7 +817,7 @@ static unsigned ufs_bitmap_search(struct super_block *sb,
 		want = want_arr[count];
 		for (pos = 0; pos <= uspi->s_fpb - count; pos++) {
 			if ((blockmap & mask) == want) {
-				UFSD(("EXIT, result %u\n", result));
+				UFSD("EXIT, result %u\n", result);
 				return result + pos;
  			}
 			mask <<= 1;
@@ -835,7 +827,7 @@ static unsigned ufs_bitmap_search(struct super_block *sb,
 
 	ufs_error(sb, "ufs_bitmap_search", "block not in map on cg %u\n",
 		  ucpi->c_cgx);
-	UFSD(("EXIT (FAILED)\n"));
+	UFSD("EXIT (FAILED)\n");
 	return (unsigned)-1;
 }
 
diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c
index 65fe06810172..09c39e5e6386 100644
--- a/fs/ufs/cylinder.c
+++ b/fs/ufs/cylinder.c
@@ -20,15 +20,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_CYLINDER_DEBUG
-
-#ifdef UFS_CYLINDER_DEBUG
-#define UFSD(x) printk("(%s, %d), %s:", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
-
 /*
  * Read cylinder group into cache. The memory space for ufs_cg_private_info
  * structure is already allocated during ufs_read_super.
@@ -42,7 +33,7 @@ static void ufs_read_cylinder (struct super_block * sb,
 	struct ufs_cylinder_group * ucg;
 	unsigned i, j;
 
-	UFSD(("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr))
+	UFSD("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr);
 	uspi = sbi->s_uspi;
 	ucpi = sbi->s_ucpi[bitmap_nr];
 	ucg = (struct ufs_cylinder_group *)sbi->s_ucg[cgno]->b_data;
@@ -73,7 +64,7 @@ static void ufs_read_cylinder (struct super_block * sb,
 	ucpi->c_clustersumoff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clustersumoff);
 	ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff);
 	ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;	
 	
 failed:
@@ -95,11 +86,11 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
 	struct ufs_cylinder_group * ucg;
 	unsigned i;
 
-	UFSD(("ENTER, bitmap_nr %u\n", bitmap_nr))
+	UFSD("ENTER, bitmap_nr %u\n", bitmap_nr);
 
 	uspi = sbi->s_uspi;
 	if (sbi->s_cgno[bitmap_nr] == UFS_CGNO_EMPTY) {
-		UFSD(("EXIT\n"))
+		UFSD("EXIT\n");
 		return;
 	}
 	ucpi = sbi->s_ucpi[bitmap_nr];
@@ -122,7 +113,7 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
 	}
 
 	sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 }
 
 /*
@@ -139,7 +130,7 @@ struct ufs_cg_private_info * ufs_load_cylinder (
 	struct ufs_cg_private_info * ucpi;
 	unsigned cg, i, j;
 
-	UFSD(("ENTER, cgno %u\n", cgno))
+	UFSD("ENTER, cgno %u\n", cgno);
 
 	uspi = sbi->s_uspi;
 	if (cgno >= uspi->s_ncg) {
@@ -150,7 +141,7 @@ struct ufs_cg_private_info * ufs_load_cylinder (
 	 * Cylinder group number cg it in cache and it was last used
 	 */
 	if (sbi->s_cgno[0] == cgno) {
-		UFSD(("EXIT\n"))
+		UFSD("EXIT\n");
 		return sbi->s_ucpi[0];
 	}
 	/*
@@ -160,16 +151,16 @@ struct ufs_cg_private_info * ufs_load_cylinder (
 		if (sbi->s_cgno[cgno] != UFS_CGNO_EMPTY) {
 			if (sbi->s_cgno[cgno] != cgno) {
 				ufs_panic (sb, "ufs_load_cylinder", "internal error, wrong number of cg in cache");
-				UFSD(("EXIT (FAILED)\n"))
+				UFSD("EXIT (FAILED)\n");
 				return NULL;
 			}
 			else {
-				UFSD(("EXIT\n"))
+				UFSD("EXIT\n");
 				return sbi->s_ucpi[cgno];
 			}
 		} else {
 			ufs_read_cylinder (sb, cgno, cgno);
-			UFSD(("EXIT\n"))
+			UFSD("EXIT\n");
 			return sbi->s_ucpi[cgno];
 		}
 	}
@@ -204,6 +195,6 @@ struct ufs_cg_private_info * ufs_load_cylinder (
 		sbi->s_ucpi[0] = ucpi;
 		ufs_read_cylinder (sb, cgno, 0);
 	}
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return sbi->s_ucpi[0];
 }
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 9473df5bff51..732c3fd2b6f2 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -25,14 +25,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_DIR_DEBUG
-
-#ifdef UFS_DIR_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
 /*
  * NOTE! unlike strncmp, ufs_match returns 1 for success, 0 for failure.
  *
@@ -262,7 +254,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
 	struct page *page = NULL;
 	struct ufs_dir_entry *de;
 
-	UFSD(("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen));
+	UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen);
 
 	if (npages == 0 || namelen > UFS_MAXNAMLEN)
 		goto out;
@@ -326,7 +318,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode)
 	unsigned from, to;
 	int err;
 
-	UFSD(("ENTER, name %s, namelen %u\n", name, namelen));
+	UFSD("ENTER, name %s, namelen %u\n", name, namelen);
 
 	/*
 	 * We take care of directory expansion in the same loop.
@@ -442,7 +434,7 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int need_revalidate = filp->f_version != inode->i_version;
 	unsigned flags = UFS_SB(sb)->s_flags;
 
-	UFSD(("BEGIN"));
+	UFSD("BEGIN\n");
 
 	if (pos > inode->i_size - UFS_DIR_REC_LEN(1))
 		return 0;
@@ -484,9 +476,9 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 				offset = (char *)de - kaddr;
 
-				UFSD(("filldir(%s,%u)\n", de->d_name,
-				      fs32_to_cpu(sb, de->d_ino)));
-				UFSD(("namlen %u\n", ufs_get_de_namlen(sb, de)));
+				UFSD("filldir(%s,%u)\n", de->d_name,
+				      fs32_to_cpu(sb, de->d_ino));
+				UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
 
 				if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
 					d_type = de->d_u.d_44.d_type;
@@ -524,12 +516,12 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
 	struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from);
 	int err;
 
-	UFSD(("ENTER\n"));
+	UFSD("ENTER\n");
 
-	UFSD(("ino %u, reclen %u, namlen %u, name %s\n",
+	UFSD("ino %u, reclen %u, namlen %u, name %s\n",
 	      fs32_to_cpu(sb, de->d_ino),
 	      fs16_to_cpu(sb, de->d_reclen),
-	      ufs_get_de_namlen(sb, de), de->d_name));
+	      ufs_get_de_namlen(sb, de), de->d_name);
 
 	while ((char*)de < (char*)dir) {
 		if (de->d_reclen == 0) {
@@ -554,7 +546,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
 	mark_inode_dirty(inode);
 out:
 	ufs_put_page(page);
-	UFSD(("EXIT\n"));
+	UFSD("EXIT\n");
 	return err;
 }
 
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 2da0ffda82cc..ad017fa2dd21 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -34,14 +34,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_IALLOC_DEBUG
-
-#ifdef UFS_IALLOC_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
 /*
  * NOTE! When we get the inode, we're the only people
  * that have access to it, and as such there are no
@@ -68,7 +60,7 @@ void ufs_free_inode (struct inode * inode)
 	int is_directory;
 	unsigned ino, cg, bit;
 	
-	UFSD(("ENTER, ino %lu\n", inode->i_ino))
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -130,7 +122,7 @@ void ufs_free_inode (struct inode * inode)
 	
 	sb->s_dirt = 1;
 	unlock_super (sb);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 }
 
 /*
@@ -155,7 +147,7 @@ struct inode * ufs_new_inode(struct inode * dir, int mode)
 	unsigned cg, bit, i, j, start;
 	struct ufs_inode_info *ufsi;
 
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	
 	/* Cannot create files in a deleted directory */
 	if (!dir || !dir->i_nlink)
@@ -227,7 +219,7 @@ cg_found:
 			goto failed;
 		}
 	}
-	UFSD(("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg))
+	UFSD("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg);
 	if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
 		ubh_setbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
 	else {
@@ -287,14 +279,14 @@ cg_found:
 		return ERR_PTR(-EDQUOT);
 	}
 
-	UFSD(("allocating inode %lu\n", inode->i_ino))
-	UFSD(("EXIT\n"))
+	UFSD("allocating inode %lu\n", inode->i_ino);
+	UFSD("EXIT\n");
 	return inode;
 
 failed:
 	unlock_super (sb);
 	make_bad_inode(inode);
 	iput (inode);
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return ERR_PTR(-ENOSPC);
 }
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index c28b7522c9e7..01f754462341 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -41,15 +41,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_INODE_DEBUG
-#undef UFS_INODE_DEBUG_MORE
-
-#ifdef UFS_INODE_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
 static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
 {
 	struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi;
@@ -61,7 +52,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
 	int n = 0;
 
 
-	UFSD(("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks));
+	UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks);
 	if (i_block < 0) {
 		ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0");
 	} else if (i_block < direct_blocks) {
@@ -104,8 +95,8 @@ u64  ufs_frag_map(struct inode *inode, sector_t frag)
 	unsigned flags = UFS_SB(sb)->s_flags;
 	u64 temp = 0L;
 
-	UFSD((": frag = %llu  depth = %d\n", (unsigned long long)frag, depth));
-	UFSD((": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask));
+	UFSD(": frag = %llu  depth = %d\n", (unsigned long long)frag, depth);
+	UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask);
 
 	if (depth == 0)
 		return 0;
@@ -186,8 +177,8 @@ static struct buffer_head *ufs_inode_getfrag(struct inode *inode,
 	__fs32 * p, * p2;
 	unsigned flags = 0;
 
-	UFSD(("ENTER, ino %lu, fragment %u, new_fragment %u, required %u\n",
-		inode->i_ino, fragment, new_fragment, required))         
+	UFSD("ENTER, ino %lu, fragment %u, new_fragment %u, required %u\n",
+		inode->i_ino, fragment, new_fragment, required);
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -210,7 +201,7 @@ repeat:
 		if (metadata) {
 			result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
 			if (tmp == fs32_to_cpu(sb, *p)) {
-				UFSD(("EXIT, result %u\n", tmp + blockoff))
+				UFSD("EXIT, result %u\n", tmp + blockoff);
 				return result;
 			}
 			brelse (result);
@@ -288,7 +279,7 @@ repeat:
 	if (IS_SYNC(inode))
 		ufs_sync_inode (inode);
 	mark_inode_dirty(inode);
-	UFSD(("EXIT, result %u\n", tmp + blockoff))
+	UFSD("EXIT, result %u\n", tmp + blockoff);
 	return result;
 
      /* This part : To be implemented ....
@@ -323,7 +314,7 @@ static struct buffer_head *ufs_block_getfrag(struct inode *inode, struct buffer_
 	block = ufs_fragstoblks (fragment);
 	blockoff = ufs_fragnum (fragment);
 
-	UFSD(("ENTER, ino %lu, fragment %u, new_fragment %u\n", inode->i_ino, fragment, new_fragment))	
+	UFSD("ENTER, ino %lu, fragment %u, new_fragment %u\n", inode->i_ino, fragment, new_fragment);
 
 	result = NULL;
 	if (!bh)
@@ -377,10 +368,10 @@ repeat:
 		sync_dirty_buffer(bh);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-	UFSD(("result %u\n", tmp + blockoff));
+	UFSD("result %u\n", tmp + blockoff);
 out:
 	brelse (bh);
-	UFSD(("EXIT\n"));
+	UFSD("EXIT\n");
 	return result;
 }
 
@@ -399,7 +390,7 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
 	
 	if (!create) {
 		phys64 = ufs_frag_map(inode, fragment);
-		UFSD(("phys64 = %llu \n",phys64));
+		UFSD("phys64 = %llu \n",phys64);
 		if (phys64)
 			map_bh(bh_result, sb, phys64);
 		return 0;
@@ -414,7 +405,7 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
 
 	lock_kernel();
 
-	UFSD(("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment))
+	UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
 	if (fragment < 0)
 		goto abort_negative;
 	if (fragment >
@@ -514,7 +505,7 @@ struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment,
 {
 	struct buffer_head * bh;
 
-	UFSD(("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment))
+	UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment);
 	bh = ufs_getfrag (inode, fragment, create, err);
 	if (!bh || buffer_uptodate(bh)) 		
 		return bh;
@@ -586,7 +577,7 @@ void ufs_read_inode (struct inode * inode)
 	unsigned i;
 	unsigned flags;
 	
-	UFSD(("ENTER, ino %lu\n", inode->i_ino))
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
 	
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -652,7 +643,7 @@ void ufs_read_inode (struct inode * inode)
 
 	brelse (bh);
 
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;
 
 bad_inode:
@@ -660,7 +651,7 @@ bad_inode:
 	return;
 
 ufs2_inode :
-	UFSD(("Reading ufs2 inode, ino %lu\n", inode->i_ino))
+	UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
 
 	ufs2_inode = (struct ufs2_inode *)(bh->b_data + sizeof(struct ufs2_inode) * ufs_inotofsbo(inode->i_ino));
 
@@ -712,7 +703,7 @@ ufs2_inode :
 
 	brelse(bh);
 
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;
 }
 
@@ -726,7 +717,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
 	unsigned i;
 	unsigned flags;
 
-	UFSD(("ENTER, ino %lu\n", inode->i_ino))
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -787,7 +778,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
 		sync_dirty_buffer(bh);
 	brelse (bh);
 	
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return 0;
 }
 
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 364bb92b0917..abd5f23a426d 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -34,17 +34,6 @@
 #include "swab.h"	/* will go away - see comment in mknod() */
 #include "util.h"
 
-/*
-#undef UFS_NAMEI_DEBUG
-*/
-#define UFS_NAMEI_DEBUG
-
-#ifdef UFS_NAMEI_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
 static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
 {
 	int err = ufs_add_link(dentry, inode);
@@ -90,8 +79,13 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
 static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
 		struct nameidata *nd)
 {
-	struct inode * inode = ufs_new_inode(dir, mode);
-	int err = PTR_ERR(inode);
+	struct inode *inode;
+	int err;
+
+	UFSD("BEGIN\n");
+	inode = ufs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ufs_file_inode_operations;
 		inode->i_fop = &ufs_file_operations;
@@ -101,6 +95,7 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
 		err = ufs_add_nondir(dentry, inode);
 		unlock_kernel();
 	}
+	UFSD("END: err=%d\n", err);
 	return err;
 }
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index c00d1e741529..42425999d2d3 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -90,18 +90,7 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_SUPER_DEBUG
-#undef UFS_SUPER_DEBUG_MORE
-
-
-#undef UFS_SUPER_DEBUG_MORE
-#ifdef UFS_SUPER_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
-#ifdef UFS_SUPER_DEBUG_MORE
+#ifdef CONFIG_UFS_DEBUG
 /*
  * Print contents of ufs_super_block, useful for debugging
  */
@@ -157,18 +146,23 @@ void ufs2_print_super_stuff(
 	printk("ufs_print_super_stuff\n");
 	printk("size of usb:     %u\n", sizeof(struct ufs_super_block));
 	printk("  magic:         0x%x\n", fs32_to_cpu(sb, usb->fs_magic));
-	printk("  fs_size:   %u\n",fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size));
-	printk("  fs_dsize:  %u\n",fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize));
-	printk("  bsize:         %u\n", fs32_to_cpu(usb, usb->fs_bsize));
-	printk("  fsize:         %u\n", fs32_to_cpu(usb, usb->fs_fsize));
+	printk("  fs_size:   %llu\n",
+	       (unsigned long long)fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size));
+	printk("  fs_dsize:  %llu\n",
+	       (unsigned long long)fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize));
+	printk("  bsize:         %u\n", fs32_to_cpu(sb, usb->fs_bsize));
+	printk("  fsize:         %u\n", fs32_to_cpu(sb, usb->fs_fsize));
 	printk("  fs_volname:  %s\n", usb->fs_u11.fs_u2.fs_volname);
 	printk("  fs_fsmnt:  %s\n", usb->fs_u11.fs_u2.fs_fsmnt);
-	printk("  fs_sblockloc: %u\n",fs64_to_cpu(sb,
-			usb->fs_u11.fs_u2.fs_sblockloc));
-	printk("  cs_ndir(No of dirs):  %u\n",fs64_to_cpu(sb,
-			usb->fs_u11.fs_u2.fs_cstotal.cs_ndir));
-	printk("  cs_nbfree(No of free blocks):  %u\n",fs64_to_cpu(sb,
-			usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree));
+	printk("  fs_sblockloc: %llu\n",
+	       (unsigned long long)fs64_to_cpu(sb,
+					       usb->fs_u11.fs_u2.fs_sblockloc));
+	printk("  cs_ndir(No of dirs):  %llu\n",
+	       (unsigned long long)fs64_to_cpu(sb,
+				         usb->fs_u11.fs_u2.fs_cstotal.cs_ndir));
+	printk("  cs_nbfree(No of free blocks):  %llu\n",
+	       (unsigned long long)fs64_to_cpu(sb,
+				       usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree));
 	printk("\n");
 }
 
@@ -207,7 +201,7 @@ void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group
 	printk("  nclusterblks  %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
 	printk("\n");
 }
-#endif /* UFS_SUPER_DEBUG_MORE */
+#endif /* CONFIG_UFS_DEBUG */
 
 static struct super_operations ufs_super_ops;
 
@@ -309,7 +303,7 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
 {
 	char * p;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	
 	if (!options)
 		return 1;
@@ -398,7 +392,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 	unsigned size, blks, i;
 	unsigned flags = 0;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	
 	uspi = sbi->s_uspi;
 
@@ -451,12 +445,12 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 		sbi->s_cgno[i] = UFS_CGNO_EMPTY;
 	}
 	for (i = 0; i < uspi->s_ncg; i++) {
-		UFSD(("read cg %u\n", i))
+		UFSD("read cg %u\n", i);
 		if (!(sbi->s_ucg[i] = sb_bread(sb, ufs_cgcmin(i))))
 			goto failed;
 		if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data))
 			goto failed;
-#ifdef UFS_SUPER_DEBUG_MORE
+#ifdef CONFIG_UFS_DEBUG
 		ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
 #endif
 	}
@@ -466,7 +460,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 		sbi->s_cgno[i] = UFS_CGNO_EMPTY;
 	}
 	sbi->s_cg_loaded = 0;
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return 1;
 
 failed:
@@ -479,7 +473,7 @@ failed:
 		for (i = 0; i < UFS_MAX_GROUP_LOADED; i++)
 			kfree (sbi->s_ucpi[i]);
 	}
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return 0;
 }
 
@@ -495,7 +489,7 @@ static void ufs_put_cylinder_structures (struct super_block *sb)
 	unsigned char * base, * space;
 	unsigned blks, size, i;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	
 	uspi = sbi->s_uspi;
 
@@ -523,7 +517,7 @@ static void ufs_put_cylinder_structures (struct super_block *sb)
 		brelse (sbi->s_ucg[i]);
 	kfree (sbi->s_ucg);
 	kfree (base);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 }
 
 static int ufs_fill_super(struct super_block *sb, void *data, int silent)
@@ -544,7 +538,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	ubh = NULL;
 	flags = 0;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 		
 	sbi = kmalloc(sizeof(struct ufs_sb_info), GFP_KERNEL);
 	if (!sbi)
@@ -552,7 +546,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_fs_info = sbi;
 	memset(sbi, 0, sizeof(struct ufs_sb_info));
 
-	UFSD(("flag %u\n", (int)(sb->s_flags & MS_RDONLY)))
+	UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
 	
 #ifndef CONFIG_UFS_FS_WRITE
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -593,7 +587,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	   the rules */
 	switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) {
 	case UFS_MOUNT_UFSTYPE_44BSD:
-		UFSD(("ufstype=44bsd\n"))
+		UFSD("ufstype=44bsd\n");
 		uspi->s_fsize = block_size = 512;
 		uspi->s_fmask = ~(512 - 1);
 		uspi->s_fshift = 9;
@@ -602,7 +596,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
 		break;
 	case UFS_MOUNT_UFSTYPE_UFS2:
-		UFSD(("ufstype=ufs2\n"));
+		UFSD("ufstype=ufs2\n");
 		super_block_offset=SBLOCK_UFS2;
 		uspi->s_fsize = block_size = 512;
 		uspi->s_fmask = ~(512 - 1);
@@ -617,7 +611,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 		
 	case UFS_MOUNT_UFSTYPE_SUN:
-		UFSD(("ufstype=sun\n"))
+		UFSD("ufstype=sun\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -628,7 +622,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 
 	case UFS_MOUNT_UFSTYPE_SUNx86:
-		UFSD(("ufstype=sunx86\n"))
+		UFSD("ufstype=sunx86\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -639,7 +633,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 
 	case UFS_MOUNT_UFSTYPE_OLD:
-		UFSD(("ufstype=old\n"))
+		UFSD("ufstype=old\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -654,7 +648,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	
 	case UFS_MOUNT_UFSTYPE_NEXTSTEP:
-		UFSD(("ufstype=nextstep\n"))
+		UFSD("ufstype=nextstep\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -669,7 +663,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	
 	case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD:
-		UFSD(("ufstype=nextstep-cd\n"))
+		UFSD("ufstype=nextstep-cd\n");
 		uspi->s_fsize = block_size = 2048;
 		uspi->s_fmask = ~(2048 - 1);
 		uspi->s_fshift = 11;
@@ -684,7 +678,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	
 	case UFS_MOUNT_UFSTYPE_OPENSTEP:
-		UFSD(("ufstype=openstep\n"))
+		UFSD("ufstype=openstep\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -699,7 +693,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	
 	case UFS_MOUNT_UFSTYPE_HP:
-		UFSD(("ufstype=hp\n"))
+		UFSD("ufstype=hp\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -820,11 +814,11 @@ magic_found:
 		ubh = NULL;
 		block_size = uspi->s_fsize;
 		super_block_size = uspi->s_sbsize;
-		UFSD(("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size))
+		UFSD("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size);
 		goto again;
 	}
 
-#ifdef UFS_SUPER_DEBUG_MORE
+#ifdef CONFIG_UFS_DEBUG
         if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
 		ufs2_print_super_stuff(sb,usb);
         else
@@ -842,13 +836,13 @@ magic_found:
 	  (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) {
 		switch(usb1->fs_clean) {
 		case UFS_FSCLEAN:
-			UFSD(("fs is clean\n"))
+			UFSD("fs is clean\n");
 			break;
 		case UFS_FSSTABLE:
-			UFSD(("fs is stable\n"))
+			UFSD("fs is stable\n");
 			break;
 		case UFS_FSOSF1:
-			UFSD(("fs is DEC OSF/1\n"))
+			UFSD("fs is DEC OSF/1\n");
 			break;
 		case UFS_FSACTIVE:
 			printk("ufs_read_super: fs is active\n");
@@ -901,8 +895,8 @@ magic_found:
 	uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask);
 	uspi->s_bshift = fs32_to_cpu(sb, usb1->fs_bshift);
 	uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
-	UFSD(("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift,
-		uspi->s_fshift));
+	UFSD("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift,
+		uspi->s_fshift);
 	uspi->s_fpbshift = fs32_to_cpu(sb, usb1->fs_fragshift);
 	uspi->s_fsbtodb = fs32_to_cpu(sb, usb1->fs_fsbtodb);
 	/* s_sbsize already set */
@@ -935,12 +929,11 @@ magic_found:
 	 * Compute another frequently used values
 	 */
 	uspi->s_fpbmask = uspi->s_fpb - 1;
-	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
 		uspi->s_apbshift = uspi->s_bshift - 3;
-	}
-	else {
+	else
 		uspi->s_apbshift = uspi->s_bshift - 2;
-	}
+
 	uspi->s_2apbshift = uspi->s_apbshift * 2;
 	uspi->s_3apbshift = uspi->s_apbshift * 3;
 	uspi->s_apb = 1 << uspi->s_apbshift;
@@ -975,7 +968,7 @@ magic_found:
 		if (!ufs_read_cylinder_structures(sb))
 			goto failed;
 
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return 0;
 
 dalloc_failed:
@@ -986,11 +979,11 @@ failed:
 	kfree (uspi);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return -EINVAL;
 
 failed_nomem:
-	UFSD(("EXIT (NOMEM)\n"))
+	UFSD("EXIT (NOMEM)\n");
 	return -ENOMEM;
 }
 
@@ -1002,7 +995,7 @@ static void ufs_write_super (struct super_block *sb) {
 
 	lock_kernel();
 
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	flags = UFS_SB(sb)->s_flags;
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
@@ -1017,15 +1010,15 @@ static void ufs_write_super (struct super_block *sb) {
 		ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	}
 	sb->s_dirt = 0;
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	unlock_kernel();
 }
 
-static void ufs_put_super (struct super_block *sb)
+static void ufs_put_super(struct super_block *sb)
 {
 	struct ufs_sb_info * sbi = UFS_SB(sb);
 		
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 
 	if (!(sb->s_flags & MS_RDONLY))
 		ufs_put_cylinder_structures (sb);
@@ -1034,6 +1027,7 @@ static void ufs_put_super (struct super_block *sb)
 	kfree (sbi->s_uspi);
 	kfree (sbi);
 	sb->s_fs_info = NULL;
+UFSD("EXIT\n");
 	return;
 }
 
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 29c66e1e24df..716183d834e7 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -49,14 +49,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_TRUNCATE_DEBUG
-
-#ifdef UFS_TRUNCATE_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
- 
 /*
  * Secure deletion currently doesn't work. It interacts very badly
  * with buffers shared with memory mappings, and for that reason
@@ -82,7 +74,7 @@ static int ufs_trunc_direct (struct inode * inode)
 	unsigned i, tmp;
 	int retry;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -105,7 +97,7 @@ static int ufs_trunc_direct (struct inode * inode)
 		block2 = ufs_fragstoblks (frag3);
 	}
 
-	UFSD(("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4))
+	UFSD("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4);
 
 	if (frag1 >= frag2)
 		goto next1;		
@@ -171,7 +163,7 @@ next1:
 	ufs_free_fragments (inode, tmp, frag4);
  next3:
 
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return retry;
 }
 
@@ -186,7 +178,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
 	unsigned frag_to_free, free_count;
 	int retry;
 
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 		
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -252,7 +244,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
 	}
 	ubh_brelse (ind_ubh);
 	
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	
 	return retry;
 }
@@ -266,7 +258,7 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
 	__fs32 * dind;
 	int retry = 0;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -315,7 +307,7 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
 	}
 	ubh_brelse (dind_bh);
 	
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	
 	return retry;
 }
@@ -330,7 +322,7 @@ static int ufs_trunc_tindirect (struct inode * inode)
 	__fs32 * tind, * p;
 	int retry;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -375,7 +367,7 @@ static int ufs_trunc_tindirect (struct inode * inode)
 	}
 	ubh_brelse (tind_bh);
 	
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return retry;
 }
 		
@@ -386,7 +378,7 @@ void ufs_truncate (struct inode * inode)
 	struct ufs_sb_private_info * uspi;
 	int retry;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
 
@@ -417,5 +409,5 @@ void ufs_truncate (struct inode * inode)
 	ufsi->i_lastfrag = DIRECT_FRAGMENT;
 	unlock_kernel();
 	mark_inode_dirty(inode);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 }
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index f9556bc484ef..4685f7cb70b2 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -14,15 +14,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_UTILS_DEBUG
-
-#ifdef UFS_UTILS_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
-
 struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
 	struct super_block *sb, u64 fragment, u64 size)
 {
diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h
index 48394dae225d..28b31591f688 100644
--- a/include/linux/ufs_fs.h
+++ b/include/linux/ufs_fs.h
@@ -220,6 +220,19 @@ typedef __u16 __bitwise __fs16;
  */
 #define UFS_MINFREE         5
 #define UFS_DEFAULTOPT      UFS_OPTTIME
+
+/*
+ * Debug code
+ */
+#ifdef CONFIG_UFS_DEBUG
+#	define UFSD(f, a...)	{					\
+		printk ("UFSD (%s, %d): %s:",				\
+			__FILE__, __LINE__, __FUNCTION__);		\
+		printk (f, ## a);					\
+	}
+#else
+#	define UFSD(f, a...)	/**/
+#endif
             
 /*
  * Turn file system block numbers into disk block addresses.
-- 
cgit v1.2.3


From dd187a2603d9904ddc410441348f0cfc558a5233 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:25 -0700
Subject: [PATCH] ufs: little directory lookup optimization

This patch make little optimization of ufs_find_entry like "ext2" does.  Save
number of page and reuse it again in the next call.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/dir.c             | 7 ++++---
 fs/ufs/ialloc.c          | 1 +
 fs/ufs/inode.c           | 4 ++--
 include/linux/ufs_fs_i.h | 1 +
 4 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 732c3fd2b6f2..7f0a0aa63584 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -252,6 +252,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
 	unsigned long start, n;
 	unsigned long npages = ufs_dir_pages(dir);
 	struct page *page = NULL;
+	struct ufs_inode_info *ui = UFS_I(dir);
 	struct ufs_dir_entry *de;
 
 	UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen);
@@ -262,8 +263,8 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
 	/* OFFSET_CACHE */
 	*res_page = NULL;
 
-	/* start = ei->i_dir_start_lookup; */
-	start = 0;
+	start = ui->i_dir_start_lookup;
+
 	if (start >= npages)
 		start = 0;
 	n = start;
@@ -295,7 +296,7 @@ out:
 
 found:
 	*res_page = page;
-	/* ei->i_dir_start_lookup = n; */
+	ui->i_dir_start_lookup = n;
 	return de;
 }
 
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index ad017fa2dd21..c684aaad9998 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -264,6 +264,7 @@ cg_found:
 	ufsi->i_shadow = 0;
 	ufsi->i_osync = 0;
 	ufsi->i_oeftflag = 0;
+	ufsi->i_dir_start_lookup = 0;
 	memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1));
 
 	insert_inode_hash(inode);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 01f754462341..c57612d443d0 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -628,12 +628,12 @@ void ufs_read_inode (struct inode * inode)
 	ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
 	ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
 	ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
+	ufsi->i_dir_start_lookup = 0;
 	
 	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
 		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
 			ufsi->i_u1.i_data[i] = ufs_inode->ui_u2.ui_addr.ui_db[i];
-	}
-	else {
+	} else {
 		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
 			ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i];
 	}
diff --git a/include/linux/ufs_fs_i.h b/include/linux/ufs_fs_i.h
index 21665a953978..f50ce3b0cd52 100644
--- a/include/linux/ufs_fs_i.h
+++ b/include/linux/ufs_fs_i.h
@@ -27,6 +27,7 @@ struct ufs_inode_info {
 	__u32	i_oeftflag;
 	__u16	i_osync;
 	__u32	i_lastfrag;
+	__u32   i_dir_start_lookup;
 	struct inode vfs_inode;
 };
 
-- 
cgit v1.2.3


From 647b7e87b56f594daf648f44abfbeeb5eb6a9457 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:29 -0700
Subject: [PATCH] ufs: one way to access super block

Super block of UFS usually has size >512, because of fragment size may be 512,
this cause some problems.

Currently, there are two methods to work with ufs super block:

1) split structure which describes ufs super blocks into structures with
   size <=512

2) use one structure which describes ufs super block, and hope that array
   of "buffer_head" which holds "super block", has such construction:

	bh[n]->b_data + bh[n]->b_size == bh[n + 1]->b_data

The second variant may cause some problems in the future, and usage of two
variants cause unnecessary code duplication.

This patch remove the second variant.  Also patch contains some CodingStyle
fixes.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/super.c         | 217 ++++++++++++++++++++++++-------------------------
 fs/ufs/util.h          |  52 ++++--------
 include/linux/ufs_fs.h |  61 ++++++++++++--
 3 files changed, 173 insertions(+), 157 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index c8339e566380..3aadbd3167a6 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -94,82 +94,77 @@
 /*
  * Print contents of ufs_super_block, useful for debugging
  */
-void ufs_print_super_stuff(struct super_block *sb,
-	struct ufs_super_block_first * usb1,
-	struct ufs_super_block_second * usb2, 
-	struct ufs_super_block_third * usb3)
+static void ufs_print_super_stuff(struct super_block *sb, unsigned flags,
+				  struct ufs_super_block_first *usb1,
+				  struct ufs_super_block_second *usb2,
+				  struct ufs_super_block_third *usb3)
 {
 	printk("ufs_print_super_stuff\n");
-	printk("size of usb:     %zu\n", sizeof(struct ufs_super_block));
-	printk("  magic:         0x%x\n", fs32_to_cpu(sb, usb3->fs_magic));
-	printk("  sblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
-	printk("  cblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
-	printk("  iblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
-	printk("  dblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
-	printk("  cgoffset:      %u\n", fs32_to_cpu(sb, usb1->fs_cgoffset));
-	printk("  ~cgmask:       0x%x\n", ~fs32_to_cpu(sb, usb1->fs_cgmask));
-	printk("  size:          %u\n", fs32_to_cpu(sb, usb1->fs_size));
-	printk("  dsize:         %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
-	printk("  ncg:           %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
-	printk("  bsize:         %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
-	printk("  fsize:         %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
-	printk("  frag:          %u\n", fs32_to_cpu(sb, usb1->fs_frag));
-	printk("  fragshift:     %u\n", fs32_to_cpu(sb, usb1->fs_fragshift));
-	printk("  ~fmask:        %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
-	printk("  fshift:        %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
-	printk("  sbsize:        %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
-	printk("  spc:           %u\n", fs32_to_cpu(sb, usb1->fs_spc));
-	printk("  cpg:           %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
-	printk("  ipg:           %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
-	printk("  fpg:           %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
-	printk("  csaddr:        %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
-	printk("  cssize:        %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
-	printk("  cgsize:        %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
-	printk("  fstodb:        %u\n", fs32_to_cpu(sb, usb1->fs_fsbtodb));
-	printk("  contigsumsize: %d\n", fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_contigsumsize));
-	printk("  postblformat:  %u\n", fs32_to_cpu(sb, usb3->fs_postblformat));
-	printk("  nrpos:         %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
-	printk("  ndir           %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
-	printk("  nifree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
-	printk("  nbfree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
-	printk("  nffree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
-	printk("\n");
-}
-
-/*
- * Print contents of ufs2 ufs_super_block, useful for debugging
- */
-void ufs2_print_super_stuff(
-     struct super_block *sb,
-      struct ufs_super_block *usb)
-{
-	printk("ufs_print_super_stuff\n");
-	printk("size of usb:     %zu\n", sizeof(struct ufs_super_block));
-	printk("  magic:         0x%x\n", fs32_to_cpu(sb, usb->fs_magic));
-	printk("  fs_size:   %llu\n",
-	       (unsigned long long)fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size));
-	printk("  fs_dsize:  %llu\n",
-	       (unsigned long long)fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize));
-	printk("  bsize:         %u\n", fs32_to_cpu(sb, usb->fs_bsize));
-	printk("  fsize:         %u\n", fs32_to_cpu(sb, usb->fs_fsize));
-	printk("  fs_volname:  %s\n", usb->fs_u11.fs_u2.fs_volname);
-	printk("  fs_fsmnt:  %s\n", usb->fs_u11.fs_u2.fs_fsmnt);
-	printk("  fs_sblockloc: %llu\n",
-	       (unsigned long long)fs64_to_cpu(sb,
-					       usb->fs_u11.fs_u2.fs_sblockloc));
-	printk("  cs_ndir(No of dirs):  %llu\n",
-	       (unsigned long long)fs64_to_cpu(sb,
-				         usb->fs_u11.fs_u2.fs_cstotal.cs_ndir));
-	printk("  cs_nbfree(No of free blocks):  %llu\n",
-	       (unsigned long long)fs64_to_cpu(sb,
-				       usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree));
+	printk("  magic:     0x%x\n", fs32_to_cpu(sb, usb3->fs_magic));
+	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+		printk("  fs_size:   %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size));
+		printk("  fs_dsize:  %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize));
+		printk("  bsize:         %u\n",
+		       fs32_to_cpu(sb, usb1->fs_bsize));
+		printk("  fsize:         %u\n",
+		       fs32_to_cpu(sb, usb1->fs_fsize));
+		printk("  fs_volname:  %s\n", usb2->fs_un.fs_u2.fs_volname);
+		printk("  fs_sblockloc: %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc));
+		printk("  cs_ndir(No of dirs):  %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir));
+		printk("  cs_nbfree(No of free blocks):  %llu\n",
+		       (unsigned long long)
+		       fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree));
+	} else {
+		printk(" sblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
+		printk(" cblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
+		printk(" iblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
+		printk(" dblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
+		printk(" cgoffset:    %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cgoffset));
+		printk(" ~cgmask:     0x%x\n",
+		       ~fs32_to_cpu(sb, usb1->fs_cgmask));
+		printk(" size:        %u\n", fs32_to_cpu(sb, usb1->fs_size));
+		printk(" dsize:       %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
+		printk(" ncg:         %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
+		printk(" bsize:       %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
+		printk(" fsize:       %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
+		printk(" frag:        %u\n", fs32_to_cpu(sb, usb1->fs_frag));
+		printk(" fragshift:   %u\n",
+		       fs32_to_cpu(sb, usb1->fs_fragshift));
+		printk(" ~fmask:      %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
+		printk(" fshift:      %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
+		printk(" sbsize:      %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
+		printk(" spc:         %u\n", fs32_to_cpu(sb, usb1->fs_spc));
+		printk(" cpg:         %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
+		printk(" ipg:         %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
+		printk(" fpg:         %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
+		printk(" csaddr:      %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
+		printk(" cssize:      %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
+		printk(" cgsize:      %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
+		printk(" fstodb:      %u\n",
+		       fs32_to_cpu(sb, usb1->fs_fsbtodb));
+		printk(" nrpos:       %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
+		printk(" ndir         %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
+		printk(" nifree       %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
+		printk(" nbfree       %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
+		printk(" nffree       %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
+	}
 	printk("\n");
 }
 
 /*
  * Print contents of ufs_cylinder_group, useful for debugging
  */
-void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group *cg)
+static void ufs_print_cylinder_stuff(struct super_block *sb,
+				     struct ufs_cylinder_group *cg)
 {
 	printk("\nufs_print_cylinder_stuff\n");
 	printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group));
@@ -196,11 +191,17 @@ void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group
 	printk("  iuseoff:      %u\n", fs32_to_cpu(sb, cg->cg_iusedoff));
 	printk("  freeoff:      %u\n", fs32_to_cpu(sb, cg->cg_freeoff));
 	printk("  nextfreeoff:  %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff));
-	printk("  clustersumoff %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
-	printk("  clusteroff    %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
-	printk("  nclusterblks  %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
+	printk("  clustersumoff %u\n",
+	       fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
+	printk("  clusteroff    %u\n",
+	       fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
+	printk("  nclusterblks  %u\n",
+	       fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
 	printk("\n");
 }
+#else
+#  define ufs_print_super_stuff(sb, flags, usb1, usb2, usb3) /**/
+#  define ufs_print_cylinder_stuff(sb, cg) /**/
 #endif /* CONFIG_UFS_DEBUG */
 
 static struct super_operations ufs_super_ops;
@@ -384,9 +385,9 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
  */
 static int ufs_read_cylinder_structures (struct super_block *sb)
 {
-	struct ufs_sb_info * sbi = UFS_SB(sb);
-	struct ufs_sb_private_info * uspi;
-	struct ufs_super_block *usb;
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
+	struct ufs_super_block_third *usb3;
 	struct ufs_buffer_head * ubh;
 	unsigned char * base, * space;
 	unsigned size, blks, i;
@@ -394,10 +395,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 	
 	UFSD("ENTER\n");
 	
-	uspi = sbi->s_uspi;
-
-	usb  = (struct ufs_super_block *)
-		((struct ufs_buffer_head *)uspi)->bh[0]->b_data;
+	usb3 = ubh_get_usb_third(uspi);
 
         flags = UFS_SB(sb)->s_flags;
 	
@@ -418,7 +416,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 
 		if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) 
 			ubh = ubh_bread(sb,
-				fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_csaddr) + i, size);
+				fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr) + i, size);
 		else 
 			ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
 		
@@ -450,9 +448,8 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 			goto failed;
 		if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data))
 			goto failed;
-#ifdef CONFIG_UFS_DEBUG
+
 		ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
-#endif
 	}
 	for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
 		if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL)))
@@ -818,12 +815,8 @@ magic_found:
 		goto again;
 	}
 
-#ifdef CONFIG_UFS_DEBUG
-        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
-		ufs2_print_super_stuff(sb,usb);
-        else
-		ufs_print_super_stuff(sb, usb1, usb2, usb3);
-#endif
+
+	ufs_print_super_stuff(sb, flags, usb1, usb2, usb3);
 
 	/*
 	 * Check, if file system was correctly unmounted.
@@ -878,10 +871,9 @@ magic_found:
 	uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask);
 
 	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
-		uspi->s_u2_size  = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size);
-		uspi->s_u2_dsize = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize);
-	}
-	else {
+		uspi->s_u2_size  = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size);
+		uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
+	} else {
 		uspi->s_size  =  fs32_to_cpu(sb, usb1->fs_size);
 		uspi->s_dsize =  fs32_to_cpu(sb, usb1->fs_dsize);
 	}
@@ -916,8 +908,8 @@ magic_found:
 	uspi->s_spc = fs32_to_cpu(sb, usb1->fs_spc);
 	uspi->s_ipg = fs32_to_cpu(sb, usb1->fs_ipg);
 	uspi->s_fpg = fs32_to_cpu(sb, usb1->fs_fpg);
-	uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_cpc);
-	uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_contigsumsize);
+	uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_un.fs_u1.fs_cpc);
+	uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_contigsumsize);
 	uspi->s_qbmask = ufs_get_fs_qbmask(sb, usb3);
 	uspi->s_qfmask = ufs_get_fs_qfmask(sb, usb3);
 	uspi->s_postblformat = fs32_to_cpu(sb, usb3->fs_postblformat);
@@ -949,7 +941,7 @@ magic_found:
 	if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) ==
 	    UFS_MOUNT_UFSTYPE_44BSD)
 		uspi->s_maxsymlinklen =
-		    fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_maxsymlinklen);
+		    fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen);
 	
 	sbi->s_flags = flags;
 
@@ -987,7 +979,8 @@ failed_nomem:
 	return -ENOMEM;
 }
 
-static void ufs_write_super (struct super_block *sb) {
+static void ufs_write_super(struct super_block *sb)
+{
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
 	struct ufs_super_block_third * usb3;
@@ -1027,7 +1020,7 @@ static void ufs_put_super(struct super_block *sb)
 	kfree (sbi->s_uspi);
 	kfree (sbi);
 	sb->s_fs_info = NULL;
-UFSD("EXIT\n");
+	UFSD("EXIT\n");
 	return;
 }
 
@@ -1107,31 +1100,29 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	return 0;
 }
 
-static int ufs_statfs (struct dentry *dentry, struct kstatfs *buf)
+static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
-	struct ufs_sb_private_info * uspi;
-	struct ufs_super_block_first * usb1;
-	struct ufs_super_block * usb;
-	unsigned  flags = 0;
+	struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi;
+	unsigned  flags = UFS_SB(sb)->s_flags;
+	struct ufs_super_block_first *usb1;
+	struct ufs_super_block_second *usb2;
+	struct ufs_super_block_third *usb3;
 
 	lock_kernel();
 
-	uspi = UFS_SB(sb)->s_uspi;
-	usb1 = ubh_get_usb_first (uspi);
-	usb  = (struct ufs_super_block *)
-		((struct ufs_buffer_head *)uspi)->bh[0]->b_data ;
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
 	
-	flags = UFS_SB(sb)->s_flags;
 	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
 		buf->f_type = UFS2_MAGIC;
-		buf->f_blocks = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize);
-		buf->f_bfree = ufs_blkstofrags(fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree)) +
-			fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_cstotal.cs_nffree);
-		buf->f_ffree = fs64_to_cpu(sb,
-        		usb->fs_u11.fs_u2.fs_cstotal.cs_nifree);
-	}
-	else {
+		buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
+		buf->f_bfree = ufs_blkstofrags(
+			fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)) +
+			fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree);
+		buf->f_ffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree);
+	} else {
 		buf->f_type = UFS_MAGIC;
 		buf->f_blocks = uspi->s_dsize;
 		buf->f_bfree = ufs_blkstofrags(fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)) +
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index e95d1c46461f..eacd5e37b8e6 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -39,12 +39,12 @@ ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
 {
 	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
 	case UFS_ST_SUN:
-		return fs32_to_cpu(sb, usb3->fs_u2.fs_sun.fs_state);
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state);
 	case UFS_ST_SUNx86:
 		return fs32_to_cpu(sb, usb1->fs_u1.fs_sunx86.fs_state);
 	case UFS_ST_44BSD:
 	default:
-		return fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_state);
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_state);
 	}
 }
 
@@ -54,13 +54,13 @@ ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
 {
 	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
 	case UFS_ST_SUN:
-		usb3->fs_u2.fs_sun.fs_state = cpu_to_fs32(sb, value);
+		usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value);
 		break;
 	case UFS_ST_SUNx86:
 		usb1->fs_u1.fs_sunx86.fs_state = cpu_to_fs32(sb, value);
 		break;
 	case UFS_ST_44BSD:
-		usb3->fs_u2.fs_44.fs_state = cpu_to_fs32(sb, value);
+		usb3->fs_un2.fs_44.fs_state = cpu_to_fs32(sb, value);
 		break;
 	}
 }
@@ -70,7 +70,7 @@ ufs_get_fs_npsect(struct super_block *sb, struct ufs_super_block_first *usb1,
 		  struct ufs_super_block_third *usb3)
 {
 	if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-		return fs32_to_cpu(sb, usb3->fs_u2.fs_sunx86.fs_npsect);
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_sunx86.fs_npsect);
 	else
 		return fs32_to_cpu(sb, usb1->fs_u1.fs_sun.fs_npsect);
 }
@@ -82,16 +82,16 @@ ufs_get_fs_qbmask(struct super_block *sb, struct ufs_super_block_third *usb3)
 
 	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
 	case UFS_ST_SUN:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sun.fs_qbmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sun.fs_qbmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qbmask[1];
 		break;
 	case UFS_ST_SUNx86:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sunx86.fs_qbmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sunx86.fs_qbmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qbmask[1];
 		break;
 	case UFS_ST_44BSD:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_44.fs_qbmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_44.fs_qbmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qbmask[1];
 		break;
 	}
 
@@ -105,16 +105,16 @@ ufs_get_fs_qfmask(struct super_block *sb, struct ufs_super_block_third *usb3)
 
 	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
 	case UFS_ST_SUN:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sun.fs_qfmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sun.fs_qfmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qfmask[1];
 		break;
 	case UFS_ST_SUNx86:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sunx86.fs_qfmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sunx86.fs_qfmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qfmask[1];
 		break;
 	case UFS_ST_44BSD:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_44.fs_qfmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_44.fs_qfmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qfmask[1];
 		break;
 	}
 
@@ -302,24 +302,6 @@ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi,
 #define ubh_blkmap(ubh,begin,bit) \
 	((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb)))
 
-
-/*
- * Macros for access to superblock array structures
- */
-#define ubh_postbl(ubh,cylno,i) \
-	((uspi->s_postblformat != UFS_DYNAMICPOSTBLFMT) \
-	? (*(__s16*)(ubh_get_addr(ubh, \
-	(unsigned)(&((struct ufs_super_block *)0)->fs_opostbl) \
-	+ (((cylno) * 16 + (i)) << 1) ) )) \
-	: (*(__s16*)(ubh_get_addr(ubh, \
-	uspi->s_postbloff + (((cylno) * uspi->s_nrpos + (i)) << 1) ))))
-
-#define ubh_rotbl(ubh,i) \
-	((uspi->s_postblformat != UFS_DYNAMICPOSTBLFMT) \
-	? (*(__u8*)(ubh_get_addr(ubh, \
-	(unsigned)(&((struct ufs_super_block *)0)->fs_space) + (i)))) \
-	: (*(__u8*)(ubh_get_addr(ubh, uspi->s_rotbloff + (i)))))
-
 /*
  * Determine the number of available frags given a
  * percentage to hold in reserve.
diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h
index 28b31591f688..87b0a658bec8 100644
--- a/include/linux/ufs_fs.h
+++ b/include/linux/ufs_fs.h
@@ -351,8 +351,12 @@ struct ufs2_csum_total {
 	__fs64   cs_spare[3];	/* future expansion */
 };
 
+#if 0
 /*
  * This is the actual superblock, as it is laid out on the disk.
+ * Do NOT use this structure, because of sizeof(ufs_super_block) > 512 and
+ * it may occupy several blocks, use
+ * struct ufs_super_block_(first,second,third) instead.
  */
 struct ufs_super_block {
 	__fs32	fs_link;	/* UNUSED */
@@ -498,6 +502,7 @@ struct ufs_super_block {
 	__fs32	fs_magic;		/* magic number */
 	__u8	fs_space[1];		/* list of blocks for each rotation */
 };
+#endif/*struct ufs_super_block*/
 
 /*
  * Preference for optimization.
@@ -837,16 +842,54 @@ struct ufs_super_block_first {
 };
 
 struct ufs_super_block_second {
-	__s8	fs_fsmnt[212];
-	__fs32	fs_cgrotor;
-	__fs32	fs_csp[UFS_MAXCSBUFS];
-	__fs32	fs_maxcluster;
-	__fs32	fs_cpc;
-	__fs16	fs_opostbl[82];
-};	
+	union {
+		struct {
+			__s8	fs_fsmnt[212];
+			__fs32	fs_cgrotor;
+			__fs32	fs_csp[UFS_MAXCSBUFS];
+			__fs32	fs_maxcluster;
+			__fs32	fs_cpc;
+			__fs16	fs_opostbl[82];
+		} fs_u1;
+		struct {
+			__s8  fs_fsmnt[UFS2_MAXMNTLEN - UFS_MAXMNTLEN + 212];
+			__u8   fs_volname[UFS2_MAXVOLLEN];
+			__fs64  fs_swuid;
+			__fs32  fs_pad;
+			__fs32   fs_cgrotor;
+			__fs32   fs_ocsp[UFS2_NOCSPTRS];
+			__fs32   fs_contigdirs;
+			__fs32   fs_csp;
+			__fs32   fs_maxcluster;
+			__fs32   fs_active;
+			__fs32   fs_old_cpc;
+			__fs32   fs_maxbsize;
+			__fs64   fs_sparecon64[17];
+			__fs64   fs_sblockloc;
+			__fs64	cs_ndir;
+			__fs64	cs_nbfree;
+		} fs_u2;
+	} fs_un;
+};
 
 struct ufs_super_block_third {
-	__fs16	fs_opostbl[46];
+	union {
+		struct {
+			__fs16	fs_opostbl[46];
+		} fs_u1;
+		struct {
+			__fs64	cs_nifree;	/* number of free inodes */
+			__fs64	cs_nffree;	/* number of free frags */
+			__fs64   cs_numclusters;	/* number of free clusters */
+			__fs64   cs_spare[3];	/* future expansion */
+			struct  ufs_timeval    fs_time;		/* last time written */
+			__fs64    fs_size;		/* number of blocks in fs */
+			__fs64    fs_dsize;	/* number of data blocks in fs */
+			__fs64   fs_csaddr;	/* blk addr of cyl grp summary area */
+			__fs64    fs_pendingblocks;/* blocks in process of being freed */
+			__fs32    fs_pendinginodes;/*inodes in process of being freed */
+		} fs_u2;
+	} fs_un1;
 	union {
 		struct {
 			__fs32	fs_sparecon[53];/* reserved for future constants */
@@ -874,7 +917,7 @@ struct ufs_super_block_third {
 			__fs32	fs_qfmask[2];	/* ~usb_fmask */
 			__fs32	fs_state;	/* file system state time stamp */
 		} fs_44;
-	} fs_u2;
+	} fs_un2;
 	__fs32	fs_postblformat;
 	__fs32	fs_nrpos;
 	__fs32	fs_postbloff;
-- 
cgit v1.2.3


From ee3ffd6c126323693b3b32a71a1f1acfce30bd66 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:30 -0700
Subject: [PATCH] ufs: make fsck -f happy

ufs super block contains some statistic about file systems, like amount of
directories, free blocks, inodes and so on.

UFS1 hold this information in one location and uses 32bit integers for such
information, UFS2 hold statistic in another location and uses 64bit integers.

There is transition variant, if UFS1 has type 44BSD and flags field in super
block has some special value this mean that we work with statistic like UFS2
does.  and this also means that nobody care about old(UFS1) statistic.

So if start fsck against such file system, after usage linux ufs driver, it
found error: at now only UFS1 like statistic is updated.

This patch should fix this.  Also it contains some minor cleanup: CodingSytle
and remove unused variables.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/balloc.c        |  24 ++++-----
 fs/ufs/ialloc.c        |   8 +--
 fs/ufs/super.c         | 140 ++++++++++++++++++++++++++++++++++++-------------
 fs/ufs/util.h          |  10 ++--
 include/linux/ufs_fs.h |  14 ++++-
 5 files changed, 139 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 99d881812ad8..cb36d2dadef5 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -83,7 +83,7 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 
 	
 	fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
-	fs32_add(sb, &usb1->fs_cstotal.cs_nffree, count);
+	uspi->cs_total.cs_nffree += count;
 	fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
 	blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
 	ufs_fragacct(sb, blkmap, ucg->cg_frsum, 1);
@@ -94,12 +94,12 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 	blkno = ufs_fragstoblks (bbase);
 	if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
 		fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb);
-		fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, uspi->s_fpb);
+		uspi->cs_total.cs_nffree -= uspi->s_fpb;
 		fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb);
 		if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 			ufs_clusteracct (sb, ucpi, blkno, 1);
 		fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
-		fs32_add(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+		uspi->cs_total.cs_nbfree++;
 		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
 		cylno = ufs_cbtocylno (bbase);
 		fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(bbase)), 1);
@@ -185,7 +185,7 @@ do_more:
 		DQUOT_FREE_BLOCK(inode, uspi->s_fpb);
 
 		fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
-		fs32_add(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+		uspi->cs_total.cs_nbfree++;
 		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
 		cylno = ufs_cbtocylno(i);
 		fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(i)), 1);
@@ -372,7 +372,7 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 	/*
 	 * There is not enough space for user on the device
 	 */
-	if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(usb1, UFS_MINFREE) <= 0) {
+	if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
 		unlock_super (sb);
 		UFSD("EXIT (FAILED)\n");
 		return 0;
@@ -418,8 +418,8 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 	switch (fs32_to_cpu(sb, usb1->fs_optim)) {
 	    case UFS_OPTSPACE:
 		request = newcount;
-		if (uspi->s_minfree < 5 || fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree) 
-		    > uspi->s_dsize * uspi->s_minfree / (2 * 100) )
+		if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree
+		    > uspi->s_dsize * uspi->s_minfree / (2 * 100))
 			break;
 		usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
 		break;
@@ -428,7 +428,7 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 	
 	    case UFS_OPTTIME:
 		request = uspi->s_fpb;
-		if (fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree) < uspi->s_dsize *
+		if (uspi->cs_total.cs_nffree < uspi->s_dsize *
 		    (uspi->s_minfree - 2) / 100)
 			break;
 		usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
@@ -516,7 +516,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 
 	fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
 	fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
-	fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count);
+	uspi->cs_total.cs_nffree -= count;
 	
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
@@ -618,7 +618,7 @@ cg_found:
 		DQUOT_FREE_BLOCK(inode, i);
 
 		fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
-		fs32_add(sb, &usb1->fs_cstotal.cs_nffree, i);
+		uspi->cs_total.cs_nffree += i;
 		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i);
 		fs32_add(sb, &ucg->cg_frsum[i], 1);
 		goto succed;
@@ -635,7 +635,7 @@ cg_found:
 		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
 	
 	fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
-	fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count);
+	uspi->cs_total.cs_nffree -= count;
 	fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
 	fs32_sub(sb, &ucg->cg_frsum[allocsize], 1);
 
@@ -703,7 +703,7 @@ gotit:
 	}
 
 	fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
-	fs32_sub(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+	uspi->cs_total.cs_nbfree--;
 	fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1);
 	cylno = ufs_cbtocylno(result);
 	fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(result)), 1);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index c684aaad9998..6d7527350026 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -103,12 +103,12 @@ void ufs_free_inode (struct inode * inode)
 		if (ino < ucpi->c_irotor)
 			ucpi->c_irotor = ino;
 		fs32_add(sb, &ucg->cg_cs.cs_nifree, 1);
-		fs32_add(sb, &usb1->fs_cstotal.cs_nifree, 1);
+		uspi->cs_total.cs_nifree++;
 		fs32_add(sb, &UFS_SB(sb)->fs_cs(cg).cs_nifree, 1);
 
 		if (is_directory) {
 			fs32_sub(sb, &ucg->cg_cs.cs_ndir, 1);
-			fs32_sub(sb, &usb1->fs_cstotal.cs_ndir, 1);
+			uspi->cs_total.cs_ndir--;
 			fs32_sub(sb, &UFS_SB(sb)->fs_cs(cg).cs_ndir, 1);
 		}
 	}
@@ -228,12 +228,12 @@ cg_found:
 	}
 	
 	fs32_sub(sb, &ucg->cg_cs.cs_nifree, 1);
-	fs32_sub(sb, &usb1->fs_cstotal.cs_nifree, 1);
+	uspi->cs_total.cs_nifree--;
 	fs32_sub(sb, &sbi->fs_cs(cg).cs_nifree, 1);
 	
 	if (S_ISDIR(mode)) {
 		fs32_add(sb, &ucg->cg_cs.cs_ndir, 1);
-		fs32_add(sb, &usb1->fs_cstotal.cs_ndir, 1);
+		uspi->cs_total.cs_ndir++;
 		fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1);
 	}
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 3aadbd3167a6..74ef5e9bedff 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -381,24 +381,57 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
 }
 
 /*
- * Read on-disk structures associated with cylinder groups
+ * Diffrent types of UFS hold fs_cstotal in different
+ * places, and use diffrent data structure for it.
+ * To make things simplier we just copy fs_cstotal to ufs_sb_private_info
  */
-static int ufs_read_cylinder_structures (struct super_block *sb)
+static void ufs_setup_cstotal(struct super_block *sb)
 {
 	struct ufs_sb_info *sbi = UFS_SB(sb);
 	struct ufs_sb_private_info *uspi = sbi->s_uspi;
+	struct ufs_super_block_first *usb1;
+	struct ufs_super_block_second *usb2;
 	struct ufs_super_block_third *usb3;
+	unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
+
+	UFSD("ENTER, mtype=%u\n", mtype);
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+
+	if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+	     (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+	    mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+		/*we have statistic in different place, then usual*/
+		uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir);
+		uspi->cs_total.cs_nbfree = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree);
+		uspi->cs_total.cs_nifree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree);
+		uspi->cs_total.cs_nffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree);
+	} else {
+		uspi->cs_total.cs_ndir = fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir);
+		uspi->cs_total.cs_nbfree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree);
+		uspi->cs_total.cs_nifree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree);
+		uspi->cs_total.cs_nffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree);
+	}
+	UFSD("EXIT\n");
+}
+
+/*
+ * Read on-disk structures associated with cylinder groups
+ */
+static int ufs_read_cylinder_structures(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
+	unsigned flags = sbi->s_flags;
 	struct ufs_buffer_head * ubh;
 	unsigned char * base, * space;
 	unsigned size, blks, i;
-	unsigned flags = 0;
-	
+	struct ufs_super_block_third *usb3;
+
 	UFSD("ENTER\n");
-	
-	usb3 = ubh_get_usb_third(uspi);
 
-        flags = UFS_SB(sb)->s_flags;
-	
+	usb3 = ubh_get_usb_third(uspi);
 	/*
 	 * Read cs structures from (usually) first data block
 	 * on the device. 
@@ -475,21 +508,64 @@ failed:
 }
 
 /*
- * Put on-disk structures associated with cylinder groups and 
- * write them back to disk
+ * Sync our internal copy of fs_cstotal with disk
  */
-static void ufs_put_cylinder_structures (struct super_block *sb)
+static void ufs_put_cstotal(struct super_block *sb)
 {
-	struct ufs_sb_info * sbi = UFS_SB(sb);
-	struct ufs_sb_private_info * uspi;
+	unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct ufs_super_block_first *usb1;
+	struct ufs_super_block_second *usb2;
+	struct ufs_super_block_third *usb3;
+
+	UFSD("ENTER\n");
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+
+	if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+	     (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+	    mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+		/*we have statistic in different place, then usual*/
+		usb2->fs_un.fs_u2.cs_ndir =
+			cpu_to_fs64(sb, uspi->cs_total.cs_ndir);
+		usb2->fs_un.fs_u2.cs_nbfree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nbfree);
+		usb3->fs_un1.fs_u2.cs_nifree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nifree);
+		usb3->fs_un1.fs_u2.cs_nffree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nffree);
+	} else {
+		usb1->fs_cstotal.cs_ndir =
+			cpu_to_fs32(sb, uspi->cs_total.cs_ndir);
+		usb1->fs_cstotal.cs_nbfree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nbfree);
+		usb1->fs_cstotal.cs_nifree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nifree);
+		usb1->fs_cstotal.cs_nffree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nffree);
+	}
+	ubh_mark_buffer_dirty(USPI_UBH(uspi));
+	UFSD("EXIT\n");
+}
+
+/**
+ * ufs_put_super_internal() - put on-disk intrenal structures
+ * @sb: pointer to super_block structure
+ * Put on-disk structures associated with cylinder groups
+ * and write them back to disk, also update cs_total on disk
+ */
+static void ufs_put_super_internal(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
 	struct ufs_buffer_head * ubh;
 	unsigned char * base, * space;
 	unsigned blks, size, i;
+
 	
 	UFSD("ENTER\n");
-	
-	uspi = sbi->s_uspi;
-
+	ufs_put_cstotal(sb);
 	size = uspi->s_cssize;
 	blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
 	base = space = (char*) sbi->s_csp;
@@ -524,7 +600,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	struct ufs_super_block_first * usb1;
 	struct ufs_super_block_second * usb2;
 	struct ufs_super_block_third * usb3;
-	struct ufs_super_block *usb;
 	struct ufs_buffer_head * ubh;	
 	struct inode *inode;
 	unsigned block_size, super_block_size;
@@ -728,8 +803,6 @@ again:
 	usb1 = ubh_get_usb_first(uspi);
 	usb2 = ubh_get_usb_second(uspi);
 	usb3 = ubh_get_usb_third(uspi);
-	usb  = (struct ufs_super_block *)
-		((struct ufs_buffer_head *)uspi)->bh[0]->b_data ;
 
 	/*
 	 * Check ufs magic number
@@ -850,8 +923,7 @@ magic_found:
 			sb->s_flags |= MS_RDONLY;
 			break;
 		}
-	}
-	else {
+	} else {
 		printk("ufs_read_super: fs needs fsck\n");
 		sb->s_flags |= MS_RDONLY;
 	}
@@ -952,7 +1024,7 @@ magic_found:
 	if (!sb->s_root)
 		goto dalloc_failed;
 
-
+	ufs_setup_cstotal(sb);
 	/*
 	 * Read cylinder group structures
 	 */
@@ -1000,7 +1072,7 @@ static void ufs_write_super(struct super_block *sb)
 		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
 			ufs_set_fs_state(sb, usb1, usb3,
 					UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-		ubh_mark_buffer_dirty (USPI_UBH(uspi));
+		ufs_put_cstotal(sb);
 	}
 	sb->s_dirt = 0;
 	UFSD("EXIT\n");
@@ -1014,7 +1086,7 @@ static void ufs_put_super(struct super_block *sb)
 	UFSD("ENTER\n");
 
 	if (!(sb->s_flags & MS_RDONLY))
-		ufs_put_cylinder_structures (sb);
+		ufs_put_super_internal(sb);
 	
 	ubh_brelse_uspi (sbi->s_uspi);
 	kfree (sbi->s_uspi);
@@ -1049,8 +1121,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		return -EINVAL;
 	if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
 		new_mount_opt |= ufstype;
-	}
-	else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
+	} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
 		printk("ufstype can't be changed during remount\n");
 		return -EINVAL;
 	}
@@ -1064,7 +1135,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	 * fs was mouted as rw, remounting ro
 	 */
 	if (*mount_flags & MS_RDONLY) {
-		ufs_put_cylinder_structures(sb);
+		ufs_put_super_internal(sb);
 		usb1->fs_time = cpu_to_fs32(sb, get_seconds());
 		if ((flags & UFS_ST_MASK) == UFS_ST_SUN
 		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) 
@@ -1073,11 +1144,10 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		ubh_mark_buffer_dirty (USPI_UBH(uspi));
 		sb->s_dirt = 0;
 		sb->s_flags |= MS_RDONLY;
-	}
+	} else {
 	/*
 	 * fs was mounted as ro, remounting rw
 	 */
-	else {
 #ifndef CONFIG_UFS_FS_WRITE
 		printk("ufs was compiled with read-only support, "
 		"can't be mounted as read-write\n");
@@ -1089,7 +1159,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 			printk("this ufstype is read-only supported\n");
 			return -EINVAL;
 		}
-		if (!ufs_read_cylinder_structures (sb)) {
+		if (!ufs_read_cylinder_structures(sb)) {
 			printk("failed during remounting\n");
 			return -EPERM;
 		}
@@ -1118,17 +1188,13 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
 		buf->f_type = UFS2_MAGIC;
 		buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
-		buf->f_bfree = ufs_blkstofrags(
-			fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)) +
-			fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree);
-		buf->f_ffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree);
 	} else {
 		buf->f_type = UFS_MAGIC;
 		buf->f_blocks = uspi->s_dsize;
-		buf->f_bfree = ufs_blkstofrags(fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)) +
-			fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree);
-		buf->f_ffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree);
 	}
+	buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
+		uspi->cs_total.cs_nffree;
+	buf->f_ffree = uspi->cs_total.cs_nifree;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree))
 		? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index eacd5e37b8e6..99bfd6bba6d0 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -306,9 +306,13 @@ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi,
  * Determine the number of available frags given a
  * percentage to hold in reserve.
  */
-#define ufs_freespace(usb, percentreserved) \
-	(ufs_blkstofrags(fs32_to_cpu(sb, (usb)->fs_cstotal.cs_nbfree)) + \
-	fs32_to_cpu(sb, (usb)->fs_cstotal.cs_nffree) - (uspi->s_dsize * (percentreserved) / 100))
+static inline u64
+ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved)
+{
+	return ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
+		uspi->cs_total.cs_nffree -
+		(uspi->s_dsize * (percentreserved) / 100);
+}
 
 /*
  * Macros to access cylinder group array structures
diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h
index 87b0a658bec8..9a7f9b26564d 100644
--- a/include/linux/ufs_fs.h
+++ b/include/linux/ufs_fs.h
@@ -351,6 +351,17 @@ struct ufs2_csum_total {
 	__fs64   cs_spare[3];	/* future expansion */
 };
 
+/*
+ * File system flags
+ */
+#define UFS_UNCLEAN      0x01    /* file system not clean at mount (unused) */
+#define UFS_DOSOFTDEP    0x02    /* file system using soft dependencies */
+#define UFS_NEEDSFSCK    0x04    /* needs sync fsck (FreeBSD compat, unused) */
+#define UFS_INDEXDIRS    0x08    /* kernel supports indexed directories */
+#define UFS_ACLS         0x10    /* file system has ACLs enabled */
+#define UFS_MULTILABEL   0x20    /* file system is MAC multi-label */
+#define UFS_FLAGS_UPDATED 0x80   /* flags have been moved to new location */
+
 #if 0
 /*
  * This is the actual superblock, as it is laid out on the disk.
@@ -433,7 +444,7 @@ struct ufs_super_block {
 	__s8	fs_fmod;	/* super block modified flag */
 	__s8	fs_clean;	/* file system is clean flag */
 	__s8	fs_ronly;	/* mounted read-only flag */
-	__s8	fs_flags;	/* currently unused flag */
+	__s8	fs_flags;
 	union {
 		struct {
 			__s8	fs_fsmnt[UFS_MAXMNTLEN];/* name mounted on */
@@ -704,6 +715,7 @@ struct ufs_cg_private_info {
 
 struct ufs_sb_private_info {
 	struct ufs_buffer_head s_ubh; /* buffer containing super block */
+	struct ufs2_csum_total cs_total;
 	__u32	s_sblkno;	/* offset of super-blocks in filesys */
 	__u32	s_cblkno;	/* offset of cg-block in filesys */
 	__u32	s_iblkno;	/* offset of inode-blocks in filesys */
-- 
cgit v1.2.3


From 138bb68ac9d49b0ea7eeecb3a245dc4e20f181da Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 25 Jun 2006 05:47:32 -0700
Subject: [PATCH] fs/ufs/inode.c: make 2 functions static

Make two needlessly global functions static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/inode.c         | 9 ++++++---
 include/linux/ufs_fs.h | 2 --
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 01c5f19cbabe..f2dbdf5a8769 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -41,6 +41,8 @@
 #include "swab.h"
 #include "util.h"
 
+static u64 ufs_frag_map(struct inode *inode, sector_t frag);
+
 static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
 {
 	struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi;
@@ -80,7 +82,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
  * the begining of the filesystem.
  */
 
-u64  ufs_frag_map(struct inode *inode, sector_t frag)
+static u64 ufs_frag_map(struct inode *inode, sector_t frag)
 {
 	struct ufs_inode_info *ufsi = UFS_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -514,8 +516,9 @@ abort_too_big:
 	goto abort;
 }
 
-struct buffer_head *ufs_getfrag(struct inode *inode, unsigned int fragment,
-				int create, int *err)
+static struct buffer_head *ufs_getfrag(struct inode *inode,
+				       unsigned int fragment,
+				       int create, int *err)
 {
 	struct buffer_head dummy;
 	int error;
diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h
index 9a7f9b26564d..914f911325be 100644
--- a/include/linux/ufs_fs.h
+++ b/include/linux/ufs_fs.h
@@ -973,13 +973,11 @@ extern void ufs_free_inode (struct inode *inode);
 extern struct inode * ufs_new_inode (struct inode *, int);
 
 /* inode.c */
-extern u64  ufs_frag_map (struct inode *, sector_t);
 extern void ufs_read_inode (struct inode *);
 extern void ufs_put_inode (struct inode *);
 extern int ufs_write_inode (struct inode *, int);
 extern int ufs_sync_inode (struct inode *);
 extern void ufs_delete_inode (struct inode *);
-extern struct buffer_head * ufs_getfrag (struct inode *, unsigned, int, int *);
 extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
 extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
 
-- 
cgit v1.2.3


From 76a8ad293912cd2f01eca075d80cd0ddec30c627 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Sun, 25 Jun 2006 05:47:40 -0700
Subject: [PATCH] Make printk work for really early debugging

Currently printk is no use for early debugging because it refuses to
actually print anything to the console unless
cpu_online(smp_processor_id()) is true.

The stated explanation is that console drivers may require per-cpu
resources, or otherwise barf, because the system is not yet setup
correctly.  Fair enough.

However some console drivers might be quite happy running early during
boot, in fact we have one, and so it'd be nice if printk understood that.

So I added a flag (which I would have called CON_BOOT, but that's taken)
called CON_ANYTIME, which indicates that a console is happy to be called
anytime, even if the cpu is not yet online.

Tested on a Power 5 machine, with both a CON_ANYTIME driver and a bogus
console driver that BUG()s if called while offline.  No problems AFAICT.
Built for i386 UP & SMP.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/console.h |  1 +
 kernel/printk.c         | 50 ++++++++++++++++++++++++++++++++-----------------
 2 files changed, 34 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 08734e660d41..d0f8a8009490 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -87,6 +87,7 @@ void give_up_console(const struct consw *sw);
 #define CON_CONSDEV	(2) /* Last on the command line */
 #define CON_ENABLED	(4)
 #define CON_BOOT	(8)
+#define CON_ANYTIME	(16) /* Safe to call when cpu is offline */
 
 struct console
 {
diff --git a/kernel/printk.c b/kernel/printk.c
index 19a955619294..6b89dd9d11b6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -327,7 +327,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
 	struct console *con;
 
 	for (con = console_drivers; con; con = con->next) {
-		if ((con->flags & CON_ENABLED) && con->write)
+		if ((con->flags & CON_ENABLED) && con->write &&
+				(cpu_online(smp_processor_id()) ||
+				(con->flags & CON_ANYTIME)))
 			con->write(con, &LOG_BUF(start), end - start);
 	}
 }
@@ -453,6 +455,18 @@ __attribute__((weak)) unsigned long long printk_clock(void)
 	return sched_clock();
 }
 
+/* Check if we have any console registered that can be called early in boot. */
+static int have_callable_console(void)
+{
+	struct console *con;
+
+	for (con = console_drivers; con; con = con->next)
+		if (con->flags & CON_ANYTIME)
+			return 1;
+
+	return 0;
+}
+
 /**
  * printk - print a kernel message
  * @fmt: format string
@@ -566,27 +580,29 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 			log_level_unknown = 1;
 	}
 
-	if (!cpu_online(smp_processor_id())) {
+	if (!down_trylock(&console_sem)) {
 		/*
-		 * Some console drivers may assume that per-cpu resources have
-		 * been allocated.  So don't allow them to be called by this
-		 * CPU until it is officially up.  We shouldn't be calling into
-		 * random console drivers on a CPU which doesn't exist yet..
+		 * We own the drivers.  We can drop the spinlock and
+		 * let release_console_sem() print the text, maybe ...
 		 */
+		console_locked = 1;
 		printk_cpu = UINT_MAX;
 		spin_unlock_irqrestore(&logbuf_lock, flags);
-		goto out;
-	}
-	if (!down_trylock(&console_sem)) {
-		console_locked = 1;
+
 		/*
-		 * We own the drivers.  We can drop the spinlock and let
-		 * release_console_sem() print the text
+		 * Console drivers may assume that per-cpu resources have
+		 * been allocated. So unless they're explicitly marked as
+		 * being able to cope (CON_ANYTIME) don't call them until
+		 * this CPU is officially up.
 		 */
-		printk_cpu = UINT_MAX;
-		spin_unlock_irqrestore(&logbuf_lock, flags);
-		console_may_schedule = 0;
-		release_console_sem();
+		if (cpu_online(smp_processor_id()) || have_callable_console()) {
+			console_may_schedule = 0;
+			release_console_sem();
+		} else {
+			/* Release by hand to avoid flushing the buffer. */
+			console_locked = 0;
+			up(&console_sem);
+		}
 	} else {
 		/*
 		 * Someone else owns the drivers.  We drop the spinlock, which
@@ -596,7 +612,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 		printk_cpu = UINT_MAX;
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 	}
-out:
+
 	preempt_enable();
 	return printed_len;
 }
-- 
cgit v1.2.3


From 83cc5ed3c4c65fc4c3729a5cec2111ede1ebf85e Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 25 Jun 2006 05:47:41 -0700
Subject: [PATCH] kernel/sys.c: cleanups

- proper prototypes for the following functions:
  - ctrl_alt_del()  (in include/linux/reboot.h)
  - getrusage()     (in include/linux/resource.h)
- make the following needlessly global functions static:
  - kernel_restart_prepare()
  - kernel_kexec()

[akpm@osdl.org: compile fix]
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mach-ixp4xx/nas100d-power.c | 3 +--
 arch/arm/mach-ixp4xx/nslu2-power.c   | 3 +--
 arch/mips/kernel/irixsig.c           | 3 +--
 arch/mips/kernel/sysirix.c           | 2 +-
 arch/um/drivers/mconsole_kern.c      | 2 --
 drivers/char/keyboard.c              | 1 +
 drivers/s390/char/sclp_quiesce.c     | 3 +--
 include/linux/reboot.h               | 4 ++--
 include/linux/resource.h             | 4 ++++
 kernel/exit.c                        | 3 +--
 kernel/sys.c                         | 5 ++---
 11 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ixp4xx/nas100d-power.c b/arch/arm/mach-ixp4xx/nas100d-power.c
index 99d333d7ebdd..a3745ed37f9f 100644
--- a/arch/arm/mach-ixp4xx/nas100d-power.c
+++ b/arch/arm/mach-ixp4xx/nas100d-power.c
@@ -20,11 +20,10 @@
 #include <linux/module.h>
 #include <linux/reboot.h>
 #include <linux/interrupt.h>
+#include <linux/reboot.h>
 
 #include <asm/mach-types.h>
 
-extern void ctrl_alt_del(void);
-
 static irqreturn_t nas100d_reset_handler(int irq, void *dev_id, struct pt_regs *regs)
 {
 	/* Signal init to do the ctrlaltdel action, this will bypass init if
diff --git a/arch/arm/mach-ixp4xx/nslu2-power.c b/arch/arm/mach-ixp4xx/nslu2-power.c
index d80c362bc539..6d38e97142cc 100644
--- a/arch/arm/mach-ixp4xx/nslu2-power.c
+++ b/arch/arm/mach-ixp4xx/nslu2-power.c
@@ -20,11 +20,10 @@
 #include <linux/module.h>
 #include <linux/reboot.h>
 #include <linux/interrupt.h>
+#include <linux/reboot.h>
 
 #include <asm/mach-types.h>
 
-extern void ctrl_alt_del(void);
-
 static irqreturn_t nslu2_power_handler(int irq, void *dev_id, struct pt_regs *regs)
 {
 	/* Signal init to do the ctrlaltdel action, this will bypass init if
diff --git a/arch/mips/kernel/irixsig.c b/arch/mips/kernel/irixsig.c
index a9bf6cc3abd1..676e868d26fb 100644
--- a/arch/mips/kernel/irixsig.c
+++ b/arch/mips/kernel/irixsig.c
@@ -13,6 +13,7 @@
 #include <linux/smp_lock.h>
 #include <linux/time.h>
 #include <linux/ptrace.h>
+#include <linux/resource.h>
 
 #include <asm/ptrace.h>
 #include <asm/uaccess.h>
@@ -540,8 +541,6 @@ out:
 #define IRIX_P_PGID   2
 #define IRIX_P_ALL    7
 
-extern int getrusage(struct task_struct *, int, struct rusage __user *);
-
 #define W_EXITED     1
 #define W_TRAPPED    2
 #define W_STOPPED    4
diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c
index 19e1ef43eb4b..1137dd6ea7aa 100644
--- a/arch/mips/kernel/sysirix.c
+++ b/arch/mips/kernel/sysirix.c
@@ -31,6 +31,7 @@
 #include <linux/socket.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/resource.h>
 
 #include <asm/ptrace.h>
 #include <asm/page.h>
@@ -235,7 +236,6 @@ asmlinkage int irix_prctl(unsigned option, ...)
 #undef DEBUG_PROCGRPS
 
 extern unsigned long irix_mapelf(int fd, struct elf_phdr __user *user_phdrp, int cnt);
-extern int getrusage(struct task_struct *p, int who, struct rusage __user *ru);
 extern char *prom_getenv(char *name);
 extern long prom_setenv(char *name, char *value);
 
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 6d7173fc55a3..79149314ed04 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -300,8 +300,6 @@ void mconsole_reboot(struct mc_request *req)
 	machine_restart(NULL);
 }
 
-extern void ctrl_alt_del(void);
-
 void mconsole_cad(struct mc_request *req)
 {
 	mconsole_reply(req, "", 0, 0);
diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c
index 5755b7e5f187..edd996f6fb87 100644
--- a/drivers/char/keyboard.c
+++ b/drivers/char/keyboard.c
@@ -39,6 +39,7 @@
 #include <linux/vt_kern.h>
 #include <linux/sysrq.h>
 #include <linux/input.h>
+#include <linux/reboot.h>
 
 static void kbd_disconnect(struct input_handle *handle);
 extern void ctrl_alt_del(void);
diff --git a/drivers/s390/char/sclp_quiesce.c b/drivers/s390/char/sclp_quiesce.c
index 56fa69168898..a4c53c172db6 100644
--- a/drivers/s390/char/sclp_quiesce.c
+++ b/drivers/s390/char/sclp_quiesce.c
@@ -13,6 +13,7 @@
 #include <linux/cpumask.h>
 #include <linux/smp.h>
 #include <linux/init.h>
+#include <linux/reboot.h>
 #include <asm/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/sigp.h>
@@ -66,8 +67,6 @@ do_machine_quiesce(void)
 }
 #endif
 
-extern void ctrl_alt_del(void);
-
 /* Handler for quiesce event. Start shutdown procedure. */
 static void
 sclp_quiesce_handler(struct evbuf_header *evbuf)
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 015297ff73fa..1dd1c707311f 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -59,13 +59,13 @@ extern void machine_crash_shutdown(struct pt_regs *);
  * Architecture independent implemenations of sys_reboot commands.
  */
 
-extern void kernel_restart_prepare(char *cmd);
 extern void kernel_shutdown_prepare(enum system_states state);
 
 extern void kernel_restart(char *cmd);
 extern void kernel_halt(void);
 extern void kernel_power_off(void);
-extern void kernel_kexec(void);
+
+void ctrl_alt_del(void);
 
 /*
  * Emergency restart, callable from an interrupt handler.
diff --git a/include/linux/resource.h b/include/linux/resource.h
index 21a86cb6acdb..ae13db714742 100644
--- a/include/linux/resource.h
+++ b/include/linux/resource.h
@@ -3,6 +3,8 @@
 
 #include <linux/time.h>
 
+struct task_struct;
+
 /*
  * Resource control/accounting header file for linux
  */
@@ -67,4 +69,6 @@ struct rlimit {
  */
 #include <asm/resource.h>
 
+int getrusage(struct task_struct *p, int who, struct rusage __user *ru);
+
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index a3baf92462bd..b12a4706f73f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -36,6 +36,7 @@
 #include <linux/compat.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/audit.h> /* for audit_free() */
+#include <linux/resource.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -45,8 +46,6 @@
 extern void sem_exit (void);
 extern struct task_struct *child_reaper;
 
-int getrusage(struct task_struct *, int, struct rusage __user *);
-
 static void exit_mm(struct task_struct * tsk);
 
 static void __unhash_process(struct task_struct *p)
diff --git a/kernel/sys.c b/kernel/sys.c
index 7e0927bad713..2d5179c67cec 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -589,7 +589,7 @@ void emergency_restart(void)
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
 
-void kernel_restart_prepare(char *cmd)
+static void kernel_restart_prepare(char *cmd)
 {
 	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
@@ -623,7 +623,7 @@ EXPORT_SYMBOL_GPL(kernel_restart);
  *	Move into place and start executing a preloaded standalone
  *	executable.  If nothing was preloaded return an error.
  */
-void kernel_kexec(void)
+static void kernel_kexec(void)
 {
 #ifdef CONFIG_KEXEC
 	struct kimage *image;
@@ -637,7 +637,6 @@ void kernel_kexec(void)
 	machine_kexec(image);
 #endif
 }
-EXPORT_SYMBOL_GPL(kernel_kexec);
 
 void kernel_shutdown_prepare(enum system_states state)
 {
-- 
cgit v1.2.3


From fe96e57d77481c8c1b6b0381d7e086870ac394fa Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sun, 25 Jun 2006 05:47:42 -0700
Subject: [PATCH] fix list.h kernel-doc

kernel-doc:

Put all short function descriptions on one line or if they are too long,
omit the short description & add a Description: section for them.

Change some list iterator descriptions to use "current" point instead of
"existing" point.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/list.h | 82 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 52 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index a02642e4710a..88ecfa8f31c3 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -281,16 +281,17 @@ static inline int list_empty(const struct list_head *head)
 }
 
 /**
- * list_empty_careful - tests whether a list is
- * empty _and_ checks that no other CPU might be
- * in the process of still modifying either member
+ * list_empty_careful - tests whether a list is empty and not being modified
+ * @head: the list to test
+ *
+ * Description:
+ * tests whether a list is empty _and_ checks that no other CPU might be
+ * in the process of modifying either member (next or prev)
  *
  * NOTE: using list_empty_careful() without synchronization
  * can only be safe if the only activity that can happen
  * to the list entry is list_del_init(). Eg. it cannot be used
  * if another CPU could re-list_add() it.
- *
- * @head: the list to test.
  */
 static inline int list_empty_careful(const struct list_head *head)
 {
@@ -380,7 +381,7 @@ static inline void list_splice_init(struct list_head *list,
         	pos = pos->prev)
 
 /**
- * list_for_each_safe	-	iterate over a list safe against removal of list entry
+ * list_for_each_safe - iterate over a list safe against removal of list entry
  * @pos:	the &struct list_head to use as a loop counter.
  * @n:		another &struct list_head to use as temporary storage
  * @head:	the head for your list.
@@ -412,21 +413,24 @@ static inline void list_splice_init(struct list_head *list,
 	     pos = list_entry(pos->member.prev, typeof(*pos), member))
 
 /**
- * list_prepare_entry - prepare a pos entry for use as a start point in
- *			list_for_each_entry_continue
+ * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue
  * @pos:	the type * to use as a start point
  * @head:	the head of the list
  * @member:	the name of the list_struct within the struct.
+ *
+ * Prepares a pos entry for use as a start point in list_for_each_entry_continue.
  */
 #define list_prepare_entry(pos, head, member) \
 	((pos) ? : list_entry(head, typeof(*pos), member))
 
 /**
- * list_for_each_entry_continue -	iterate over list of given type
- *			continuing after existing point
+ * list_for_each_entry_continue - continue iteration over list of given type
  * @pos:	the type * to use as a loop counter.
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
  */
 #define list_for_each_entry_continue(pos, head, member) 		\
 	for (pos = list_entry(pos->member.next, typeof(*pos), member);	\
@@ -434,11 +438,12 @@ static inline void list_splice_init(struct list_head *list,
 	     pos = list_entry(pos->member.next, typeof(*pos), member))
 
 /**
- * list_for_each_entry_from -	iterate over list of given type
- *			continuing from existing point
+ * list_for_each_entry_from - iterate over list of given type from the current point
  * @pos:	the type * to use as a loop counter.
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type, continuing from current position.
  */
 #define list_for_each_entry_from(pos, head, member) 			\
 	for (; prefetch(pos->member.next), &pos->member != (head);	\
@@ -458,12 +463,14 @@ static inline void list_splice_init(struct list_head *list,
 	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
 
 /**
- * list_for_each_entry_safe_continue -	iterate over list of given type
- *			continuing after existing point safe against removal of list entry
+ * list_for_each_entry_safe_continue
  * @pos:	the type * to use as a loop counter.
  * @n:		another type * to use as temporary storage
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type, continuing after current point,
+ * safe against removal of list entry.
  */
 #define list_for_each_entry_safe_continue(pos, n, head, member) 		\
 	for (pos = list_entry(pos->member.next, typeof(*pos), member), 		\
@@ -472,12 +479,14 @@ static inline void list_splice_init(struct list_head *list,
 	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
 
 /**
- * list_for_each_entry_safe_from - iterate over list of given type
- *			from existing point safe against removal of list entry
+ * list_for_each_entry_safe_from
  * @pos:	the type * to use as a loop counter.
  * @n:		another type * to use as temporary storage
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type from current point, safe against
+ * removal of list entry.
  */
 #define list_for_each_entry_safe_from(pos, n, head, member) 			\
 	for (n = list_entry(pos->member.next, typeof(*pos), member);		\
@@ -485,12 +494,14 @@ static inline void list_splice_init(struct list_head *list,
 	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
 
 /**
- * list_for_each_entry_safe_reverse - iterate backwards over list of given type safe against
- *				      removal of list entry
+ * list_for_each_entry_safe_reverse
  * @pos:	the type * to use as a loop counter.
  * @n:		another type * to use as temporary storage
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate backwards over list of given type, safe against removal
+ * of list entry.
  */
 #define list_for_each_entry_safe_reverse(pos, n, head, member)		\
 	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
@@ -518,12 +529,13 @@ static inline void list_splice_init(struct list_head *list,
         	pos = pos->next)
 
 /**
- * list_for_each_safe_rcu	-	iterate over an rcu-protected list safe
- *					against removal of list entry
+ * list_for_each_safe_rcu
  * @pos:	the &struct list_head to use as a loop counter.
  * @n:		another &struct list_head to use as temporary storage
  * @head:	the head for your list.
  *
+ * Iterate over an rcu-protected list, safe against removal of list entry.
+ *
  * This list-traversal primitive may safely run concurrently with
  * the _rcu list-mutation primitives such as list_add_rcu()
  * as long as the traversal is guarded by rcu_read_lock().
@@ -551,11 +563,12 @@ static inline void list_splice_init(struct list_head *list,
 
 
 /**
- * list_for_each_continue_rcu	-	iterate over an rcu-protected list
- *			continuing after existing point.
+ * list_for_each_continue_rcu
  * @pos:	the &struct list_head to use as a loop counter.
  * @head:	the head for your list.
  *
+ * Iterate over an rcu-protected list, continuing after current point.
+ *
  * This list-traversal primitive may safely run concurrently with
  * the _rcu list-mutation primitives such as list_add_rcu()
  * as long as the traversal is guarded by rcu_read_lock().
@@ -681,11 +694,14 @@ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
 
 
 /**
- * hlist_add_head_rcu - adds the specified element to the specified hlist,
- * while permitting racing traversals.
+ * hlist_add_head_rcu
  * @n: the element to add to the hash list.
  * @h: the list to add to.
  *
+ * Description:
+ * Adds the specified element to the specified hlist,
+ * while permitting racing traversals.
+ *
  * The caller must take whatever precautions are necessary
  * (such as holding appropriate locks) to avoid racing
  * with another list-mutation primitive, such as hlist_add_head_rcu()
@@ -730,11 +746,14 @@ static inline void hlist_add_after(struct hlist_node *n,
 }
 
 /**
- * hlist_add_before_rcu - adds the specified element to the specified hlist
- * before the specified node while permitting racing traversals.
+ * hlist_add_before_rcu
  * @n: the new element to add to the hash list.
  * @next: the existing element to add the new element before.
  *
+ * Description:
+ * Adds the specified element to the specified hlist
+ * before the specified node while permitting racing traversals.
+ *
  * The caller must take whatever precautions are necessary
  * (such as holding appropriate locks) to avoid racing
  * with another list-mutation primitive, such as hlist_add_head_rcu()
@@ -755,11 +774,14 @@ static inline void hlist_add_before_rcu(struct hlist_node *n,
 }
 
 /**
- * hlist_add_after_rcu - adds the specified element to the specified hlist
- * after the specified node while permitting racing traversals.
+ * hlist_add_after_rcu
  * @prev: the existing element to add the new element after.
  * @n: the new element to add to the hash list.
  *
+ * Description:
+ * Adds the specified element to the specified hlist
+ * after the specified node while permitting racing traversals.
+ *
  * The caller must take whatever precautions are necessary
  * (such as holding appropriate locks) to avoid racing
  * with another list-mutation primitive, such as hlist_add_head_rcu()
@@ -804,7 +826,7 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 	     pos = pos->next)
 
 /**
- * hlist_for_each_entry_continue - iterate over a hlist continuing after existing point
+ * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
  * @tpos:	the type * to use as a loop counter.
  * @pos:	the &struct hlist_node to use as a loop counter.
  * @member:	the name of the hlist_node within the struct.
@@ -816,7 +838,7 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 	     pos = pos->next)
 
 /**
- * hlist_for_each_entry_from - iterate over a hlist continuing from existing point
+ * hlist_for_each_entry_from - iterate over a hlist continuing from current point
  * @tpos:	the type * to use as a loop counter.
  * @pos:	the &struct hlist_node to use as a loop counter.
  * @member:	the name of the hlist_node within the struct.
-- 
cgit v1.2.3


From 8e3a67a99231f9f3f476bc3449e93c9a6a17f2e0 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sun, 25 Jun 2006 05:47:43 -0700
Subject: [PATCH] list.h doc: change "counter" to "cursor"

Use loop "cursor" instead of loop "counter" for list iterator descriptions.
They are not counters, they are pointers or positions.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/list.h | 52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 88ecfa8f31c3..37ca31b21bb7 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -351,7 +351,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each	-	iterate over a list
- * @pos:	the &struct list_head to use as a loop counter.
+ * @pos:	the &struct list_head to use as a loop cursor.
  * @head:	the head for your list.
  */
 #define list_for_each(pos, head) \
@@ -360,7 +360,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * __list_for_each	-	iterate over a list
- * @pos:	the &struct list_head to use as a loop counter.
+ * @pos:	the &struct list_head to use as a loop cursor.
  * @head:	the head for your list.
  *
  * This variant differs from list_for_each() in that it's the
@@ -373,7 +373,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each_prev	-	iterate over a list backwards
- * @pos:	the &struct list_head to use as a loop counter.
+ * @pos:	the &struct list_head to use as a loop cursor.
  * @head:	the head for your list.
  */
 #define list_for_each_prev(pos, head) \
@@ -382,7 +382,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each_safe - iterate over a list safe against removal of list entry
- * @pos:	the &struct list_head to use as a loop counter.
+ * @pos:	the &struct list_head to use as a loop cursor.
  * @n:		another &struct list_head to use as temporary storage
  * @head:	the head for your list.
  */
@@ -392,7 +392,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each_entry	-	iterate over list of given type
- * @pos:	the type * to use as a loop counter.
+ * @pos:	the type * to use as a loop cursor.
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
  */
@@ -403,7 +403,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each_entry_reverse - iterate backwards over list of given type.
- * @pos:	the type * to use as a loop counter.
+ * @pos:	the type * to use as a loop cursor.
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
  */
@@ -425,7 +425,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each_entry_continue - continue iteration over list of given type
- * @pos:	the type * to use as a loop counter.
+ * @pos:	the type * to use as a loop cursor.
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
  *
@@ -439,7 +439,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each_entry_from - iterate over list of given type from the current point
- * @pos:	the type * to use as a loop counter.
+ * @pos:	the type * to use as a loop cursor.
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
  *
@@ -451,7 +451,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
- * @pos:	the type * to use as a loop counter.
+ * @pos:	the type * to use as a loop cursor.
  * @n:		another type * to use as temporary storage
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
@@ -464,7 +464,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each_entry_safe_continue
- * @pos:	the type * to use as a loop counter.
+ * @pos:	the type * to use as a loop cursor.
  * @n:		another type * to use as temporary storage
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
@@ -480,7 +480,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each_entry_safe_from
- * @pos:	the type * to use as a loop counter.
+ * @pos:	the type * to use as a loop cursor.
  * @n:		another type * to use as temporary storage
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
@@ -495,7 +495,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each_entry_safe_reverse
- * @pos:	the type * to use as a loop counter.
+ * @pos:	the type * to use as a loop cursor.
  * @n:		another type * to use as temporary storage
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
@@ -511,7 +511,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each_rcu	-	iterate over an rcu-protected list
- * @pos:	the &struct list_head to use as a loop counter.
+ * @pos:	the &struct list_head to use as a loop cursor.
  * @head:	the head for your list.
  *
  * This list-traversal primitive may safely run concurrently with
@@ -530,7 +530,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each_safe_rcu
- * @pos:	the &struct list_head to use as a loop counter.
+ * @pos:	the &struct list_head to use as a loop cursor.
  * @n:		another &struct list_head to use as temporary storage
  * @head:	the head for your list.
  *
@@ -547,7 +547,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each_entry_rcu	-	iterate over rcu list of given type
- * @pos:	the type * to use as a loop counter.
+ * @pos:	the type * to use as a loop cursor.
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
  *
@@ -564,7 +564,7 @@ static inline void list_splice_init(struct list_head *list,
 
 /**
  * list_for_each_continue_rcu
- * @pos:	the &struct list_head to use as a loop counter.
+ * @pos:	the &struct list_head to use as a loop cursor.
  * @head:	the head for your list.
  *
  * Iterate over an rcu-protected list, continuing after current point.
@@ -814,8 +814,8 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 
 /**
  * hlist_for_each_entry	- iterate over list of given type
- * @tpos:	the type * to use as a loop counter.
- * @pos:	the &struct hlist_node to use as a loop counter.
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
  * @head:	the head for your list.
  * @member:	the name of the hlist_node within the struct.
  */
@@ -827,8 +827,8 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 
 /**
  * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
- * @tpos:	the type * to use as a loop counter.
- * @pos:	the &struct hlist_node to use as a loop counter.
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
  * @member:	the name of the hlist_node within the struct.
  */
 #define hlist_for_each_entry_continue(tpos, pos, member)		 \
@@ -839,8 +839,8 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 
 /**
  * hlist_for_each_entry_from - iterate over a hlist continuing from current point
- * @tpos:	the type * to use as a loop counter.
- * @pos:	the &struct hlist_node to use as a loop counter.
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
  * @member:	the name of the hlist_node within the struct.
  */
 #define hlist_for_each_entry_from(tpos, pos, member)			 \
@@ -850,8 +850,8 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 
 /**
  * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
- * @tpos:	the type * to use as a loop counter.
- * @pos:	the &struct hlist_node to use as a loop counter.
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
  * @n:		another &struct hlist_node to use as temporary storage
  * @head:	the head for your list.
  * @member:	the name of the hlist_node within the struct.
@@ -864,8 +864,8 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 
 /**
  * hlist_for_each_entry_rcu - iterate over rcu list of given type
- * @tpos:	the type * to use as a loop counter.
- * @pos:	the &struct hlist_node to use as a loop counter.
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
  * @head:	the head for your list.
  * @member:	the name of the hlist_node within the struct.
  *
-- 
cgit v1.2.3


From dbe217af3be08346f4b1abb885c2d9ec29c98fac Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Sun, 25 Jun 2006 05:47:44 -0700
Subject: [PATCH] IDE CD end-of media error fix

This is a patch from Alan that fixes a real ide-cd.c regression causing
bogus "Media Check" failures for perfectly valid Fedora install ISOs, on
certain CD-ROM drives.

This is a forward port to 2.6.16 (from RHEL) of the minimal changes for the
end of media problem.  It may not be sufficient for some controllers
(promise notably) and it does not touch the locking so the error path
locking is as horked as in mainstream.

From: Ingo Molnar <mingo@elte.hu>

I have ported the patch to 2.6.17-rc4 and tested it by provoking
end-of-media IO errors with an unaligned ISO image.  Unlike the vanilla
kernel, the patched kernel interpreted the error condition correctly with
512 byte granularity:

 hdc: command error: status=0x51 { DriveReady SeekComplete Error }
 hdc: command error: error=0x54 { AbortedCommand LastFailedSense=0x05 }
 ide: failed opcode was: unknown
 ATAPI device hdc:
   Error: Illegal request -- (Sense key=0x05)
   Illegal mode for this track or incompatible medium -- (asc=0x64, ascq=0x00)
   The failed "Read 10" packet command was:
   "28 00 00 04 fb 78 00 00 06 00 00 00 00 00 00 00 "
 end_request: I/O error, dev hdc, sector 1306080
 Buffer I/O error on device hdc, logical block 163260
 Buffer I/O error on device hdc, logical block 163261
 Buffer I/O error on device hdc, logical block 163262

the unpatched kernel produces an incorrect error dump:

 hdc: command error: status=0x51 { DriveReady SeekComplete Error }
 hdc: command error: error=0x54 { AbortedCommand LastFailedSense=0x05 }
 ide: failed opcode was: unknown
 end_request: I/O error, dev hdc, sector 1306080
 Buffer I/O error on device hdc, logical block 163260
 hdc: command error: status=0x51 { DriveReady SeekComplete Error }
 hdc: command error: error=0x54 { AbortedCommand LastFailedSense=0x05 }
 ide: failed opcode was: unknown
 end_request: I/O error, dev hdc, sector 1306088
 Buffer I/O error on device hdc, logical block 163261
 hdc: command error: status=0x51 { DriveReady SeekComplete Error }
 hdc: command error: error=0x54 { AbortedCommand LastFailedSense=0x05 }
 ide: failed opcode was: unknown
 end_request: I/O error, dev hdc, sector 1306096
 Buffer I/O error on device hdc, logical block 163262

I do not have the right type of CD-ROM drive to reproduce the end-of-media
data corruption bug myself, but this same patch in RHEL solved it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Bartlomiej Zolnierkiewicz <B.Zolnierkiewicz@elka.pw.edu.pl>
Cc: Jens Axboe <axboe@suse.de>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/ide/ide-cd.c | 120 +++++++++++++++++++++++++++++++++++++++------------
 drivers/ide/ide-io.c |  57 ++++++++++++++++++++++++
 include/linux/ide.h  |   3 ++
 3 files changed, 152 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 6de3cd3d6e8e..99fa42402e71 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -395,7 +395,8 @@ static int cdrom_log_sense(ide_drive_t *drive, struct request *rq,
 			 * we cannot reliably check if drive can auto-close
 			 */
 			if (rq->cmd[0] == GPCMD_START_STOP_UNIT && sense->asc == 0x24)
-				log = 0;
+				break;
+			log = 1;
 			break;
 		case UNIT_ATTENTION:
 			/*
@@ -417,6 +418,11 @@ void cdrom_analyze_sense_data(ide_drive_t *drive,
 			      struct request *failed_command,
 			      struct request_sense *sense)
 {
+	unsigned long sector;
+	unsigned long bio_sectors;
+	unsigned long valid;
+	struct cdrom_info *info = drive->driver_data;
+
 	if (!cdrom_log_sense(drive, failed_command, sense))
 		return;
 
@@ -429,6 +435,37 @@ void cdrom_analyze_sense_data(ide_drive_t *drive,
 		if (sense->sense_key == 0x05 && sense->asc == 0x24)
 			return;
 
+ 	if (sense->error_code == 0x70) {	/* Current Error */
+ 		switch(sense->sense_key) {
+		case MEDIUM_ERROR:
+		case VOLUME_OVERFLOW:
+		case ILLEGAL_REQUEST:
+			if (!sense->valid)
+				break;
+			if (failed_command == NULL ||
+					!blk_fs_request(failed_command))
+				break;
+			sector = (sense->information[0] << 24) |
+				 (sense->information[1] << 16) |
+				 (sense->information[2] <<  8) |
+				 (sense->information[3]);
+
+			bio_sectors = bio_sectors(failed_command->bio);
+			if (bio_sectors < 4)
+				bio_sectors = 4;
+			if (drive->queue->hardsect_size == 2048)
+				sector <<= 2;	/* Device sector size is 2K */
+			sector &= ~(bio_sectors -1);
+			valid = (sector - failed_command->sector) << 9;
+
+			if (valid < 0)
+				valid = 0;
+			if (sector < get_capacity(info->disk) &&
+				drive->probed_capacity - sector < 4 * 75) {
+				set_capacity(info->disk, sector);
+			}
+ 		}
+ 	}
 #if VERBOSE_IDE_CD_ERRORS
 	{
 		int i;
@@ -609,17 +646,23 @@ static void cdrom_end_request (ide_drive_t *drive, int uptodate)
 				sense = failed->sense;
 				failed->sense_len = rq->sense_len;
 			}
-
+			cdrom_analyze_sense_data(drive, failed, sense);
 			/*
 			 * now end failed request
 			 */
-			spin_lock_irqsave(&ide_lock, flags);
-			end_that_request_chunk(failed, 0, failed->data_len);
-			end_that_request_last(failed, 0);
-			spin_unlock_irqrestore(&ide_lock, flags);
-		}
-
-		cdrom_analyze_sense_data(drive, failed, sense);
+			if (blk_fs_request(failed)) {
+				if (ide_end_dequeued_request(drive, failed, 0,
+						failed->hard_nr_sectors))
+					BUG();
+			} else {
+				spin_lock_irqsave(&ide_lock, flags);
+				end_that_request_chunk(failed, 0,
+							failed->data_len);
+				end_that_request_last(failed, 0);
+				spin_unlock_irqrestore(&ide_lock, flags);
+			}
+		} else
+			cdrom_analyze_sense_data(drive, NULL, sense);
 	}
 
 	if (!rq->current_nr_sectors && blk_fs_request(rq))
@@ -633,6 +676,13 @@ static void cdrom_end_request (ide_drive_t *drive, int uptodate)
 	ide_end_request(drive, uptodate, nsectors);
 }
 
+static void ide_dump_status_no_sense(ide_drive_t *drive, const char *msg, u8 stat)
+{
+	if (stat & 0x80)
+		return;
+	ide_dump_status(drive, msg, stat);
+}
+
 /* Returns 0 if the request should be continued.
    Returns 1 if the request was ended. */
 static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret)
@@ -761,16 +811,16 @@ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret)
 			   sense_key == DATA_PROTECT) {
 			/* No point in retrying after an illegal
 			   request or data protect error.*/
-			ide_dump_status (drive, "command error", stat);
+			ide_dump_status_no_sense (drive, "command error", stat);
 			do_end_request = 1;
 		} else if (sense_key == MEDIUM_ERROR) {
 			/* No point in re-trying a zillion times on a bad 
 			 * sector...  If we got here the error is not correctable */
-			ide_dump_status (drive, "media error (bad sector)", stat);
+			ide_dump_status_no_sense (drive, "media error (bad sector)", stat);
 			do_end_request = 1;
 		} else if (sense_key == BLANK_CHECK) {
 			/* Disk appears blank ?? */
-			ide_dump_status (drive, "media error (blank)", stat);
+			ide_dump_status_no_sense (drive, "media error (blank)", stat);
 			do_end_request = 1;
 		} else if ((err & ~ABRT_ERR) != 0) {
 			/* Go to the default handler
@@ -782,13 +832,27 @@ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret)
 			do_end_request = 1;
 		}
 
-		if (do_end_request)
-			cdrom_end_request(drive, 0);
-
-		/* If we got a CHECK_CONDITION status,
-		   queue a request sense command. */
-		if ((stat & ERR_STAT) != 0)
-			cdrom_queue_request_sense(drive, NULL, NULL);
+		/* End a request through request sense analysis when we have
+		   sense data. We need this in order to perform end of media
+		   processing */
+
+		if (do_end_request) {
+			if (stat & ERR_STAT) {
+				unsigned long flags;
+				spin_lock_irqsave(&ide_lock, flags);
+				blkdev_dequeue_request(rq);
+				HWGROUP(drive)->rq = NULL;
+				spin_unlock_irqrestore(&ide_lock, flags);
+
+				cdrom_queue_request_sense(drive, rq->sense, rq);
+			} else
+				cdrom_end_request(drive, 0);
+		} else {
+			/* If we got a CHECK_CONDITION status,
+			   queue a request sense command. */
+			if (stat & ERR_STAT)
+				cdrom_queue_request_sense(drive, NULL, NULL);
+		}
 	} else {
 		blk_dump_rq_flags(rq, "ide-cd: bad rq");
 		cdrom_end_request(drive, 0);
@@ -1491,8 +1555,7 @@ static ide_startstop_t cdrom_do_packet_command (ide_drive_t *drive)
 }
 
 
-static
-int cdrom_queue_packet_command(ide_drive_t *drive, struct request *rq)
+static int cdrom_queue_packet_command(ide_drive_t *drive, struct request *rq)
 {
 	struct request_sense sense;
 	int retries = 10;
@@ -2220,6 +2283,9 @@ static int cdrom_read_toc(ide_drive_t *drive, struct request_sense *sense)
 		toc->capacity = 0x1fffff;
 
 	set_capacity(info->disk, toc->capacity * sectors_per_frame);
+	/* Save a private copy of te TOC capacity for error handling */
+	drive->probed_capacity = toc->capacity * sectors_per_frame;
+
 	blk_queue_hardsect_size(drive->queue,
 				sectors_per_frame << SECTOR_BITS);
 
@@ -2342,6 +2408,7 @@ static int cdrom_read_toc(ide_drive_t *drive, struct request_sense *sense)
 	if (!stat && (last_written > toc->capacity)) {
 		toc->capacity = last_written;
 		set_capacity(info->disk, toc->capacity * sectors_per_frame);
+		drive->probed_capacity = toc->capacity * sectors_per_frame;
 	}
 
 	/* Remember that we've read this stuff. */
@@ -2698,14 +2765,11 @@ int ide_cdrom_drive_status (struct cdrom_device_info *cdi, int slot_nr)
 	 * any other way to detect this...
 	 */
 	if (sense.sense_key == NOT_READY) {
-		if (sense.asc == 0x3a) {
-			if (sense.ascq == 1)
-				return CDS_NO_DISC;
-			else if (sense.ascq == 0 || sense.ascq == 2)
-				return CDS_TRAY_OPEN;
-		}
+		if (sense.asc == 0x3a && sense.ascq == 1)
+			return CDS_NO_DISC;
+		else
+			return CDS_TRAY_OPEN;
 	}
-
 	return CDS_DRIVE_NOT_READY;
 }
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 4f2f138de2ca..622a55c72f03 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -222,6 +222,63 @@ static ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *
 	return ide_stopped;
 }
 
+/**
+ *	ide_end_dequeued_request	-	complete an IDE I/O
+ *	@drive: IDE device for the I/O
+ *	@uptodate:
+ *	@nr_sectors: number of sectors completed
+ *
+ *	Complete an I/O that is no longer on the request queue. This
+ *	typically occurs when we pull the request and issue a REQUEST_SENSE.
+ *	We must still finish the old request but we must not tamper with the
+ *	queue in the meantime.
+ *
+ *	NOTE: This path does not handle barrier, but barrier is not supported
+ *	on ide-cd anyway.
+ */
+
+int ide_end_dequeued_request(ide_drive_t *drive, struct request *rq,
+			     int uptodate, int nr_sectors)
+{
+	unsigned long flags;
+	int ret = 1;
+
+	spin_lock_irqsave(&ide_lock, flags);
+
+	BUG_ON(!(rq->flags & REQ_STARTED));
+
+	/*
+	 * if failfast is set on a request, override number of sectors and
+	 * complete the whole request right now
+	 */
+	if (blk_noretry_request(rq) && end_io_error(uptodate))
+		nr_sectors = rq->hard_nr_sectors;
+
+	if (!blk_fs_request(rq) && end_io_error(uptodate) && !rq->errors)
+		rq->errors = -EIO;
+
+	/*
+	 * decide whether to reenable DMA -- 3 is a random magic for now,
+	 * if we DMA timeout more than 3 times, just stay in PIO
+	 */
+	if (drive->state == DMA_PIO_RETRY && drive->retry_pio <= 3) {
+		drive->state = 0;
+		HWGROUP(drive)->hwif->ide_dma_on(drive);
+	}
+
+	if (!end_that_request_first(rq, uptodate, nr_sectors)) {
+		add_disk_randomness(rq->rq_disk);
+		if (blk_rq_tagged(rq))
+			blk_queue_end_tag(drive->queue, rq);
+		end_that_request_last(rq, uptodate);
+		ret = 0;
+	}
+	spin_unlock_irqrestore(&ide_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ide_end_dequeued_request);
+
+
 /**
  *	ide_complete_pm_request - end the current Power Management request
  *	@drive: target drive
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 77e66d055f5b..ef7bef207f48 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -630,6 +630,7 @@ typedef struct ide_drive_s {
 	unsigned int	usage;		/* current "open()" count for drive */
 	unsigned int	failures;	/* current failure count */
 	unsigned int	max_failures;	/* maximum allowed failure count */
+	u64		probed_capacity;/* initial reported media capacity (ide-cd only currently) */
 
 	u64		capacity64;	/* total number of sectors */
 
@@ -1005,6 +1006,8 @@ extern	ide_hwif_t	ide_hwifs[];		/* master data repository */
 extern int noautodma;
 
 extern int ide_end_request (ide_drive_t *drive, int uptodate, int nrsecs);
+int ide_end_dequeued_request(ide_drive_t *drive, struct request *rq,
+			     int uptodate, int nr_sectors);
 
 /*
  * This is used on exit from the driver to designate the next irq handler
-- 
cgit v1.2.3


From ad4063b0b2ffd7c8359b62c830e88152fc39ab20 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben-linux@fluff.org>
Date: Sun, 25 Jun 2006 05:48:03 -0700
Subject: [PATCH] AX88796 parallel port driver

Driver for the simple parallel port interface on the Asix AX88796 chip on
an platform_bus.

[akpm@osdl.org: x86_64 build fix]
Signed-off-by: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/parport/Kconfig           |  12 ++
 drivers/parport/Makefile          |   3 +-
 drivers/parport/parport_ax88796.c | 443 ++++++++++++++++++++++++++++++++++++++
 include/linux/parport.h           |   5 +
 4 files changed, 462 insertions(+), 1 deletion(-)
 create mode 100644 drivers/parport/parport_ax88796.c

(limited to 'include/linux')

diff --git a/drivers/parport/Kconfig b/drivers/parport/Kconfig
index 4d8dc27ea9d1..c7fa28a28b9f 100644
--- a/drivers/parport/Kconfig
+++ b/drivers/parport/Kconfig
@@ -136,6 +136,18 @@ config PARPORT_SUNBPP
 	  found on many Sun machines. Note that many of the newer Ultras
 	  actually have pc style hardware instead.
 
+config PARPORT_AX88796
+	tristate "AX88796 Parallel Port"
+	depends on PARPORT
+	select PARPORT_NOT_PC
+	help
+	  Say Y here if you need support for the parallel port hardware on
+	  the AX88796 network controller chip. This code is also available
+	  as a module (say M), called parport_ax88796.
+
+	  The driver is not dependant on the AX88796 network driver, and
+	  should not interfere with the networking functions of the chip.
+
 config PARPORT_1284
 	bool "IEEE 1284 transfer modes"
 	depends on PARPORT
diff --git a/drivers/parport/Makefile b/drivers/parport/Makefile
index a19de35f8de2..696b8d4ca887 100644
--- a/drivers/parport/Makefile
+++ b/drivers/parport/Makefile
@@ -17,4 +17,5 @@ obj-$(CONFIG_PARPORT_MFC3)	+= parport_mfc3.o
 obj-$(CONFIG_PARPORT_ATARI)	+= parport_atari.o
 obj-$(CONFIG_PARPORT_SUNBPP)	+= parport_sunbpp.o
 obj-$(CONFIG_PARPORT_GSC)	+= parport_gsc.o
-obj-$(CONFIG_PARPORT_IP32)	+= parport_ip32.o
+obj-$(CONFIG_PARPORT_AX88796)	+= parport_ax88796.o
+obj-$(CONFIG_PARPORT_IP32)	+= parport_ip32.o
\ No newline at end of file
diff --git a/drivers/parport/parport_ax88796.c b/drivers/parport/parport_ax88796.c
new file mode 100644
index 000000000000..4baa719439a2
--- /dev/null
+++ b/drivers/parport/parport_ax88796.c
@@ -0,0 +1,443 @@
+/* linux/drivers/parport/parport_ax88796.c
+ *
+ * (c) 2005,2006 Simtec Electronics
+ *	Ben Dooks <ben@simtec.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+*/
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/parport.h>
+#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/platform_device.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+
+#define AX_SPR_BUSY		(1<<7)
+#define AX_SPR_ACK		(1<<6)
+#define AX_SPR_PE		(1<<5)
+#define AX_SPR_SLCT		(1<<4)
+#define AX_SPR_ERR		(1<<3)
+
+#define AX_CPR_nDOE		(1<<5)
+#define AX_CPR_SLCTIN		(1<<3)
+#define AX_CPR_nINIT		(1<<2)
+#define AX_CPR_ATFD		(1<<1)
+#define AX_CPR_STRB		(1<<0)
+
+struct ax_drvdata {
+	struct parport		*parport;
+	struct parport_state	 suspend;
+
+	struct device		*dev;
+	struct resource		*io;
+
+	unsigned char		 irq_enabled;
+
+	void __iomem		*base;
+	void __iomem		*spp_data;
+	void __iomem		*spp_spr;
+	void __iomem		*spp_cpr;
+};
+
+static inline struct ax_drvdata *pp_to_drv(struct parport *p)
+{
+	return p->private_data;
+}
+
+static unsigned char
+parport_ax88796_read_data(struct parport *p)
+{
+	struct ax_drvdata *dd = pp_to_drv(p);
+
+	return readb(dd->spp_data);
+}
+
+static void
+parport_ax88796_write_data(struct parport *p, unsigned char data)
+{
+	struct ax_drvdata *dd = pp_to_drv(p);
+
+	writeb(data, dd->spp_data);
+}
+
+static unsigned char
+parport_ax88796_read_control(struct parport *p)
+{
+	struct ax_drvdata *dd = pp_to_drv(p);
+	unsigned int cpr = readb(dd->spp_cpr);
+	unsigned int ret = 0;
+
+	if (!(cpr & AX_CPR_STRB))
+		ret |= PARPORT_CONTROL_STROBE;
+
+	if (!(cpr & AX_CPR_ATFD))
+		ret |= PARPORT_CONTROL_AUTOFD;
+
+	if (cpr & AX_CPR_nINIT)
+		ret |= PARPORT_CONTROL_INIT;
+
+	if (!(cpr & AX_CPR_SLCTIN))
+		ret |= PARPORT_CONTROL_SELECT;
+
+	return ret;
+}
+
+static void
+parport_ax88796_write_control(struct parport *p, unsigned char control)
+{
+	struct ax_drvdata *dd = pp_to_drv(p);
+	unsigned int cpr = readb(dd->spp_cpr);
+
+	cpr &= AX_CPR_nDOE;
+
+	if (!(control & PARPORT_CONTROL_STROBE))
+		cpr |= AX_CPR_STRB;
+
+	if (!(control & PARPORT_CONTROL_AUTOFD))
+		cpr |= AX_CPR_ATFD;
+
+	if (control & PARPORT_CONTROL_INIT)
+		cpr |= AX_CPR_nINIT;
+
+	if (!(control & PARPORT_CONTROL_SELECT))
+		cpr |= AX_CPR_SLCTIN;
+
+	dev_dbg(dd->dev, "write_control: ctrl=%02x, cpr=%02x\n", control, cpr);
+	writeb(cpr, dd->spp_cpr);
+
+	if (parport_ax88796_read_control(p) != control) {
+		dev_err(dd->dev, "write_control: read != set (%02x, %02x)\n",
+			parport_ax88796_read_control(p), control);
+	}
+}
+
+static unsigned char
+parport_ax88796_read_status(struct parport *p)
+{
+	struct ax_drvdata *dd = pp_to_drv(p);
+	unsigned int status = readb(dd->spp_spr);
+	unsigned int ret = 0;
+
+	if (status & AX_SPR_BUSY)
+		ret |= PARPORT_STATUS_BUSY;
+
+	if (status & AX_SPR_ACK)
+		ret |= PARPORT_STATUS_ACK;
+
+	if (status & AX_SPR_ERR)
+		ret |= PARPORT_STATUS_ERROR;
+
+	if (status & AX_SPR_SLCT)
+		ret |= PARPORT_STATUS_SELECT;
+
+	if (status & AX_SPR_PE)
+		ret |= PARPORT_STATUS_PAPEROUT;
+
+	return ret;
+}
+
+static unsigned char
+parport_ax88796_frob_control(struct parport *p, unsigned char mask,
+			     unsigned char val)
+{
+	struct ax_drvdata *dd = pp_to_drv(p);
+	unsigned char old = parport_ax88796_read_control(p);
+
+	dev_dbg(dd->dev, "frob: mask=%02x, val=%02x, old=%02x\n",
+		mask, val, old);
+
+	parport_ax88796_write_control(p, (old & ~mask) | val);
+	return old;
+}
+
+static void
+parport_ax88796_enable_irq(struct parport *p)
+{
+	struct ax_drvdata *dd = pp_to_drv(p);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (!dd->irq_enabled) {
+		enable_irq(p->irq);
+		dd->irq_enabled = 1;
+	}
+	local_irq_restore(flags);
+}
+
+static void
+parport_ax88796_disable_irq(struct parport *p)
+{
+	struct ax_drvdata *dd = pp_to_drv(p);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (dd->irq_enabled) {
+		disable_irq(p->irq);
+		dd->irq_enabled = 0;
+	}
+	local_irq_restore(flags);
+}
+
+static void
+parport_ax88796_data_forward(struct parport *p)
+{
+	struct ax_drvdata *dd = pp_to_drv(p);
+	void __iomem *cpr = dd->spp_cpr;
+
+	writeb((readb(cpr) & ~AX_CPR_nDOE), cpr);
+}
+
+static void
+parport_ax88796_data_reverse(struct parport *p)
+{
+	struct ax_drvdata *dd = pp_to_drv(p);
+	void __iomem *cpr = dd->spp_cpr;
+
+	writeb(readb(cpr) | AX_CPR_nDOE, cpr);
+}
+
+static void
+parport_ax88796_init_state(struct pardevice *d, struct parport_state *s)
+{
+	struct ax_drvdata *dd = pp_to_drv(d->port);
+
+	memset(s, 0, sizeof(struct parport_state));
+
+	dev_dbg(dd->dev, "init_state: %p: state=%p\n", d, s);
+	s->u.ax88796.cpr = readb(dd->spp_cpr);
+}
+
+static void
+parport_ax88796_save_state(struct parport *p, struct parport_state *s)
+{
+	struct ax_drvdata *dd = pp_to_drv(p);
+
+	dev_dbg(dd->dev, "save_state: %p: state=%p\n", p, s);
+	s->u.ax88796.cpr = readb(dd->spp_cpr);
+}
+
+static void
+parport_ax88796_restore_state(struct parport *p, struct parport_state *s)
+{
+	struct ax_drvdata *dd = pp_to_drv(p);
+
+	dev_dbg(dd->dev, "restore_state: %p: state=%p\n", p, s);
+	writeb(s->u.ax88796.cpr, dd->spp_cpr);
+}
+
+static irqreturn_t
+parport_ax88796_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+        parport_generic_irq(irq, dev_id, regs);
+        return IRQ_HANDLED;
+}
+
+
+static struct parport_operations parport_ax88796_ops = {
+	.write_data	= parport_ax88796_write_data,
+	.read_data	= parport_ax88796_read_data,
+
+	.write_control	= parport_ax88796_write_control,
+	.read_control	= parport_ax88796_read_control,
+	.frob_control	= parport_ax88796_frob_control,
+
+	.read_status	= parport_ax88796_read_status,
+
+	.enable_irq	= parport_ax88796_enable_irq,
+	.disable_irq	= parport_ax88796_disable_irq,
+
+	.data_forward	= parport_ax88796_data_forward,
+	.data_reverse	= parport_ax88796_data_reverse,
+
+	.init_state	= parport_ax88796_init_state,
+	.save_state	= parport_ax88796_save_state,
+	.restore_state	= parport_ax88796_restore_state,
+
+	.epp_write_data	= parport_ieee1284_epp_write_data,
+	.epp_read_data	= parport_ieee1284_epp_read_data,
+	.epp_write_addr	= parport_ieee1284_epp_write_addr,
+	.epp_read_addr	= parport_ieee1284_epp_read_addr,
+
+	.ecp_write_data	= parport_ieee1284_ecp_write_data,
+	.ecp_read_data	= parport_ieee1284_ecp_read_data,
+	.ecp_write_addr	= parport_ieee1284_ecp_write_addr,
+
+	.compat_write_data	= parport_ieee1284_write_compat,
+	.nibble_read_data	= parport_ieee1284_read_nibble,
+	.byte_read_data		= parport_ieee1284_read_byte,
+
+	.owner		= THIS_MODULE,
+};
+
+static int parport_ax88796_probe(struct platform_device *pdev)
+{
+	struct device *_dev = &pdev->dev;
+	struct ax_drvdata *dd;
+	struct parport *pp = NULL;
+	struct resource *res;
+	unsigned long size;
+	int spacing;
+	int irq;
+	int ret;
+
+	dd = kzalloc(sizeof(struct ax_drvdata), GFP_KERNEL);
+	if (dd == NULL) {
+		dev_err(_dev, "no memory for private data\n");
+		return -ENOMEM;
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (res == NULL) {
+		dev_err(_dev, "no MEM specified\n");
+		ret = -ENXIO;
+		goto exit_mem;
+	}
+
+	size = (res->end - res->start) + 1;
+	spacing = size / 3;
+
+	dd->io = request_mem_region(res->start, size, pdev->name);
+	if (dd->io == NULL) {
+		dev_err(_dev, "cannot reserve memory\n");
+		ret = -ENXIO;
+		goto exit_mem;
+	}
+
+	dd->base = ioremap(res->start, size);
+	if (dd->base == NULL) {
+		dev_err(_dev, "cannot ioremap region\n");
+		ret = -ENXIO;
+		goto exit_res;
+	}
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq <= 0)
+		irq = PARPORT_IRQ_NONE;
+
+	pp = parport_register_port((unsigned long)dd->base, irq,
+				   PARPORT_DMA_NONE,
+				   &parport_ax88796_ops);
+
+	if (pp == NULL) {
+		dev_err(_dev, "failed to register parallel port\n");
+		ret = -ENOMEM;
+		goto exit_unmap;
+	}
+
+	pp->private_data = dd;
+	dd->parport = pp;
+	dd->dev = _dev;
+
+	dd->spp_data = dd->base;
+	dd->spp_spr  = dd->base + (spacing * 1);
+	dd->spp_cpr  = dd->base + (spacing * 2);
+
+	/* initialise the port controls */
+	writeb(AX_CPR_STRB, dd->spp_cpr);
+
+	if (irq >= 0) {
+		/* request irq */
+		ret = request_irq(irq, parport_ax88796_interrupt,
+				  SA_TRIGGER_FALLING, pdev->name, pp);
+
+		if (ret < 0)
+			goto exit_port;
+
+		dd->irq_enabled = 1;
+	}
+
+	platform_set_drvdata(pdev, pp);
+
+	dev_info(_dev, "attached parallel port driver\n");
+	parport_announce_port(pp);
+
+	return 0;
+
+ exit_port:
+	parport_remove_port(pp);
+ exit_unmap:
+	iounmap(dd->base);
+ exit_res:
+	release_resource(dd->io);
+	kfree(dd->io);
+ exit_mem:
+	kfree(dd);
+	return ret;
+}
+
+static int parport_ax88796_remove(struct platform_device *pdev)
+{
+	struct parport *p = platform_get_drvdata(pdev);
+	struct ax_drvdata *dd = pp_to_drv(p);
+
+	free_irq(p->irq, p);
+	parport_remove_port(p);
+	iounmap(dd->base);
+	release_resource(dd->io);
+	kfree(dd->io);
+	kfree(dd);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+
+static int parport_ax88796_suspend(struct platform_device *dev,
+				   pm_message_t state)
+{
+	struct parport *p = platform_get_drvdata(dev);
+	struct ax_drvdata *dd = pp_to_drv(p);
+
+	parport_ax88796_save_state(p, &dd->suspend);
+	writeb(AX_CPR_nDOE | AX_CPR_STRB, dd->spp_cpr);
+	return 0;
+}
+
+static int parport_ax88796_resume(struct platform_device *dev)
+{
+	struct parport *p = platform_get_drvdata(dev);
+	struct ax_drvdata *dd = pp_to_drv(p);
+
+	parport_ax88796_restore_state(p, &dd->suspend);
+	return 0;
+}
+
+#else
+#define parport_ax88796_suspend NULL
+#define parport_ax88796_resume  NULL
+#endif
+
+static struct platform_driver axdrv = {
+	.driver		= {
+		.name	= "ax88796-pp",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= parport_ax88796_probe,
+	.remove		= parport_ax88796_remove,
+	.suspend	= parport_ax88796_suspend,
+	.resume		= parport_ax88796_resume,
+};
+
+static int __init parport_ax88796_init(void)
+{
+	return platform_driver_register(&axdrv);
+}
+
+static void __exit parport_ax88796_exit(void)
+{
+	platform_driver_unregister(&axdrv);
+}
+
+module_init(parport_ax88796_init)
+module_exit(parport_ax88796_exit)
+
+MODULE_AUTHOR("Ben Dooks <ben@simtec.co.uk>");
+MODULE_DESCRIPTION("AX88796 Parport parallel port driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/parport.h b/include/linux/parport.h
index d42737eeee06..5bf321e82c99 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -127,6 +127,10 @@ struct amiga_parport_state {
        unsigned char statusdir;/* ciab.ddrb & 7 */
 };
 
+struct ax88796_parport_state {
+	unsigned char cpr;
+};
+
 struct ip32_parport_state {
 	unsigned int dcr;
 	unsigned int ecr;
@@ -138,6 +142,7 @@ struct parport_state {
 		/* ARC has no state. */
 		struct ax_parport_state ax;
 		struct amiga_parport_state amiga;
+		struct ax88796_parport_state ax88796;
 		/* Atari has not state. */
 		struct ip32_parport_state ip32;
 		void *misc; 
-- 
cgit v1.2.3


From 1c2bf374a4b8c2e1a3e6ff3a64fb67272a8cd2e2 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Sun, 25 Jun 2006 05:48:06 -0700
Subject: [PATCH] ext3_fsblk_t: filesystem, group blocks and bug fixes

Some of the in-kernel ext3 block variable type are treated as signed 4 bytes
int type, thus limited ext3 filesystem to 8TB (4kblock size based).  While
trying to fix them, it seems quite confusing in the ext3 code where some
blocks are filesystem-wide blocks, some are group relative offsets that need
to be signed value (as -1 has special meaning).  So it seem saner to define
two types of physical blocks: one is filesystem wide blocks, another is
group-relative blocks.  The following patches clarify these two types of
blocks in the ext3 code, and fix the type bugs which limit current 32 bit ext3
filesystem limit to 8TB.

With this series of patches and the percpu counter data type changes in the mm
tree, we are able to extend exts filesystem limit to 16TB.

This work is also a pre-request for the recent >32 bit ext3 work, and makes
the kernel to able to address 48 bit ext3 block a lot easier: Simply redefine
ext3_fsblk_t from unsigned long to sector_t and redefine the format string for
ext3 filesystem block corresponding.

Two RFC with a series patches have been posted to ext2-devel list and have
been reviewed and discussed:
http://marc.theaimsgroup.com/?l=ext2-devel&m=114722190816690&w=2

http://marc.theaimsgroup.com/?l=ext2-devel&m=114784919525942&w=2

Patches are tested on both 32 bit machine and 64 bit machine, <8TB ext3 and
>8TB ext3 filesystem(with the latest to be released e2fsprogs-1.39).  Tests
includes overnight fsx, tiobench, dbench and fsstress.

This patch:

Defines ext3_fsblk_t and ext3_grpblk_t, and the printk format string for
filesystem wide blocks.

This patch classifies all block group relative blocks, and ext3_fsblk_t blocks
occurs in the same function where used to be confusing before.  Also include
kernel bug fixes for filesystem wide in-kernel block variables.  There are
some fileystem wide blocks are treated as int/unsigned int type in the kernel
currently, especially in ext3 block allocation and reservation code.  This
patch fixed those bugs by converting those variables to ext3_fsblk_t(unsigned
long) type.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/balloc.c          | 215 ++++++++++++++++++++++++----------------------
 fs/ext3/ialloc.c          |  10 ++-
 fs/ext3/inode.c           |   2 +-
 fs/ext3/resize.c          |  43 ++++++----
 fs/ext3/super.c           |   2 +-
 fs/ext3/xattr.c           |  27 +++---
 include/linux/ext3_fs.h   |  19 ++--
 include/linux/ext3_fs_i.h |   8 ++
 8 files changed, 177 insertions(+), 149 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 77927d6938f6..b1633cd28eca 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -163,10 +163,10 @@ restart:
 #endif
 
 static int
-goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
+goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
 			unsigned int group, struct super_block * sb)
 {
-	unsigned long group_first_block, group_last_block;
+	ext3_fsblk_t group_first_block, group_last_block;
 
 	group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
 				group * EXT3_BLOCKS_PER_GROUP(sb);
@@ -175,8 +175,8 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
 	if ((rsv->_rsv_start > group_last_block) ||
 	    (rsv->_rsv_end < group_first_block))
 		return 0;
-	if ((goal >= 0) && ((goal + group_first_block < rsv->_rsv_start)
-		|| (goal + group_first_block > rsv->_rsv_end)))
+	if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
+		|| (grp_goal + group_first_block > rsv->_rsv_end)))
 		return 0;
 	return 1;
 }
@@ -187,7 +187,7 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
  * Returns NULL if there are no windows or if all windows start after the goal.
  */
 static struct ext3_reserve_window_node *
-search_reserve_window(struct rb_root *root, unsigned long goal)
+search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
 {
 	struct rb_node *n = root->rb_node;
 	struct ext3_reserve_window_node *rsv;
@@ -223,7 +223,7 @@ void ext3_rsv_window_add(struct super_block *sb,
 {
 	struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
 	struct rb_node *node = &rsv->rsv_node;
-	unsigned int start = rsv->rsv_start;
+	ext3_fsblk_t start = rsv->rsv_start;
 
 	struct rb_node ** p = &root->rb_node;
 	struct rb_node * parent = NULL;
@@ -310,20 +310,20 @@ void ext3_discard_reservation(struct inode *inode)
 
 /* Free given blocks, update quota and i_blocks field */
 void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
-			 unsigned long block, unsigned long count,
-			 int *pdquot_freed_blocks)
+			 ext3_fsblk_t block, unsigned long count,
+			 unsigned long *pdquot_freed_blocks)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gd_bh;
 	unsigned long block_group;
-	unsigned long bit;
+	ext3_grpblk_t bit;
 	unsigned long i;
 	unsigned long overflow;
 	struct ext3_group_desc * desc;
 	struct ext3_super_block * es;
 	struct ext3_sb_info *sbi;
 	int err = 0, ret;
-	unsigned group_freed;
+	ext3_grpblk_t group_freed;
 
 	*pdquot_freed_blocks = 0;
 	sbi = EXT3_SB(sb);
@@ -333,7 +333,7 @@ void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
 	    block + count > le32_to_cpu(es->s_blocks_count)) {
 		ext3_error (sb, "ext3_free_blocks",
 			    "Freeing blocks not in datazone - "
-			    "block = %lu, count = %lu", block, count);
+			    "block = "E3FSBLK", count = %lu", block, count);
 		goto error_return;
 	}
 
@@ -369,7 +369,7 @@ do_more:
 		      sbi->s_itb_per_group))
 		ext3_error (sb, "ext3_free_blocks",
 			    "Freeing blocks in system zones - "
-			    "Block = %lu, count = %lu",
+			    "Block = "E3FSBLK", count = %lu",
 			    block, count);
 
 	/*
@@ -453,7 +453,8 @@ do_more:
 						bit + i, bitmap_bh->b_data)) {
 			jbd_unlock_bh_state(bitmap_bh);
 			ext3_error(sb, __FUNCTION__,
-				"bit already cleared for block %lu", block + i);
+				"bit already cleared for block "E3FSBLK,
+				 block + i);
 			jbd_lock_bh_state(bitmap_bh);
 			BUFFER_TRACE(bitmap_bh, "bit already cleared");
 		} else {
@@ -493,10 +494,10 @@ error_return:
 
 /* Free given blocks, update quota and i_blocks field */
 void ext3_free_blocks(handle_t *handle, struct inode *inode,
-			unsigned long block, unsigned long count)
+			ext3_fsblk_t block, unsigned long count)
 {
 	struct super_block * sb;
-	int dquot_freed_blocks;
+	unsigned long dquot_freed_blocks;
 
 	sb = inode->i_sb;
 	if (!sb) {
@@ -525,7 +526,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
  * data-writes at some point, and disable it for metadata allocations or
  * sync-data inodes.
  */
-static int ext3_test_allocatable(int nr, struct buffer_head *bh)
+static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
 {
 	int ret;
 	struct journal_head *jh = bh2jh(bh);
@@ -542,11 +543,11 @@ static int ext3_test_allocatable(int nr, struct buffer_head *bh)
 	return ret;
 }
 
-static int
-bitmap_search_next_usable_block(int start, struct buffer_head *bh,
-					int maxblocks)
+static ext3_grpblk_t
+bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+					ext3_grpblk_t maxblocks)
 {
-	int next;
+	ext3_grpblk_t next;
 	struct journal_head *jh = bh2jh(bh);
 
 	/*
@@ -576,10 +577,11 @@ bitmap_search_next_usable_block(int start, struct buffer_head *bh,
  * the initial goal; then for a free byte somewhere in the bitmap; then
  * for any free bit in the bitmap.
  */
-static int
-find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
+static ext3_grpblk_t
+find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+			ext3_grpblk_t maxblocks)
 {
-	int here, next;
+	ext3_grpblk_t here, next;
 	char *p, *r;
 
 	if (start > 0) {
@@ -591,7 +593,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
 		 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
 		 * next 64-bit boundary is simple..
 		 */
-		int end_goal = (start + 63) & ~63;
+		ext3_grpblk_t end_goal = (start + 63) & ~63;
 		if (end_goal > maxblocks)
 			end_goal = maxblocks;
 		here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
@@ -628,7 +630,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
  * zero (failure).
  */
 static inline int
-claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
+claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
 {
 	struct journal_head *jh = bh2jh(bh);
 	int ret;
@@ -651,12 +653,13 @@ claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
  * new bitmap.  In that case we must release write access to the old one via
  * ext3_journal_release_buffer(), else we'll run out of credits.
  */
-static int
+static ext3_grpblk_t
 ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
-			struct buffer_head *bitmap_bh, int goal,
+			struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
 			unsigned long *count, struct ext3_reserve_window *my_rsv)
 {
-	int group_first_block, start, end;
+	ext3_fsblk_t group_first_block;
+	ext3_grpblk_t start, end;
 	unsigned long num = 0;
 
 	/* we do allocation within the reservation window if we have a window */
@@ -673,13 +676,13 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
 		if (end > EXT3_BLOCKS_PER_GROUP(sb))
 			/* reservation window crosses group boundary */
 			end = EXT3_BLOCKS_PER_GROUP(sb);
-		if ((start <= goal) && (goal < end))
-			start = goal;
+		if ((start <= grp_goal) && (grp_goal < end))
+			start = grp_goal;
 		else
-			goal = -1;
+			grp_goal = -1;
 	} else {
-		if (goal > 0)
-			start = goal;
+		if (grp_goal > 0)
+			start = grp_goal;
 		else
 			start = 0;
 		end = EXT3_BLOCKS_PER_GROUP(sb);
@@ -688,43 +691,43 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
 	BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
 
 repeat:
-	if (goal < 0 || !ext3_test_allocatable(goal, bitmap_bh)) {
-		goal = find_next_usable_block(start, bitmap_bh, end);
-		if (goal < 0)
+	if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
+		grp_goal = find_next_usable_block(start, bitmap_bh, end);
+		if (grp_goal < 0)
 			goto fail_access;
 		if (!my_rsv) {
 			int i;
 
-			for (i = 0; i < 7 && goal > start &&
-					ext3_test_allocatable(goal - 1,
+			for (i = 0; i < 7 && grp_goal > start &&
+					ext3_test_allocatable(grp_goal - 1,
 								bitmap_bh);
-					i++, goal--)
+					i++, grp_goal--)
 				;
 		}
 	}
-	start = goal;
+	start = grp_goal;
 
-	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
+	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
 		/*
 		 * The block was allocated by another thread, or it was
 		 * allocated and then freed by another thread
 		 */
 		start++;
-		goal++;
+		grp_goal++;
 		if (start >= end)
 			goto fail_access;
 		goto repeat;
 	}
 	num++;
-	goal++;
-	while (num < *count && goal < end
-		&& ext3_test_allocatable(goal, bitmap_bh)
-		&& claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
+	grp_goal++;
+	while (num < *count && grp_goal < end
+		&& ext3_test_allocatable(grp_goal, bitmap_bh)
+		&& claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
 		num++;
-		goal++;
+		grp_goal++;
 	}
 	*count = num;
-	return goal - num;
+	return grp_goal - num;
 fail_access:
 	*count = num;
 	return -1;
@@ -766,12 +769,13 @@ fail_access:
 static int find_next_reservable_window(
 				struct ext3_reserve_window_node *search_head,
 				struct ext3_reserve_window_node *my_rsv,
-				struct super_block * sb, int start_block,
-				int last_block)
+				struct super_block * sb,
+				ext3_fsblk_t start_block,
+				ext3_fsblk_t last_block)
 {
 	struct rb_node *next;
 	struct ext3_reserve_window_node *rsv, *prev;
-	int cur;
+	ext3_fsblk_t cur;
 	int size = my_rsv->rsv_goal_size;
 
 	/* TODO: make the start of the reservation window byte-aligned */
@@ -873,10 +877,10 @@ static int find_next_reservable_window(
  *
  *	@rsv: the reservation
  *
- *	@goal: The goal (group-relative).  It is where the search for a
+ *	@grp_goal: The goal (group-relative).  It is where the search for a
  *		free reservable space should start from.
- *		if we have a goal(goal >0 ), then start from there,
- *		no goal(goal = -1), we start from the first block
+ *		if we have a grp_goal(grp_goal >0 ), then start from there,
+ *		no grp_goal(grp_goal = -1), we start from the first block
  *		of the group.
  *
  *	@sb: the super block
@@ -885,12 +889,12 @@ static int find_next_reservable_window(
  *
  */
 static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
-		int goal, struct super_block *sb,
+		ext3_grpblk_t grp_goal, struct super_block *sb,
 		unsigned int group, struct buffer_head *bitmap_bh)
 {
 	struct ext3_reserve_window_node *search_head;
-	int group_first_block, group_end_block, start_block;
-	int first_free_block;
+	ext3_fsblk_t group_first_block, group_end_block, start_block;
+	ext3_grpblk_t first_free_block;
 	struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
 	unsigned long size;
 	int ret;
@@ -900,10 +904,10 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
 				group * EXT3_BLOCKS_PER_GROUP(sb);
 	group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
 
-	if (goal < 0)
+	if (grp_goal < 0)
 		start_block = group_first_block;
 	else
-		start_block = goal + group_first_block;
+		start_block = grp_goal + group_first_block;
 
 	size = my_rsv->rsv_goal_size;
 
@@ -1057,14 +1061,15 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
  * sorted double linked list should be fast.
  *
  */
-static int
+static ext3_grpblk_t
 ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 			unsigned int group, struct buffer_head *bitmap_bh,
-			int goal, struct ext3_reserve_window_node * my_rsv,
+			ext3_grpblk_t grp_goal,
+			struct ext3_reserve_window_node * my_rsv,
 			unsigned long *count, int *errp)
 {
-	unsigned long group_first_block;
-	int ret = 0;
+	ext3_fsblk_t group_first_block;
+	ext3_grpblk_t ret = 0;
 	int fatal;
 	unsigned long num = *count;
 
@@ -1090,12 +1095,12 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	 */
 	if (my_rsv == NULL ) {
 		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
-						goal, count, NULL);
+						grp_goal, count, NULL);
 		goto out;
 	}
 	/*
-	 * goal is a group relative block number (if there is a goal)
-	 * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb)
+	 * grp_goal is a group relative block number (if there is a goal)
+	 * 0 < grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
 	 * first block is a filesystem wide block number
 	 * first block is the block number of the first block in this group
 	 */
@@ -1119,24 +1124,24 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	 */
 	while (1) {
 		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
-			!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) {
+			!goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb)) {
 			if (my_rsv->rsv_goal_size < *count)
 				my_rsv->rsv_goal_size = *count;
-			ret = alloc_new_reservation(my_rsv, goal, sb,
+			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
 							group, bitmap_bh);
 			if (ret < 0)
 				break;			/* failed */
 
-			if (!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb))
-				goal = -1;
-		} else if (goal > 0 && (my_rsv->rsv_end-goal+1) < *count)
+			if (!goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb))
+				grp_goal = -1;
+		} else if (grp_goal > 0 && (my_rsv->rsv_end-grp_goal+1) < *count)
 			try_to_extend_reservation(my_rsv, sb,
-					*count-my_rsv->rsv_end + goal - 1);
+					*count-my_rsv->rsv_end + grp_goal - 1);
 
 		if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
 		    || (my_rsv->rsv_end < group_first_block))
 			BUG();
-		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal,
+		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, grp_goal,
 					   &num, &my_rsv->rsv_window);
 		if (ret >= 0) {
 			my_rsv->rsv_alloc_hit += num;
@@ -1164,7 +1169,7 @@ out:
 
 static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
 {
-	int free_blocks, root_blocks;
+	ext3_fsblk_t free_blocks, root_blocks;
 
 	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
 	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
@@ -1200,19 +1205,20 @@ int ext3_should_retry_alloc(struct super_block *sb, int *retries)
  * bitmap, and then for any free bit if that fails.
  * This function also updates quota and i_blocks field.
  */
-int ext3_new_blocks(handle_t *handle, struct inode *inode,
-			unsigned long goal, unsigned long *count, int *errp)
+ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, unsigned long *count, int *errp)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gdp_bh;
 	int group_no;
 	int goal_group;
-	int ret_block;
+	ext3_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
+	ext3_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
+	ext3_fsblk_t ret_block;		/* filesyetem-wide allocated block */
 	int bgi;			/* blockgroup iteration index */
-	int target_block;
 	int fatal = 0, err;
 	int performed_allocation = 0;
-	int free_blocks;
+	ext3_grpblk_t free_blocks;	/* number of free blocks in a group */
 	struct super_block *sb;
 	struct ext3_group_desc *gdp;
 	struct ext3_super_block *es;
@@ -1285,16 +1291,17 @@ retry:
 		my_rsv = NULL;
 
 	if (free_blocks > 0) {
-		ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
+		grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
 				EXT3_BLOCKS_PER_GROUP(sb));
 		bitmap_bh = read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
-		ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
-					bitmap_bh, ret_block, my_rsv, &num, &fatal);
+		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+					group_no, bitmap_bh, grp_target_blk,
+					my_rsv,	&num, &fatal);
 		if (fatal)
 			goto out;
-		if (ret_block >= 0)
+		if (grp_alloc_blk >= 0)
 			goto allocated;
 	}
 
@@ -1327,11 +1334,15 @@ retry:
 		bitmap_bh = read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
-		ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
-					bitmap_bh, -1, my_rsv, &num, &fatal);
+		/*
+		 * try to allocate block(s) from this group, without a goal(-1).
+		 */
+		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+					group_no, bitmap_bh, -1, my_rsv,
+					&num, &fatal);
 		if (fatal)
 			goto out;
-		if (ret_block >= 0) 
+		if (grp_alloc_blk >= 0)
 			goto allocated;
 	}
 	/*
@@ -1360,18 +1371,19 @@ allocated:
 	if (fatal)
 		goto out;
 
-	target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb)
+	ret_block = grp_alloc_blk + group_no * EXT3_BLOCKS_PER_GROUP(sb)
 				+ le32_to_cpu(es->s_first_data_block);
 
-	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), target_block, num) ||
-	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), target_block, num) ||
-	    in_range(target_block, le32_to_cpu(gdp->bg_inode_table),
+	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
+	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
+	    in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
 		      EXT3_SB(sb)->s_itb_per_group) ||
-	    in_range(target_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
+	    in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
 		      EXT3_SB(sb)->s_itb_per_group))
 		ext3_error(sb, "ext3_new_block",
 			    "Allocating block in system zone - "
-			    "blocks from %u, length %lu", target_block, num);
+			    "blocks from "E3FSBLK", length %lu",
+			     ret_block, num);
 
 	performed_allocation = 1;
 
@@ -1380,7 +1392,7 @@ allocated:
 		struct buffer_head *debug_bh;
 
 		/* Record bitmap buffer state in the newly allocated block */
-		debug_bh = sb_find_get_block(sb, target_block);
+		debug_bh = sb_find_get_block(sb, ret_block);
 		if (debug_bh) {
 			BUFFER_TRACE(debug_bh, "state when allocated");
 			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
@@ -1393,24 +1405,21 @@ allocated:
 		int i;
 
 		for (i = 0; i < num; i++) {
-			if (ext3_test_bit(ret_block,
+			if (ext3_test_bit(grp_alloc_blk+i,
 					bh2jh(bitmap_bh)->b_committed_data)) {
 				printk("%s: block was unexpectedly set in "
 					"b_committed_data\n", __FUNCTION__);
 			}
 		}
 	}
-	ext3_debug("found bit %d\n", ret_block);
+	ext3_debug("found bit %d\n", grp_alloc_blk);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
 	jbd_unlock_bh_state(bitmap_bh);
 #endif
 
-	/* ret_block was blockgroup-relative.  Now it becomes fs-relative */
-	ret_block = target_block;
-
 	if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
 		ext3_error(sb, "ext3_new_block",
-			    "block(%d) >= blocks count(%d) - "
+			    "block("E3FSBLK") >= blocks count(%d) - "
 			    "block_group = %d, es == %p ", ret_block,
 			le32_to_cpu(es->s_blocks_count), group_no, es);
 		goto out;
@@ -1421,7 +1430,7 @@ allocated:
 	 * list of some description.  We don't know in advance whether
 	 * the caller wants to use it as metadata or data.
 	 */
-	ext3_debug("allocating block %d. Goal hits %d of %d.\n",
+	ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
 			ret_block, goal_hits, goal_attempts);
 
 	spin_lock(sb_bgl_lock(sbi, group_no));
@@ -1461,8 +1470,8 @@ out:
 	return 0;
 }
 
-int ext3_new_block(handle_t *handle, struct inode *inode,
-			unsigned long goal, int *errp)
+ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, int *errp)
 {
 	unsigned long count = 1;
 
@@ -1520,7 +1529,7 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
 }
 
 static inline int
-block_in_use(unsigned long block, struct super_block *sb, unsigned char *map)
+block_in_use(ext3_fsblk_t block, struct super_block *sb, unsigned char *map)
 {
 	return ext3_test_bit ((block -
 		le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dc826464f313..36546ed36a14 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -262,9 +262,11 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
 	int ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
 	int freei, avefreei;
-	int freeb, avefreeb;
-	int blocks_per_dir, ndirs;
-	int max_debt, max_dirs, min_blocks, min_inodes;
+	ext3_fsblk_t freeb, avefreeb;
+	ext3_fsblk_t blocks_per_dir;
+	int ndirs;
+	int max_debt, max_dirs, min_inodes;
+	ext3_grpblk_t min_blocks;
 	int group = -1, i;
 	struct ext3_group_desc *desc;
 	struct buffer_head *bh;
@@ -307,7 +309,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
 	min_inodes = avefreei - inodes_per_group / 4;
 	min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
 
-	max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, BLOCK_COST);
+	max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST);
 	if (max_debt * INODE_COST > inodes_per_group)
 		max_debt = inodes_per_group / INODE_COST;
 	if (max_debt > 255)
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2edd7eec88fd..b02bc32c57a4 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -62,7 +62,7 @@ static int ext3_inode_is_fast_symlink(struct inode *inode)
  * still needs to be revoked.
  */
 int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
-			struct buffer_head *bh, int blocknr)
+			struct buffer_head *bh, ext3_fsblk_t blocknr)
 {
 	int err;
 
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index a31dff81ed77..82c678e92682 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -28,16 +28,16 @@ static int verify_group_input(struct super_block *sb,
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
-	unsigned start = le32_to_cpu(es->s_blocks_count);
-	unsigned end = start + input->blocks_count;
+	ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count);
+	ext3_fsblk_t end = start + input->blocks_count;
 	unsigned group = input->group;
-	unsigned itend = input->inode_table + sbi->s_itb_per_group;
+	ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
 	unsigned overhead = ext3_bg_has_super(sb, group) ?
 		(1 + ext3_bg_num_gdb(sb, group) +
 		 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
-	unsigned metaend = start + overhead;
+	ext3_fsblk_t metaend = start + overhead;
 	struct buffer_head *bh = NULL;
-	int free_blocks_count;
+	ext3_grpblk_t free_blocks_count;
 	int err = -EINVAL;
 
 	input->free_blocks_count = free_blocks_count =
@@ -64,7 +64,8 @@ static int verify_group_input(struct super_block *sb,
 		ext3_warning(sb, __FUNCTION__, "Bad blocks count %u",
 			     input->blocks_count);
 	else if (!(bh = sb_bread(sb, end - 1)))
-		ext3_warning(sb, __FUNCTION__, "Cannot read last block (%u)",
+		ext3_warning(sb, __FUNCTION__,
+			     "Cannot read last block ("E3FSBLK")",
 			     end - 1);
 	else if (outside(input->block_bitmap, start, end))
 		ext3_warning(sb, __FUNCTION__,
@@ -77,7 +78,7 @@ static int verify_group_input(struct super_block *sb,
 	else if (outside(input->inode_table, start, end) ||
 	         outside(itend - 1, start, end))
 		ext3_warning(sb, __FUNCTION__,
-			     "Inode table not in group (blocks %u-%u)",
+			     "Inode table not in group (blocks %u-"E3FSBLK")",
 			     input->inode_table, itend - 1);
 	else if (input->inode_bitmap == input->block_bitmap)
 		ext3_warning(sb, __FUNCTION__,
@@ -85,24 +86,27 @@ static int verify_group_input(struct super_block *sb,
 			     input->block_bitmap);
 	else if (inside(input->block_bitmap, input->inode_table, itend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Block bitmap (%u) in inode table (%u-%u)",
+			     "Block bitmap (%u) in inode table (%u-"E3FSBLK")",
 			     input->block_bitmap, input->inode_table, itend-1);
 	else if (inside(input->inode_bitmap, input->inode_table, itend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Inode bitmap (%u) in inode table (%u-%u)",
+			     "Inode bitmap (%u) in inode table (%u-"E3FSBLK")",
 			     input->inode_bitmap, input->inode_table, itend-1);
 	else if (inside(input->block_bitmap, start, metaend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Block bitmap (%u) in GDT table (%u-%u)",
+			     "Block bitmap (%u) in GDT table"
+			     " ("E3FSBLK"-"E3FSBLK")",
 			     input->block_bitmap, start, metaend - 1);
 	else if (inside(input->inode_bitmap, start, metaend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Inode bitmap (%u) in GDT table (%u-%u)",
+			     "Inode bitmap (%u) in GDT table"
+			     " ("E3FSBLK"-"E3FSBLK")",
 			     input->inode_bitmap, start, metaend - 1);
 	else if (inside(input->inode_table, start, metaend) ||
 	         inside(itend - 1, start, metaend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Inode table (%u-%u) overlaps GDT table (%u-%u)",
+			     "Inode table (%u-"E3FSBLK") overlaps"
+			     "GDT table ("E3FSBLK"-"E3FSBLK")",
 			     input->inode_table, itend - 1, start, metaend - 1);
 	else
 		err = 0;
@@ -171,7 +175,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 	struct buffer_head *bh;
 	handle_t *handle;
 	unsigned long block;
-	int bit;
+	ext3_grpblk_t bit;
 	int i;
 	int err = 0, err2;
 
@@ -340,7 +344,7 @@ static int verify_reserved_gdb(struct super_block *sb,
 	while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
 		if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
 			ext3_warning(sb, __FUNCTION__,
-				     "reserved GDT %ld missing grp %d (%ld)",
+				     "reserved GDT %lu missing grp %d (%lu)",
 				     blk, grp,
 				     grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
 			return -EINVAL;
@@ -906,11 +910,12 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 {
 	unsigned long o_blocks_count;
 	unsigned long o_groups_count;
-	unsigned long last;
-	int add;
+	ext3_grpblk_t last;
+	ext3_grpblk_t add;
 	struct buffer_head * bh;
 	handle_t *handle;
-	int err, freed_blocks;
+	int err;
+	unsigned long freed_blocks;
 
 	/* We don't need to worry about locking wrt other resizers just
 	 * yet: we're going to revalidate es->s_blocks_count after
@@ -1001,10 +1006,10 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	sb->s_dirt = 1;
 	unlock_super(sb);
-	ext3_debug("freeing blocks %ld through %ld\n", o_blocks_count,
+	ext3_debug("freeing blocks %lu through %lu\n", o_blocks_count,
 		   o_blocks_count + add);
 	ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-	ext3_debug("freed blocks %ld through %ld\n", o_blocks_count,
+	ext3_debug("freed blocks %lu through %lu\n", o_blocks_count,
 		   o_blocks_count + add);
 	if ((err = ext3_journal_stop(handle)))
 		goto exit_put;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index e0fc0c83be90..94113500fc55 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1841,7 +1841,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	struct buffer_head * bh;
 	journal_t *journal;
 	int start;
-	int len;
+	ext3_fsblk_t len;
 	int hblock, blocksize;
 	unsigned long sb_block;
 	unsigned long offset;
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e8d60bf6b7df..1ba515de5a75 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -225,7 +225,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
 	error = -ENODATA;
 	if (!EXT3_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh)
 		goto cleanup;
@@ -233,7 +233,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 bad_block:	ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block %d", inode->i_ino,
+			   "inode %ld: bad block %u", inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -366,7 +366,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
 	error = 0;
 	if (!EXT3_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	error = -EIO;
 	if (!bh)
@@ -375,7 +375,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block %d", inode->i_ino,
+			   "inode %ld: bad block %u", inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -647,7 +647,7 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		if (ext3_xattr_check_block(bs->bh)) {
 			ext3_error(sb, __FUNCTION__,
-				"inode %ld: bad block %d", inode->i_ino,
+				"inode %ld: bad block %u", inode->i_ino,
 				EXT3_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
@@ -792,11 +792,12 @@ inserted:
 			get_bh(new_bh);
 		} else {
 			/* We need to allocate a new block */
-			int goal = le32_to_cpu(
+			ext3_fsblk_t goal = le32_to_cpu(
 					EXT3_SB(sb)->s_es->s_first_data_block) +
-				EXT3_I(inode)->i_block_group *
+				(ext3_fsblk_t)EXT3_I(inode)->i_block_group *
 				EXT3_BLOCKS_PER_GROUP(sb);
-			int block = ext3_new_block(handle, inode, goal, &error);
+			ext3_fsblk_t block = ext3_new_block(handle, inode,
+							goal, &error);
 			if (error)
 				goto cleanup;
 			ea_idebug(inode, "creating block %d", block);
@@ -847,7 +848,7 @@ cleanup_dquot:
 
 bad_block:
 	ext3_error(inode->i_sb, __FUNCTION__,
-		   "inode %ld: bad block %d", inode->i_ino,
+		   "inode %ld: bad block %u", inode->i_ino,
 		   EXT3_I(inode)->i_file_acl);
 	goto cleanup;
 
@@ -1076,14 +1077,14 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: block %d read error", inode->i_ino,
+			"inode %ld: block %u read error", inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: bad block %d", inode->i_ino,
+			"inode %ld: bad block %u", inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
@@ -1210,11 +1211,11 @@ again:
 		bh = sb_bread(inode->i_sb, ce->e_block);
 		if (!bh) {
 			ext3_error(inode->i_sb, __FUNCTION__,
-				"inode %ld: block %ld read error",
+				"inode %ld: block %lu read error",
 				inode->i_ino, (unsigned long) ce->e_block);
 		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
 				EXT3_XATTR_REFCOUNT_MAX) {
-			ea_idebug(inode, "block %ld refcount %d>=%d",
+			ea_idebug(inode, "block %lu refcount %d>=%d",
 				  (unsigned long) ce->e_block,
 				  le32_to_cpu(BHDR(bh)->h_refcount),
 					  EXT3_XATTR_REFCOUNT_MAX);
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 757d54d8f1a5..34136ff02aca 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -730,13 +730,15 @@ struct dir_private_info {
 /* balloc.c */
 extern int ext3_bg_has_super(struct super_block *sb, int group);
 extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
-extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
-extern int ext3_new_blocks (handle_t *, struct inode *, unsigned long,
-			unsigned long *, int *);
-extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
-			      unsigned long);
-extern void ext3_free_blocks_sb (handle_t *, struct super_block *,
-				 unsigned long, unsigned long, int *);
+extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, int *errp);
+extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, unsigned long *count, int *errp);
+extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
+			ext3_fsblk_t block, unsigned long count);
+extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
+				 ext3_fsblk_t block, unsigned long count,
+				unsigned long *pdquot_freed_blocks);
 extern unsigned long ext3_count_free_blocks (struct super_block *);
 extern void ext3_check_blocks_bitmap (struct super_block *);
 extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
@@ -773,7 +775,8 @@ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
 
 
 /* inode.c */
-int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
+		struct buffer_head *bh, ext3_fsblk_t blocknr);
 struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
 struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
 int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index 7abf90147180..e1c7509c0c9f 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -21,6 +21,14 @@
 #include <linux/seqlock.h>
 #include <linux/mutex.h>
 
+/* data type for block offset of block group */
+typedef int ext3_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long ext3_fsblk_t;
+
+#define E3FSBLK "%lu"
+
 struct ext3_reserve_window {
 	__u32			_rsv_start;	/* First byte reserved */
 	__u32			_rsv_end;	/* Last byte reserved or 0 */
-- 
cgit v1.2.3


From 43d23f9039fc810ecd621f1e4f9d578eadce058a Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Sun, 25 Jun 2006 05:48:07 -0700
Subject: [PATCH] ext3_fsblk_t: the rest of in-kernel filesystem blocks
 conversion

Convert the ext3 in-kernel filesystem blocks to ext3_fsblk_t.  Convert the
rest of all unsigned long type in-kernel filesystem blocks to ext3_fsblk_t,
and replace the printk format string respondingly.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/balloc.c          | 29 +++++++++++--------------
 fs/ext3/inode.c           | 55 ++++++++++++++++++++++++-----------------------
 fs/ext3/ioctl.c           |  2 +-
 fs/ext3/resize.c          | 34 +++++++++++++++--------------
 fs/ext3/super.c           | 32 +++++++++++++--------------
 fs/ext3/xattr.c           | 12 +++++------
 include/linux/ext3_fs.h   | 12 +++++++++--
 include/linux/ext3_fs_i.h |  8 +++----
 8 files changed, 96 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index b1633cd28eca..96172e89ddc3 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -168,8 +168,7 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
 {
 	ext3_fsblk_t group_first_block, group_last_block;
 
-	group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-				group * EXT3_BLOCKS_PER_GROUP(sb);
+	group_first_block = ext3_group_first_block_no(sb, group);
 	group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
 
 	if ((rsv->_rsv_start > group_last_block) ||
@@ -664,9 +663,7 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
 
 	/* we do allocation within the reservation window if we have a window */
 	if (my_rsv) {
-		group_first_block =
-			le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-			group * EXT3_BLOCKS_PER_GROUP(sb);
+		group_first_block = ext3_group_first_block_no(sb, group);
 		if (my_rsv->_rsv_start >= group_first_block)
 			start = my_rsv->_rsv_start - group_first_block;
 		else
@@ -900,8 +897,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
 	int ret;
 	spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
 
-	group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-				group * EXT3_BLOCKS_PER_GROUP(sb);
+	group_first_block = ext3_group_first_block_no(sb, group);
 	group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
 
 	if (grp_goal < 0)
@@ -1104,8 +1100,7 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	 * first block is a filesystem wide block number
 	 * first block is the block number of the first block in this group
 	 */
-	group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-			group * EXT3_BLOCKS_PER_GROUP(sb);
+	group_first_block = ext3_group_first_block_no(sb, group);
 
 	/*
 	 * Basically we will allocate a new block from inode's reservation
@@ -1371,8 +1366,7 @@ allocated:
 	if (fatal)
 		goto out;
 
-	ret_block = grp_alloc_blk + group_no * EXT3_BLOCKS_PER_GROUP(sb)
-				+ le32_to_cpu(es->s_first_data_block);
+	ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
 
 	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
 	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
@@ -1478,15 +1472,16 @@ ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
 	return ext3_new_blocks(handle, inode, goal, &count, errp);
 }
 
-unsigned long ext3_count_free_blocks(struct super_block *sb)
+ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
 {
-	unsigned long desc_count;
+	ext3_fsblk_t desc_count;
 	struct ext3_group_desc *gdp;
 	int i;
 	unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
 #ifdef EXT3FS_DEBUG
 	struct ext3_super_block *es;
-	unsigned long bitmap_count, x;
+	ext3_fsblk_t bitmap_count;
+	unsigned long x;
 	struct buffer_head *bitmap_bh = NULL;
 
 	es = EXT3_SB(sb)->s_es;
@@ -1511,8 +1506,10 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
-	printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n",
-	       le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
+	printk("ext3_count_free_blocks: stored = "E3FSBLK
+		", computed = "E3FSBLK", "E3FSBLK"\n",
+	       le32_to_cpu(es->s_free_blocks_count),
+		desc_count, bitmap_count);
 	return bitmap_count;
 #else
 	desc_count = 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b02bc32c57a4..0321e1b9034a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -407,13 +407,13 @@ no_block:
  *
  *	Caller must make sure that @ind is valid and will stay that way.
  */
-static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
+static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
 {
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
 	__le32 *p;
-	unsigned long bg_start;
-	unsigned long colour;
+	ext3_fsblk_t bg_start;
+	ext3_grpblk_t colour;
 
 	/* Try to find previous block */
 	for (p = ind->p - 1; p >= start; p--) {
@@ -429,8 +429,7 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
 	 * It is going to be referred to from the inode itself? OK, just put it
 	 * into the same cylinder group then.
 	 */
-	bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
-		le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
+	bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
 	colour = (current->pid % 16) *
 			(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 	return bg_start + colour;
@@ -448,7 +447,7 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
  *	stores it in *@goal and returns zero.
  */
 
-static unsigned long ext3_find_goal(struct inode *inode, long block,
+static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
 		Indirect chain[4], Indirect *partial)
 {
 	struct ext3_block_alloc_info *block_i;
@@ -516,13 +515,13 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
  *		direct blocks
  */
 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
-			unsigned long goal, int indirect_blks, int blks,
-			unsigned long long new_blocks[4], int *err)
+			ext3_fsblk_t goal, int indirect_blks, int blks,
+			ext3_fsblk_t new_blocks[4], int *err)
 {
 	int target, i;
 	unsigned long count = 0;
 	int index = 0;
-	unsigned long current_block = 0;
+	ext3_fsblk_t current_block = 0;
 	int ret = 0;
 
 	/*
@@ -592,7 +591,7 @@ failed_out:
  *	as described above and return 0.
  */
 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
-			int indirect_blks, int *blks, unsigned long goal,
+			int indirect_blks, int *blks, ext3_fsblk_t goal,
 			int *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
@@ -600,8 +599,8 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
 	int err = 0;
 	struct buffer_head *bh;
 	int num;
-	unsigned long long new_blocks[4];
-	unsigned long long current_block;
+	ext3_fsblk_t new_blocks[4];
+	ext3_fsblk_t current_block;
 
 	num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
 				*blks, new_blocks, &err);
@@ -688,7 +687,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
 	int i;
 	int err = 0;
 	struct ext3_block_alloc_info *block_i;
-	unsigned long current_block;
+	ext3_fsblk_t current_block;
 
 	block_i = EXT3_I(inode)->i_block_alloc_info;
 	/*
@@ -795,13 +794,13 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 	int offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
-	unsigned long goal;
+	ext3_fsblk_t goal;
 	int indirect_blks;
 	int blocks_to_boundary = 0;
 	int depth;
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	int count = 0;
-	unsigned long first_block = 0;
+	ext3_fsblk_t first_block = 0;
 
 
 	J_ASSERT(handle != NULL || create == 0);
@@ -819,7 +818,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 		count++;
 		/*map more blocks*/
 		while (count < maxblocks && count <= blocks_to_boundary) {
-			unsigned long blk;
+			ext3_fsblk_t blk;
 
 			if (!verify_chain(chain, partial)) {
 				/*
@@ -1759,7 +1758,7 @@ void ext3_set_aops(struct inode *inode)
 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
 		struct address_space *mapping, loff_t from)
 {
-	unsigned long index = from >> PAGE_CACHE_SHIFT;
+	ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize, iblock, length, pos;
 	struct inode *inode = mapping->host;
@@ -1960,7 +1959,7 @@ no_top:
  * than `count' because there can be holes in there.
  */
 static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
-		struct buffer_head *bh, unsigned long block_to_free,
+		struct buffer_head *bh, ext3_fsblk_t block_to_free,
 		unsigned long count, __le32 *first, __le32 *last)
 {
 	__le32 *p;
@@ -2022,12 +2021,12 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
 			   struct buffer_head *this_bh,
 			   __le32 *first, __le32 *last)
 {
-	unsigned long block_to_free = 0;    /* Starting block # of a run */
+	ext3_fsblk_t block_to_free = 0;    /* Starting block # of a run */
 	unsigned long count = 0;	    /* Number of blocks in the run */ 
 	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
 					       corresponding to
 					       block_to_free */
-	unsigned long nr;		    /* Current block # */
+	ext3_fsblk_t nr;		    /* Current block # */
 	__le32 *p;			    /* Pointer into inode/ind
 					       for current block */
 	int err;
@@ -2089,7 +2088,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 			       struct buffer_head *parent_bh,
 			       __le32 *first, __le32 *last, int depth)
 {
-	unsigned long nr;
+	ext3_fsblk_t nr;
 	__le32 *p;
 
 	if (is_handle_aborted(handle))
@@ -2113,7 +2112,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 			 */
 			if (!bh) {
 				ext3_error(inode->i_sb, "ext3_free_branches",
-					   "Read failure, inode=%ld, block=%ld",
+					   "Read failure, inode=%ld, block="E3FSBLK,
 					   inode->i_ino, nr);
 				continue;
 			}
@@ -2394,11 +2393,12 @@ out_stop:
 	ext3_journal_stop(handle);
 }
 
-static unsigned long ext3_get_inode_block(struct super_block *sb,
+static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
 		unsigned long ino, struct ext3_iloc *iloc)
 {
 	unsigned long desc, group_desc, block_group;
-	unsigned long offset, block;
+	unsigned long offset;
+	ext3_fsblk_t block;
 	struct buffer_head *bh;
 	struct ext3_group_desc * gdp;
 
@@ -2448,7 +2448,7 @@ static unsigned long ext3_get_inode_block(struct super_block *sb,
 static int __ext3_get_inode_loc(struct inode *inode,
 				struct ext3_iloc *iloc, int in_mem)
 {
-	unsigned long block;
+	ext3_fsblk_t block;
 	struct buffer_head *bh;
 
 	block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
@@ -2459,7 +2459,8 @@ static int __ext3_get_inode_loc(struct inode *inode,
 	if (!bh) {
 		ext3_error (inode->i_sb, "ext3_get_inode_loc",
 				"unable to read inode block - "
-				"inode=%lu, block=%lu", inode->i_ino, block);
+				"inode=%lu, block="E3FSBLK,
+				 inode->i_ino, block);
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
@@ -2540,7 +2541,7 @@ make_io:
 		if (!buffer_uptodate(bh)) {
 			ext3_error(inode->i_sb, "ext3_get_inode_loc",
 					"unable to read inode block - "
-					"inode=%lu, block=%lu",
+					"inode=%lu, block="E3FSBLK,
 					inode->i_ino, block);
 			brelse(bh);
 			return -EIO;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 8c22aa9a7fbb..3a6b012d120c 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -204,7 +204,7 @@ flags_err:
 		return 0;
 	}
 	case EXT3_IOC_GROUP_EXTEND: {
-		unsigned long n_blocks_count;
+		ext3_fsblk_t n_blocks_count;
 		struct super_block *sb = inode->i_sb;
 		int err;
 
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 82c678e92682..dfd811895d8f 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -116,7 +116,7 @@ static int verify_group_input(struct super_block *sb,
 }
 
 static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
-				  unsigned long blk)
+				  ext3_fsblk_t blk)
 {
 	struct buffer_head *bh;
 	int err;
@@ -167,14 +167,13 @@ static int setup_new_group_blocks(struct super_block *sb,
 				  struct ext3_new_group_data *input)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	unsigned long start = input->group * sbi->s_blocks_per_group +
-		le32_to_cpu(sbi->s_es->s_first_data_block);
+	ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group);
 	int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
 		le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
 	unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
 	struct buffer_head *bh;
 	handle_t *handle;
-	unsigned long block;
+	ext3_fsblk_t block;
 	ext3_grpblk_t bit;
 	int i;
 	int err = 0, err2;
@@ -332,7 +331,7 @@ static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
 static int verify_reserved_gdb(struct super_block *sb,
 			       struct buffer_head *primary)
 {
-	const unsigned long blk = primary->b_blocknr;
+	const ext3_fsblk_t blk = primary->b_blocknr;
 	const unsigned long end = EXT3_SB(sb)->s_groups_count;
 	unsigned three = 1;
 	unsigned five = 5;
@@ -344,7 +343,8 @@ static int verify_reserved_gdb(struct super_block *sb,
 	while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
 		if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
 			ext3_warning(sb, __FUNCTION__,
-				     "reserved GDT %lu missing grp %d (%lu)",
+				     "reserved GDT "E3FSBLK
+				     " missing grp %d ("E3FSBLK")",
 				     blk, grp,
 				     grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
 			return -EINVAL;
@@ -376,7 +376,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	struct super_block *sb = inode->i_sb;
 	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
 	unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
-	unsigned long gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
+	ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
 	struct buffer_head **o_group_desc, **n_group_desc;
 	struct buffer_head *dind;
 	int gdbackups;
@@ -421,7 +421,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	data = (__u32 *)dind->b_data;
 	if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
 		ext3_warning(sb, __FUNCTION__,
-			     "new group %u GDT block %lu not reserved",
+			     "new group %u GDT block "E3FSBLK" not reserved",
 			     input->group, gdblock);
 		err = -EINVAL;
 		goto exit_dind;
@@ -519,7 +519,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	struct buffer_head **primary;
 	struct buffer_head *dind;
 	struct ext3_iloc iloc;
-	unsigned long blk;
+	ext3_fsblk_t blk;
 	__u32 *data, *end;
 	int gdbackups = 0;
 	int res, i;
@@ -544,7 +544,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	for (res = 0; res < reserved_gdb; res++, blk++) {
 		if (le32_to_cpu(*data) != blk) {
 			ext3_warning(sb, __FUNCTION__,
-				     "reserved block %lu not at offset %ld",
+				     "reserved block "E3FSBLK
+				     " not at offset %ld",
 				     blk, (long)(data - (__u32 *)dind->b_data));
 			err = -EINVAL;
 			goto exit_bh;
@@ -906,9 +907,9 @@ exit_put:
  * GDT blocks are reserved to grow to the desired size.
  */
 int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
-		      unsigned long n_blocks_count)
+		      ext3_fsblk_t n_blocks_count)
 {
-	unsigned long o_blocks_count;
+	ext3_fsblk_t o_blocks_count;
 	unsigned long o_groups_count;
 	ext3_grpblk_t last;
 	ext3_grpblk_t add;
@@ -924,7 +925,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	o_groups_count = EXT3_SB(sb)->s_groups_count;
 
 	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT3-fs: extending last group from %lu to %lu blocks\n",
+		printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
 		       o_blocks_count, n_blocks_count);
 
 	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
@@ -963,7 +964,8 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 
 	if (o_blocks_count + add < n_blocks_count)
 		ext3_warning(sb, __FUNCTION__,
-			     "will only finish group (%lu blocks, %u new)",
+			     "will only finish group ("E3FSBLK
+			     " blocks, %u new)",
 			     o_blocks_count + add, add);
 
 	/* See if the device is actually as big as what was requested */
@@ -1006,10 +1008,10 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	sb->s_dirt = 1;
 	unlock_super(sb);
-	ext3_debug("freeing blocks %lu through %lu\n", o_blocks_count,
+	ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
 		   o_blocks_count + add);
 	ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-	ext3_debug("freed blocks %lu through %lu\n", o_blocks_count,
+	ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count,
 		   o_blocks_count + add);
 	if ((err = ext3_journal_stop(handle)))
 		goto exit_put;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 94113500fc55..b2891cc29db1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -689,14 +689,15 @@ static match_table_t tokens = {
 	{Opt_resize, "resize"},
 };
 
-static unsigned long get_sb_block(void **data)
+static ext3_fsblk_t get_sb_block(void **data)
 {
-	unsigned long 	sb_block;
+	ext3_fsblk_t 	sb_block;
 	char 		*options = (char *) *data;
 
 	if (!options || strncmp(options, "sb=", 3) != 0)
 		return 1;	/* Default location */
 	options += 3;
+	/*todo: use simple_strtoll with >32bit ext3 */
 	sb_block = simple_strtoul(options, &options, 0);
 	if (*options && *options != ',') {
 		printk("EXT3-fs: Invalid sb specification: %s\n",
@@ -711,7 +712,7 @@ static unsigned long get_sb_block(void **data)
 
 static int parse_options (char *options, struct super_block *sb,
 			  unsigned long *inum, unsigned long *journal_devnum,
-			  unsigned long *n_blocks_count, int is_remount)
+			  ext3_fsblk_t *n_blocks_count, int is_remount)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	char * p;
@@ -1128,7 +1129,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
 static int ext3_check_descriptors (struct super_block * sb)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	ext3_fsblk_t block = le32_to_cpu(sbi->s_es->s_first_data_block);
 	struct ext3_group_desc * gdp = NULL;
 	int desc_block = 0;
 	int i;
@@ -1315,15 +1316,14 @@ static loff_t ext3_max_size(int bits)
 	return res;
 }
 
-static unsigned long descriptor_loc(struct super_block *sb,
-				    unsigned long logic_sb_block,
+static ext3_fsblk_t descriptor_loc(struct super_block *sb,
+				    ext3_fsblk_t logic_sb_block,
 				    int nr)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	unsigned long bg, first_data_block, first_meta_bg;
+	unsigned long bg, first_meta_bg;
 	int has_super = 0;
 
-	first_data_block = le32_to_cpu(sbi->s_es->s_first_data_block);
 	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
 
 	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
@@ -1332,7 +1332,7 @@ static unsigned long descriptor_loc(struct super_block *sb,
 	bg = sbi->s_desc_per_block * nr;
 	if (ext3_bg_has_super(sb, bg))
 		has_super = 1;
-	return (first_data_block + has_super + (bg * sbi->s_blocks_per_group));
+	return (has_super + ext3_group_first_block_no(sb, bg));
 }
 
 
@@ -1341,9 +1341,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	struct buffer_head * bh;
 	struct ext3_super_block *es = NULL;
 	struct ext3_sb_info *sbi;
-	unsigned long block;
-	unsigned long sb_block = get_sb_block(&data);
-	unsigned long logic_sb_block;
+	ext3_fsblk_t block;
+	ext3_fsblk_t sb_block = get_sb_block(&data);
+	ext3_fsblk_t logic_sb_block;
 	unsigned long offset = 0;
 	unsigned long journal_inum = 0;
 	unsigned long journal_devnum = 0;
@@ -1840,10 +1840,10 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 {
 	struct buffer_head * bh;
 	journal_t *journal;
-	int start;
+	ext3_fsblk_t start;
 	ext3_fsblk_t len;
 	int hblock, blocksize;
-	unsigned long sb_block;
+	ext3_fsblk_t sb_block;
 	unsigned long offset;
 	struct ext3_super_block * es;
 	struct block_device *bdev;
@@ -2216,7 +2216,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 {
 	struct ext3_super_block * es;
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	unsigned long n_blocks_count = 0;
+	ext3_fsblk_t n_blocks_count = 0;
 	unsigned long old_sb_flags;
 	struct ext3_mount_options old_opts;
 	int err;
@@ -2336,7 +2336,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 	struct super_block *sb = dentry->d_sb;
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
-	unsigned long overhead;
+	ext3_fsblk_t overhead;
 	int i;
 
 	if (test_opt (sb, MINIX_DF))
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 1ba515de5a75..a44a0562203a 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -233,7 +233,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 bad_block:	ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block %u", inode->i_ino,
+			   "inode %ld: bad block "E3FSBLK, inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -375,7 +375,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block %u", inode->i_ino,
+			   "inode %ld: bad block "E3FSBLK, inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -647,7 +647,7 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		if (ext3_xattr_check_block(bs->bh)) {
 			ext3_error(sb, __FUNCTION__,
-				"inode %ld: bad block %u", inode->i_ino,
+				"inode %ld: bad block "E3FSBLK, inode->i_ino,
 				EXT3_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
@@ -848,7 +848,7 @@ cleanup_dquot:
 
 bad_block:
 	ext3_error(inode->i_sb, __FUNCTION__,
-		   "inode %ld: bad block %u", inode->i_ino,
+		   "inode %ld: bad block "E3FSBLK, inode->i_ino,
 		   EXT3_I(inode)->i_file_acl);
 	goto cleanup;
 
@@ -1077,14 +1077,14 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: block %u read error", inode->i_ino,
+			"inode %ld: block "E3FSBLK" read error", inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: bad block %u", inode->i_ino,
+			"inode %ld: bad block "E3FSBLK, inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 34136ff02aca..5607e6457a65 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -710,6 +710,14 @@ struct dir_private_info {
 	__u32		next_hash;
 };
 
+/* calculate the first block number of the group */
+static inline ext3_fsblk_t
+ext3_group_first_block_no(struct super_block *sb, unsigned long group_no)
+{
+	return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) +
+		le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
+}
+
 /*
  * Special error return code only used by dx_probe() and its callers.
  */
@@ -739,7 +747,7 @@ extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
 extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
 				 ext3_fsblk_t block, unsigned long count,
 				unsigned long *pdquot_freed_blocks);
-extern unsigned long ext3_count_free_blocks (struct super_block *);
+extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *);
 extern void ext3_check_blocks_bitmap (struct super_block *);
 extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
 						    unsigned int block_group,
@@ -811,7 +819,7 @@ extern int ext3_group_add(struct super_block *sb,
 				struct ext3_new_group_data *input);
 extern int ext3_group_extend(struct super_block *sb,
 				struct ext3_super_block *es,
-				unsigned long n_blocks_count);
+				ext3_fsblk_t n_blocks_count);
 
 /* super.c */
 extern void ext3_error (struct super_block *, const char *, const char *, ...)
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index e1c7509c0c9f..2f18b9511f21 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -30,8 +30,8 @@ typedef unsigned long ext3_fsblk_t;
 #define E3FSBLK "%lu"
 
 struct ext3_reserve_window {
-	__u32			_rsv_start;	/* First byte reserved */
-	__u32			_rsv_end;	/* Last byte reserved or 0 */
+	ext3_fsblk_t	_rsv_start;	/* First byte reserved */
+	ext3_fsblk_t	_rsv_end;	/* Last byte reserved or 0 */
 };
 
 struct ext3_reserve_window_node {
@@ -58,7 +58,7 @@ struct ext3_block_alloc_info {
 	 * allocated to this file.  This give us the goal (target) for the next
 	 * allocation when we detect linearly ascending requests.
 	 */
-	__u32                   last_alloc_physical_block;
+	ext3_fsblk_t		last_alloc_physical_block;
 };
 
 #define rsv_start rsv_window._rsv_start
@@ -75,7 +75,7 @@ struct ext3_inode_info {
 	__u8	i_frag_no;
 	__u8	i_frag_size;
 #endif
-	__u32	i_file_acl;
+	ext3_fsblk_t	i_file_acl;
 	__u32	i_dir_acl;
 	__u32	i_dtime;
 
-- 
cgit v1.2.3


From 9de9adb615bddbdb786273c41ec3c03837e32fa5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 25 Jun 2006 05:48:10 -0700
Subject: [PATCH] for_each_cpu_mask() warning fix

On UP, this:

       cpumask_t mask = node_to_cpumask(numa_node_id());

       for_each_cpu_mask(cpu, mask)

does this:

mm/readahead.c: In function `node_readahead_aging':
mm/readahead.c:850: warning: unused variable `mask'

which is unpleasantly fixed by this:

Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpumask.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index fb5b761e3444..b268a3c0c376 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -317,7 +317,8 @@ static inline void __cpus_remap(cpumask_t *dstp, const cpumask_t *srcp,
 		(cpu) < NR_CPUS;		\
 		(cpu) = next_cpu((cpu), (mask)))
 #else /* NR_CPUS == 1 */
-#define for_each_cpu_mask(cpu, mask) for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#define for_each_cpu_mask(cpu, mask)		\
+	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
 #endif /* NR_CPUS */
 
 /*
-- 
cgit v1.2.3


From 4ad3bcf3146aa12f41262bb5dd1d9f1778e085b1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 25 Jun 2006 05:48:13 -0700
Subject: [PATCH] nbd: endian annotations

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Paul Clements <Paul.Clements@steeleye.com>
Cc: Jens Axboe <axboe@suse.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/nbd.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nbd.h b/include/linux/nbd.h
index 1d7cdd20b553..e712e7d47cc2 100644
--- a/include/linux/nbd.h
+++ b/include/linux/nbd.h
@@ -77,11 +77,11 @@ struct nbd_device {
  * server. All data are in network byte order.
  */
 struct nbd_request {
-	__u32 magic;
-	__u32 type;	/* == READ || == WRITE 	*/
+	__be32 magic;
+	__be32 type;	/* == READ || == WRITE 	*/
 	char handle[8];
-	__u64 from;
-	__u32 len;
+	__be64 from;
+	__be32 len;
 }
 #ifdef __GNUC__
 	__attribute__ ((packed))
@@ -93,8 +93,8 @@ struct nbd_request {
  * it has completed an I/O request (or an error occurs).
  */
 struct nbd_reply {
-	__u32 magic;
-	__u32 error;		/* 0 = ok, else error	*/
+	__be32 magic;
+	__be32 error;		/* 0 = ok, else error	*/
 	char handle[8];		/* handle you got from request	*/
 };
 #endif
-- 
cgit v1.2.3


From 3419b23a919698f75944d3e0d97eb1d9c51e4bb6 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Sun, 25 Jun 2006 05:48:14 -0700
Subject: [PATCH] epoll: use unlocked wqueue operations

A few days ago Arjan signaled a lockdep red flag on epoll locks, and
precisely between the epoll's device structure lock (->lock) and the wait
queue head lock (->lock).

Like I explained in another email, and directly to Arjan, this can't happen
in reality because of the explicit check at eventpoll.c:592, that does not
allow to drop an epoll fd inside the same epoll fd.  Since lockdep is
working on per-structure locks, it will never be able to know of policies
enforced in other parts of the code.

It was decided time ago of having the ability to drop epoll fds inside
other epoll fds, that triggers a very trick wakeup operations (due to
possibly reentrant callback-driven wakeups) handled by the
ep_poll_safewake() function.  While looking again at the code though, I
noticed that all the operations done on the epoll's main structure wait
queue head (->wq) are already protected by the epoll lock (->lock), so that
locked-style functions can be used to manipulate the ->wq member.  This
makes both a lock-acquire save, and lockdep happy.

Running totalmess on my dual opteron for a while did not reveal any problem
so far:

http://www.xmailserver.org/totalmess.c

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/eventpoll.c            | 17 ++++++++++-------
 include/linux/eventpoll.h |  2 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 08e7e6a555ca..9c677bbd0b08 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
 /*
  *  fs/eventpoll.c ( Efficent event polling implementation )
- *  Copyright (C) 2001,...,2003	 Davide Libenzi
+ *  Copyright (C) 2001,...,2006	 Davide Libenzi
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -1004,7 +1004,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 		/* Notify waiting tasks that events are available */
 		if (waitqueue_active(&ep->wq))
-			wake_up(&ep->wq);
+			__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
 		if (waitqueue_active(&ep->poll_wait))
 			pwake++;
 	}
@@ -1083,7 +1083,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 
 				/* Notify waiting tasks that events are available */
 				if (waitqueue_active(&ep->wq))
-					wake_up(&ep->wq);
+					__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+							 TASK_INTERRUPTIBLE);
 				if (waitqueue_active(&ep->poll_wait))
 					pwake++;
 			}
@@ -1260,7 +1261,8 @@ is_linked:
 	 * wait list.
 	 */
 	if (waitqueue_active(&ep->wq))
-		wake_up(&ep->wq);
+		__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+				 TASK_INTERRUPTIBLE);
 	if (waitqueue_active(&ep->poll_wait))
 		pwake++;
 
@@ -1444,7 +1446,8 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
 		 * wait list.
 		 */
 		if (waitqueue_active(&ep->wq))
-			wake_up(&ep->wq);
+			__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+					 TASK_INTERRUPTIBLE);
 		if (waitqueue_active(&ep->poll_wait))
 			pwake++;
 	}
@@ -1516,7 +1519,7 @@ retry:
 		 * ep_poll_callback() when events will become available.
 		 */
 		init_waitqueue_entry(&wait, current);
-		add_wait_queue(&ep->wq, &wait);
+		__add_wait_queue(&ep->wq, &wait);
 
 		for (;;) {
 			/*
@@ -1536,7 +1539,7 @@ retry:
 			jtimeout = schedule_timeout(jtimeout);
 			write_lock_irqsave(&ep->lock, flags);
 		}
-		remove_wait_queue(&ep->wq, &wait);
+		__remove_wait_queue(&ep->wq, &wait);
 
 		set_current_state(TASK_RUNNING);
 	}
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 1e4bdfcf83a2..84cfa8bbdc36 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -1,6 +1,6 @@
 /*
  *  include/linux/eventpoll.h ( Efficent event polling implementation )
- *  Copyright (C) 2001,...,2003	 Davide Libenzi
+ *  Copyright (C) 2001,...,2006	 Davide Libenzi
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
-- 
cgit v1.2.3


From 655066c3835e7b51794c4d56f042eb78b5a79f53 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Sun, 25 Jun 2006 05:48:17 -0700
Subject: [PATCH] RTC: rtc-dev UIE emulation

Import genrtc's RTC UIE emulation (CONFIG_GEN_RTC_X) to rtc-dev driver with
slight adjustments/refinements.  This makes UIE-less rtc drivers work
better with programs doing read/poll on /dev/rtc, such as hwclock.  This
emulation should not harm rtc drivers with UIE support, since
rtc_dev_ioctl() calls underlaying rtc driver's ioctl() first.

Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Acked-by: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/rtc/Kconfig   |   7 ++++
 drivers/rtc/rtc-dev.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtc.h   |  10 +++++
 3 files changed, 119 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 9e249650baf0..725d6b696792 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -73,6 +73,13 @@ config RTC_INTF_DEV
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-dev.
 
+config RTC_INTF_DEV_UIE_EMUL
+	bool "RTC UIE emulation on dev interface"
+	depends on RTC_INTF_DEV
+	help
+	  Provides an emulation for RTC_UIE if the underlaying rtc chip
+	  driver did not provide RTC_UIE ioctls.
+
 comment "RTC drivers"
 	depends on RTC_CLASS
 
diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c
index 2011567005f9..07387c99df0d 100644
--- a/drivers/rtc/rtc-dev.c
+++ b/drivers/rtc/rtc-dev.c
@@ -48,6 +48,93 @@ static int rtc_dev_open(struct inode *inode, struct file *file)
 	return err;
 }
 
+#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
+/*
+ * Routine to poll RTC seconds field for change as often as possible,
+ * after first RTC_UIE use timer to reduce polling
+ */
+static void rtc_uie_task(void *data)
+{
+	struct rtc_device *rtc = data;
+	struct rtc_time tm;
+	int num = 0;
+	int err;
+
+	err = rtc_read_time(&rtc->class_dev, &tm);
+	spin_lock_irq(&rtc->irq_lock);
+	if (rtc->stop_uie_polling || err) {
+		rtc->uie_task_active = 0;
+	} else if (rtc->oldsecs != tm.tm_sec) {
+		num = (tm.tm_sec + 60 - rtc->oldsecs) % 60;
+		rtc->oldsecs = tm.tm_sec;
+		rtc->uie_timer.expires = jiffies + HZ - (HZ/10);
+		rtc->uie_timer_active = 1;
+		rtc->uie_task_active = 0;
+		add_timer(&rtc->uie_timer);
+	} else if (schedule_work(&rtc->uie_task) == 0) {
+		rtc->uie_task_active = 0;
+	}
+	spin_unlock_irq(&rtc->irq_lock);
+	if (num)
+		rtc_update_irq(&rtc->class_dev, num, RTC_UF | RTC_IRQF);
+}
+
+static void rtc_uie_timer(unsigned long data)
+{
+	struct rtc_device *rtc = (struct rtc_device *)data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rtc->irq_lock, flags);
+	rtc->uie_timer_active = 0;
+	rtc->uie_task_active = 1;
+	if ((schedule_work(&rtc->uie_task) == 0))
+		rtc->uie_task_active = 0;
+	spin_unlock_irqrestore(&rtc->irq_lock, flags);
+}
+
+static void clear_uie(struct rtc_device *rtc)
+{
+	spin_lock_irq(&rtc->irq_lock);
+	if (rtc->irq_active) {
+		rtc->stop_uie_polling = 1;
+		if (rtc->uie_timer_active) {
+			spin_unlock_irq(&rtc->irq_lock);
+			del_timer_sync(&rtc->uie_timer);
+			spin_lock_irq(&rtc->irq_lock);
+			rtc->uie_timer_active = 0;
+		}
+		if (rtc->uie_task_active) {
+			spin_unlock_irq(&rtc->irq_lock);
+			flush_scheduled_work();
+			spin_lock_irq(&rtc->irq_lock);
+		}
+		rtc->irq_active = 0;
+	}
+	spin_unlock_irq(&rtc->irq_lock);
+}
+
+static int set_uie(struct rtc_device *rtc)
+{
+	struct rtc_time tm;
+	int err;
+
+	err = rtc_read_time(&rtc->class_dev, &tm);
+	if (err)
+		return err;
+	spin_lock_irq(&rtc->irq_lock);
+	if (!rtc->irq_active) {
+		rtc->irq_active = 1;
+		rtc->stop_uie_polling = 0;
+		rtc->oldsecs = tm.tm_sec;
+		rtc->uie_task_active = 1;
+		if (schedule_work(&rtc->uie_task) == 0)
+			rtc->uie_task_active = 0;
+	}
+	rtc->irq_data = 0;
+	spin_unlock_irq(&rtc->irq_lock);
+	return 0;
+}
+#endif /* CONFIG_RTC_INTF_DEV_UIE_EMUL */
 
 static ssize_t
 rtc_dev_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
@@ -232,6 +319,14 @@ static int rtc_dev_ioctl(struct inode *inode, struct file *file,
 			return -EFAULT;
 		break;
 
+#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
+	case RTC_UIE_OFF:
+		clear_uie(rtc);
+		return 0;
+
+	case RTC_UIE_ON:
+		return set_uie(rtc);
+#endif
 	default:
 		err = -ENOTTY;
 		break;
@@ -244,6 +339,9 @@ static int rtc_dev_release(struct inode *inode, struct file *file)
 {
 	struct rtc_device *rtc = to_rtc_device(file->private_data);
 
+#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
+	clear_uie(rtc);
+#endif
 	if (rtc->ops->release)
 		rtc->ops->release(rtc->class_dev.dev);
 
@@ -284,6 +382,10 @@ static int rtc_dev_add_device(struct class_device *class_dev,
 	mutex_init(&rtc->char_lock);
 	spin_lock_init(&rtc->irq_lock);
 	init_waitqueue_head(&rtc->irq_queue);
+#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
+	INIT_WORK(&rtc->uie_task, rtc_uie_task, rtc);
+	setup_timer(&rtc->uie_timer, rtc_uie_timer, (unsigned long)rtc);
+#endif
 
 	cdev_init(&rtc->char_dev, &rtc_dev_fops);
 	rtc->char_dev.owner = rtc->owner;
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index ab61cd1199f2..43310760fe73 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -155,6 +155,16 @@ struct rtc_device
 	struct rtc_task *irq_task;
 	spinlock_t irq_task_lock;
 	int irq_freq;
+#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
+	struct work_struct uie_task;
+	struct timer_list uie_timer;
+	/* Those fields are protected by rtc->irq_lock */
+	unsigned int oldsecs;
+	unsigned int irq_active:1;
+	unsigned int stop_uie_polling:1;
+	unsigned int uie_task_active:1;
+	unsigned int uie_timer_active:1;
+#endif
 };
 #define to_rtc_device(d) container_of(d, struct rtc_device, class_dev)
 
-- 
cgit v1.2.3


From 110d693d5898649da606cd6e5f6af4d7f70a405f Mon Sep 17 00:00:00 2001
From: Alessandro Zummo <alessandro.zummo@towertech.it>
Date: Sun, 25 Jun 2006 05:48:20 -0700
Subject: [PATCH] rtc subsystem: add capability checks

Centralize CAP_SYS_XXX checks to avoid duplicate code and missing checks in
the drivers.

Signed-off-by: Alessandro Zummo <a.zummo@towertech.it>
Cc: Richard Purdie <rpurdie@rpsys.net>
Cc: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/rtc/class.c      |  1 +
 drivers/rtc/rtc-dev.c    | 29 ++++++++++++++++++++++-------
 drivers/rtc/rtc-sa1100.c |  4 ----
 drivers/rtc/rtc-vr41xx.c |  8 --------
 include/linux/rtc.h      |  1 +
 5 files changed, 24 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 413c7d54ea10..5396beec30d0 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -69,6 +69,7 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
 	rtc->id = id;
 	rtc->ops = ops;
 	rtc->owner = owner;
+	rtc->max_user_freq = 64;
 	rtc->class_dev.dev = dev;
 	rtc->class_dev.class = rtc_class;
 	rtc->class_dev.release = rtc_device_release;
diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c
index 07387c99df0d..61a58259c93f 100644
--- a/drivers/rtc/rtc-dev.c
+++ b/drivers/rtc/rtc-dev.c
@@ -214,6 +214,28 @@ static int rtc_dev_ioctl(struct inode *inode, struct file *file,
 	struct rtc_wkalrm alarm;
 	void __user *uarg = (void __user *) arg;
 
+	/* check that the calles has appropriate permissions
+	 * for certain ioctls. doing this check here is useful
+	 * to avoid duplicate code in each driver.
+	 */
+	switch (cmd) {
+	case RTC_EPOCH_SET:
+	case RTC_SET_TIME:
+		if (!capable(CAP_SYS_TIME))
+			return -EACCES;
+		break;
+
+	case RTC_IRQP_SET:
+		if (arg > rtc->max_user_freq && !capable(CAP_SYS_RESOURCE))
+			return -EACCES;
+		break;
+
+	case RTC_PIE_ON:
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EACCES;
+		break;
+	}
+
 	/* avoid conflicting IRQ users */
 	if (cmd == RTC_PIE_ON || cmd == RTC_PIE_OFF || cmd == RTC_IRQP_SET) {
 		spin_lock(&rtc->irq_task_lock);
@@ -272,9 +294,6 @@ static int rtc_dev_ioctl(struct inode *inode, struct file *file,
 		break;
 
 	case RTC_SET_TIME:
-		if (!capable(CAP_SYS_TIME))
-			return -EACCES;
-
 		if (copy_from_user(&tm, uarg, sizeof(tm)))
 			return -EFAULT;
 
@@ -290,10 +309,6 @@ static int rtc_dev_ioctl(struct inode *inode, struct file *file,
 			err = -EINVAL;
 			break;
 		}
-		if (!capable(CAP_SYS_TIME)) {
-			err = -EACCES;
-			break;
-		}
 		rtc_epoch = arg;
 		err = 0;
 #endif
diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c
index a997529f8926..ab486fbc828d 100644
--- a/drivers/rtc/rtc-sa1100.c
+++ b/drivers/rtc/rtc-sa1100.c
@@ -229,8 +229,6 @@ static int sa1100_rtc_ioctl(struct device *dev, unsigned int cmd,
 		spin_unlock_irq(&sa1100_rtc_lock);
 		return 0;
 	case RTC_PIE_ON:
-		if ((rtc_freq > 64) && !capable(CAP_SYS_RESOURCE))
-			return -EACCES;
 		spin_lock_irq(&sa1100_rtc_lock);
 		OSMR1 = TIMER_FREQ/rtc_freq + OSCR;
 		OIER |= OIER_E1;
@@ -242,8 +240,6 @@ static int sa1100_rtc_ioctl(struct device *dev, unsigned int cmd,
 	case RTC_IRQP_SET:
 		if (arg < 1 || arg > TIMER_FREQ)
 			return -EINVAL;
-		if ((arg > 64) && (!capable(CAP_SYS_RESOURCE)))
-			return -EACCES;
 		rtc_freq = arg;
 		return 0;
 	}
diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c
index 277596c302e3..33e029207e26 100644
--- a/drivers/rtc/rtc-vr41xx.c
+++ b/drivers/rtc/rtc-vr41xx.c
@@ -81,7 +81,6 @@ MODULE_LICENSE("GPL");
 
 #define RTC_FREQUENCY		32768
 #define MAX_PERIODIC_RATE	6553
-#define MAX_USER_PERIODIC_RATE	64
 
 static void __iomem *rtc1_base;
 static void __iomem *rtc2_base;
@@ -240,9 +239,6 @@ static int vr41xx_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long
 		if (arg > MAX_PERIODIC_RATE)
 			return -EINVAL;
 
-		if (arg > MAX_USER_PERIODIC_RATE && capable(CAP_SYS_RESOURCE) == 0)
-			return -EACCES;
-
 		periodic_frequency = arg;
 
 		count = RTC_FREQUENCY;
@@ -263,10 +259,6 @@ static int vr41xx_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long
 		/* Doesn't support before 1900 */
 		if (arg < 1900)
 			return -EINVAL;
-
-		if (capable(CAP_SYS_TIME) == 0)
-			return -EACCES;
-
 		epoch = arg;
 		break;
 	default:
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 43310760fe73..c12cbc1b83c5 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -155,6 +155,7 @@ struct rtc_device
 	struct rtc_task *irq_task;
 	spinlock_t irq_task_lock;
 	int irq_freq;
+	int max_user_freq;
 #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
 	struct work_struct uie_task;
 	struct timer_list uie_timer;
-- 
cgit v1.2.3


From 362600fe60fd18a25b4de8ec544b9e24e77e1484 Mon Sep 17 00:00:00 2001
From: Raphael Assenat <raph@raphnet.net>
Date: Sun, 25 Jun 2006 05:48:24 -0700
Subject: [PATCH] Add v3020 RTC support

This patch adds support for the v3020 RTC from EM Microelectronic.

The v3020 RTC is designed to be connected on a bus using only one data bit.
 Since any data bit may be used, it is necessary to specify this to the
driver by passing a struct v3020_platform_data pointer (see
include/linux/rtc-v3020.h) to the driver.

Part of the following code comes from the kernel patchs produced by
Compulab for their products.  The original file (available here:
http://raph.people.8d.com/misc/emv3020.c) was released under the terms of
the GPL license.

[akpm@osdl.org: cleanups]
Signed-off-by: Raphael Assenat <raph@raphnet.net>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/rtc/Kconfig       |  10 ++
 drivers/rtc/Makefile      |   1 +
 drivers/rtc/rtc-v3020.c   | 264 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtc-v3020.h |  35 ++++++
 4 files changed, 310 insertions(+)
 create mode 100644 drivers/rtc/rtc-v3020.c
 create mode 100644 include/linux/rtc-v3020.h

(limited to 'include/linux')

diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 94746cfac7b5..8534012ebdef 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -227,4 +227,14 @@ config RTC_DRV_MAX6902
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-max6902.
 
+config RTC_DRV_V3020
+	tristate "EM Microelectronic V3020"
+	depends on RTC_CLASS
+	help
+	  If you say yes here you will get support for the
+	  EM Microelectronic v3020 RTC chip.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-v3020.
+
 endmenu
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index dd480d6bcdf8..cbb8e8a7f620 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_RTC_DRV_SA1100)	+= rtc-sa1100.o
 obj-$(CONFIG_RTC_DRV_VR41XX)	+= rtc-vr41xx.o
 obj-$(CONFIG_RTC_DRV_PL031)	+= rtc-pl031.o
 obj-$(CONFIG_RTC_DRV_MAX6902)	+= rtc-max6902.o
+obj-$(CONFIG_RTC_DRV_V3020)	+= rtc-v3020.o
diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c
new file mode 100644
index 000000000000..a40f400acff6
--- /dev/null
+++ b/drivers/rtc/rtc-v3020.c
@@ -0,0 +1,264 @@
+/* drivers/rtc/rtc-v3020.c
+ *
+ * Copyright (C) 2006 8D Technologies inc.
+ * Copyright (C) 2004 Compulab Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Driver for the V3020 RTC
+ *
+ * Changelog:
+ *
+ *  10-May-2006: Raphael Assenat <raph@8d.com>
+ *				- Converted to platform driver
+ *				- Use the generic rtc class
+ *
+ *  ??-???-2004: Someone at Compulab
+ *  			- Initial driver creation.
+ *
+ */
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rtc.h>
+#include <linux/types.h>
+#include <linux/bcd.h>
+#include <linux/rtc-v3020.h>
+
+#include <asm/io.h>
+
+#undef DEBUG
+
+struct v3020 {
+	void __iomem *ioaddress;
+	int leftshift;
+	struct rtc_device *rtc;
+};
+
+static void v3020_set_reg(struct v3020 *chip, unsigned char address,
+			unsigned char data)
+{
+	int i;
+	unsigned char tmp;
+
+	tmp = address;
+	for (i = 0; i < 4; i++) {
+		writel((tmp & 1) << chip->leftshift, chip->ioaddress);
+		tmp >>= 1;
+	}
+
+	/* Commands dont have data */
+	if (!V3020_IS_COMMAND(address)) {
+		for (i = 0; i < 8; i++) {
+			writel((data & 1) << chip->leftshift, chip->ioaddress);
+			data >>= 1;
+		}
+	}
+}
+
+static unsigned char v3020_get_reg(struct v3020 *chip, unsigned char address)
+{
+	unsigned int data=0;
+	int i;
+
+	for (i = 0; i < 4; i++) {
+		writel((address & 1) << chip->leftshift, chip->ioaddress);
+		address >>= 1;
+	}
+
+	for (i = 0; i < 8; i++) {
+		data >>= 1;
+		if (readl(chip->ioaddress) & (1 << chip->leftshift))
+			data |= 0x80;
+	}
+
+	return data;
+}
+
+static int v3020_read_time(struct device *dev, struct rtc_time *dt)
+{
+	struct v3020 *chip = dev_get_drvdata(dev);
+	int tmp;
+
+	/* Copy the current time to ram... */
+	v3020_set_reg(chip, V3020_CMD_CLOCK2RAM, 0);
+
+	/* ...and then read constant values. */
+	tmp = v3020_get_reg(chip, V3020_SECONDS);
+	dt->tm_sec	= BCD2BIN(tmp);
+	tmp = v3020_get_reg(chip, V3020_MINUTES);
+	dt->tm_min	= BCD2BIN(tmp);
+	tmp = v3020_get_reg(chip, V3020_HOURS);
+	dt->tm_hour	= BCD2BIN(tmp);
+	tmp = v3020_get_reg(chip, V3020_MONTH_DAY);
+	dt->tm_mday	= BCD2BIN(tmp);
+	tmp = v3020_get_reg(chip, V3020_MONTH);
+	dt->tm_mon	= BCD2BIN(tmp);
+	tmp = v3020_get_reg(chip, V3020_WEEK_DAY);
+	dt->tm_wday	= BCD2BIN(tmp);
+	tmp = v3020_get_reg(chip, V3020_YEAR);
+	dt->tm_year = BCD2BIN(tmp)+100;
+
+#ifdef DEBUG
+	printk("\n%s : Read RTC values\n",__FUNCTION__);
+	printk("tm_hour: %i\n",dt->tm_hour);
+	printk("tm_min : %i\n",dt->tm_min);
+	printk("tm_sec : %i\n",dt->tm_sec);
+	printk("tm_year: %i\n",dt->tm_year);
+	printk("tm_mon : %i\n",dt->tm_mon);
+	printk("tm_mday: %i\n",dt->tm_mday);
+	printk("tm_wday: %i\n",dt->tm_wday);
+#endif
+
+	return 0;
+}
+
+
+static int v3020_set_time(struct device *dev, struct rtc_time *dt)
+{
+	struct v3020 *chip = dev_get_drvdata(dev);
+
+#ifdef DEBUG
+	printk("\n%s : Setting RTC values\n",__FUNCTION__);
+	printk("tm_sec : %i\n",dt->tm_sec);
+	printk("tm_min : %i\n",dt->tm_min);
+	printk("tm_hour: %i\n",dt->tm_hour);
+	printk("tm_mday: %i\n",dt->tm_mday);
+	printk("tm_wday: %i\n",dt->tm_wday);
+	printk("tm_year: %i\n",dt->tm_year);
+#endif
+
+	/* Write all the values to ram... */
+	v3020_set_reg(chip, V3020_SECONDS, 	BIN2BCD(dt->tm_sec));
+	v3020_set_reg(chip, V3020_MINUTES, 	BIN2BCD(dt->tm_min));
+	v3020_set_reg(chip, V3020_HOURS, 	BIN2BCD(dt->tm_hour));
+	v3020_set_reg(chip, V3020_MONTH_DAY,	BIN2BCD(dt->tm_mday));
+	v3020_set_reg(chip, V3020_MONTH, 	BIN2BCD(dt->tm_mon));
+	v3020_set_reg(chip, V3020_WEEK_DAY, 	BIN2BCD(dt->tm_wday));
+	v3020_set_reg(chip, V3020_YEAR, 	BIN2BCD(dt->tm_year % 100));
+
+	/* ...and set the clock. */
+	v3020_set_reg(chip, V3020_CMD_RAM2CLOCK, 0);
+
+	/* Compulab used this delay here. I dont know why,
+	 * the datasheet does not specify a delay. */
+	/*mdelay(5);*/
+
+	return 0;
+}
+
+static struct rtc_class_ops v3020_rtc_ops = {
+	.read_time	= v3020_read_time,
+	.set_time	= v3020_set_time,
+};
+
+static int rtc_probe(struct platform_device *pdev)
+{
+	struct v3020_platform_data *pdata = pdev->dev.platform_data;
+	struct v3020 *chip;
+	struct rtc_device *rtc;
+	int retval = -EBUSY;
+	int i;
+	int temp;
+
+	if (pdev->num_resources != 1)
+		return -EBUSY;
+
+	if (pdev->resource[0].flags != IORESOURCE_MEM)
+		return -EBUSY;
+
+	if (pdev == NULL)
+		return -EBUSY;
+
+	chip = kzalloc(sizeof *chip, GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	chip->leftshift = pdata->leftshift;
+	chip->ioaddress = ioremap(pdev->resource[0].start, 1);
+	if (chip->ioaddress == NULL)
+		goto err_chip;
+
+	/* Make sure the v3020 expects a communication cycle
+	 * by reading 8 times */
+	for (i = 0; i < 8; i++)
+		temp = readl(chip->ioaddress);
+
+	/* Test chip by doing a write/read sequence
+	 * to the chip ram */
+	v3020_set_reg(chip, V3020_SECONDS, 0x33);
+	if(v3020_get_reg(chip, V3020_SECONDS) != 0x33) {
+		retval = -ENODEV;
+		goto err_io;
+	}
+
+	/* Make sure frequency measurment mode, test modes, and lock
+	 * are all disabled */
+	v3020_set_reg(chip, V3020_STATUS_0, 0x0);
+
+	dev_info(&pdev->dev, "Chip available at physical address 0x%p,"
+		"data connected to D%d\n",
+		(void*)pdev->resource[0].start,
+		chip->leftshift);
+
+	platform_set_drvdata(pdev, chip);
+
+	rtc = rtc_device_register("v3020",
+				&pdev->dev, &v3020_rtc_ops, THIS_MODULE);
+	if (IS_ERR(rtc)) {
+		retval = PTR_ERR(rtc);
+		goto err_io;
+	}
+	chip->rtc = rtc;
+
+	return 0;
+
+err_io:
+	iounmap(chip->ioaddress);
+err_chip:
+	kfree(chip);
+
+	return retval;
+}
+
+static int rtc_remove(struct platform_device *dev)
+{
+	struct v3020 *chip = platform_get_drvdata(dev);
+	struct rtc_device *rtc = chip->rtc;
+
+	if (rtc)
+		rtc_device_unregister(rtc);
+
+	iounmap(chip->ioaddress);
+	kfree(chip);
+
+	return 0;
+}
+
+static struct platform_driver rtc_device_driver = {
+	.probe	= rtc_probe,
+	.remove = rtc_remove,
+	.driver = {
+		.name	= "v3020",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static __init int v3020_init(void)
+{
+	return platform_driver_register(&rtc_device_driver);
+}
+
+static __exit void v3020_exit(void)
+{
+	platform_driver_unregister(&rtc_device_driver);
+}
+
+module_init(v3020_init);
+module_exit(v3020_exit);
+
+MODULE_DESCRIPTION("V3020 RTC");
+MODULE_AUTHOR("Raphael Assenat");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/rtc-v3020.h b/include/linux/rtc-v3020.h
new file mode 100644
index 000000000000..bf74e63c98fe
--- /dev/null
+++ b/include/linux/rtc-v3020.h
@@ -0,0 +1,35 @@
+/*
+ * v3020.h - Registers definition and platform data structure for the v3020 RTC.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2006, 8D Technologies inc.
+ */
+#ifndef __LINUX_V3020_H
+#define __LINUX_V3020_H
+
+/* The v3020 has only one data pin but which one
+ * is used depends on the board. */
+struct v3020_platform_data {
+	int leftshift; /* (1<<(leftshift)) & readl() */
+};
+
+#define V3020_STATUS_0	0x00
+#define V3020_STATUS_1	0x01
+#define V3020_SECONDS	0x02
+#define V3020_MINUTES	0x03
+#define V3020_HOURS		0x04
+#define V3020_MONTH_DAY	0x05
+#define V3020_MONTH		0x06
+#define V3020_YEAR		0x07
+#define V3020_WEEK_DAY	0x08
+#define V3020_WEEK		0x09
+
+#define V3020_IS_COMMAND(val) ((val)>=0x0E)
+
+#define V3020_CMD_RAM2CLOCK	0x0E
+#define V3020_CMD_CLOCK2RAM	0x0F
+
+#endif /* __LINUX_V3020_H */
-- 
cgit v1.2.3


From 8232212e0b4ee4eb3e407f5a9b098f6377820164 Mon Sep 17 00:00:00 2001
From: Andrew Victor <andrew@sanpeople.com>
Date: Sun, 25 Jun 2006 05:48:25 -0700
Subject: [PATCH] RTC: Add rtc_year_days() to calculate tm_yday

RTC: Add exported function rtc_year_days() to calculate the tm_yday value.

Signed-off-by: Andrew Victor <andrew@sanpeople.com>
Signed-off-by: Alessandro Zummo <a.zummo@towertech.it>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/rtc/rtc-lib.c | 19 +++++++++++++++++++
 include/linux/rtc.h   |  1 +
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/rtc/rtc-lib.c b/drivers/rtc/rtc-lib.c
index cfedc1d28ee1..9812120f3a7c 100644
--- a/drivers/rtc/rtc-lib.c
+++ b/drivers/rtc/rtc-lib.c
@@ -18,15 +18,34 @@ static const unsigned char rtc_days_in_month[] = {
 	31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
 };
 
+static const unsigned short rtc_ydays[2][13] = {
+	/* Normal years */
+	{ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 },
+	/* Leap years */
+	{ 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 }
+};
+
 #define LEAPS_THRU_END_OF(y) ((y)/4 - (y)/100 + (y)/400)
 #define LEAP_YEAR(year) ((!(year % 4) && (year % 100)) || !(year % 400))
 
+/*
+ * The number of days in the month.
+ */
 int rtc_month_days(unsigned int month, unsigned int year)
 {
 	return rtc_days_in_month[month] + (LEAP_YEAR(year) && month == 1);
 }
 EXPORT_SYMBOL(rtc_month_days);
 
+/*
+ * The number of days since January 1. (0 to 365)
+ */
+int rtc_year_days(unsigned int day, unsigned int month, unsigned int year)
+{
+	return rtc_ydays[LEAP_YEAR(year)][month] + day-1;
+}
+EXPORT_SYMBOL(rtc_year_days);
+
 /*
  * Convert seconds since 01-01-1970 00:00:00 to Gregorian date.
  */
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index c12cbc1b83c5..36e2bf4b4315 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -102,6 +102,7 @@ struct rtc_pll_info {
 #include <linux/interrupt.h>
 
 extern int rtc_month_days(unsigned int month, unsigned int year);
+extern int rtc_year_days(unsigned int day, unsigned int month, unsigned int year);
 extern int rtc_valid_tm(struct rtc_time *tm);
 extern int rtc_tm_to_time(struct rtc_time *tm, unsigned long *time);
 extern void rtc_time_to_tm(unsigned long time, struct rtc_time *tm);
-- 
cgit v1.2.3


From eab03ac7bd3e0da99eb9dc068772a85a5e3f3577 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Sun, 25 Jun 2006 05:48:31 -0700
Subject: [PATCH] Get rid of /proc/sys/proc

The table is empty, why does it still exist?

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sysctl.h |  4 +---
 kernel/sysctl.c        | 11 -----------
 2 files changed, 1 insertion(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index c7132029af0f..6a60770984e9 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -55,7 +55,7 @@ enum
 	CTL_KERN=1,		/* General kernel info and control */
 	CTL_VM=2,		/* VM management */
 	CTL_NET=3,		/* Networking */
-	CTL_PROC=4,		/* Process info */
+	/* was CTL_PROC */
 	CTL_FS=5,		/* Filesystems */
 	CTL_DEBUG=6,		/* Debugging */
 	CTL_DEV=7,		/* Devices */
@@ -767,8 +767,6 @@ enum {
 	NET_BRIDGE_NF_FILTER_VLAN_TAGGED = 4,
 };
 
-/* CTL_PROC names: */
-
 /* CTL_FS names: */
 enum
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index eb8bd214e7d7..2c0e65819448 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -143,7 +143,6 @@ static struct ctl_table_header root_table_header =
 
 static ctl_table kern_table[];
 static ctl_table vm_table[];
-static ctl_table proc_table[];
 static ctl_table fs_table[];
 static ctl_table debug_table[];
 static ctl_table dev_table[];
@@ -202,12 +201,6 @@ static ctl_table root_table[] = {
 		.child		= net_table,
 	},
 #endif
-	{
-		.ctl_name	= CTL_PROC,
-		.procname	= "proc",
-		.mode		= 0555,
-		.child		= proc_table,
-	},
 	{
 		.ctl_name	= CTL_FS,
 		.procname	= "fs",
@@ -927,10 +920,6 @@ static ctl_table vm_table[] = {
 	{ .ctl_name = 0 }
 };
 
-static ctl_table proc_table[] = {
-	{ .ctl_name = 0 }
-};
-
 static ctl_table fs_table[] = {
 	{
 		.ctl_name	= FS_NRINODE,
-- 
cgit v1.2.3


From 3e8c54fad89144b8d63cc41619f363df1ec7cc42 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@linux01.gwdg.de>
Date: Sun, 25 Jun 2006 05:48:49 -0700
Subject: [PATCH] fuse: use MISC_MAJOR

The following patches add POSIX file locking to the fuse interface.

Additional changes ralated to this are:

  - asynchronous interrupt of requests by SIGKILL no longer supported

  - separate control filesystem, instead of using sysfs objects

  - add support for synchronously interrupting requests

Details are documented in Documentation/filesystems/fuse.txt throughout the
patches.

This patch:

Have fuse.h use MISC_MAJOR rather than a hardcoded '10'.

Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fuse.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 5425b60021e3..8e4319614bef 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -9,6 +9,7 @@
 /* This file defines the kernel interface of FUSE */
 
 #include <asm/types.h>
+#include <linux/major.h>
 
 /** Version number of this interface */
 #define FUSE_KERNEL_VERSION 7
@@ -20,7 +21,7 @@
 #define FUSE_ROOT_ID 1
 
 /** The major number of the fuse character device */
-#define FUSE_MAJOR 10
+#define FUSE_MAJOR MISC_MAJOR
 
 /** The minor number of the fuse character device */
 #define FUSE_MINOR 229
-- 
cgit v1.2.3


From 7142125937e1482ad3ae4366594c6586153dfc86 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sun, 25 Jun 2006 05:48:52 -0700
Subject: [PATCH] fuse: add POSIX file locking support

This patch adds POSIX file locking support to the fuse interface.

This implementation doesn't keep any locking state in kernel.  Unlocking on
close() is handled by the FLUSH message, which now contains the lock owner id.

Mandatory locking is not supported.  The filesystem may enfoce mandatory
locking in userspace if needed.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/file.c       | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/fuse_i.h     |   4 ++
 fs/fuse/inode.c      |  20 +++++++-
 include/linux/fuse.h |  26 +++++++++-
 4 files changed, 178 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1d59af306b28..d9a8289297c0 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -160,6 +160,18 @@ static int fuse_release(struct inode *inode, struct file *file)
 	return fuse_release_common(inode, file, 0);
 }
 
+/*
+ * It would be nice to scramble the ID space, so that the value of the
+ * files_struct pointer is not exposed to userspace.  Symmetric crypto
+ * functions are overkill, since the inverse function doesn't need to
+ * be implemented (though it does have to exist).  Is there something
+ * simpler?
+ */
+static inline u64 fuse_lock_owner_id(fl_owner_t id)
+{
+	return (unsigned long) id;
+}
+
 static int fuse_flush(struct file *file, fl_owner_t id)
 {
 	struct inode *inode = file->f_dentry->d_inode;
@@ -181,11 +193,13 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
+	inarg.lock_owner = fuse_lock_owner_id(id);
 	req->in.h.opcode = FUSE_FLUSH;
 	req->in.h.nodeid = get_node_id(inode);
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
+	req->force = 1;
 	request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
@@ -604,6 +618,122 @@ static int fuse_set_page_dirty(struct page *page)
 	return 0;
 }
 
+static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
+				  struct file_lock *fl)
+{
+	switch (ffl->type) {
+	case F_UNLCK:
+		break;
+
+	case F_RDLCK:
+	case F_WRLCK:
+		if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
+		    ffl->end < ffl->start)
+			return -EIO;
+
+		fl->fl_start = ffl->start;
+		fl->fl_end = ffl->end;
+		fl->fl_pid = ffl->pid;
+		break;
+
+	default:
+		return -EIO;
+	}
+	fl->fl_type = ffl->type;
+	return 0;
+}
+
+static void fuse_lk_fill(struct fuse_req *req, struct file *file,
+			 const struct file_lock *fl, int opcode, pid_t pid)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_lk_in *arg = &req->misc.lk_in;
+
+	arg->fh = ff->fh;
+	arg->owner = fuse_lock_owner_id(fl->fl_owner);
+	arg->lk.start = fl->fl_start;
+	arg->lk.end = fl->fl_end;
+	arg->lk.type = fl->fl_type;
+	arg->lk.pid = pid;
+	req->in.h.opcode = opcode;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(*arg);
+	req->in.args[0].value = arg;
+}
+
+static int fuse_getlk(struct file *file, struct file_lock *fl)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_lk_out outarg;
+	int err;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	fuse_lk_fill(req, file, fl, FUSE_GETLK, 0);
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err)
+		err = convert_fuse_file_lock(&outarg.lk, fl);
+
+	return err;
+}
+
+static int fuse_setlk(struct file *file, struct file_lock *fl)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
+	pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
+	int err;
+
+	/* Unlock on close is handled by the flush method */
+	if (fl->fl_flags & FL_CLOSE)
+		return 0;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	fuse_lk_fill(req, file, fl, opcode, pid);
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	return err;
+}
+
+static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int err;
+
+	if (cmd == F_GETLK) {
+		if (fc->no_lock) {
+			if (!posix_test_lock(file, fl, fl))
+				fl->fl_type = F_UNLCK;
+			err = 0;
+		} else
+			err = fuse_getlk(file, fl);
+	} else {
+		if (fc->no_lock)
+			err = posix_lock_file_wait(file, fl);
+		else
+			err = fuse_setlk(file, fl);
+	}
+	return err;
+}
+
 static const struct file_operations fuse_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
@@ -613,6 +743,7 @@ static const struct file_operations fuse_file_operations = {
 	.flush		= fuse_flush,
 	.release	= fuse_release,
 	.fsync		= fuse_fsync,
+	.lock		= fuse_file_lock,
 	.sendfile	= generic_file_sendfile,
 };
 
@@ -624,6 +755,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
 	.flush		= fuse_flush,
 	.release	= fuse_release,
 	.fsync		= fuse_fsync,
+	.lock		= fuse_file_lock,
 	/* no mmap and sendfile */
 };
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ac12b01f4446..eb3166625ca9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -190,6 +190,7 @@ struct fuse_req {
 		struct fuse_init_in init_in;
 		struct fuse_init_out init_out;
 		struct fuse_read_in read_in;
+		struct fuse_lk_in lk_in;
 	} misc;
 
 	/** page vector */
@@ -307,6 +308,9 @@ struct fuse_conn {
 	/** Is removexattr not implemented by fs? */
 	unsigned no_removexattr : 1;
 
+	/** Are file locking primitives not implemented by fs? */
+	unsigned no_lock : 1;
+
 	/** Is access not implemented by fs? */
 	unsigned no_access : 1;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 13a7e8ab7a78..412892905838 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -98,6 +98,14 @@ static void fuse_clear_inode(struct inode *inode)
 	}
 }
 
+static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	if (*flags & MS_MANDLOCK)
+		return -EINVAL;
+
+	return 0;
+}
+
 void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
 {
 	if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size)
@@ -409,6 +417,7 @@ static struct super_operations fuse_super_operations = {
 	.destroy_inode  = fuse_destroy_inode,
 	.read_inode	= fuse_read_inode,
 	.clear_inode	= fuse_clear_inode,
+	.remount_fs	= fuse_remount_fs,
 	.put_super	= fuse_put_super,
 	.umount_begin	= fuse_umount_begin,
 	.statfs		= fuse_statfs,
@@ -428,8 +437,12 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 			ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
 			if (arg->flags & FUSE_ASYNC_READ)
 				fc->async_read = 1;
-		} else
+			if (!(arg->flags & FUSE_POSIX_LOCKS))
+				fc->no_lock = 1;
+		} else {
 			ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+			fc->no_lock = 1;
+		}
 
 		fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
 		fc->minor = arg->minor;
@@ -447,7 +460,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	arg->major = FUSE_KERNEL_VERSION;
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
 	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
-	arg->flags |= FUSE_ASYNC_READ;
+	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
@@ -479,6 +492,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	struct fuse_req *init_req;
 	int err;
 
+	if (sb->s_flags & MS_MANDLOCK)
+		return -EINVAL;
+
 	if (!parse_fuse_opt((char *) data, &d))
 		return -EINVAL;
 
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 8e4319614bef..e7a76ec0f05c 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -1,6 +1,6 @@
 /*
     FUSE: Filesystem in Userspace
-    Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+    Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
     This program can be distributed under the terms of the GNU GPL.
     See the file COPYING.
@@ -15,7 +15,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 6
+#define FUSE_KERNEL_MINOR_VERSION 7
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -59,6 +59,13 @@ struct fuse_kstatfs {
 	__u32	spare[6];
 };
 
+struct fuse_file_lock {
+	__u64	start;
+	__u64	end;
+	__u32	type;
+	__u32	pid; /* tgid */
+};
+
 /**
  * Bitmasks for fuse_setattr_in.valid
  */
@@ -83,6 +90,7 @@ struct fuse_kstatfs {
  * INIT request/reply flags
  */
 #define FUSE_ASYNC_READ		(1 << 0)
+#define FUSE_POSIX_LOCKS	(1 << 1)
 
 enum fuse_opcode {
 	FUSE_LOOKUP	   = 1,
@@ -113,6 +121,9 @@ enum fuse_opcode {
 	FUSE_READDIR       = 28,
 	FUSE_RELEASEDIR    = 29,
 	FUSE_FSYNCDIR      = 30,
+	FUSE_GETLK         = 31,
+	FUSE_SETLK         = 32,
+	FUSE_SETLKW        = 33,
 	FUSE_ACCESS        = 34,
 	FUSE_CREATE        = 35
 };
@@ -200,6 +211,7 @@ struct fuse_flush_in {
 	__u64	fh;
 	__u32	flush_flags;
 	__u32	padding;
+	__u64	lock_owner;
 };
 
 struct fuse_read_in {
@@ -248,6 +260,16 @@ struct fuse_getxattr_out {
 	__u32	padding;
 };
 
+struct fuse_lk_in {
+	__u64	fh;
+	__u64	owner;
+	struct fuse_file_lock lk;
+};
+
+struct fuse_lk_out {
+	struct fuse_file_lock lk;
+};
+
 struct fuse_access_in {
 	__u32	mask;
 	__u32	padding;
-- 
cgit v1.2.3


From a4d27e75ffb7b8ecb7eed0c7db0df975525f3fd7 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sun, 25 Jun 2006 05:48:54 -0700
Subject: [PATCH] fuse: add request interruption

Add synchronous request interruption.  This is needed for file locking
operations which have to be interruptible.  However filesystem may implement
interruptibility of other operations (e.g.  like NFS 'intr' mount option).

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/fuse.txt |  48 ++++++++++-
 fs/fuse/dev.c                      | 162 ++++++++++++++++++++++++++++++-------
 fs/fuse/file.c                     |   3 +
 fs/fuse/fuse_i.h                   |  16 ++++
 fs/fuse/inode.c                    |   1 +
 include/linux/fuse.h               |   7 +-
 6 files changed, 205 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/fuse.txt b/Documentation/filesystems/fuse.txt
index 324df27704cc..a584f05403a4 100644
--- a/Documentation/filesystems/fuse.txt
+++ b/Documentation/filesystems/fuse.txt
@@ -124,6 +124,46 @@ For each connection the following files exist within this directory:
 
 Only the owner of the mount may read or write these files.
 
+Interrupting filesystem operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If a process issuing a FUSE filesystem request is interrupted, the
+following will happen:
+
+  1) If the request is not yet sent to userspace AND the signal is
+     fatal (SIGKILL or unhandled fatal signal), then the request is
+     dequeued and returns immediately.
+
+  2) If the request is not yet sent to userspace AND the signal is not
+     fatal, then an 'interrupted' flag is set for the request.  When
+     the request has been successfully transfered to userspace and
+     this flag is set, an INTERRUPT request is queued.
+
+  3) If the request is already sent to userspace, then an INTERRUPT
+     request is queued.
+
+INTERRUPT requests take precedence over other requests, so the
+userspace filesystem will receive queued INTERRUPTs before any others.
+
+The userspace filesystem may ignore the INTERRUPT requests entirely,
+or may honor them by sending a reply to the _original_ request, with
+the error set to EINTR.
+
+It is also possible that there's a race between processing the
+original request and it's INTERRUPT request.  There are two possibilities:
+
+  1) The INTERRUPT request is processed before the original request is
+     processed
+
+  2) The INTERRUPT request is processed after the original request has
+     been answered
+
+If the filesystem cannot find the original request, it should wait for
+some timeout and/or a number of new requests to arrive, after which it
+should reply to the INTERRUPT request with an EAGAIN error.  In case
+1) the INTERRUPT request will be requeued.  In case 2) the INTERRUPT
+reply will be ignored.
+
 Aborting a filesystem connection
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -351,10 +391,10 @@ but is caused by a pagefault.
 
 Solution is basically the same as above.
 
-An additional problem is that while the write buffer is being
-copied to the request, the request must not be interrupted.  This
-is because the destination address of the copy may not be valid
-after the request is interrupted.
+An additional problem is that while the write buffer is being copied
+to the request, the request must not be interrupted/aborted.  This is
+because the destination address of the copy may not be valid after the
+request has returned.
 
 This is solved with doing the copy atomically, and allowing abort
 while the page(s) belonging to the write buffer are faulted with
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6b5f74cb7b54..1e2006caf158 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -34,6 +34,7 @@ static void fuse_request_init(struct fuse_req *req)
 {
 	memset(req, 0, sizeof(*req));
 	INIT_LIST_HEAD(&req->list);
+	INIT_LIST_HEAD(&req->intr_entry);
 	init_waitqueue_head(&req->waitq);
 	atomic_set(&req->count, 1);
 }
@@ -215,6 +216,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 	void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 	req->end = NULL;
 	list_del(&req->list);
+	list_del(&req->intr_entry);
 	req->state = FUSE_REQ_FINISHED;
 	if (req->background) {
 		if (fc->num_background == FUSE_MAX_BACKGROUND) {
@@ -235,28 +237,63 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		fuse_put_request(fc, req);
 }
 
+static void wait_answer_interruptible(struct fuse_conn *fc,
+				      struct fuse_req *req)
+{
+	if (signal_pending(current))
+		return;
+
+	spin_unlock(&fc->lock);
+	wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
+	spin_lock(&fc->lock);
+}
+
+static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
+{
+	list_add_tail(&req->intr_entry, &fc->interrupts);
+	wake_up(&fc->waitq);
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+}
+
 /* Called with fc->lock held.  Releases, and then reacquires it. */
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 {
-	sigset_t oldset;
+	if (!fc->no_interrupt) {
+		/* Any signal may interrupt this */
+		wait_answer_interruptible(fc, req);
 
-	spin_unlock(&fc->lock);
-	if (req->force)
+		if (req->aborted)
+			goto aborted;
+		if (req->state == FUSE_REQ_FINISHED)
+			return;
+
+		req->interrupted = 1;
+		if (req->state == FUSE_REQ_SENT)
+			queue_interrupt(fc, req);
+	}
+
+	if (req->force) {
+		spin_unlock(&fc->lock);
 		wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
-	else {
+		spin_lock(&fc->lock);
+	} else {
+		sigset_t oldset;
+
+		/* Only fatal signals may interrupt this */
 		block_sigs(&oldset);
-		wait_event_interruptible(req->waitq,
-					 req->state == FUSE_REQ_FINISHED);
+		wait_answer_interruptible(fc, req);
 		restore_sigs(&oldset);
 	}
-	spin_lock(&fc->lock);
-	if (req->state == FUSE_REQ_FINISHED && !req->aborted)
-		return;
 
-	if (!req->aborted) {
-		req->out.h.error = -EINTR;
-		req->aborted = 1;
-	}
+	if (req->aborted)
+		goto aborted;
+	if (req->state == FUSE_REQ_FINISHED)
+ 		return;
+
+	req->out.h.error = -EINTR;
+	req->aborted = 1;
+
+ aborted:
 	if (req->locked) {
 		/* This is uninterruptible sleep, because data is
 		   being copied to/from the buffers of req.  During
@@ -288,13 +325,19 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
 	return nbytes;
 }
 
+static u64 fuse_get_unique(struct fuse_conn *fc)
+ {
+ 	fc->reqctr++;
+ 	/* zero is special */
+ 	if (fc->reqctr == 0)
+ 		fc->reqctr = 1;
+
+	return fc->reqctr;
+}
+
 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 {
-	fc->reqctr++;
-	/* zero is special */
-	if (fc->reqctr == 0)
-		fc->reqctr = 1;
-	req->in.h.unique = fc->reqctr;
+	req->in.h.unique = fuse_get_unique(fc);
 	req->in.h.len = sizeof(struct fuse_in_header) +
 		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
 	list_add_tail(&req->list, &fc->pending);
@@ -307,9 +350,6 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
 
-/*
- * This can only be interrupted by a SIGKILL
- */
 void request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
@@ -566,13 +606,18 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
 	return err;
 }
 
+static int request_pending(struct fuse_conn *fc)
+{
+	return !list_empty(&fc->pending) || !list_empty(&fc->interrupts);
+}
+
 /* Wait until a request is available on the pending list */
 static void request_wait(struct fuse_conn *fc)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
 	add_wait_queue_exclusive(&fc->waitq, &wait);
-	while (fc->connected && list_empty(&fc->pending)) {
+	while (fc->connected && !request_pending(fc)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (signal_pending(current))
 			break;
@@ -585,6 +630,45 @@ static void request_wait(struct fuse_conn *fc)
 	remove_wait_queue(&fc->waitq, &wait);
 }
 
+/*
+ * Transfer an interrupt request to userspace
+ *
+ * Unlike other requests this is assembled on demand, without a need
+ * to allocate a separate fuse_req structure.
+ *
+ * Called with fc->lock held, releases it
+ */
+static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
+			       const struct iovec *iov, unsigned long nr_segs)
+{
+	struct fuse_copy_state cs;
+	struct fuse_in_header ih;
+	struct fuse_interrupt_in arg;
+	unsigned reqsize = sizeof(ih) + sizeof(arg);
+	int err;
+
+	list_del_init(&req->intr_entry);
+	req->intr_unique = fuse_get_unique(fc);
+	memset(&ih, 0, sizeof(ih));
+	memset(&arg, 0, sizeof(arg));
+	ih.len = reqsize;
+	ih.opcode = FUSE_INTERRUPT;
+	ih.unique = req->intr_unique;
+	arg.unique = req->in.h.unique;
+
+	spin_unlock(&fc->lock);
+	if (iov_length(iov, nr_segs) < reqsize)
+		return -EINVAL;
+
+	fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs);
+	err = fuse_copy_one(&cs, &ih, sizeof(ih));
+	if (!err)
+		err = fuse_copy_one(&cs, &arg, sizeof(arg));
+	fuse_copy_finish(&cs);
+
+	return err ? err : reqsize;
+}
+
 /*
  * Read a single request into the userspace filesystem's buffer.  This
  * function waits until a request is available, then removes it from
@@ -610,7 +694,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	spin_lock(&fc->lock);
 	err = -EAGAIN;
 	if ((file->f_flags & O_NONBLOCK) && fc->connected &&
-	    list_empty(&fc->pending))
+	    !request_pending(fc))
 		goto err_unlock;
 
 	request_wait(fc);
@@ -618,9 +702,15 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	if (!fc->connected)
 		goto err_unlock;
 	err = -ERESTARTSYS;
-	if (list_empty(&fc->pending))
+	if (!request_pending(fc))
 		goto err_unlock;
 
+	if (!list_empty(&fc->interrupts)) {
+		req = list_entry(fc->interrupts.next, struct fuse_req,
+				 intr_entry);
+		return fuse_read_interrupt(fc, req, iov, nr_segs);
+	}
+
 	req = list_entry(fc->pending.next, struct fuse_req, list);
 	req->state = FUSE_REQ_READING;
 	list_move(&req->list, &fc->io);
@@ -658,6 +748,8 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	else {
 		req->state = FUSE_REQ_SENT;
 		list_move_tail(&req->list, &fc->processing);
+		if (req->interrupted)
+			queue_interrupt(fc, req);
 		spin_unlock(&fc->lock);
 	}
 	return reqsize;
@@ -684,7 +776,7 @@ static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
 	list_for_each(entry, &fc->processing) {
 		struct fuse_req *req;
 		req = list_entry(entry, struct fuse_req, list);
-		if (req->in.h.unique == unique)
+		if (req->in.h.unique == unique || req->intr_unique == unique)
 			return req;
 	}
 	return NULL;
@@ -750,7 +842,6 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 		goto err_unlock;
 
 	req = request_find(fc, oh.unique);
-	err = -EINVAL;
 	if (!req)
 		goto err_unlock;
 
@@ -761,6 +852,23 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 		request_end(fc, req);
 		return -ENOENT;
 	}
+	/* Is it an interrupt reply? */
+	if (req->intr_unique == oh.unique) {
+		err = -EINVAL;
+		if (nbytes != sizeof(struct fuse_out_header))
+			goto err_unlock;
+
+		if (oh.error == -ENOSYS)
+			fc->no_interrupt = 1;
+		else if (oh.error == -EAGAIN)
+			queue_interrupt(fc, req);
+
+		spin_unlock(&fc->lock);
+		fuse_copy_finish(&cs);
+		return nbytes;
+	}
+
+	req->state = FUSE_REQ_WRITING;
 	list_move(&req->list, &fc->io);
 	req->out.h = oh;
 	req->locked = 1;
@@ -809,7 +917,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 	spin_lock(&fc->lock);
 	if (!fc->connected)
 		mask = POLLERR;
-	else if (!list_empty(&fc->pending))
+	else if (request_pending(fc))
 		mask |= POLLIN | POLLRDNORM;
 	spin_unlock(&fc->lock);
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ce759414cff9..36f92f181d2f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -705,6 +705,9 @@ static int fuse_setlk(struct file *file, struct file_lock *fl)
 	fuse_lk_fill(req, file, fl, opcode, pid);
 	request_send(fc, req);
 	err = req->out.h.error;
+	/* locking is restartable */
+	if (err == -EINTR)
+		err = -ERESTARTSYS;
 	fuse_put_request(fc, req);
 	return err;
 }
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fd65e75e1622..c862df58da92 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -131,6 +131,7 @@ enum fuse_req_state {
 	FUSE_REQ_PENDING,
 	FUSE_REQ_READING,
 	FUSE_REQ_SENT,
+	FUSE_REQ_WRITING,
 	FUSE_REQ_FINISHED
 };
 
@@ -144,9 +145,15 @@ struct fuse_req {
 	    fuse_conn */
 	struct list_head list;
 
+	/** Entry on the interrupts list  */
+	struct list_head intr_entry;
+
 	/** refcount */
 	atomic_t count;
 
+	/** Unique ID for the interrupt request */
+	u64 intr_unique;
+
 	/*
 	 * The following bitfields are either set once before the
 	 * request is queued or setting/clearing them is protected by
@@ -165,6 +172,9 @@ struct fuse_req {
 	/** Request is sent in the background */
 	unsigned background:1;
 
+	/** The request has been interrupted */
+	unsigned interrupted:1;
+
 	/** Data is being copied to/from the request */
 	unsigned locked:1;
 
@@ -262,6 +272,9 @@ struct fuse_conn {
 	/** Number of requests currently in the background */
 	unsigned num_background;
 
+	/** Pending interrupts */
+	struct list_head interrupts;
+
 	/** Flag indicating if connection is blocked.  This will be
 	    the case before the INIT reply is received, and if there
 	    are too many outstading backgrounds requests */
@@ -320,6 +333,9 @@ struct fuse_conn {
 	/** Is create not implemented by fs? */
 	unsigned no_create : 1;
 
+	/** Is interrupt not implemented by fs? */
+	unsigned no_interrupt : 1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 412892905838..e21ef8a3ad30 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -381,6 +381,7 @@ static struct fuse_conn *new_conn(void)
 		INIT_LIST_HEAD(&fc->pending);
 		INIT_LIST_HEAD(&fc->processing);
 		INIT_LIST_HEAD(&fc->io);
+		INIT_LIST_HEAD(&fc->interrupts);
 		atomic_set(&fc->num_waiting, 0);
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index e7a76ec0f05c..9fc48a674b82 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -125,7 +125,8 @@ enum fuse_opcode {
 	FUSE_SETLK         = 32,
 	FUSE_SETLKW        = 33,
 	FUSE_ACCESS        = 34,
-	FUSE_CREATE        = 35
+	FUSE_CREATE        = 35,
+	FUSE_INTERRUPT     = 36,
 };
 
 /* The read buffer is required to be at least 8k, but may be much larger */
@@ -291,6 +292,10 @@ struct fuse_init_out {
 	__u32	max_write;
 };
 
+struct fuse_interrupt_in {
+	__u64	unique;
+};
+
 struct fuse_in_header {
 	__u32	len;
 	__u32	opcode;
-- 
cgit v1.2.3


From c7b2eff059fcc2d1b7085ee3d84b79fd657a537b Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Sun, 25 Jun 2006 05:48:59 -0700
Subject: [PATCH] kthread: update loop.c to use kthread

Update loop.c to use a kthread instead of a deprecated kernel_thread for
loop devices.

[akpm@osdl.org: don't change the thread's name]
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/loop.c | 24 +++++++++++-------------
 include/linux/loop.h |  2 +-
 2 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3c74ea729fc7..9dc294a74953 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -74,6 +74,7 @@
 #include <linux/completion.h>
 #include <linux/highmem.h>
 #include <linux/gfp.h>
+#include <linux/kthread.h>
 
 #include <asm/uaccess.h>
 
@@ -578,8 +579,6 @@ static int loop_thread(void *data)
 	struct loop_device *lo = data;
 	struct bio *bio;
 
-	daemonize("loop%d", lo->lo_number);
-
 	/*
 	 * loop can be used in an encrypted device,
 	 * hence, it mustn't be stopped at all
@@ -592,11 +591,6 @@ static int loop_thread(void *data)
 	lo->lo_state = Lo_bound;
 	lo->lo_pending = 1;
 
-	/*
-	 * complete it, we are running
-	 */
-	complete(&lo->lo_done);
-
 	for (;;) {
 		int pending;
 
@@ -629,7 +623,6 @@ static int loop_thread(void *data)
 			break;
 	}
 
-	complete(&lo->lo_done);
 	return 0;
 }
 
@@ -746,6 +739,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 	unsigned lo_blocksize;
 	int		lo_flags = 0;
 	int		error;
+	struct task_struct *tsk;
 	loff_t		size;
 
 	/* This is safe, since we have a reference from open(). */
@@ -839,10 +833,11 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 
 	set_blocksize(bdev, lo_blocksize);
 
-	error = kernel_thread(loop_thread, lo, CLONE_KERNEL);
-	if (error < 0)
+	tsk = kthread_run(loop_thread, lo, "loop%d", lo->lo_number);
+	if (IS_ERR(tsk)) {
+		error = PTR_ERR(tsk);
 		goto out_putf;
-	wait_for_completion(&lo->lo_done);
+	}
 	return 0;
 
  out_putf:
@@ -898,6 +893,9 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
 
+	if (!lo->lo_thread)
+		return -EINVAL;
+
 	if (lo->lo_refcnt > 1)	/* we needed one fd for the ioctl */
 		return -EBUSY;
 
@@ -911,7 +909,7 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 		complete(&lo->lo_bh_done);
 	spin_unlock_irq(&lo->lo_lock);
 
-	wait_for_completion(&lo->lo_done);
+	kthread_stop(lo->lo_thread);
 
 	lo->lo_backing_file = NULL;
 
@@ -924,6 +922,7 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 	lo->lo_sizelimit = 0;
 	lo->lo_encrypt_key_size = 0;
 	lo->lo_flags = 0;
+	lo->lo_thread = NULL;
 	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
 	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
 	memset(lo->lo_file_name, 0, LO_NAME_SIZE);
@@ -1288,7 +1287,6 @@ static int __init loop_init(void)
 		if (!lo->lo_queue)
 			goto out_mem4;
 		mutex_init(&lo->lo_ctl_mutex);
-		init_completion(&lo->lo_done);
 		init_completion(&lo->lo_bh_done);
 		lo->lo_number = i;
 		spin_lock_init(&lo->lo_lock);
diff --git a/include/linux/loop.h b/include/linux/loop.h
index e76c7611d6cc..bf3d2345ce99 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -59,7 +59,7 @@ struct loop_device {
 	struct bio 		*lo_bio;
 	struct bio		*lo_biotail;
 	int			lo_state;
-	struct completion	lo_done;
+	struct task_struct	*lo_thread;
 	struct completion	lo_bh_done;
 	struct mutex		lo_ctl_mutex;
 	int			lo_pending;
-- 
cgit v1.2.3


From 45c9b11a1d07770cabb48cb0f7960a77650ffc64 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Sun, 25 Jun 2006 05:49:11 -0700
Subject: [PATCH] Implement AT_SYMLINK_FOLLOW flag for linkat

When the linkat() syscall was added the flag parameter was added in the
last minute but it wasn't used so far.  The following patch should change
that.  My tests show that this is all that's needed.

If OLDNAME is a symlink setting the flag causes linkat to follow the
symlink and create a hardlink with the target.  This is actually the
behavior POSIX demands for link() as well but Linux wisely does not do
this.  With this flag (which will most likely be in the next POSIX
revision) the programmer can choose the behavior, defaulting to the safe
variant.  As a side effect it is now possible to implement a
POSIX-compliant link(2) function for those who are interested.

  touch file
  ln -s file symlink

  linkat(fd, "symlink", fd, "newlink", 0)
    -> newlink is hardlink of symlink

  linkat(fd, "symlink", fd, "newlink", AT_SYMLINK_FOLLOW)
    -> newlink is hardlink of file

The value of AT_SYMLINK_FOLLOW is determined by the definition we already
use in glibc.

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namei.c            | 6 ++++--
 include/linux/fcntl.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index bb4a3e40e432..c784e8bb57a3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2243,14 +2243,16 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 	int error;
 	char * to;
 
-	if (flags != 0)
+	if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
 		return -EINVAL;
 
 	to = getname(newname);
 	if (IS_ERR(to))
 		return PTR_ERR(to);
 
-	error = __user_walk_fd(olddfd, oldname, 0, &old_nd);
+	error = __user_walk_fd(olddfd, oldname,
+			       flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
+			       &old_nd);
 	if (error)
 		goto exit;
 	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index c52a63755fdd..996f5611cd59 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -29,6 +29,7 @@
 #define AT_SYMLINK_NOFOLLOW	0x100   /* Do not follow symbolic links.  */
 #define AT_REMOVEDIR		0x200   /* Remove directory instead of
                                            unlinking file.  */
+#define AT_SYMLINK_FOLLOW	0x400   /* Follow symbolic links.  */
 
 #ifdef __KERNEL__
 
-- 
cgit v1.2.3


From fa9799e33d362aeca4555cd6318735bab1c04d16 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sun, 25 Jun 2006 05:49:15 -0700
Subject: [PATCH] ktime/hrtimer: fix kernel-doc comments

Fix kernel-doc formatting in ktime.h and hrtimer.[ch] files.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h |  3 ---
 include/linux/ktime.h   |  8 --------
 kernel/hrtimer.c        | 11 +----------
 3 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 7d2a1b974c5e..07d7305f131e 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -40,7 +40,6 @@ struct hrtimer_base;
 
 /**
  * struct hrtimer - the basic hrtimer structure
- *
  * @node:	red black tree node for time ordered insertion
  * @expires:	the absolute expiry time in the hrtimers internal
  *		representation. The time is related to the clock on
@@ -59,7 +58,6 @@ struct hrtimer {
 
 /**
  * struct hrtimer_sleeper - simple sleeper structure
- *
  * @timer:	embedded timer structure
  * @task:	task to wake up
  *
@@ -72,7 +70,6 @@ struct hrtimer_sleeper {
 
 /**
  * struct hrtimer_base - the timer base for a specific clock
- *
  * @index:		clock type index for per_cpu support when moving a timer
  *			to a base on another cpu.
  * @lock:		lock protecting the base and associated timers
diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 62bc57580707..ed3396dcc4f7 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -66,7 +66,6 @@ typedef union {
 
 /**
  * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value
- *
  * @secs:	seconds to set
  * @nsecs:	nanoseconds to set
  *
@@ -138,7 +137,6 @@ static inline ktime_t ktime_set(const long secs, const unsigned long nsecs)
 
 /**
  * ktime_sub - subtract two ktime_t variables
- *
  * @lhs:	minuend
  * @rhs:	subtrahend
  *
@@ -157,7 +155,6 @@ static inline ktime_t ktime_sub(const ktime_t lhs, const ktime_t rhs)
 
 /**
  * ktime_add - add two ktime_t variables
- *
  * @add1:	addend1
  * @add2:	addend2
  *
@@ -184,7 +181,6 @@ static inline ktime_t ktime_add(const ktime_t add1, const ktime_t add2)
 
 /**
  * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
- *
  * @kt:		addend
  * @nsec:	the scalar nsec value to add
  *
@@ -194,7 +190,6 @@ extern ktime_t ktime_add_ns(const ktime_t kt, u64 nsec);
 
 /**
  * timespec_to_ktime - convert a timespec to ktime_t format
- *
  * @ts:		the timespec variable to convert
  *
  * Returns a ktime_t variable with the converted timespec value
@@ -207,7 +202,6 @@ static inline ktime_t timespec_to_ktime(const struct timespec ts)
 
 /**
  * timeval_to_ktime - convert a timeval to ktime_t format
- *
  * @tv:		the timeval variable to convert
  *
  * Returns a ktime_t variable with the converted timeval value
@@ -220,7 +214,6 @@ static inline ktime_t timeval_to_ktime(const struct timeval tv)
 
 /**
  * ktime_to_timespec - convert a ktime_t variable to timespec format
- *
  * @kt:		the ktime_t variable to convert
  *
  * Returns the timespec representation of the ktime value
@@ -233,7 +226,6 @@ static inline struct timespec ktime_to_timespec(const ktime_t kt)
 
 /**
  * ktime_to_timeval - convert a ktime_t variable to timeval format
- *
  * @kt:		the ktime_t variable to convert
  *
  * Returns the timeval representation of the ktime value
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9587aac72f4d..55601b3ce60e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
 
 /**
  * ktime_get_ts - get the monotonic clock in timespec format
- *
  * @ts:		pointer to timespec variable
  *
  * The function calculates the monotonic clock from the realtime
@@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 # ifndef CONFIG_KTIME_SCALAR
 /**
  * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
- *
  * @kt:		addend
  * @nsec:	the scalar nsec value to add
  *
@@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 
 /**
  * hrtimer_forward - forward the timer expiry
- *
  * @timer:	hrtimer to forward
  * @now:	forward past this time
  * @interval:	the interval to forward
@@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 
 /**
  * hrtimer_start - (re)start an relative timer on the current CPU
- *
  * @timer:	the timer to be added
  * @tim:	expiry time
  * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
@@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start);
 
 /**
  * hrtimer_try_to_cancel - try to deactivate a timer
- *
  * @timer:	hrtimer to stop
  *
  * Returns:
  *  0 when the timer was not active
  *  1 when the timer was active
  * -1 when the timer is currently excuting the callback function and
- *    can not be stopped
+ *    cannot be stopped
  */
 int hrtimer_try_to_cancel(struct hrtimer *timer)
 {
@@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
 
 /**
  * hrtimer_cancel - cancel a timer and wait for the handler to finish.
- *
  * @timer:	the timer to be cancelled
  *
  * Returns:
@@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
 
 /**
  * hrtimer_get_remaining - get remaining time for the timer
- *
  * @timer:	the timer to read
  */
 ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
@@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void)
 
 /**
  * hrtimer_init - initialize a timer to the given clock
- *
  * @timer:	the timer to be initialized
  * @clock_id:	the clock to be used
  * @mode:	timer mode abs/rel
@@ -588,7 +580,6 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
 
 /**
  * hrtimer_get_res - get the timer resolution for a clock
- *
  * @which_clock: which clock to query
  * @tp:		 pointer to timespec variable to store the resolution
  *
-- 
cgit v1.2.3


From e905914f96e11862b130dd229f73045dad9a34e8 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Sun, 25 Jun 2006 05:49:17 -0700
Subject: [PATCH] Implement kasprintf

Implement kasprintf, a kernel version of asprintf.  This allocates the
memory required for the formatted string, including the trailing '\0'.
Returns NULL on allocation failure.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kernel.h |  2 ++
 lib/vsprintf.c         | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8c21aaa248b4..3c5e4c2e517d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -117,6 +117,8 @@ extern int scnprintf(char * buf, size_t size, const char * fmt, ...)
 	__attribute__ ((format (printf, 3, 4)));
 extern int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
 	__attribute__ ((format (printf, 3, 0)));
+extern char *kasprintf(gfp_t gfp, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
 
 extern int sscanf(const char *, const char *, ...)
 	__attribute__ ((format (scanf, 2, 3)));
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index f5959476e53d..797428afd111 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -849,3 +849,26 @@ int sscanf(const char * buf, const char * fmt, ...)
 }
 
 EXPORT_SYMBOL(sscanf);
+
+
+/* Simplified asprintf. */
+char *kasprintf(gfp_t gfp, const char *fmt, ...)
+{
+	va_list ap;
+	unsigned int len;
+	char *p;
+
+	va_start(ap, fmt);
+	len = vsnprintf(NULL, 0, fmt, ap);
+	va_end(ap);
+
+	p = kmalloc(len+1, gfp);
+	if (!p)
+		return NULL;
+	va_start(ap, fmt);
+	vsnprintf(p, len+1, fmt, ap);
+	va_end(ap);
+	return p;
+}
+
+EXPORT_SYMBOL(kasprintf);
-- 
cgit v1.2.3


From 9e37bd301ee130598fa1406c1281caa159473bf8 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sun, 25 Jun 2006 05:49:19 -0700
Subject: [PATCH] kthread: move kernel-doc and put it into DocBook

Move kthread API kernel-doc from kthread.h to kthread.c & fix it.
Add kthread API to kernel-api DocBook.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/DocBook/kernel-api.tmpl |  2 ++
 include/linux/kthread.h               | 65 ++---------------------------------
 kernel/kthread.c                      | 61 +++++++++++++++++++++++++++++++-
 3 files changed, 65 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 5a4abe0d5165..8305eb7b8c15 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -62,6 +62,8 @@
      <sect1><title>Internal Functions</title>
 !Ikernel/exit.c
 !Ikernel/signal.c
+!Iinclude/linux/kthread.h
+!Ekernel/kthread.c
      </sect1>
 
      <sect1><title>Kernel objects manipulation</title>
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index ebdd41fd1082..7cce5dfa092f 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -4,37 +4,19 @@
 #include <linux/err.h>
 #include <linux/sched.h>
 
-/**
- * kthread_create: create a kthread.
- * @threadfn: the function to run until signal_pending(current).
- * @data: data ptr for @threadfn.
- * @namefmt: printf-style name for the thread.
- *
- * Description: This helper function creates and names a kernel
- * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run(), kthread_create_on_cpu().
- *
- * When woken, the thread will run @threadfn() with @data as its
- * argument. @threadfn can either call do_exit() directly if it is a
- * standalone thread for which noone will call kthread_stop(), or
- * return when 'kthread_should_stop()' is true (which means
- * kthread_stop() has been called).  The return value should be zero
- * or a negative error number: it will be passed to kthread_stop().
- *
- * Returns a task_struct or ERR_PTR(-ENOMEM).
- */
 struct task_struct *kthread_create(int (*threadfn)(void *data),
 				   void *data,
 				   const char namefmt[], ...);
 
 /**
- * kthread_run: create and wake a thread.
+ * kthread_run - create and wake a thread.
  * @threadfn: the function to run until signal_pending(current).
  * @data: data ptr for @threadfn.
  * @namefmt: printf-style name for the thread.
  *
  * Description: Convenient wrapper for kthread_create() followed by
- * wake_up_process().  Returns the kthread, or ERR_PTR(-ENOMEM). */
+ * wake_up_process().  Returns the kthread or ERR_PTR(-ENOMEM).
+ */
 #define kthread_run(threadfn, data, namefmt, ...)			   \
 ({									   \
 	struct task_struct *__k						   \
@@ -44,50 +26,9 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 	__k;								   \
 })
 
-/**
- * kthread_bind: bind a just-created kthread to a cpu.
- * @k: thread created by kthread_create().
- * @cpu: cpu (might not be online, must be possible) for @k to run on.
- *
- * Description: This function is equivalent to set_cpus_allowed(),
- * except that @cpu doesn't need to be online, and the thread must be
- * stopped (ie. just returned from kthread_create().
- */
 void kthread_bind(struct task_struct *k, unsigned int cpu);
-
-/**
- * kthread_stop: stop a thread created by kthread_create().
- * @k: thread created by kthread_create().
- *
- * Sets kthread_should_stop() for @k to return true, wakes it, and
- * waits for it to exit.  Your threadfn() must not call do_exit()
- * itself if you use this function!  This can also be called after
- * kthread_create() instead of calling wake_up_process(): the thread
- * will exit without calling threadfn().
- *
- * Returns the result of threadfn(), or -EINTR if wake_up_process()
- * was never called. */
 int kthread_stop(struct task_struct *k);
-
-/**
- * kthread_stop_sem: stop a thread created by kthread_create().
- * @k: thread created by kthread_create().
- * @s: semaphore that @k waits on while idle.
- *
- * Does essentially the same thing as kthread_stop() above, but wakes
- * @k by calling up(@s).
- *
- * Returns the result of threadfn(), or -EINTR if wake_up_process()
- * was never called. */
 int kthread_stop_sem(struct task_struct *k, struct semaphore *s);
-
-/**
- * kthread_should_stop: should this kthread return now?
- *
- * When someone calls kthread_stop on your kthread, it will be woken
- * and this will return true.  You should then return, and your return
- * value will be passed through to kthread_stop().
- */
 int kthread_should_stop(void);
 
 #endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c5f3c6613b6d..24be714b04c7 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -45,6 +45,13 @@ struct kthread_stop_info
 static DEFINE_MUTEX(kthread_stop_lock);
 static struct kthread_stop_info kthread_stop_info;
 
+/**
+ * kthread_should_stop - should this kthread return now?
+ *
+ * When someone calls kthread_stop on your kthread, it will be woken
+ * and this will return true.  You should then return, and your return
+ * value will be passed through to kthread_stop().
+ */
 int kthread_should_stop(void)
 {
 	return (kthread_stop_info.k == current);
@@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create)
 	complete(&create->done);
 }
 
+/**
+ * kthread_create - create a kthread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: This helper function creates and names a kernel
+ * thread.  The thread will be stopped: use wake_up_process() to start
+ * it.  See also kthread_run(), kthread_create_on_cpu().
+ *
+ * When woken, the thread will run @threadfn() with @data as its
+ * argument. @threadfn can either call do_exit() directly if it is a
+ * standalone thread for which noone will call kthread_stop(), or
+ * return when 'kthread_should_stop()' is true (which means
+ * kthread_stop() has been called).  The return value should be zero
+ * or a negative error number; it will be passed to kthread_stop().
+ *
+ * Returns a task_struct or ERR_PTR(-ENOMEM).
+ */
 struct task_struct *kthread_create(int (*threadfn)(void *data),
 				   void *data,
 				   const char namefmt[],
@@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create);
 
+/**
+ * kthread_bind - bind a just-created kthread to a cpu.
+ * @k: thread created by kthread_create().
+ * @cpu: cpu (might not be online, must be possible) for @k to run on.
+ *
+ * Description: This function is equivalent to set_cpus_allowed(),
+ * except that @cpu doesn't need to be online, and the thread must be
+ * stopped (i.e., just returned from kthread_create().
+ */
 void kthread_bind(struct task_struct *k, unsigned int cpu)
 {
 	BUG_ON(k->state != TASK_INTERRUPTIBLE);
@@ -166,12 +201,36 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 }
 EXPORT_SYMBOL(kthread_bind);
 
+/**
+ * kthread_stop - stop a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_stop() for @k to return true, wakes it, and
+ * waits for it to exit.  Your threadfn() must not call do_exit()
+ * itself if you use this function!  This can also be called after
+ * kthread_create() instead of calling wake_up_process(): the thread
+ * will exit without calling threadfn().
+ *
+ * Returns the result of threadfn(), or %-EINTR if wake_up_process()
+ * was never called.
+ */
 int kthread_stop(struct task_struct *k)
 {
 	return kthread_stop_sem(k, NULL);
 }
 EXPORT_SYMBOL(kthread_stop);
 
+/**
+ * kthread_stop_sem - stop a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ * @s: semaphore that @k waits on while idle.
+ *
+ * Does essentially the same thing as kthread_stop() above, but wakes
+ * @k by calling up(@s).
+ *
+ * Returns the result of threadfn(), or %-EINTR if wake_up_process()
+ * was never called.
+ */
 int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 {
 	int ret;
@@ -210,5 +269,5 @@ static __init int helper_init(void)
 
 	return 0;
 }
-core_initcall(helper_init);
 
+core_initcall(helper_init);
-- 
cgit v1.2.3


From 643f3319b9132c768081ce94f938a29139a16de9 Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Sun, 25 Jun 2006 05:49:20 -0700
Subject: [PATCH] add synclink_gt custom hdlc idle

Add custom HDLC idle pattern feature.

It allows the user to specify an arbitrary 8 or 16 bit repeating pattern on
the transmit data pin between HDLC frames.

In most cases the idle pattern is continuous ones or flags as supported by off
the shelf synchronous controllers and defined in the ISO3309 standard.  Some
applications (radio/satellite modems, connections to legacy military hardware)
require non-standard patterns.

Signed-off-by: Paul Fulghum <paulkf@microgate.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/synclink_gt.c | 46 +++++++++++++++++++++++++++++++++-------------
 include/linux/synclink.h   |  4 +++-
 2 files changed, 36 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index b4f1a5a435aa..6f93a0149fbf 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -2515,7 +2515,8 @@ static int set_txidle(struct slgt_info *info, int idle_mode)
 	DBGINFO(("%s set_txidle(%d)\n", info->device_name, idle_mode));
 	spin_lock_irqsave(&info->lock,flags);
 	info->idle_mode = idle_mode;
-	tx_set_idle(info);
+	if (info->params.mode != MGSL_MODE_ASYNC)
+		tx_set_idle(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 	return 0;
 }
@@ -3940,8 +3941,6 @@ static void async_mode(struct slgt_info *info)
 
 	msc_set_vcr(info);
 
-	tx_set_idle(info);
-
 	/* SCR (serial control)
 	 *
 	 * 15  1=tx req on FIFO half empty
@@ -4175,17 +4174,38 @@ static void hdlc_mode(struct slgt_info *info)
  */
 static void tx_set_idle(struct slgt_info *info)
 {
-	unsigned char val = 0xff;
+	unsigned char val;
+	unsigned short tcr;
 
-	switch(info->idle_mode)
-	{
-	case HDLC_TXIDLE_FLAGS:          val = 0x7e; break;
-	case HDLC_TXIDLE_ALT_ZEROS_ONES: val = 0xaa; break;
-	case HDLC_TXIDLE_ZEROS:          val = 0x00; break;
-	case HDLC_TXIDLE_ONES:           val = 0xff; break;
-	case HDLC_TXIDLE_ALT_MARK_SPACE: val = 0xaa; break;
-	case HDLC_TXIDLE_SPACE:          val = 0x00; break;
-	case HDLC_TXIDLE_MARK:           val = 0xff; break;
+	/* if preamble enabled (tcr[6] == 1) then tx idle size = 8 bits
+	 * else tcr[5:4] = tx idle size: 00 = 8 bits, 01 = 16 bits
+	 */
+	tcr = rd_reg16(info, TCR);
+	if (info->idle_mode & HDLC_TXIDLE_CUSTOM_16) {
+		/* disable preamble, set idle size to 16 bits */
+		tcr = (tcr & ~(BIT6 + BIT5)) | BIT4;
+		/* MSB of 16 bit idle specified in tx preamble register (TPR) */
+		wr_reg8(info, TPR, (unsigned char)((info->idle_mode >> 8) & 0xff));
+	} else if (!(tcr & BIT6)) {
+		/* preamble is disabled, set idle size to 8 bits */
+		tcr &= ~(BIT5 + BIT4);
+	}
+	wr_reg16(info, TCR, tcr);
+
+	if (info->idle_mode & (HDLC_TXIDLE_CUSTOM_8 | HDLC_TXIDLE_CUSTOM_16)) {
+		/* LSB of custom tx idle specified in tx idle register */
+		val = (unsigned char)(info->idle_mode & 0xff);
+	} else {
+		/* standard 8 bit idle patterns */
+		switch(info->idle_mode)
+		{
+		case HDLC_TXIDLE_FLAGS:          val = 0x7e; break;
+		case HDLC_TXIDLE_ALT_ZEROS_ONES:
+		case HDLC_TXIDLE_ALT_MARK_SPACE: val = 0xaa; break;
+		case HDLC_TXIDLE_ZEROS:
+		case HDLC_TXIDLE_SPACE:          val = 0x00; break;
+		default:                         val = 0xff;
+		}
 	}
 
 	wr_reg8(info, TIR, val);
diff --git a/include/linux/synclink.h b/include/linux/synclink.h
index 2993302f7923..b0e3535c38bf 100644
--- a/include/linux/synclink.h
+++ b/include/linux/synclink.h
@@ -1,7 +1,7 @@
 /*
  * SyncLink Multiprotocol Serial Adapter Driver
  *
- * $Id: synclink.h,v 3.11 2006/02/06 21:20:29 paulkf Exp $
+ * $Id: synclink.h,v 3.13 2006/05/23 18:25:06 paulkf Exp $
  *
  * Copyright (C) 1998-2000 by Microgate Corporation
  *
@@ -97,6 +97,8 @@
 #define HDLC_TXIDLE_ALT_MARK_SPACE	4
 #define HDLC_TXIDLE_SPACE		5
 #define HDLC_TXIDLE_MARK		6
+#define HDLC_TXIDLE_CUSTOM_8            0x10000000
+#define HDLC_TXIDLE_CUSTOM_16           0x20000000
 
 #define HDLC_ENCODING_NRZ			0
 #define HDLC_ENCODING_NRZB			1
-- 
cgit v1.2.3


From 6f84be84b4cde72fa2a2f0d10ac284a31e923200 Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Sun, 25 Jun 2006 05:49:22 -0700
Subject: [PATCH] synclink_gt: add GT2 adapter support

Add support for SyncLink GT2 adapter to driver.

Signed-off-by: Paul Fulghum <paulkf@microgate.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/synclink_gt.c | 8 +++++++-
 include/linux/synclink.h   | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 41eab9831ad5..4e35d4181224 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -101,6 +101,7 @@ MODULE_LICENSE("GPL");
 
 static struct pci_device_id pci_table[] = {
 	{PCI_VENDOR_ID_MICROGATE, SYNCLINK_GT_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
+	{PCI_VENDOR_ID_MICROGATE, SYNCLINK_GT2_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
 	{PCI_VENDOR_ID_MICROGATE, SYNCLINK_GT4_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
 	{PCI_VENDOR_ID_MICROGATE, SYNCLINK_AC_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
 	{0,}, /* terminate list */
@@ -3277,6 +3278,9 @@ static void add_device(struct slgt_info *info)
 	case SYNCLINK_GT_DEVICE_ID:
 		devstr = "GT";
 		break;
+	case SYNCLINK_GT2_DEVICE_ID:
+		devstr = "GT2";
+		break;
 	case SYNCLINK_GT4_DEVICE_ID:
 		devstr = "GT4";
 		break;
@@ -3354,7 +3358,9 @@ static void device_init(int adapter_num, struct pci_dev *pdev)
 	int i;
 	int port_count = 1;
 
-	if (pdev->device == SYNCLINK_GT4_DEVICE_ID)
+	if (pdev->device == SYNCLINK_GT2_DEVICE_ID)
+		port_count = 2;
+	else if (pdev->device == SYNCLINK_GT4_DEVICE_ID)
 		port_count = 4;
 
 	/* allocate device instances for all ports */
diff --git a/include/linux/synclink.h b/include/linux/synclink.h
index b0e3535c38bf..0577f5284cbc 100644
--- a/include/linux/synclink.h
+++ b/include/linux/synclink.h
@@ -172,6 +172,7 @@ typedef struct _MGSL_PARAMS
 #define SYNCLINK_GT_DEVICE_ID 0x0070
 #define SYNCLINK_GT4_DEVICE_ID 0x0080
 #define SYNCLINK_AC_DEVICE_ID  0x0090
+#define SYNCLINK_GT2_DEVICE_ID 0x00A0
 #define MGSL_MAX_SERIAL_NUMBER 30
 
 /*
-- 
cgit v1.2.3


From 0e4648141af02331f21aabcd34940c70f09a2d04 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sun, 25 Jun 2006 05:49:24 -0700
Subject: [PATCH] pacct: add pacct_struct to fix some pacct bugs.

The pacct facility need an i/o operation when an accounting record is
generated.  There is a possibility to wake OOM killer up.  If OOM killer is
activated, it kills some processes to make them release process memory
regions.

But acct_process() is called in the killed processes context before calling
exit_mm(), so those processes cannot release own memory.  In the results, any
processes stop in this point and it finally cause a system stall.
---
 include/linux/acct.h  |  4 ++++
 include/linux/sched.h |  7 +++++++
 kernel/acct.c         | 51 ++++++++++++++++++++++++++++++++++++---------------
 kernel/exit.c         |  4 +++-
 kernel/fork.c         |  1 +
 5 files changed, 51 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acct.h b/include/linux/acct.h
index 3d54fbcf969e..5bca9b3ef2d7 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -121,12 +121,16 @@ struct vfsmount;
 struct super_block;
 extern void acct_auto_close_mnt(struct vfsmount *m);
 extern void acct_auto_close(struct super_block *sb);
+extern void acct_init_pacct(struct pacct_struct *pacct);
+extern void acct_collect();
 extern void acct_process(long exitcode);
 extern void acct_update_integrals(struct task_struct *tsk);
 extern void acct_clear_integrals(struct task_struct *tsk);
 #else
 #define acct_auto_close_mnt(x)	do { } while (0)
 #define acct_auto_close(x)	do { } while (0)
+#define acct_init_pacct(x)	do { } while (0)
+#define acct_collect()		do { } while (0)
 #define acct_process(x)		do { } while (0)
 #define acct_update_integrals(x)		do { } while (0)
 #define acct_clear_integrals(task)	do { } while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 38b4791e6a5d..abada7c1d5e4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -358,6 +358,10 @@ struct sighand_struct {
 	spinlock_t		siglock;
 };
 
+struct pacct_struct {
+	unsigned long		ac_mem;
+};
+
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
@@ -449,6 +453,9 @@ struct signal_struct {
 	struct key *session_keyring;	/* keyring inherited over fork */
 	struct key *process_keyring;	/* keyring private to this process */
 #endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+	struct pacct_struct pacct;	/* per-process accounting information */
+#endif
 };
 
 /* Context switch must be unlocked if interrupts are to be enabled */
diff --git a/kernel/acct.c b/kernel/acct.c
index 44dd6bd63517..b35263137824 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -421,9 +421,9 @@ static u32 encode_float(u64 value)
  */
 static void do_acct_process(long exitcode, struct file *file)
 {
+	struct pacct_struct *pacct = &current->signal->pacct;
 	acct_t ac;
 	mm_segment_t fs;
-	unsigned long vsize;
 	unsigned long flim;
 	u64 elapsed;
 	u64 run_time;
@@ -505,20 +505,9 @@ static void do_acct_process(long exitcode, struct file *file)
 		ac.ac_flag |= ACORE;
 	if (current->flags & PF_SIGNALED)
 		ac.ac_flag |= AXSIG;
-
-	vsize = 0;
-	if (current->mm) {
-		struct vm_area_struct *vma;
-		down_read(&current->mm->mmap_sem);
-		vma = current->mm->mmap;
-		while (vma) {
-			vsize += vma->vm_end - vma->vm_start;
-			vma = vma->vm_next;
-		}
-		up_read(&current->mm->mmap_sem);
-	}
-	vsize = vsize / 1024;
-	ac.ac_mem = encode_comp_t(vsize);
+	spin_lock(&current->sighand->siglock);
+	ac.ac_mem = encode_comp_t(pacct->ac_mem);
+	spin_unlock(&current->sighand->siglock);
 	ac.ac_io = encode_comp_t(0 /* current->io_usage */);	/* %% */
 	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
 	ac.ac_minflt = encode_comp_t(current->signal->min_flt +
@@ -545,6 +534,38 @@ static void do_acct_process(long exitcode, struct file *file)
 	set_fs(fs);
 }
 
+/**
+ * acct_init_pacct - initialize a new pacct_struct
+ */
+void acct_init_pacct(struct pacct_struct *pacct)
+{
+	memset(pacct, 0, sizeof(struct pacct_struct));
+}
+
+/**
+ * acct_collect - collect accounting information into pacct_struct
+ */
+void acct_collect(void)
+{
+	struct pacct_struct *pacct = &current->signal->pacct;
+	unsigned long vsize = 0;
+
+	if (current->mm) {
+		struct vm_area_struct *vma;
+		down_read(&current->mm->mmap_sem);
+		vma = current->mm->mmap;
+		while (vma) {
+			vsize += vma->vm_end - vma->vm_start;
+			vma = vma->vm_next;
+		}
+		up_read(&current->mm->mmap_sem);
+	}
+
+	spin_lock(&current->sighand->siglock);
+	pacct->ac_mem = vsize / 1024;
+	spin_unlock(&current->sighand->siglock);
+}
+
 /**
  * acct_process - now just a wrapper around do_acct_process
  * @exitcode: task exit code
diff --git a/kernel/exit.c b/kernel/exit.c
index 601263c0806f..819d82c2efba 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -894,7 +894,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (group_dead) {
  		hrtimer_cancel(&tsk->signal->real_timer);
 		exit_itimers(tsk->signal);
-		acct_process(code);
+		acct_collect();
 	}
 	if (unlikely(tsk->robust_list))
 		exit_robust_list(tsk);
@@ -906,6 +906,8 @@ fastcall NORET_TYPE void do_exit(long code)
 		audit_free(tsk);
 	exit_mm(tsk);
 
+	if (group_dead)
+		acct_process(code);
 	exit_sem(tsk);
 	__exit_files(tsk);
 	__exit_fs(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 49adc0e8d47c..dfd10cb370c3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -874,6 +874,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 		tsk->it_prof_expires =
 			secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
 	}
+	acct_init_pacct(&sig->pacct);
 
 	return 0;
 }
-- 
cgit v1.2.3


From f6ec29a42d7ac3b309a9cef179b686d23986ab98 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sun, 25 Jun 2006 05:49:25 -0700
Subject: [PATCH] pacct: avoidance to refer the last thread as a representation
 of the process

When pacct facility generate an 'ac_flag' field in accounting record, it
refers a task_struct of the thread which died last in the process.  But any
other task_structs are ignored.

Therefore, pacct facility drops ASU flag even if root-privilege operations are
used by any other threads except the last one.  In addition, AFORK flag is
always set when the thread of group-leader didn't die last, although this
process has called execve() after fork().

We have a same matter in ac_exitcode.  The recorded ac_exitcode is an exit
code of the last thread in the process.  There is a possibility this exitcode
is not the group leader's one.
---
 include/linux/acct.h  |  8 ++++----
 include/linux/sched.h |  2 ++
 kernel/acct.c         | 42 ++++++++++++++++++++++++------------------
 kernel/exit.c         |  4 ++--
 4 files changed, 32 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acct.h b/include/linux/acct.h
index 5bca9b3ef2d7..e86bae7324d2 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -122,16 +122,16 @@ struct super_block;
 extern void acct_auto_close_mnt(struct vfsmount *m);
 extern void acct_auto_close(struct super_block *sb);
 extern void acct_init_pacct(struct pacct_struct *pacct);
-extern void acct_collect();
-extern void acct_process(long exitcode);
+extern void acct_collect(long exitcode, int group_dead);
+extern void acct_process(void);
 extern void acct_update_integrals(struct task_struct *tsk);
 extern void acct_clear_integrals(struct task_struct *tsk);
 #else
 #define acct_auto_close_mnt(x)	do { } while (0)
 #define acct_auto_close(x)	do { } while (0)
 #define acct_init_pacct(x)	do { } while (0)
-#define acct_collect()		do { } while (0)
-#define acct_process(x)		do { } while (0)
+#define acct_collect(x,y)	do { } while (0)
+#define acct_process()		do { } while (0)
 #define acct_update_integrals(x)		do { } while (0)
 #define acct_clear_integrals(task)	do { } while (0)
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index abada7c1d5e4..d8429dc250f0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -359,6 +359,8 @@ struct sighand_struct {
 };
 
 struct pacct_struct {
+	int			ac_flag;
+	long			ac_exitcode;
 	unsigned long		ac_mem;
 };
 
diff --git a/kernel/acct.c b/kernel/acct.c
index b35263137824..4c85fdf615da 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,7 +75,7 @@ int acct_parm[3] = {4, 2, 30};
 /*
  * External references and all of the globals.
  */
-static void do_acct_process(long, struct file *);
+static void do_acct_process(struct file *);
 
 /*
  * This structure is used so that all the data protected by lock
@@ -196,7 +196,7 @@ static void acct_file_reopen(struct file *file)
 	if (old_acct) {
 		mnt_unpin(old_acct->f_vfsmnt);
 		spin_unlock(&acct_globals.lock);
-		do_acct_process(0, old_acct);
+		do_acct_process(old_acct);
 		filp_close(old_acct, NULL);
 		spin_lock(&acct_globals.lock);
 	}
@@ -419,7 +419,7 @@ static u32 encode_float(u64 value)
 /*
  *  do_acct_process does all actual work. Caller holds the reference to file.
  */
-static void do_acct_process(long exitcode, struct file *file)
+static void do_acct_process(struct file *file)
 {
 	struct pacct_struct *pacct = &current->signal->pacct;
 	acct_t ac;
@@ -496,17 +496,10 @@ static void do_acct_process(long exitcode, struct file *file)
 		old_encode_dev(tty_devnum(current->signal->tty)) : 0;
 	read_unlock(&tasklist_lock);
 
-	ac.ac_flag = 0;
-	if (current->flags & PF_FORKNOEXEC)
-		ac.ac_flag |= AFORK;
-	if (current->flags & PF_SUPERPRIV)
-		ac.ac_flag |= ASU;
-	if (current->flags & PF_DUMPCORE)
-		ac.ac_flag |= ACORE;
-	if (current->flags & PF_SIGNALED)
-		ac.ac_flag |= AXSIG;
 	spin_lock(&current->sighand->siglock);
+	ac.ac_flag = pacct->ac_flag;
 	ac.ac_mem = encode_comp_t(pacct->ac_mem);
+	ac.ac_exitcode = pacct->ac_exitcode;
 	spin_unlock(&current->sighand->siglock);
 	ac.ac_io = encode_comp_t(0 /* current->io_usage */);	/* %% */
 	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
@@ -515,7 +508,6 @@ static void do_acct_process(long exitcode, struct file *file)
 	ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
 				     current->maj_flt);
 	ac.ac_swaps = encode_comp_t(0);
-	ac.ac_exitcode = exitcode;
 
 	/*
          * Kernel segment override to datasegment and write it
@@ -544,13 +536,15 @@ void acct_init_pacct(struct pacct_struct *pacct)
 
 /**
  * acct_collect - collect accounting information into pacct_struct
+ * @exitcode: task exit code
+ * @group_dead: not 0, if this thread is the last one in the process.
  */
-void acct_collect(void)
+void acct_collect(long exitcode, int group_dead)
 {
 	struct pacct_struct *pacct = &current->signal->pacct;
 	unsigned long vsize = 0;
 
-	if (current->mm) {
+	if (group_dead && current->mm) {
 		struct vm_area_struct *vma;
 		down_read(&current->mm->mmap_sem);
 		vma = current->mm->mmap;
@@ -562,7 +556,19 @@ void acct_collect(void)
 	}
 
 	spin_lock(&current->sighand->siglock);
-	pacct->ac_mem = vsize / 1024;
+	if (group_dead)
+		pacct->ac_mem = vsize / 1024;
+	if (thread_group_leader(current)) {
+		pacct->ac_exitcode = exitcode;
+		if (current->flags & PF_FORKNOEXEC)
+			pacct->ac_flag |= AFORK;
+	}
+	if (current->flags & PF_SUPERPRIV)
+		pacct->ac_flag |= ASU;
+	if (current->flags & PF_DUMPCORE)
+		pacct->ac_flag |= ACORE;
+	if (current->flags & PF_SIGNALED)
+		pacct->ac_flag |= AXSIG;
 	spin_unlock(&current->sighand->siglock);
 }
 
@@ -572,7 +578,7 @@ void acct_collect(void)
  *
  * handles process accounting for an exiting task
  */
-void acct_process(long exitcode)
+void acct_process()
 {
 	struct file *file = NULL;
 
@@ -591,7 +597,7 @@ void acct_process(long exitcode)
 	get_file(file);
 	spin_unlock(&acct_globals.lock);
 
-	do_acct_process(exitcode, file);
+	do_acct_process(file);
 	fput(file);
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 819d82c2efba..e76bd02e930e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -894,8 +894,8 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (group_dead) {
  		hrtimer_cancel(&tsk->signal->real_timer);
 		exit_itimers(tsk->signal);
-		acct_collect();
 	}
+	acct_collect(code, group_dead);
 	if (unlikely(tsk->robust_list))
 		exit_robust_list(tsk);
 #if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT)
@@ -907,7 +907,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	exit_mm(tsk);
 
 	if (group_dead)
-		acct_process(code);
+		acct_process();
 	exit_sem(tsk);
 	__exit_files(tsk);
 	__exit_fs(tsk);
-- 
cgit v1.2.3


From 77787bfb44da6e6166af088226707aeccee27968 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sun, 25 Jun 2006 05:49:26 -0700
Subject: [PATCH] pacct: none-delayed process accounting accumulation

In current 2.6.17 implementation, signal_struct refered from task_struct is
used for per-process data structure.  The pacct facility also uses it as a
per-process data structure to store stime, utime, minflt, majflt.  But those
members are saved in __exit_signal().  It's too late.

For example, if some threads exits at same time, pacct facility has a
possibility to drop accountings for a part of those threads.  (see, the
following 'The results of original 2.6.17 kernel') I think accounting
information should be completely collected into the per-process data structure
before writing out an accounting record.

This patch fixes this matter.  Accumulation of stime, utime, minflt and majflt
are done before generating accounting record.

[mingo@elte.hu: fix acct_collect() siglock bug found by lockdep]
Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  2 ++
 kernel/acct.c         | 24 +++++++++++-------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d8429dc250f0..8d11d9310db0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -362,6 +362,8 @@ struct pacct_struct {
 	int			ac_flag;
 	long			ac_exitcode;
 	unsigned long		ac_mem;
+	cputime_t		ac_utime, ac_stime;
+	unsigned long		ac_minflt, ac_majflt;
 };
 
 /*
diff --git a/kernel/acct.c b/kernel/acct.c
index 4c85fdf615da..368c4f03fe0e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -428,7 +428,6 @@ static void do_acct_process(struct file *file)
 	u64 elapsed;
 	u64 run_time;
 	struct timespec uptime;
-	unsigned long jiffies;
 
 	/*
 	 * First check to see if there is enough free_space to continue
@@ -469,12 +468,6 @@ static void do_acct_process(struct file *file)
 #endif
 	do_div(elapsed, AHZ);
 	ac.ac_btime = xtime.tv_sec - elapsed;
-	jiffies = cputime_to_jiffies(cputime_add(current->utime,
-						 current->signal->utime));
-	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
-	jiffies = cputime_to_jiffies(cputime_add(current->stime,
-						 current->signal->stime));
-	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
 	/* we really need to bite the bullet and change layout */
 	ac.ac_uid = current->uid;
 	ac.ac_gid = current->gid;
@@ -497,16 +490,16 @@ static void do_acct_process(struct file *file)
 	read_unlock(&tasklist_lock);
 
 	spin_lock(&current->sighand->siglock);
+	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
 	ac.ac_flag = pacct->ac_flag;
 	ac.ac_mem = encode_comp_t(pacct->ac_mem);
+	ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
+	ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
 	ac.ac_exitcode = pacct->ac_exitcode;
 	spin_unlock(&current->sighand->siglock);
 	ac.ac_io = encode_comp_t(0 /* current->io_usage */);	/* %% */
 	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
-	ac.ac_minflt = encode_comp_t(current->signal->min_flt +
-				     current->min_flt);
-	ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
-				     current->maj_flt);
 	ac.ac_swaps = encode_comp_t(0);
 
 	/*
@@ -532,6 +525,7 @@ static void do_acct_process(struct file *file)
 void acct_init_pacct(struct pacct_struct *pacct)
 {
 	memset(pacct, 0, sizeof(struct pacct_struct));
+	pacct->ac_utime = pacct->ac_stime = cputime_zero;
 }
 
 /**
@@ -555,7 +549,7 @@ void acct_collect(long exitcode, int group_dead)
 		up_read(&current->mm->mmap_sem);
 	}
 
-	spin_lock(&current->sighand->siglock);
+	spin_lock_irq(&current->sighand->siglock);
 	if (group_dead)
 		pacct->ac_mem = vsize / 1024;
 	if (thread_group_leader(current)) {
@@ -569,7 +563,11 @@ void acct_collect(long exitcode, int group_dead)
 		pacct->ac_flag |= ACORE;
 	if (current->flags & PF_SIGNALED)
 		pacct->ac_flag |= AXSIG;
-	spin_unlock(&current->sighand->siglock);
+	pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
+	pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
+	pacct->ac_minflt += current->min_flt;
+	pacct->ac_majflt += current->maj_flt;
+	spin_unlock_irq(&current->sighand->siglock);
 }
 
 /**
-- 
cgit v1.2.3


From d84a84775bba661d5a3fd06757bbb419381937f3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 25 Jun 2006 05:49:32 -0700
Subject: [PATCH] Fix "biovec-(256)" in /proc/slabinfo

Stringify does what it was told to do.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index b60ffe32cd21..76bdaeab6f62 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -47,7 +47,7 @@
 #define BIO_BUG_ON
 #endif
 
-#define BIO_MAX_PAGES		(256)
+#define BIO_MAX_PAGES		256
 #define BIO_MAX_SIZE		(BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
 #define BIO_MAX_SECTORS		(BIO_MAX_SIZE >> 9)
 
-- 
cgit v1.2.3


From 3448097fccdce4ea8f0fcad4f37f502a8cd72e68 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sun, 25 Jun 2006 18:41:00 -0700
Subject: Revert "swsusp special saveable pages support" commits

This reverts commits

  3e3318dee0878d42ed62a19c292a2ac284135db3 [PATCH] swsusp: x86_64 mark special saveable/unsaveable pages
  b6370d96e09944c6e3ae8d5743ca8a8ab1f79f6c [PATCH] swsusp: i386 mark special saveable/unsaveable pages
  ce4ab0012b32c1a4a1d6e934aeb73bf3151c48d9 [PATCH] swsusp: add architecture special saveable pages support

because not only do they apparently cause page faults on x86, the
infrastructure doesn't compile on powerpc.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/setup.c   | 106 ------------------------------------------
 arch/x86_64/kernel/setup.c |  95 --------------------------------------
 include/linux/suspend.h    |   1 -
 kernel/power/power.h       |   4 --
 kernel/power/snapshot.c    | 112 ++-------------------------------------------
 kernel/power/swsusp.c      |  18 ++++++--
 6 files changed, 18 insertions(+), 318 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 6c1639836e06..6bef9273733e 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -48,7 +48,6 @@
 #include <linux/crash_dump.h>
 #include <linux/dmi.h>
 #include <linux/pfn.h>
-#include <linux/suspend.h>
 
 #include <video/edid.h>
 
@@ -1427,111 +1426,6 @@ static void set_mca_bus(int x)
 static void set_mca_bus(int x) { }
 #endif
 
-#ifdef CONFIG_SOFTWARE_SUSPEND
-static void __init mark_nosave_page_range(unsigned long start, unsigned long end)
-{
-	struct page *page;
-	while (start <= end) {
-		page = pfn_to_page(start);
-		SetPageNosave(page);
-		start++;
-	}
-}
-
-static void __init e820_nosave_reserved_pages(void)
-{
-	int i;
-	unsigned long r_start = 0, r_end = 0;
-
-	/* Assume e820 map is sorted */
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		unsigned long start, end;
-
-		start = PFN_DOWN(ei->addr);
-		end = PFN_UP(ei->addr + ei->size);
-		if (start >= end)
-			continue;
-		if (ei->type == E820_RESERVED)
-			continue;
-		r_end = start;
-		/*
-		 * Highmem 'Reserved' pages are marked as reserved, swsusp
-		 * will not save/restore them, so we ignore these pages here.
-		 */
-		if (r_end > max_low_pfn)
-			r_end = max_low_pfn;
-		if (r_end > r_start)
-			mark_nosave_page_range(r_start, r_end-1);
-		if (r_end >= max_low_pfn)
-			break;
-		r_start = end;
-	}
-}
-
-static void __init e820_save_acpi_pages(void)
-{
-	int i;
-
-	/* Assume e820 map is sorted */
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		unsigned long start, end;
-
-		start = ei->addr;
-		end = ei->addr + ei->size;
-		if (start >= end)
-			continue;
-		if (ei->type != E820_ACPI && ei->type != E820_NVS)
-			continue;
-		/*
-		 * If the region is below max_low_pfn, it will be
-		 * saved/restored by swsusp follow 'RAM' type.
-		 */
-		if (start < (max_low_pfn << PAGE_SHIFT))
-			start = max_low_pfn << PAGE_SHIFT;
-		/*
-		 * Highmem pages (ACPI NVS/Data) are reserved, but swsusp
-		 * highmem save/restore will not save/restore them. We marked
-		 * them as arch saveable pages here
-		 */
-		if (end > start)
-			swsusp_add_arch_pages(start, end);
-	}
-}
-
-extern char __start_rodata, __end_rodata;
-/*
- * BIOS reserved region/hole - no save/restore
- * ACPI NVS - save/restore
- * ACPI Data - this is a little tricky, the mem could be used by OS after OS
- * reads tables from the region, but anyway save/restore the memory hasn't any
- * side effect and Linux runtime module load/unload might use it.
- * kernel rodata - no save/restore (kernel rodata isn't changed)
- */
-static int __init mark_nosave_pages(void)
-{
-	unsigned long pfn_start, pfn_end;
-
-	/* FIXME: provide a version for efi BIOS */
-	if (efi_enabled)
-		return 0;
-	/* BIOS reserved regions & holes */
-	e820_nosave_reserved_pages();
-
-	/* kernel rodata */
-	pfn_start = PFN_UP(virt_to_phys(&__start_rodata));
-	pfn_end = PFN_DOWN(virt_to_phys(&__end_rodata));
-	mark_nosave_page_range(pfn_start, pfn_end-1);
-
-	/* record ACPI Data/NVS as saveable */
-	e820_save_acpi_pages();
-
-	return 0;
-}
-core_initcall(mark_nosave_pages);
-#endif
-
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index fdb82658b1a1..fb850b52b4da 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -47,7 +47,6 @@
 #include <linux/dmi.h>
 #include <linux/dma-mapping.h>
 #include <linux/ctype.h>
-#include <linux/suspend.h>
 
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
@@ -596,100 +595,6 @@ static void discover_ebda(void)
 		ebda_size = 64*1024;
 }
 
-#ifdef CONFIG_SOFTWARE_SUSPEND
-static void __init mark_nosave_page_range(unsigned long start, unsigned long end)
-{
-	struct page *page;
-	while (start <= end) {
-		page = pfn_to_page(start);
-		SetPageNosave(page);
-		start++;
-	}
-}
-
-static void __init e820_nosave_reserved_pages(void)
-{
-	int i;
-	unsigned long r_start = 0, r_end = 0;
-
-	/* Assume e820 map is sorted */
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		unsigned long start, end;
-
-		start = round_down(ei->addr, PAGE_SIZE);
-		end = round_up(ei->addr + ei->size, PAGE_SIZE);
-		if (start >= end)
-			continue;
-		if (ei->type == E820_RESERVED)
-			continue;
-		r_end = start>>PAGE_SHIFT;
-		/* swsusp ignores invalid pfn, ignore these pages here */
-		if (r_end > end_pfn)
-			r_end = end_pfn;
-		if (r_end > r_start)
-			mark_nosave_page_range(r_start, r_end-1);
-		if (r_end >= end_pfn)
-			break;
-		r_start = end>>PAGE_SHIFT;
-	}
-}
-
-static void __init e820_save_acpi_pages(void)
-{
-	int i;
-
-	/* Assume e820 map is sorted */
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		unsigned long start, end;
-
-		start = ei->addr, PAGE_SIZE;
-		end = ei->addr + ei->size;
-		if (start >= end)
-			continue;
-		if (ei->type != E820_ACPI && ei->type != E820_NVS)
-			continue;
-		/*
-		 * If the region is below end_pfn, it will be
-		 * saved/restored by swsusp follow 'RAM' type.
-		 */
-		if (start < (end_pfn << PAGE_SHIFT))
-			start = end_pfn << PAGE_SHIFT;
-		if (end > start)
-			swsusp_add_arch_pages(start, end);
-	}
-}
-
-extern char __start_rodata, __end_rodata;
-/*
- * BIOS reserved region/hole - no save/restore
- * ACPI NVS - save/restore
- * ACPI Data - this is a little tricky, the mem could be used by OS after OS
- * reads tables from the region, but anyway save/restore the memory hasn't any
- * side effect and Linux runtime module load/unload might use it.
- * kernel rodata - no save/restore (kernel rodata isn't changed)
- */
-static int __init mark_nosave_pages(void)
-{
-	unsigned long pfn_start, pfn_end;
-
-	/* BIOS reserved regions & holes */
-	e820_nosave_reserved_pages();
-
-	/* kernel rodata */
-	pfn_start = round_up(__pa_symbol(&__start_rodata), PAGE_SIZE) >> PAGE_SHIFT;
-	pfn_end = round_down(__pa_symbol(&__end_rodata), PAGE_SIZE) >> PAGE_SHIFT;
-	mark_nosave_page_range(pfn_start, pfn_end-1);
-
-	/* record ACPI Data/NVS as saveable */
-	e820_save_acpi_pages();
-
-	return 0;
-}
-core_initcall(mark_nosave_pages);
-#endif
-
 void __init setup_arch(char **cmdline_p)
 {
 	unsigned long kernel_end;
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index e82cb10fb3ea..96e31aa64cc7 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -71,7 +71,6 @@ struct saved_context;
 void __save_processor_state(struct saved_context *ctxt);
 void __restore_processor_state(struct saved_context *ctxt);
 unsigned long get_safe_page(gfp_t gfp_mask);
-int swsusp_add_arch_pages(unsigned long start, unsigned long end);
 
 /*
  * XXX: We try to keep some more pages free so that I/O operations succeed
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 98c41423f3b1..57a792982fb9 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -105,10 +105,6 @@ extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
 extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
 extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
 
-extern unsigned int count_special_pages(void);
-extern int save_special_mem(void);
-extern int restore_special_mem(void);
-
 extern int swsusp_check(void);
 extern int swsusp_shrink_memory(void);
 extern void swsusp_free(void);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3d9284100b22..24c96f354231 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -39,90 +39,8 @@ static unsigned int nr_copy_pages;
 static unsigned int nr_meta_pages;
 static unsigned long *buffer;
 
-struct arch_saveable_page {
-	unsigned long start;
-	unsigned long end;
-	char *data;
-	struct arch_saveable_page *next;
-};
-static struct arch_saveable_page *arch_pages;
-
-int swsusp_add_arch_pages(unsigned long start, unsigned long end)
-{
-	struct arch_saveable_page *tmp;
-
-	while (start < end) {
-		tmp = kzalloc(sizeof(struct arch_saveable_page), GFP_KERNEL);
-		if (!tmp)
-			return -ENOMEM;
-		tmp->start = start;
-		tmp->end = ((start >> PAGE_SHIFT) + 1) << PAGE_SHIFT;
-		if (tmp->end > end)
-			tmp->end = end;
-		tmp->next = arch_pages;
-		start = tmp->end;
-		arch_pages = tmp;
-	}
-	return 0;
-}
-
-static unsigned int count_arch_pages(void)
-{
-	unsigned int count = 0;
-	struct arch_saveable_page *tmp = arch_pages;
-	while (tmp) {
-		count++;
-		tmp = tmp->next;
-	}
-	return count;
-}
-
-static int save_arch_mem(void)
-{
-	char *kaddr;
-	struct arch_saveable_page *tmp = arch_pages;
-	int offset;
-
-	pr_debug("swsusp: Saving arch specific memory");
-	while (tmp) {
-		tmp->data = (char *)__get_free_page(GFP_ATOMIC);
-		if (!tmp->data)
-			return -ENOMEM;
-		offset = tmp->start - (tmp->start & PAGE_MASK);
-		/* arch pages might haven't a 'struct page' */
-		kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0);
-		memcpy(tmp->data + offset, kaddr + offset,
-			tmp->end - tmp->start);
-		kunmap_atomic(kaddr, KM_USER0);
-
-		tmp = tmp->next;
-	}
-	return 0;
-}
-
-static int restore_arch_mem(void)
-{
-	char *kaddr;
-	struct arch_saveable_page *tmp = arch_pages;
-	int offset;
-
-	while (tmp) {
-		if (!tmp->data)
-			continue;
-		offset = tmp->start - (tmp->start & PAGE_MASK);
-		kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0);
-		memcpy(kaddr + offset, tmp->data + offset,
-			tmp->end - tmp->start);
-		kunmap_atomic(kaddr, KM_USER0);
-		free_page((long)tmp->data);
-		tmp->data = NULL;
-		tmp = tmp->next;
-	}
-	return 0;
-}
-
 #ifdef CONFIG_HIGHMEM
-static unsigned int count_highmem_pages(void)
+unsigned int count_highmem_pages(void)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
@@ -199,7 +117,7 @@ static int save_highmem_zone(struct zone *zone)
 	return 0;
 }
 
-static int save_highmem(void)
+int save_highmem(void)
 {
 	struct zone *zone;
 	int res = 0;
@@ -216,7 +134,7 @@ static int save_highmem(void)
 	return 0;
 }
 
-static int restore_highmem(void)
+int restore_highmem(void)
 {
 	printk("swsusp: Restoring Highmem\n");
 	while (highmem_copy) {
@@ -238,29 +156,6 @@ static inline int save_highmem(void) {return 0;}
 static inline int restore_highmem(void) {return 0;}
 #endif
 
-unsigned int count_special_pages(void)
-{
-	return count_arch_pages() + count_highmem_pages();
-}
-
-int save_special_mem(void)
-{
-	int ret;
-	ret = save_arch_mem();
-	if (!ret)
-		ret = save_highmem();
-	return ret;
-}
-
-int restore_special_mem(void)
-{
-	int ret;
-	ret = restore_arch_mem();
-	if (!ret)
-		ret = restore_highmem();
-	return ret;
-}
-
 static int pfn_is_nosave(unsigned long pfn)
 {
 	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
@@ -286,6 +181,7 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn)
 		return 0;
 
 	page = pfn_to_page(pfn);
+	BUG_ON(PageReserved(page) && PageNosave(page));
 	if (PageNosave(page))
 		return 0;
 	if (PageReserved(page) && pfn_is_nosave(pfn))
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f0ee4e7780d6..17f669c83012 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -62,6 +62,16 @@ unsigned long image_size = 500 * 1024 * 1024;
 
 int in_suspend __nosavedata = 0;
 
+#ifdef CONFIG_HIGHMEM
+unsigned int count_highmem_pages(void);
+int save_highmem(void);
+int restore_highmem(void);
+#else
+static inline int save_highmem(void) { return 0; }
+static inline int restore_highmem(void) { return 0; }
+static inline unsigned int count_highmem_pages(void) { return 0; }
+#endif
+
 /**
  *	The following functions are used for tracing the allocated
  *	swap pages, so that they can be freed in case of an error.
@@ -182,7 +192,7 @@ int swsusp_shrink_memory(void)
 
 	printk("Shrinking memory...  ");
 	do {
-		size = 2 * count_special_pages();
+		size = 2 * count_highmem_pages();
 		size += size / 50 + count_data_pages();
 		size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
 			PAGES_FOR_IO;
@@ -226,7 +236,7 @@ int swsusp_suspend(void)
 		goto Enable_irqs;
 	}
 
-	if ((error = save_special_mem())) {
+	if ((error = save_highmem())) {
 		printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
 		goto Restore_highmem;
 	}
@@ -237,7 +247,7 @@ int swsusp_suspend(void)
 	/* Restore control flow magically appears here */
 	restore_processor_state();
 Restore_highmem:
-	restore_special_mem();
+	restore_highmem();
 	device_power_up();
 Enable_irqs:
 	local_irq_enable();
@@ -263,7 +273,7 @@ int swsusp_resume(void)
 	 */
 	swsusp_free();
 	restore_processor_state();
-	restore_special_mem();
+	restore_highmem();
 	touch_softlockup_watchdog();
 	device_power_up();
 	local_irq_enable();
-- 
cgit v1.2.3


From f60d2b111cd55c335c2b70e50d66a612d2b10856 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor_core@ameritech.net>
Date: Mon, 26 Jun 2006 01:48:36 -0400
Subject: Input: reset name, phys and uniq when unregistering

Name, phys and uniq are quite often constant strings in moules implementing
particular input device. If a module unregisters input device and then gets
unloaded, the device could still be present in memory (pinned via sysfs),
but aforementioned members would point to some random memory. Set them all
to NULL when unregistering so sysfs handlers won't try dereferencing them.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/input.c | 19 ++++++++++++++++++-
 include/linux/input.h |  7 +------
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/input.c b/drivers/input/input.c
index b149c9434849..d3cdb139e962 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -29,6 +29,7 @@ MODULE_DESCRIPTION("Input core");
 MODULE_LICENSE("GPL");
 
 EXPORT_SYMBOL(input_allocate_device);
+EXPORT_SYMBOL(input_free_device);
 EXPORT_SYMBOL(input_register_device);
 EXPORT_SYMBOL(input_unregister_device);
 EXPORT_SYMBOL(input_register_handler);
@@ -872,6 +873,7 @@ struct input_dev *input_allocate_device(void)
 		dev->dynalloc = 1;
 		dev->cdev.class = &input_class;
 		class_device_initialize(&dev->cdev);
+		mutex_init(&dev->mutex);
 		INIT_LIST_HEAD(&dev->h_list);
 		INIT_LIST_HEAD(&dev->node);
 	}
@@ -879,6 +881,18 @@ struct input_dev *input_allocate_device(void)
 	return dev;
 }
 
+void input_free_device(struct input_dev *dev)
+{
+	if (dev) {
+
+		mutex_lock(&dev->mutex);
+		dev->name = dev->phys = dev->uniq = NULL;
+		mutex_unlock(&dev->mutex);
+
+		input_put_device(dev);
+	}
+}
+
 int input_register_device(struct input_dev *dev)
 {
 	static atomic_t input_no = ATOMIC_INIT(0);
@@ -895,7 +909,6 @@ int input_register_device(struct input_dev *dev)
 		return -EINVAL;
 	}
 
-	mutex_init(&dev->mutex);
 	set_bit(EV_SYN, dev->evbit);
 
 	/*
@@ -979,6 +992,10 @@ void input_unregister_device(struct input_dev *dev)
 	sysfs_remove_group(&dev->cdev.kobj, &input_dev_attr_group);
 	class_device_unregister(&dev->cdev);
 
+	mutex_lock(&dev->mutex);
+	dev->name = dev->phys = dev->uniq = NULL;
+	mutex_unlock(&dev->mutex);
+
 	input_wakeup_procfs_readers();
 }
 
diff --git a/include/linux/input.h b/include/linux/input.h
index b32c2b6e53f6..8d2b874d9e72 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -1005,6 +1005,7 @@ static inline void init_input_dev(struct input_dev *dev)
 }
 
 struct input_dev *input_allocate_device(void);
+void input_free_device(struct input_dev *dev);
 
 static inline struct input_dev *input_get_device(struct input_dev *dev)
 {
@@ -1016,12 +1017,6 @@ static inline void input_put_device(struct input_dev *dev)
 	class_device_put(&dev->cdev);
 }
 
-static inline void input_free_device(struct input_dev *dev)
-{
-	if (dev)
-		input_put_device(dev);
-}
-
 int input_register_device(struct input_dev *);
 void input_unregister_device(struct input_dev *);
 
-- 
cgit v1.2.3


From b9ab58dd8e771d30df110c56e785db1ae5e073df Mon Sep 17 00:00:00 2001
From: Jerome Pinot <ngc891@gmail.com>
Date: Mon, 26 Jun 2006 01:51:23 -0400
Subject: Input: fix misspelling of Hangeul key

Fix a mispelling of the korean alphabet name in the input subsystem.
See http://en.wikipedia.org/wiki/Hangeul#Names for more details.

KEY_HANGUEL left to not break people

Signed-off-by: Jerome Pinot <ngc891@gmail.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/char/keyboard.c        | 2 +-
 drivers/input/keyboard/atkbd.c | 8 ++++----
 drivers/macintosh/adbhid.c     | 2 +-
 drivers/usb/input/hid-debug.h  | 2 +-
 include/linux/input.h          | 3 ++-
 5 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c
index 1b63acfc6c78..d82368bc46d0 100644
--- a/drivers/char/keyboard.c
+++ b/drivers/char/keyboard.c
@@ -1074,7 +1074,7 @@ static int emulate_raw(struct vc_data *vc, unsigned int keycode,
 			put_queue(vc, 0x1d | up_flag);
 			put_queue(vc, 0x45 | up_flag);
 			return 0;
-		case KEY_HANGUEL:
+		case KEY_HANGEUL:
 			if (!up_flag) put_queue(vc, 0xf1);
 			return 0;
 		case KEY_HANJA:
diff --git a/drivers/input/keyboard/atkbd.c b/drivers/input/keyboard/atkbd.c
index 0eb955ddc654..1bf61f00cbd2 100644
--- a/drivers/input/keyboard/atkbd.c
+++ b/drivers/input/keyboard/atkbd.c
@@ -150,7 +150,7 @@ static unsigned char atkbd_unxlate_table[128] = {
 #define ATKBD_RET_EMUL0		0xe0
 #define ATKBD_RET_EMUL1		0xe1
 #define ATKBD_RET_RELEASE	0xf0
-#define ATKBD_RET_HANGUEL	0xf1
+#define ATKBD_RET_HANGEUL	0xf1
 #define ATKBD_RET_HANJA		0xf2
 #define ATKBD_RET_ERR		0xff
 
@@ -304,7 +304,7 @@ static irqreturn_t atkbd_interrupt(struct serio *serio, unsigned char data,
 
 		if (atkbd->emul ||
 		    (code != ATKBD_RET_EMUL0 && code != ATKBD_RET_EMUL1 &&
-		     code != ATKBD_RET_HANGUEL && code != ATKBD_RET_HANJA &&
+		     code != ATKBD_RET_HANGEUL && code != ATKBD_RET_HANJA &&
 		     (code != ATKBD_RET_ERR || atkbd->err_xl) &&
 	             (code != ATKBD_RET_BAT || atkbd->bat_xl))) {
 			atkbd->release = code >> 7;
@@ -333,8 +333,8 @@ static irqreturn_t atkbd_interrupt(struct serio *serio, unsigned char data,
 		case ATKBD_RET_RELEASE:
 			atkbd->release = 1;
 			goto out;
-		case ATKBD_RET_HANGUEL:
-			atkbd_report_key(atkbd->dev, regs, KEY_HANGUEL, 3);
+		case ATKBD_RET_HANGEUL:
+			atkbd_report_key(atkbd->dev, regs, KEY_HANGEUL, 3);
 			goto out;
 		case ATKBD_RET_HANJA:
 			atkbd_report_key(atkbd->dev, regs, KEY_HANJA, 3);
diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c
index c26e1236b275..cbfbbe2b150a 100644
--- a/drivers/macintosh/adbhid.c
+++ b/drivers/macintosh/adbhid.c
@@ -179,7 +179,7 @@ u8 adb_to_linux_keycodes[128] = {
 	/* 0x65 */ KEY_F9,		/*  67 */
 	/* 0x66 */ KEY_HANJA,		/* 123 */
 	/* 0x67 */ KEY_F11,		/*  87 */
-	/* 0x68 */ KEY_HANGUEL,		/* 122 */
+	/* 0x68 */ KEY_HANGEUL,		/* 122 */
 	/* 0x69 */ KEY_SYSRQ,		/*  99 */
 	/* 0x6a */ 0,
 	/* 0x6b */ KEY_SCROLLLOCK,	/*  70 */
diff --git a/drivers/usb/input/hid-debug.h b/drivers/usb/input/hid-debug.h
index 702c48c2f81b..f04d6d75c098 100644
--- a/drivers/usb/input/hid-debug.h
+++ b/drivers/usb/input/hid-debug.h
@@ -563,7 +563,7 @@ static char *keys[KEY_MAX + 1] = {
 	[KEY_VOLUMEUP] = "VolumeUp",		[KEY_POWER] = "Power",
 	[KEY_KPEQUAL] = "KPEqual",		[KEY_KPPLUSMINUS] = "KPPlusMinus",
 	[KEY_PAUSE] = "Pause",			[KEY_KPCOMMA] = "KPComma",
-	[KEY_HANGUEL] = "Hanguel",		[KEY_HANJA] = "Hanja",
+	[KEY_HANGUEL] = "Hangeul",		[KEY_HANJA] = "Hanja",
 	[KEY_YEN] = "Yen",			[KEY_LEFTMETA] = "LeftMeta",
 	[KEY_RIGHTMETA] = "RightMeta",		[KEY_COMPOSE] = "Compose",
 	[KEY_STOP] = "Stop",			[KEY_AGAIN] = "Again",
diff --git a/include/linux/input.h b/include/linux/input.h
index 8d2b874d9e72..56f1e0e1e598 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -232,7 +232,8 @@ struct input_absinfo {
 #define KEY_PAUSE		119
 
 #define KEY_KPCOMMA		121
-#define KEY_HANGUEL		122
+#define KEY_HANGEUL		122
+#define KEY_HANGUEL		KEY_HANGEUL
 #define KEY_HANJA		123
 #define KEY_YEN			124
 #define KEY_LEFTMETA		125
-- 
cgit v1.2.3


From 6048126440dcb3ba01316f961465c0ff5a255dd1 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 25 Jun 2006 23:58:10 -0700
Subject: [NET]: make net/core/dev.c:netdev_nit static

netdev_nit can now become static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 -
 net/core/dev.c            | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bc747e5d7138..03cd7551a7a1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -699,7 +699,6 @@ extern int		dev_hard_start_xmit(struct sk_buff *skb,
 
 extern void		dev_init(void);
 
-extern int		netdev_nit;
 extern int		netdev_budget;
 
 /* Called by rtnetlink.c:rtnl_unlock() */
diff --git a/net/core/dev.c b/net/core/dev.c
index aa8454901719..f1c52cbd6ef7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -230,7 +230,7 @@ extern void netdev_unregister_sysfs(struct net_device *);
  *	For efficiency
  */
 
-int netdev_nit;
+static int netdev_nit;
 
 /*
  *	Add a protocol ID to the list. Now that the input handler is
-- 
cgit v1.2.3


From 068c6e98bc7ec4419299b38cd40be26ebf4bdeda Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon, 26 Jun 2006 00:04:27 -0700
Subject: [NET] netpoll: break recursive loop in netpoll rx path

The netpoll system currently has a rx to tx path via:

netpoll_rx
 __netpoll_rx
  arp_reply
   netpoll_send_skb
    dev->hard_start_tx

This rx->tx loop places network drivers at risk of inadvertently causing a
deadlock or BUG halt by recursively trying to acquire a spinlock that is
used in both their rx and tx paths (this problem was origionally reported
to me in the 3c59x driver, which shares a spinlock between the
boomerang_interrupt and boomerang_start_xmit routines).

This patch breaks this loop, by queueing arp frames, so that they can be
responded to after all receive operations have been completed.  Tested by
myself and the reported with successful results.

Specifically it was tested with netdump.  Heres the BZ with details:
https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=194055

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h |  1 +
 net/core/netpoll.c      | 26 ++++++++++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index ca5a8733000f..1efe60c5c00c 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -31,6 +31,7 @@ struct netpoll_info {
 	int rx_flags;
 	spinlock_t rx_lock;
 	struct netpoll *rx_np; /* netpoll that registered an rx_hook */
+	struct sk_buff_head arp_tx; /* list of arp requests to reply to */
 };
 
 void netpoll_poll(struct netpoll *np);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 377d1e7257b5..471da451cd48 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -54,6 +54,7 @@ static atomic_t trapped;
 				sizeof(struct iphdr) + sizeof(struct ethhdr))
 
 static void zap_completion_queue(void);
+static void arp_reply(struct sk_buff *skb);
 
 static void queue_process(void *p)
 {
@@ -153,6 +154,22 @@ static void poll_napi(struct netpoll *np)
 	}
 }
 
+static void service_arp_queue(struct netpoll_info *npi)
+{
+	struct sk_buff *skb;
+
+	if (unlikely(!npi))
+		return;
+
+	skb = skb_dequeue(&npi->arp_tx);
+
+	while (skb != NULL) {
+		arp_reply(skb);
+		skb = skb_dequeue(&npi->arp_tx);
+	}
+	return;
+}
+
 void netpoll_poll(struct netpoll *np)
 {
 	if(!np->dev || !netif_running(np->dev) || !np->dev->poll_controller)
@@ -163,6 +180,8 @@ void netpoll_poll(struct netpoll *np)
 	if (np->dev->poll)
 		poll_napi(np);
 
+	service_arp_queue(np->dev->npinfo);
+
 	zap_completion_queue();
 }
 
@@ -442,7 +461,9 @@ int __netpoll_rx(struct sk_buff *skb)
 	int proto, len, ulen;
 	struct iphdr *iph;
 	struct udphdr *uh;
-	struct netpoll *np = skb->dev->npinfo->rx_np;
+	struct netpoll_info *npi = skb->dev->npinfo;
+	struct netpoll *np = npi->rx_np;
+
 
 	if (!np)
 		goto out;
@@ -452,7 +473,7 @@ int __netpoll_rx(struct sk_buff *skb)
 	/* check if netpoll clients need ARP */
 	if (skb->protocol == __constant_htons(ETH_P_ARP) &&
 	    atomic_read(&trapped)) {
-		arp_reply(skb);
+		skb_queue_tail(&npi->arp_tx, skb);
 		return 1;
 	}
 
@@ -647,6 +668,7 @@ int netpoll_setup(struct netpoll *np)
 		npinfo->poll_owner = -1;
 		npinfo->tries = MAX_RETRIES;
 		spin_lock_init(&npinfo->rx_lock);
+		skb_queue_head_init(&npinfo->arp_tx);
 	} else
 		npinfo = ndev->npinfo;
 
-- 
cgit v1.2.3


From 6c2bb98bc33ae33c7a33a133a4cd5a06395fece5 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 16 May 2006 22:09:29 +1000
Subject: [CRYPTO] all: Pass tfm instead of ctx to algorithms

Up until now algorithms have been happy to get a context pointer since
they know everything that's in the tfm already (e.g., alignment, block
size).

However, once we have parameterised algorithms, such information will
be specific to each tfm.  So the algorithm API needs to be changed to
pass the tfm structure instead of the context pointer.

This patch is basically a text substitution.  The only tricky bit is
the assembly routines that need to get the context pointer offset
through asm-offsets.h.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/i386/crypto/aes-i586-asm.S     | 28 +++++++++----------
 arch/i386/crypto/aes.c              | 10 +++----
 arch/i386/kernel/asm-offsets.c      |  3 ++
 arch/s390/crypto/aes_s390.c         | 14 +++++-----
 arch/s390/crypto/des_s390.c         | 42 ++++++++++++++--------------
 arch/s390/crypto/sha1_s390.c        | 15 +++++-----
 arch/s390/crypto/sha256_s390.c      | 13 +++++----
 arch/x86_64/crypto/aes-x86_64-asm.S | 18 +++++++-----
 arch/x86_64/crypto/aes.c            | 10 +++----
 arch/x86_64/kernel/asm-offsets.c    |  3 ++
 crypto/aes.c                        | 14 +++++-----
 crypto/anubis.c                     | 13 ++++-----
 crypto/arc4.c                       |  9 +++---
 crypto/blowfish.c                   | 19 +++++++------
 crypto/cast5.c                      | 14 +++++-----
 crypto/cast6.c                      | 15 +++++-----
 crypto/cipher.c                     | 14 +++++-----
 crypto/compress.c                   |  6 ++--
 crypto/crc32c.c                     | 19 +++++++------
 crypto/crypto_null.c                | 17 ++++++------
 crypto/deflate.c                    | 19 +++++++------
 crypto/des.c                        | 27 ++++++++++--------
 crypto/digest.c                     | 15 ++++------
 crypto/khazad.c                     | 14 +++++-----
 crypto/md4.c                        | 12 ++++----
 crypto/md5.c                        | 12 ++++----
 crypto/michael_mic.c                | 19 +++++++------
 crypto/serpent.c                    | 27 ++++++++++--------
 crypto/sha1.c                       | 17 ++++++------
 crypto/sha256.c                     | 17 ++++++------
 crypto/sha512.c                     | 25 ++++++++---------
 crypto/tea.c                        | 55 ++++++++++++++++++-------------------
 crypto/tgr192.c                     | 29 ++++++++++---------
 crypto/twofish.c                    | 14 +++++-----
 crypto/wp512.c                      | 24 ++++++++--------
 drivers/crypto/padlock-aes.c        | 29 ++++++++++---------
 include/linux/crypto.h              | 29 +++++++++----------
 37 files changed, 349 insertions(+), 331 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/crypto/aes-i586-asm.S b/arch/i386/crypto/aes-i586-asm.S
index 2851f7fe51e6..f942f0c8f630 100644
--- a/arch/i386/crypto/aes-i586-asm.S
+++ b/arch/i386/crypto/aes-i586-asm.S
@@ -36,19 +36,19 @@
 .file "aes-i586-asm.S"
 .text
 
-#define tlen 1024   // length of each of 4 'xor' arrays (256 32-bit words)
-
-// offsets to parameters with one register pushed onto stack
+#include <asm/asm-offsets.h>
 
-#define in_blk   16  // input byte array address parameter
-#define out_blk  12  // output byte array address parameter
-#define ctx       8  // AES context structure
+#define tlen 1024   // length of each of 4 'xor' arrays (256 32-bit words)
 
-// offsets in context structure
+/* offsets to parameters with one register pushed onto stack */
+#define tfm 8
+#define out_blk 12
+#define in_blk 16
 
-#define ekey     0   // encryption key schedule base address
-#define nrnd   256   // number of rounds
-#define dkey   260   // decryption key schedule base address
+/* offsets in crypto_tfm structure */
+#define ekey (crypto_tfm_ctx_offset + 0)
+#define nrnd (crypto_tfm_ctx_offset + 256)
+#define dkey (crypto_tfm_ctx_offset + 260)
 
 // register mapping for encrypt and decrypt subroutines
 
@@ -217,7 +217,7 @@
 	do_col (table, r5,r0,r1,r4, r2,r3);		/* idx=r5 */
 
 // AES (Rijndael) Encryption Subroutine
-/* void aes_enc_blk(void *ctx, u8 *out_blk, const u8 *in_blk) */
+/* void aes_enc_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */
 
 .global  aes_enc_blk
 
@@ -228,7 +228,7 @@
 
 aes_enc_blk:
 	push    %ebp
-	mov     ctx(%esp),%ebp      // pointer to context
+	mov     tfm(%esp),%ebp
 
 // CAUTION: the order and the values used in these assigns 
 // rely on the register mappings
@@ -293,7 +293,7 @@ aes_enc_blk:
 	ret
 
 // AES (Rijndael) Decryption Subroutine
-/* void aes_dec_blk(void *ctx, u8 *out_blk, const u8 *in_blk) */
+/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */
 
 .global  aes_dec_blk
 
@@ -304,7 +304,7 @@ aes_enc_blk:
 
 aes_dec_blk:
 	push    %ebp
-	mov     ctx(%esp),%ebp       // pointer to context
+	mov     tfm(%esp),%ebp
 
 // CAUTION: the order and the values used in these assigns 
 // rely on the register mappings
diff --git a/arch/i386/crypto/aes.c b/arch/i386/crypto/aes.c
index a0e033510a3b..b9c7d99160f1 100644
--- a/arch/i386/crypto/aes.c
+++ b/arch/i386/crypto/aes.c
@@ -45,8 +45,8 @@
 #include <linux/crypto.h>
 #include <linux/linkage.h>
 
-asmlinkage void aes_enc_blk(void *ctx, u8 *dst, const u8 *src);
-asmlinkage void aes_dec_blk(void *ctx, u8 *dst, const u8 *src);
+asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
 
 #define AES_MIN_KEY_SIZE	16
 #define AES_MAX_KEY_SIZE	32
@@ -378,12 +378,12 @@ static void gen_tabs(void)
 	k[8*(i)+11] = ss[3];						\
 }
 
-static int
-aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, u32 *flags)
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+		       unsigned int key_len, u32 *flags)
 {
 	int i;
 	u32 ss[8];
-	struct aes_ctx *ctx = ctx_arg;
+	struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *key = (const __le32 *)in_key;
 
 	/* encryption schedule */
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 36d66e2077d0..1c3a809e6421 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -4,6 +4,7 @@
  * to extract and format the required data.
  */
 
+#include <linux/crypto.h>
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/personality.h>
@@ -69,4 +70,6 @@ void foo(void)
 
 	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
 	DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL));
+
+	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
 }
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index c5ca2dc5d428..5713c7e5bd16 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -37,10 +37,10 @@ struct s390_aes_ctx {
 	int key_len;
 };
 
-static int aes_set_key(void *ctx, const u8 *in_key, unsigned int key_len,
-		       u32 *flags)
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+		       unsigned int key_len, u32 *flags)
 {
-	struct s390_aes_ctx *sctx = ctx;
+	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
 
 	switch (key_len) {
 	case 16:
@@ -70,9 +70,9 @@ fail:
 	return -EINVAL;
 }
 
-static void aes_encrypt(void *ctx, u8 *out, const u8 *in)
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
-	const struct s390_aes_ctx *sctx = ctx;
+	const struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
 
 	switch (sctx->key_len) {
 	case 16:
@@ -90,9 +90,9 @@ static void aes_encrypt(void *ctx, u8 *out, const u8 *in)
 	}
 }
 
-static void aes_decrypt(void *ctx, u8 *out, const u8 *in)
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
-	const struct s390_aes_ctx *sctx = ctx;
+	const struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
 
 	switch (sctx->key_len) {
 	case 16:
diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c
index e3c37aa0a199..b3f7496a79b4 100644
--- a/arch/s390/crypto/des_s390.c
+++ b/arch/s390/crypto/des_s390.c
@@ -44,10 +44,10 @@ struct crypt_s390_des3_192_ctx {
 	u8 key[DES3_192_KEY_SIZE];
 };
 
-static int des_setkey(void *ctx, const u8 *key, unsigned int keylen,
-		      u32 *flags)
+static int des_setkey(struct crypto_tfm *tfm, const u8 *key,
+		      unsigned int keylen, u32 *flags)
 {
-	struct crypt_s390_des_ctx *dctx = ctx;
+	struct crypt_s390_des_ctx *dctx = crypto_tfm_ctx(tfm);
 	int ret;
 
 	/* test if key is valid (not a weak key) */
@@ -57,16 +57,16 @@ static int des_setkey(void *ctx, const u8 *key, unsigned int keylen,
 	return ret;
 }
 
-static void des_encrypt(void *ctx, u8 *out, const u8 *in)
+static void des_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
-	struct crypt_s390_des_ctx *dctx = ctx;
+	struct crypt_s390_des_ctx *dctx = crypto_tfm_ctx(tfm);
 
 	crypt_s390_km(KM_DEA_ENCRYPT, dctx->key, out, in, DES_BLOCK_SIZE);
 }
 
-static void des_decrypt(void *ctx, u8 *out, const u8 *in)
+static void des_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
-	struct crypt_s390_des_ctx *dctx = ctx;
+	struct crypt_s390_des_ctx *dctx = crypto_tfm_ctx(tfm);
 
 	crypt_s390_km(KM_DEA_DECRYPT, dctx->key, out, in, DES_BLOCK_SIZE);
 }
@@ -166,11 +166,11 @@ static struct crypto_alg des_alg = {
  *   Implementers MUST reject keys that exhibit this property.
  *
  */
-static int des3_128_setkey(void *ctx, const u8 *key, unsigned int keylen,
-			   u32 *flags)
+static int des3_128_setkey(struct crypto_tfm *tfm, const u8 *key,
+			   unsigned int keylen, u32 *flags)
 {
 	int i, ret;
-	struct crypt_s390_des3_128_ctx *dctx = ctx;
+	struct crypt_s390_des3_128_ctx *dctx = crypto_tfm_ctx(tfm);
 	const u8* temp_key = key;
 
 	if (!(memcmp(key, &key[DES_KEY_SIZE], DES_KEY_SIZE))) {
@@ -186,17 +186,17 @@ static int des3_128_setkey(void *ctx, const u8 *key, unsigned int keylen,
 	return 0;
 }
 
-static void des3_128_encrypt(void *ctx, u8 *dst, const u8 *src)
+static void des3_128_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
-	struct crypt_s390_des3_128_ctx *dctx = ctx;
+	struct crypt_s390_des3_128_ctx *dctx = crypto_tfm_ctx(tfm);
 
 	crypt_s390_km(KM_TDEA_128_ENCRYPT, dctx->key, dst, (void*)src,
 		      DES3_128_BLOCK_SIZE);
 }
 
-static void des3_128_decrypt(void *ctx, u8 *dst, const u8 *src)
+static void des3_128_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
-	struct crypt_s390_des3_128_ctx *dctx = ctx;
+	struct crypt_s390_des3_128_ctx *dctx = crypto_tfm_ctx(tfm);
 
 	crypt_s390_km(KM_TDEA_128_DECRYPT, dctx->key, dst, (void*)src,
 		      DES3_128_BLOCK_SIZE);
@@ -302,11 +302,11 @@ static struct crypto_alg des3_128_alg = {
  *   property.
  *
  */
-static int des3_192_setkey(void *ctx, const u8 *key, unsigned int keylen,
-			   u32 *flags)
+static int des3_192_setkey(struct crypto_tfm *tfm, const u8 *key,
+			   unsigned int keylen, u32 *flags)
 {
 	int i, ret;
-	struct crypt_s390_des3_192_ctx *dctx = ctx;
+	struct crypt_s390_des3_192_ctx *dctx = crypto_tfm_ctx(tfm);
 	const u8* temp_key = key;
 
 	if (!(memcmp(key, &key[DES_KEY_SIZE], DES_KEY_SIZE) &&
@@ -325,17 +325,17 @@ static int des3_192_setkey(void *ctx, const u8 *key, unsigned int keylen,
 	return 0;
 }
 
-static void des3_192_encrypt(void *ctx, u8 *dst, const u8 *src)
+static void des3_192_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
-	struct crypt_s390_des3_192_ctx *dctx = ctx;
+	struct crypt_s390_des3_192_ctx *dctx = crypto_tfm_ctx(tfm);
 
 	crypt_s390_km(KM_TDEA_192_ENCRYPT, dctx->key, dst, (void*)src,
 		      DES3_192_BLOCK_SIZE);
 }
 
-static void des3_192_decrypt(void *ctx, u8 *dst, const u8 *src)
+static void des3_192_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
-	struct crypt_s390_des3_192_ctx *dctx = ctx;
+	struct crypt_s390_des3_192_ctx *dctx = crypto_tfm_ctx(tfm);
 
 	crypt_s390_km(KM_TDEA_192_DECRYPT, dctx->key, dst, (void*)src,
 		      DES3_192_BLOCK_SIZE);
diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c
index 36bb5346a8c4..9d34a35b1aa5 100644
--- a/arch/s390/crypto/sha1_s390.c
+++ b/arch/s390/crypto/sha1_s390.c
@@ -40,9 +40,9 @@ struct crypt_s390_sha1_ctx {
 	u8 buffer[2 * SHA1_BLOCK_SIZE];
 };
 
-static void sha1_init(void *ctx_arg) 
+static void sha1_init(struct crypto_tfm *tfm)
 {
-	struct crypt_s390_sha1_ctx *ctx = ctx_arg;
+	struct crypt_s390_sha1_ctx *ctx = crypto_tfm_ctx(tfm);
 	static const u32 initstate[5] = {
 		0x67452301,
 		0xEFCDAB89,
@@ -56,13 +56,13 @@ static void sha1_init(void *ctx_arg)
 	ctx->buf_len = 0;
 }
 
-static void
-sha1_update(void *ctx, const u8 *data, unsigned int len)
+static void sha1_update(struct crypto_tfm *tfm, const u8 *data,
+			unsigned int len)
 {
 	struct crypt_s390_sha1_ctx *sctx;
 	long imd_len;
 
-	sctx = ctx;
+	sctx = crypto_tfm_ctx(tfm);
 	sctx->count += len * 8; //message bit length
 
 	//anything in buffer yet? -> must be completed
@@ -111,10 +111,9 @@ pad_message(struct crypt_s390_sha1_ctx* sctx)
 }
 
 /* Add padding and return the message digest. */
-static void
-sha1_final(void* ctx, u8 *out)
+static void sha1_final(struct crypto_tfm *tfm, u8 *out)
 {
-	struct crypt_s390_sha1_ctx *sctx = ctx;
+	struct crypt_s390_sha1_ctx *sctx = crypto_tfm_ctx(tfm);
 
 	//must perform manual padding
 	pad_message(sctx);
diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c
index 2c76e7bee41c..f573df30f31d 100644
--- a/arch/s390/crypto/sha256_s390.c
+++ b/arch/s390/crypto/sha256_s390.c
@@ -31,9 +31,9 @@ struct s390_sha256_ctx {
 	u8 buf[2 * SHA256_BLOCK_SIZE];
 };
 
-static void sha256_init(void *ctx)
+static void sha256_init(struct crypto_tfm *tfm)
 {
-	struct s390_sha256_ctx *sctx = ctx;
+	struct s390_sha256_ctx *sctx = crypto_tfm_ctx(tfm);
 
 	sctx->state[0] = 0x6a09e667;
 	sctx->state[1] = 0xbb67ae85;
@@ -46,9 +46,10 @@ static void sha256_init(void *ctx)
 	sctx->count = 0;
 }
 
-static void sha256_update(void *ctx, const u8 *data, unsigned int len)
+static void sha256_update(struct crypto_tfm *tfm, const u8 *data,
+			  unsigned int len)
 {
-	struct s390_sha256_ctx *sctx = ctx;
+	struct s390_sha256_ctx *sctx = crypto_tfm_ctx(tfm);
 	unsigned int index;
 	int ret;
 
@@ -107,9 +108,9 @@ static void pad_message(struct s390_sha256_ctx* sctx)
 }
 
 /* Add padding and return the message digest */
-static void sha256_final(void* ctx, u8 *out)
+static void sha256_final(struct crypto_tfm *tfm, u8 *out)
 {
-	struct s390_sha256_ctx *sctx = ctx;
+	struct s390_sha256_ctx *sctx = crypto_tfm_ctx(tfm);
 
 	/* must perform manual padding */
 	pad_message(sctx);
diff --git a/arch/x86_64/crypto/aes-x86_64-asm.S b/arch/x86_64/crypto/aes-x86_64-asm.S
index 483cbb23ab8d..f3ba643e144d 100644
--- a/arch/x86_64/crypto/aes-x86_64-asm.S
+++ b/arch/x86_64/crypto/aes-x86_64-asm.S
@@ -15,6 +15,10 @@
 
 .text
 
+#include <asm/asm-offsets.h>
+
+#define BASE crypto_tfm_ctx_offset
+
 #define R1	%rax
 #define R1E	%eax
 #define R1X	%ax
@@ -46,19 +50,19 @@
 #define R10	%r10
 #define R11	%r11
 
-#define prologue(FUNC,BASE,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
+#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
 	.global	FUNC;			\
 	.type	FUNC,@function;		\
 	.align	8;			\
 FUNC:	movq	r1,r2;			\
 	movq	r3,r4;			\
-	leaq	BASE+52(r8),r9;		\
+	leaq	BASE+KEY+52(r8),r9;	\
 	movq	r10,r11;		\
 	movl	(r7),r5 ## E;		\
 	movl	4(r7),r1 ## E;		\
 	movl	8(r7),r6 ## E;		\
 	movl	12(r7),r7 ## E;		\
-	movl	(r8),r10 ## E;		\
+	movl	BASE(r8),r10 ## E;	\
 	xorl	-48(r9),r5 ## E;	\
 	xorl	-44(r9),r1 ## E;	\
 	xorl	-40(r9),r6 ## E;	\
@@ -128,8 +132,8 @@ FUNC:	movq	r1,r2;			\
 	movl	r3 ## E,r1 ## E;	\
 	movl	r4 ## E,r2 ## E;
 
-#define entry(FUNC,BASE,B128,B192) \
-	prologue(FUNC,BASE,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
+#define entry(FUNC,KEY,B128,B192) \
+	prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
 
 #define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11)
 
@@ -147,7 +151,7 @@ FUNC:	movq	r1,r2;			\
 #define decrypt_final(TAB,OFFSET) \
 	round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
 
-/* void aes_encrypt(void *ctx, u8 *out, const u8 *in) */
+/* void aes_encrypt(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
 
 	entry(aes_encrypt,0,enc128,enc192)
 	encrypt_round(aes_ft_tab,-96)
@@ -166,7 +170,7 @@ enc128:	encrypt_round(aes_ft_tab,-32)
 	encrypt_final(aes_fl_tab,112)
 	return
 
-/* void aes_decrypt(void *ctx, u8 *out, const u8 *in) */
+/* void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) */
 
 	entry(aes_decrypt,240,dec128,dec192)
 	decrypt_round(aes_it_tab,-96)
diff --git a/arch/x86_64/crypto/aes.c b/arch/x86_64/crypto/aes.c
index 6f77e7700d32..d6f8e0463b5d 100644
--- a/arch/x86_64/crypto/aes.c
+++ b/arch/x86_64/crypto/aes.c
@@ -227,10 +227,10 @@ static void __init gen_tabs(void)
 	t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t;	\
 }
 
-static int aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len,
-		       u32 *flags)
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+		       unsigned int key_len, u32 *flags)
 {
-	struct aes_ctx *ctx = ctx_arg;
+	struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *key = (const __le32 *)in_key;
 	u32 i, j, t, u, v, w;
 
@@ -283,8 +283,8 @@ static int aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len,
 	return 0;
 }
 
-extern void aes_encrypt(void *ctx_arg, u8 *out, const u8 *in);
-extern void aes_decrypt(void *ctx_arg, u8 *out, const u8 *in);
+extern void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in);
+extern void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in);
 
 static struct crypto_alg aes_alg = {
 	.cra_name		=	"aes",
diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c
index 38834bbbae11..96687e2beb2c 100644
--- a/arch/x86_64/kernel/asm-offsets.c
+++ b/arch/x86_64/kernel/asm-offsets.c
@@ -4,6 +4,7 @@
  * and format the required data.
  */
 
+#include <linux/crypto.h>
 #include <linux/sched.h> 
 #include <linux/stddef.h>
 #include <linux/errno.h> 
@@ -68,5 +69,7 @@ int main(void)
 	DEFINE(pbe_next, offsetof(struct pbe, next));
 	BLANK();
 	DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
+	BLANK();
+	DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
 	return 0;
 }
diff --git a/crypto/aes.c b/crypto/aes.c
index a5017292e066..a038711831e7 100644
--- a/crypto/aes.c
+++ b/crypto/aes.c
@@ -248,10 +248,10 @@ gen_tabs (void)
     t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t;   \
 }
 
-static int
-aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, u32 *flags)
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+		       unsigned int key_len, u32 *flags)
 {
-	struct aes_ctx *ctx = ctx_arg;
+	struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *key = (const __le32 *)in_key;
 	u32 i, t, u, v, w;
 
@@ -318,9 +318,9 @@ aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, u32 *flags)
     f_rl(bo, bi, 2, k);     \
     f_rl(bo, bi, 3, k)
 
-static void aes_encrypt(void *ctx_arg, u8 *out, const u8 *in)
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
-	const struct aes_ctx *ctx = ctx_arg;
+	const struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *src = (const __le32 *)in;
 	__le32 *dst = (__le32 *)out;
 	u32 b0[4], b1[4];
@@ -373,9 +373,9 @@ static void aes_encrypt(void *ctx_arg, u8 *out, const u8 *in)
     i_rl(bo, bi, 2, k);     \
     i_rl(bo, bi, 3, k)
 
-static void aes_decrypt(void *ctx_arg, u8 *out, const u8 *in)
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
-	const struct aes_ctx *ctx = ctx_arg;
+	const struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *src = (const __le32 *)in;
 	__le32 *dst = (__le32 *)out;
 	u32 b0[4], b1[4];
diff --git a/crypto/anubis.c b/crypto/anubis.c
index 2c796bdb91a6..7e2e1a29800e 100644
--- a/crypto/anubis.c
+++ b/crypto/anubis.c
@@ -460,16 +460,15 @@ static const u32 rc[] = {
 	0xf726ffedU, 0xe89d6f8eU, 0x19a0f089U,
 };
 
-static int anubis_setkey(void *ctx_arg, const u8 *in_key,
+static int anubis_setkey(struct crypto_tfm *tfm, const u8 *in_key,
 			 unsigned int key_len, u32 *flags)
 {
+	struct anubis_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __be32 *key = (const __be32 *)in_key;
 	int N, R, i, r;
 	u32 kappa[ANUBIS_MAX_N];
 	u32 inter[ANUBIS_MAX_N];
 
-	struct anubis_ctx *ctx = ctx_arg;
-
 	switch (key_len)
 	{
 		case 16: case 20: case 24: case 28:
@@ -660,15 +659,15 @@ static void anubis_crypt(u32 roundKey[ANUBIS_MAX_ROUNDS + 1][4],
 		dst[i] = cpu_to_be32(inter[i]);
 }
 
-static void anubis_encrypt(void *ctx_arg, u8 *dst, const u8 *src)
+static void anubis_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
-	struct anubis_ctx *ctx = ctx_arg;
+	struct anubis_ctx *ctx = crypto_tfm_ctx(tfm);
 	anubis_crypt(ctx->E, dst, src, ctx->R);
 }
 
-static void anubis_decrypt(void *ctx_arg, u8 *dst, const u8 *src)
+static void anubis_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
-	struct anubis_ctx *ctx = ctx_arg;
+	struct anubis_ctx *ctx = crypto_tfm_ctx(tfm);
 	anubis_crypt(ctx->D, dst, src, ctx->R);
 }
 
diff --git a/crypto/arc4.c b/crypto/arc4.c
index 9efbcaae88a1..5edc6a65b987 100644
--- a/crypto/arc4.c
+++ b/crypto/arc4.c
@@ -24,9 +24,10 @@ struct arc4_ctx {
 	u8 x, y;
 };
 
-static int arc4_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, u32 *flags)
+static int arc4_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+			unsigned int key_len, u32 *flags)
 {
-	struct arc4_ctx *ctx = ctx_arg;
+	struct arc4_ctx *ctx = crypto_tfm_ctx(tfm);
 	int i, j = 0, k = 0;
 
 	ctx->x = 1;
@@ -48,9 +49,9 @@ static int arc4_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, u
 	return 0;
 }
 
-static void arc4_crypt(void *ctx_arg, u8 *out, const u8 *in)
+static void arc4_crypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
-	struct arc4_ctx *ctx = ctx_arg;
+	struct arc4_ctx *ctx = crypto_tfm_ctx(tfm);
 
 	u8 *const S = ctx->S;
 	u8 x = ctx->x;
diff --git a/crypto/blowfish.c b/crypto/blowfish.c
index 7f710b201f20..490265f42b3b 100644
--- a/crypto/blowfish.c
+++ b/crypto/blowfish.c
@@ -349,7 +349,7 @@ static void encrypt_block(struct bf_ctx *bctx, u32 *dst, u32 *src)
 	dst[1] = yl;
 }
 
-static void bf_encrypt(void *ctx, u8 *dst, const u8 *src)
+static void bf_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 	const __be32 *in_blk = (const __be32 *)src;
 	__be32 *const out_blk = (__be32 *)dst;
@@ -357,17 +357,18 @@ static void bf_encrypt(void *ctx, u8 *dst, const u8 *src)
 
 	in32[0] = be32_to_cpu(in_blk[0]);
 	in32[1] = be32_to_cpu(in_blk[1]);
-	encrypt_block(ctx, out32, in32);
+	encrypt_block(crypto_tfm_ctx(tfm), out32, in32);
 	out_blk[0] = cpu_to_be32(out32[0]);
 	out_blk[1] = cpu_to_be32(out32[1]);
 }
 
-static void bf_decrypt(void *ctx, u8 *dst, const u8 *src)
+static void bf_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
+	struct bf_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __be32 *in_blk = (const __be32 *)src;
 	__be32 *const out_blk = (__be32 *)dst;
-	const u32 *P = ((struct bf_ctx *)ctx)->p;
-	const u32 *S = ((struct bf_ctx *)ctx)->s;
+	const u32 *P = ctx->p;
+	const u32 *S = ctx->s;
 	u32 yl = be32_to_cpu(in_blk[0]);
 	u32 yr = be32_to_cpu(in_blk[1]);
 
@@ -398,12 +399,14 @@ static void bf_decrypt(void *ctx, u8 *dst, const u8 *src)
 /* 
  * Calculates the blowfish S and P boxes for encryption and decryption.
  */
-static int bf_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
+static int bf_setkey(struct crypto_tfm *tfm, const u8 *key,
+		     unsigned int keylen, u32 *flags)
 {
+	struct bf_ctx *ctx = crypto_tfm_ctx(tfm);
+	u32 *P = ctx->p;
+	u32 *S = ctx->s;
 	short i, j, count;
 	u32 data[2], temp;
-	u32 *P = ((struct bf_ctx *)ctx)->p;
-	u32 *S = ((struct bf_ctx *)ctx)->s;
 
 	/* Copy the initialization s-boxes */
 	for (i = 0, count = 0; i < 256; i++)
diff --git a/crypto/cast5.c b/crypto/cast5.c
index 8834c8580c04..08eef58c1d3d 100644
--- a/crypto/cast5.c
+++ b/crypto/cast5.c
@@ -577,9 +577,9 @@ static const u32 sb8[256] = {
     (((s1[I >> 24] + s2[(I>>16)&0xff]) ^ s3[(I>>8)&0xff]) - s4[I&0xff]) )
 
 
-static void cast5_encrypt(void *ctx, u8 * outbuf, const u8 * inbuf)
+static void cast5_encrypt(struct crypto_tfm *tfm, u8 *outbuf, const u8 *inbuf)
 {
-	struct cast5_ctx *c = (struct cast5_ctx *) ctx;
+	struct cast5_ctx *c = crypto_tfm_ctx(tfm);
 	const __be32 *src = (const __be32 *)inbuf;
 	__be32 *dst = (__be32 *)outbuf;
 	u32 l, r, t;
@@ -642,9 +642,9 @@ static void cast5_encrypt(void *ctx, u8 * outbuf, const u8 * inbuf)
 	dst[1] = cpu_to_be32(l);
 }
 
-static void cast5_decrypt(void *ctx, u8 * outbuf, const u8 * inbuf)
+static void cast5_decrypt(struct crypto_tfm *tfm, u8 *outbuf, const u8 *inbuf)
 {
-	struct cast5_ctx *c = (struct cast5_ctx *) ctx;
+	struct cast5_ctx *c = crypto_tfm_ctx(tfm);
 	const __be32 *src = (const __be32 *)inbuf;
 	__be32 *dst = (__be32 *)outbuf;
 	u32 l, r, t;
@@ -769,15 +769,15 @@ static void key_schedule(u32 * x, u32 * z, u32 * k)
 }
 
 
-static int
-cast5_setkey(void *ctx, const u8 * key, unsigned key_len, u32 * flags)
+static int cast5_setkey(struct crypto_tfm *tfm, const u8 *key,
+			unsigned key_len, u32 *flags)
 {
+	struct cast5_ctx *c = crypto_tfm_ctx(tfm);
 	int i;
 	u32 x[4];
 	u32 z[4];
 	u32 k[16];
 	__be32 p_key[4];
-	struct cast5_ctx *c = (struct cast5_ctx *) ctx;
 	
 	if (key_len < 5 || key_len > 16) {
 		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
diff --git a/crypto/cast6.c b/crypto/cast6.c
index 9e28740ba775..08e33bfc3ad1 100644
--- a/crypto/cast6.c
+++ b/crypto/cast6.c
@@ -381,13 +381,13 @@ static inline void W(u32 *key, unsigned int i) {
 	key[7] ^= F2(key[0], Tr[i % 4][7], Tm[i][7]);
 }
 
-static int
-cast6_setkey(void *ctx, const u8 * in_key, unsigned key_len, u32 * flags)
+static int cast6_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+			unsigned key_len, u32 *flags)
 {
 	int i;
 	u32 key[8];
 	__be32 p_key[8]; /* padded key */
-	struct cast6_ctx *c = (struct cast6_ctx *) ctx;
+	struct cast6_ctx *c = crypto_tfm_ctx(tfm);
 
 	if (key_len < 16 || key_len > 32 || key_len % 4 != 0) {
 		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
@@ -444,8 +444,9 @@ static inline void QBAR (u32 * block, u8 * Kr, u32 * Km) {
         block[2] ^= F1(block[3], Kr[0], Km[0]);
 }
 
-static void cast6_encrypt (void * ctx, u8 * outbuf, const u8 * inbuf) {
-	struct cast6_ctx * c = (struct cast6_ctx *)ctx;
+static void cast6_encrypt(struct crypto_tfm *tfm, u8 *outbuf, const u8 *inbuf)
+{
+	struct cast6_ctx *c = crypto_tfm_ctx(tfm);
 	const __be32 *src = (const __be32 *)inbuf;
 	__be32 *dst = (__be32 *)outbuf;
 	u32 block[4];
@@ -476,8 +477,8 @@ static void cast6_encrypt (void * ctx, u8 * outbuf, const u8 * inbuf) {
 	dst[3] = cpu_to_be32(block[3]);
 }	
 
-static void cast6_decrypt (void * ctx, u8 * outbuf, const u8 * inbuf) {
-	struct cast6_ctx * c = (struct cast6_ctx *)ctx;
+static void cast6_decrypt(struct crypto_tfm *tfm, u8 *outbuf, const u8 *inbuf) {
+	struct cast6_ctx * c = crypto_tfm_ctx(tfm);
 	const __be32 *src = (const __be32 *)inbuf;
 	__be32 *dst = (__be32 *)outbuf;
 	u32 block[4];
diff --git a/crypto/cipher.c b/crypto/cipher.c
index 65bcea0cd17c..b899eb97abd7 100644
--- a/crypto/cipher.c
+++ b/crypto/cipher.c
@@ -187,7 +187,7 @@ static unsigned int cbc_process_encrypt(const struct cipher_desc *desc,
 	void (*xor)(u8 *, const u8 *) = tfm->crt_u.cipher.cit_xor_block;
 	int bsize = crypto_tfm_alg_blocksize(tfm);
 
-	void (*fn)(void *, u8 *, const u8 *) = desc->crfn;
+	void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = desc->crfn;
 	u8 *iv = desc->info;
 	unsigned int done = 0;
 
@@ -195,7 +195,7 @@ static unsigned int cbc_process_encrypt(const struct cipher_desc *desc,
 
 	do {
 		xor(iv, src);
-		fn(crypto_tfm_ctx(tfm), dst, iv);
+		fn(tfm, dst, iv);
 		memcpy(iv, dst, bsize);
 
 		src += bsize;
@@ -218,7 +218,7 @@ static unsigned int cbc_process_decrypt(const struct cipher_desc *desc,
 	u8 *buf = (u8 *)ALIGN((unsigned long)stack, alignmask + 1);
 	u8 **dst_p = src == dst ? &buf : &dst;
 
-	void (*fn)(void *, u8 *, const u8 *) = desc->crfn;
+	void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = desc->crfn;
 	u8 *iv = desc->info;
 	unsigned int done = 0;
 
@@ -227,7 +227,7 @@ static unsigned int cbc_process_decrypt(const struct cipher_desc *desc,
 	do {
 		u8 *tmp_dst = *dst_p;
 
-		fn(crypto_tfm_ctx(tfm), tmp_dst, src);
+		fn(tfm, tmp_dst, src);
 		xor(tmp_dst, iv);
 		memcpy(iv, src, bsize);
 		if (tmp_dst != dst)
@@ -245,13 +245,13 @@ static unsigned int ecb_process(const struct cipher_desc *desc, u8 *dst,
 {
 	struct crypto_tfm *tfm = desc->tfm;
 	int bsize = crypto_tfm_alg_blocksize(tfm);
-	void (*fn)(void *, u8 *, const u8 *) = desc->crfn;
+	void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = desc->crfn;
 	unsigned int done = 0;
 
 	nbytes -= bsize;
 
 	do {
-		fn(crypto_tfm_ctx(tfm), dst, src);
+		fn(tfm, dst, src);
 
 		src += bsize;
 		dst += bsize;
@@ -268,7 +268,7 @@ static int setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen)
 		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
 		return -EINVAL;
 	} else
-		return cia->cia_setkey(crypto_tfm_ctx(tfm), key, keylen,
+		return cia->cia_setkey(tfm, key, keylen,
 		                       &tfm->crt_flags);
 }
 
diff --git a/crypto/compress.c b/crypto/compress.c
index eb36d9364da3..c12fc0c41dac 100644
--- a/crypto/compress.c
+++ b/crypto/compress.c
@@ -22,8 +22,7 @@ static int crypto_compress(struct crypto_tfm *tfm,
                             const u8 *src, unsigned int slen,
                             u8 *dst, unsigned int *dlen)
 {
-	return tfm->__crt_alg->cra_compress.coa_compress(crypto_tfm_ctx(tfm),
-	                                                 src, slen, dst,
+	return tfm->__crt_alg->cra_compress.coa_compress(tfm, src, slen, dst,
 	                                                 dlen);
 }
 
@@ -31,8 +30,7 @@ static int crypto_decompress(struct crypto_tfm *tfm,
                              const u8 *src, unsigned int slen,
                              u8 *dst, unsigned int *dlen)
 {
-	return tfm->__crt_alg->cra_compress.coa_decompress(crypto_tfm_ctx(tfm),
-	                                                   src, slen, dst,
+	return tfm->__crt_alg->cra_compress.coa_decompress(tfm, src, slen, dst,
 	                                                   dlen);
 }
 
diff --git a/crypto/crc32c.c b/crypto/crc32c.c
index 953362423a5c..f2660123aeb4 100644
--- a/crypto/crc32c.c
+++ b/crypto/crc32c.c
@@ -31,9 +31,9 @@ struct chksum_ctx {
  * crc using table.
  */
 
-static void chksum_init(void *ctx)
+static void chksum_init(struct crypto_tfm *tfm)
 {
-	struct chksum_ctx *mctx = ctx;
+	struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
 
 	mctx->crc = ~(u32)0;			/* common usage */
 }
@@ -43,10 +43,10 @@ static void chksum_init(void *ctx)
  * If your algorithm starts with ~0, then XOR with ~0 before you set
  * the seed.
  */
-static int chksum_setkey(void *ctx, const u8 *key, unsigned int keylen,
-	                  u32 *flags)
+static int chksum_setkey(struct crypto_tfm *tfm, const u8 *key,
+			 unsigned int keylen, u32 *flags)
 {
-	struct chksum_ctx *mctx = ctx;
+	struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
 
 	if (keylen != sizeof(mctx->crc)) {
 		if (flags)
@@ -57,9 +57,10 @@ static int chksum_setkey(void *ctx, const u8 *key, unsigned int keylen,
 	return 0;
 }
 
-static void chksum_update(void *ctx, const u8 *data, unsigned int length)
+static void chksum_update(struct crypto_tfm *tfm, const u8 *data,
+			  unsigned int length)
 {
-	struct chksum_ctx *mctx = ctx;
+	struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
 	u32 mcrc;
 
 	mcrc = crc32c(mctx->crc, data, (size_t)length);
@@ -67,9 +68,9 @@ static void chksum_update(void *ctx, const u8 *data, unsigned int length)
 	mctx->crc = mcrc;
 }
 
-static void chksum_final(void *ctx, u8 *out)
+static void chksum_final(struct crypto_tfm *tfm, u8 *out)
 {
-	struct chksum_ctx *mctx = ctx;
+	struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
 	u32 mcrc = (mctx->crc ^ ~(u32)0);
 	
 	*(u32 *)out = __le32_to_cpu(mcrc);
diff --git a/crypto/crypto_null.c b/crypto/crypto_null.c
index 3fcf6e887e87..a0d956b52949 100644
--- a/crypto/crypto_null.c
+++ b/crypto/crypto_null.c
@@ -27,8 +27,8 @@
 #define NULL_BLOCK_SIZE		1
 #define NULL_DIGEST_SIZE	0
 
-static int null_compress(void *ctx, const u8 *src, unsigned int slen,
-                         u8 *dst, unsigned int *dlen)
+static int null_compress(struct crypto_tfm *tfm, const u8 *src,
+			 unsigned int slen, u8 *dst, unsigned int *dlen)
 {
 	if (slen > *dlen)
 		return -EINVAL;
@@ -37,20 +37,21 @@ static int null_compress(void *ctx, const u8 *src, unsigned int slen,
 	return 0;
 }
 
-static void null_init(void *ctx)
+static void null_init(struct crypto_tfm *tfm)
 { }
 
-static void null_update(void *ctx, const u8 *data, unsigned int len)
+static void null_update(struct crypto_tfm *tfm, const u8 *data,
+			unsigned int len)
 { }
 
-static void null_final(void *ctx, u8 *out)
+static void null_final(struct crypto_tfm *tfm, u8 *out)
 { }
 
-static int null_setkey(void *ctx, const u8 *key,
-                       unsigned int keylen, u32 *flags)
+static int null_setkey(struct crypto_tfm *tfm, const u8 *key,
+		       unsigned int keylen, u32 *flags)
 { return 0; }
 
-static void null_crypt(void *ctx, u8 *dst, const u8 *src)
+static void null_crypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 	memcpy(dst, src, NULL_BLOCK_SIZE);
 }
diff --git a/crypto/deflate.c b/crypto/deflate.c
index f209368d62ae..5dd2404ae5b2 100644
--- a/crypto/deflate.c
+++ b/crypto/deflate.c
@@ -102,8 +102,9 @@ static void deflate_decomp_exit(struct deflate_ctx *ctx)
 	kfree(ctx->decomp_stream.workspace);
 }
 
-static int deflate_init(void *ctx)
+static int deflate_init(struct crypto_tfm *tfm)
 {
+	struct deflate_ctx *ctx = crypto_tfm_ctx(tfm);
 	int ret;
 	
 	ret = deflate_comp_init(ctx);
@@ -116,17 +117,19 @@ out:
 	return ret;
 }
 
-static void deflate_exit(void *ctx)
+static void deflate_exit(struct crypto_tfm *tfm)
 {
+	struct deflate_ctx *ctx = crypto_tfm_ctx(tfm);
+
 	deflate_comp_exit(ctx);
 	deflate_decomp_exit(ctx);
 }
 
-static int deflate_compress(void *ctx, const u8 *src, unsigned int slen,
-	                    u8 *dst, unsigned int *dlen)
+static int deflate_compress(struct crypto_tfm *tfm, const u8 *src,
+			    unsigned int slen, u8 *dst, unsigned int *dlen)
 {
 	int ret = 0;
-	struct deflate_ctx *dctx = ctx;
+	struct deflate_ctx *dctx = crypto_tfm_ctx(tfm);
 	struct z_stream_s *stream = &dctx->comp_stream;
 
 	ret = zlib_deflateReset(stream);
@@ -151,12 +154,12 @@ out:
 	return ret;
 }
  
-static int deflate_decompress(void *ctx, const u8 *src, unsigned int slen,
-                              u8 *dst, unsigned int *dlen)
+static int deflate_decompress(struct crypto_tfm *tfm, const u8 *src,
+			      unsigned int slen, u8 *dst, unsigned int *dlen)
 {
 	
 	int ret = 0;
-	struct deflate_ctx *dctx = ctx;
+	struct deflate_ctx *dctx = crypto_tfm_ctx(tfm);
 	struct z_stream_s *stream = &dctx->decomp_stream;
 
 	ret = zlib_inflateReset(stream);
diff --git a/crypto/des.c b/crypto/des.c
index 2d74cab40c3e..a9d3c235a6af 100644
--- a/crypto/des.c
+++ b/crypto/des.c
@@ -783,9 +783,10 @@ static void dkey(u32 *pe, const u8 *k)
 	}
 }
 
-static int des_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
+static int des_setkey(struct crypto_tfm *tfm, const u8 *key,
+		      unsigned int keylen, u32 *flags)
 {
-	struct des_ctx *dctx = ctx;
+	struct des_ctx *dctx = crypto_tfm_ctx(tfm);
 	u32 tmp[DES_EXPKEY_WORDS];
 	int ret;
 
@@ -803,9 +804,10 @@ static int des_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
 	return 0;
 }
 
-static void des_encrypt(void *ctx, u8 *dst, const u8 *src)
+static void des_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
-	const u32 *K = ((struct des_ctx *)ctx)->expkey;
+	struct des_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u32 *K = ctx->expkey;
 	const __le32 *s = (const __le32 *)src;
 	__le32 *d = (__le32 *)dst;
 	u32 L, R, A, B;
@@ -825,9 +827,10 @@ static void des_encrypt(void *ctx, u8 *dst, const u8 *src)
 	d[1] = cpu_to_le32(L);
 }
 
-static void des_decrypt(void *ctx, u8 *dst, const u8 *src)
+static void des_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
-	const u32 *K = ((struct des_ctx *)ctx)->expkey + DES_EXPKEY_WORDS - 2;
+	struct des_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u32 *K = ctx->expkey + DES_EXPKEY_WORDS - 2;
 	const __le32 *s = (const __le32 *)src;
 	__le32 *d = (__le32 *)dst;
 	u32 L, R, A, B;
@@ -860,11 +863,11 @@ static void des_decrypt(void *ctx, u8 *dst, const u8 *src)
  *   property.
  *
  */
-static int des3_ede_setkey(void *ctx, const u8 *key,
+static int des3_ede_setkey(struct crypto_tfm *tfm, const u8 *key,
 			   unsigned int keylen, u32 *flags)
 {
 	const u32 *K = (const u32 *)key;
-	struct des3_ede_ctx *dctx = ctx;
+	struct des3_ede_ctx *dctx = crypto_tfm_ctx(tfm);
 	u32 *expkey = dctx->expkey;
 
 	if (unlikely(!((K[0] ^ K[2]) | (K[1] ^ K[3])) ||
@@ -881,9 +884,9 @@ static int des3_ede_setkey(void *ctx, const u8 *key,
 	return 0;
 }
 
-static void des3_ede_encrypt(void *ctx, u8 *dst, const u8 *src)
+static void des3_ede_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
-	struct des3_ede_ctx *dctx = ctx;
+	struct des3_ede_ctx *dctx = crypto_tfm_ctx(tfm);
 	const u32 *K = dctx->expkey;
 	const __le32 *s = (const __le32 *)src;
 	__le32 *d = (__le32 *)dst;
@@ -912,9 +915,9 @@ static void des3_ede_encrypt(void *ctx, u8 *dst, const u8 *src)
 	d[1] = cpu_to_le32(L);
 }
 
-static void des3_ede_decrypt(void *ctx, u8 *dst, const u8 *src)
+static void des3_ede_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
-	struct des3_ede_ctx *dctx = ctx;
+	struct des3_ede_ctx *dctx = crypto_tfm_ctx(tfm);
 	const u32 *K = dctx->expkey + DES3_EDE_EXPKEY_WORDS - 2;
 	const __le32 *s = (const __le32 *)src;
 	__le32 *d = (__le32 *)dst;
diff --git a/crypto/digest.c b/crypto/digest.c
index 062d0a5a2c89..2d9d509c2c51 100644
--- a/crypto/digest.c
+++ b/crypto/digest.c
@@ -20,7 +20,7 @@
 
 static void init(struct crypto_tfm *tfm)
 {
-	tfm->__crt_alg->cra_digest.dia_init(crypto_tfm_ctx(tfm));
+	tfm->__crt_alg->cra_digest.dia_init(tfm);
 }
 
 static void update(struct crypto_tfm *tfm,
@@ -46,16 +46,14 @@ static void update(struct crypto_tfm *tfm,
 				unsigned int bytes =
 					alignmask + 1 - (offset & alignmask);
 				bytes = min(bytes, bytes_from_page);
-				tfm->__crt_alg->cra_digest.dia_update
-						(crypto_tfm_ctx(tfm), p,
-						 bytes);
+				tfm->__crt_alg->cra_digest.dia_update(tfm, p,
+								      bytes);
 				p += bytes;
 				bytes_from_page -= bytes;
 				l -= bytes;
 			}
-			tfm->__crt_alg->cra_digest.dia_update
-					(crypto_tfm_ctx(tfm), p,
-					 bytes_from_page);
+			tfm->__crt_alg->cra_digest.dia_update(tfm, p,
+							      bytes_from_page);
 			crypto_kunmap(src, 0);
 			crypto_yield(tfm);
 			offset = 0;
@@ -83,8 +81,7 @@ static int setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen)
 	u32 flags;
 	if (tfm->__crt_alg->cra_digest.dia_setkey == NULL)
 		return -ENOSYS;
-	return tfm->__crt_alg->cra_digest.dia_setkey(crypto_tfm_ctx(tfm),
-						     key, keylen, &flags);
+	return tfm->__crt_alg->cra_digest.dia_setkey(tfm, key, keylen, &flags);
 }
 
 static void digest(struct crypto_tfm *tfm,
diff --git a/crypto/khazad.c b/crypto/khazad.c
index 5b8dc9a2d374..d4c9d3657b36 100644
--- a/crypto/khazad.c
+++ b/crypto/khazad.c
@@ -754,10 +754,10 @@ static const u64 c[KHAZAD_ROUNDS + 1] = {
 	0xccc41d14c363da5dULL, 0x5fdc7dcd7f5a6c5cULL, 0xf726ffede89d6f8eULL
 };
 
-static int khazad_setkey(void *ctx_arg, const u8 *in_key,
-                       unsigned int key_len, u32 *flags)
+static int khazad_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+			 unsigned int key_len, u32 *flags)
 {
-	struct khazad_ctx *ctx = ctx_arg;
+	struct khazad_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __be32 *key = (const __be32 *)in_key;
 	int r;
 	const u64 *S = T7;
@@ -841,15 +841,15 @@ static void khazad_crypt(const u64 roundKey[KHAZAD_ROUNDS + 1],
 	*dst = cpu_to_be64(state);
 }
 
-static void khazad_encrypt(void *ctx_arg, u8 *dst, const u8 *src)
+static void khazad_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
-	struct khazad_ctx *ctx = ctx_arg;
+	struct khazad_ctx *ctx = crypto_tfm_ctx(tfm);
 	khazad_crypt(ctx->E, dst, src);
 }
 
-static void khazad_decrypt(void *ctx_arg, u8 *dst, const u8 *src)
+static void khazad_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
-	struct khazad_ctx *ctx = ctx_arg;
+	struct khazad_ctx *ctx = crypto_tfm_ctx(tfm);
 	khazad_crypt(ctx->D, dst, src);
 }
 
diff --git a/crypto/md4.c b/crypto/md4.c
index a2d6df5c0f8c..c1bc71bdc16b 100644
--- a/crypto/md4.c
+++ b/crypto/md4.c
@@ -152,9 +152,9 @@ static inline void md4_transform_helper(struct md4_ctx *ctx)
 	md4_transform(ctx->hash, ctx->block);
 }
 
-static void md4_init(void *ctx)
+static void md4_init(struct crypto_tfm *tfm)
 {
-	struct md4_ctx *mctx = ctx;
+	struct md4_ctx *mctx = crypto_tfm_ctx(tfm);
 
 	mctx->hash[0] = 0x67452301;
 	mctx->hash[1] = 0xefcdab89;
@@ -163,9 +163,9 @@ static void md4_init(void *ctx)
 	mctx->byte_count = 0;
 }
 
-static void md4_update(void *ctx, const u8 *data, unsigned int len)
+static void md4_update(struct crypto_tfm *tfm, const u8 *data, unsigned int len)
 {
-	struct md4_ctx *mctx = ctx;
+	struct md4_ctx *mctx = crypto_tfm_ctx(tfm);
 	const u32 avail = sizeof(mctx->block) - (mctx->byte_count & 0x3f);
 
 	mctx->byte_count += len;
@@ -193,9 +193,9 @@ static void md4_update(void *ctx, const u8 *data, unsigned int len)
 	memcpy(mctx->block, data, len);
 }
 
-static void md4_final(void *ctx, u8 *out)
+static void md4_final(struct crypto_tfm *tfm, u8 *out)
 {
-	struct md4_ctx *mctx = ctx;
+	struct md4_ctx *mctx = crypto_tfm_ctx(tfm);
 	const unsigned int offset = mctx->byte_count & 0x3f;
 	char *p = (char *)mctx->block + offset;
 	int padding = 56 - (offset + 1);
diff --git a/crypto/md5.c b/crypto/md5.c
index 7f041aef5da2..93d18e8b3d53 100644
--- a/crypto/md5.c
+++ b/crypto/md5.c
@@ -147,9 +147,9 @@ static inline void md5_transform_helper(struct md5_ctx *ctx)
 	md5_transform(ctx->hash, ctx->block);
 }
 
-static void md5_init(void *ctx)
+static void md5_init(struct crypto_tfm *tfm)
 {
-	struct md5_ctx *mctx = ctx;
+	struct md5_ctx *mctx = crypto_tfm_ctx(tfm);
 
 	mctx->hash[0] = 0x67452301;
 	mctx->hash[1] = 0xefcdab89;
@@ -158,9 +158,9 @@ static void md5_init(void *ctx)
 	mctx->byte_count = 0;
 }
 
-static void md5_update(void *ctx, const u8 *data, unsigned int len)
+static void md5_update(struct crypto_tfm *tfm, const u8 *data, unsigned int len)
 {
-	struct md5_ctx *mctx = ctx;
+	struct md5_ctx *mctx = crypto_tfm_ctx(tfm);
 	const u32 avail = sizeof(mctx->block) - (mctx->byte_count & 0x3f);
 
 	mctx->byte_count += len;
@@ -188,9 +188,9 @@ static void md5_update(void *ctx, const u8 *data, unsigned int len)
 	memcpy(mctx->block, data, len);
 }
 
-static void md5_final(void *ctx, u8 *out)
+static void md5_final(struct crypto_tfm *tfm, u8 *out)
 {
-	struct md5_ctx *mctx = ctx;
+	struct md5_ctx *mctx = crypto_tfm_ctx(tfm);
 	const unsigned int offset = mctx->byte_count & 0x3f;
 	char *p = (char *)mctx->block + offset;
 	int padding = 56 - (offset + 1);
diff --git a/crypto/michael_mic.c b/crypto/michael_mic.c
index 701f859ed767..d061da21cfda 100644
--- a/crypto/michael_mic.c
+++ b/crypto/michael_mic.c
@@ -45,16 +45,17 @@ do {				\
 } while (0)
 
 
-static void michael_init(void *ctx)
+static void michael_init(struct crypto_tfm *tfm)
 {
-	struct michael_mic_ctx *mctx = ctx;
+	struct michael_mic_ctx *mctx = crypto_tfm_ctx(tfm);
 	mctx->pending_len = 0;
 }
 
 
-static void michael_update(void *ctx, const u8 *data, unsigned int len)
+static void michael_update(struct crypto_tfm *tfm, const u8 *data,
+			   unsigned int len)
 {
-	struct michael_mic_ctx *mctx = ctx;
+	struct michael_mic_ctx *mctx = crypto_tfm_ctx(tfm);
 	const __le32 *src;
 
 	if (mctx->pending_len) {
@@ -90,9 +91,9 @@ static void michael_update(void *ctx, const u8 *data, unsigned int len)
 }
 
 
-static void michael_final(void *ctx, u8 *out)
+static void michael_final(struct crypto_tfm *tfm, u8 *out)
 {
-	struct michael_mic_ctx *mctx = ctx;
+	struct michael_mic_ctx *mctx = crypto_tfm_ctx(tfm);
 	u8 *data = mctx->pending;
 	__le32 *dst = (__le32 *)out;
 
@@ -121,10 +122,10 @@ static void michael_final(void *ctx, u8 *out)
 }
 
 
-static int michael_setkey(void *ctx, const u8 *key, unsigned int keylen,
-			  u32 *flags)
+static int michael_setkey(struct crypto_tfm *tfm, const u8 *key,
+			  unsigned int keylen, u32 *flags)
 {
-	struct michael_mic_ctx *mctx = ctx;
+	struct michael_mic_ctx *mctx = crypto_tfm_ctx(tfm);
 	const __le32 *data = (const __le32 *)key;
 
 	if (keylen != 8) {
diff --git a/crypto/serpent.c b/crypto/serpent.c
index e366406ab49d..de60cdddbf4a 100644
--- a/crypto/serpent.c
+++ b/crypto/serpent.c
@@ -215,9 +215,11 @@ struct serpent_ctx {
 };
 
 
-static int serpent_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
+static int serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+			  unsigned int keylen, u32 *flags)
 {
-	u32 *k = ((struct serpent_ctx *)ctx)->expkey;
+	struct serpent_ctx *ctx = crypto_tfm_ctx(tfm);
+	u32 *k = ctx->expkey;
 	u8  *k8 = (u8 *)k;
 	u32 r0,r1,r2,r3,r4;
 	int i;
@@ -365,10 +367,11 @@ static int serpent_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *fl
 	return 0;
 }
 
-static void serpent_encrypt(void *ctx, u8 *dst, const u8 *src)
+static void serpent_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
+	struct serpent_ctx *ctx = crypto_tfm_ctx(tfm);
 	const u32
-		*k = ((struct serpent_ctx *)ctx)->expkey,
+		*k = ctx->expkey,
 		*s = (const u32 *)src;
 	u32	*d = (u32 *)dst,
 		r0, r1, r2, r3, r4;
@@ -423,8 +426,9 @@ static void serpent_encrypt(void *ctx, u8 *dst, const u8 *src)
 	d[3] = cpu_to_le32(r3);
 }
 
-static void serpent_decrypt(void *ctx, u8 *dst, const u8 *src)
+static void serpent_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
+	struct serpent_ctx *ctx = crypto_tfm_ctx(tfm);
 	const u32
 		*k = ((struct serpent_ctx *)ctx)->expkey,
 		*s = (const u32 *)src;
@@ -492,7 +496,8 @@ static struct crypto_alg serpent_alg = {
 	.cia_decrypt  		=	serpent_decrypt } }
 };
 
-static int tnepres_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
+static int tnepres_setkey(struct crypto_tfm *tfm, const u8 *key,
+			  unsigned int keylen, u32 *flags)
 {
 	u8 rev_key[SERPENT_MAX_KEY_SIZE];
 	int i;
@@ -506,10 +511,10 @@ static int tnepres_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *fl
 	for (i = 0; i < keylen; ++i)
 		rev_key[keylen - i - 1] = key[i];
  
-	return serpent_setkey(ctx, rev_key, keylen, flags);
+	return serpent_setkey(tfm, rev_key, keylen, flags);
 }
 
-static void tnepres_encrypt(void *ctx, u8 *dst, const u8 *src)
+static void tnepres_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 	const u32 * const s = (const u32 * const)src;
 	u32 * const d = (u32 * const)dst;
@@ -521,7 +526,7 @@ static void tnepres_encrypt(void *ctx, u8 *dst, const u8 *src)
 	rs[2] = swab32(s[1]);
 	rs[3] = swab32(s[0]);
 
-	serpent_encrypt(ctx, (u8 *)rd, (u8 *)rs);
+	serpent_encrypt(tfm, (u8 *)rd, (u8 *)rs);
 
 	d[0] = swab32(rd[3]);
 	d[1] = swab32(rd[2]);
@@ -529,7 +534,7 @@ static void tnepres_encrypt(void *ctx, u8 *dst, const u8 *src)
 	d[3] = swab32(rd[0]);
 }
 
-static void tnepres_decrypt(void *ctx, u8 *dst, const u8 *src)
+static void tnepres_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 	const u32 * const s = (const u32 * const)src;
 	u32 * const d = (u32 * const)dst;
@@ -541,7 +546,7 @@ static void tnepres_decrypt(void *ctx, u8 *dst, const u8 *src)
 	rs[2] = swab32(s[1]);
 	rs[3] = swab32(s[0]);
 
-	serpent_decrypt(ctx, (u8 *)rd, (u8 *)rs);
+	serpent_decrypt(tfm, (u8 *)rd, (u8 *)rs);
 
 	d[0] = swab32(rd[3]);
 	d[1] = swab32(rd[2]);
diff --git a/crypto/sha1.c b/crypto/sha1.c
index b96f57d95a82..6c77b689f87e 100644
--- a/crypto/sha1.c
+++ b/crypto/sha1.c
@@ -34,9 +34,9 @@ struct sha1_ctx {
         u8 buffer[64];
 };
 
-static void sha1_init(void *ctx)
+static void sha1_init(struct crypto_tfm *tfm)
 {
-	struct sha1_ctx *sctx = ctx;
+	struct sha1_ctx *sctx = crypto_tfm_ctx(tfm);
 	static const struct sha1_ctx initstate = {
 	  0,
 	  { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 },
@@ -46,9 +46,10 @@ static void sha1_init(void *ctx)
 	*sctx = initstate;
 }
 
-static void sha1_update(void *ctx, const u8 *data, unsigned int len)
+static void sha1_update(struct crypto_tfm *tfm, const u8 *data,
+			unsigned int len)
 {
-	struct sha1_ctx *sctx = ctx;
+	struct sha1_ctx *sctx = crypto_tfm_ctx(tfm);
 	unsigned int partial, done;
 	const u8 *src;
 
@@ -80,9 +81,9 @@ static void sha1_update(void *ctx, const u8 *data, unsigned int len)
 
 
 /* Add padding and return the message digest. */
-static void sha1_final(void* ctx, u8 *out)
+static void sha1_final(struct crypto_tfm *tfm, u8 *out)
 {
-	struct sha1_ctx *sctx = ctx;
+	struct sha1_ctx *sctx = crypto_tfm_ctx(tfm);
 	__be32 *dst = (__be32 *)out;
 	u32 i, index, padlen;
 	__be64 bits;
@@ -93,10 +94,10 @@ static void sha1_final(void* ctx, u8 *out)
 	/* Pad out to 56 mod 64 */
 	index = sctx->count & 0x3f;
 	padlen = (index < 56) ? (56 - index) : ((64+56) - index);
-	sha1_update(sctx, padding, padlen);
+	sha1_update(tfm, padding, padlen);
 
 	/* Append length */
-	sha1_update(sctx, (const u8 *)&bits, sizeof(bits));
+	sha1_update(tfm, (const u8 *)&bits, sizeof(bits));
 
 	/* Store state in digest */
 	for (i = 0; i < 5; i++)
diff --git a/crypto/sha256.c b/crypto/sha256.c
index 4533a0564895..bc71d85a7d02 100644
--- a/crypto/sha256.c
+++ b/crypto/sha256.c
@@ -230,9 +230,9 @@ static void sha256_transform(u32 *state, const u8 *input)
 	memset(W, 0, 64 * sizeof(u32));
 }
 
-static void sha256_init(void *ctx)
+static void sha256_init(struct crypto_tfm *tfm)
 {
-	struct sha256_ctx *sctx = ctx;
+	struct sha256_ctx *sctx = crypto_tfm_ctx(tfm);
 	sctx->state[0] = H0;
 	sctx->state[1] = H1;
 	sctx->state[2] = H2;
@@ -244,9 +244,10 @@ static void sha256_init(void *ctx)
 	sctx->count[0] = sctx->count[1] = 0;
 }
 
-static void sha256_update(void *ctx, const u8 *data, unsigned int len)
+static void sha256_update(struct crypto_tfm *tfm, const u8 *data,
+			  unsigned int len)
 {
-	struct sha256_ctx *sctx = ctx;
+	struct sha256_ctx *sctx = crypto_tfm_ctx(tfm);
 	unsigned int i, index, part_len;
 
 	/* Compute number of bytes mod 128 */
@@ -276,9 +277,9 @@ static void sha256_update(void *ctx, const u8 *data, unsigned int len)
 	memcpy(&sctx->buf[index], &data[i], len-i);
 }
 
-static void sha256_final(void* ctx, u8 *out)
+static void sha256_final(struct crypto_tfm *tfm, u8 *out)
 {
-	struct sha256_ctx *sctx = ctx;
+	struct sha256_ctx *sctx = crypto_tfm_ctx(tfm);
 	__be32 *dst = (__be32 *)out;
 	__be32 bits[2];
 	unsigned int index, pad_len;
@@ -292,10 +293,10 @@ static void sha256_final(void* ctx, u8 *out)
 	/* Pad out to 56 mod 64. */
 	index = (sctx->count[0] >> 3) & 0x3f;
 	pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
-	sha256_update(sctx, padding, pad_len);
+	sha256_update(tfm, padding, pad_len);
 
 	/* Append length (before padding) */
-	sha256_update(sctx, (const u8 *)bits, sizeof(bits));
+	sha256_update(tfm, (const u8 *)bits, sizeof(bits));
 
 	/* Store state in digest */
 	for (i = 0; i < 8; i++)
diff --git a/crypto/sha512.c b/crypto/sha512.c
index bc77a66d9de2..2dfe7f170b48 100644
--- a/crypto/sha512.c
+++ b/crypto/sha512.c
@@ -161,9 +161,9 @@ sha512_transform(u64 *state, u64 *W, const u8 *input)
 }
 
 static void
-sha512_init(void *ctx)
+sha512_init(struct crypto_tfm *tfm)
 {
-        struct sha512_ctx *sctx = ctx;
+	struct sha512_ctx *sctx = crypto_tfm_ctx(tfm);
 	sctx->state[0] = H0;
 	sctx->state[1] = H1;
 	sctx->state[2] = H2;
@@ -176,9 +176,9 @@ sha512_init(void *ctx)
 }
 
 static void
-sha384_init(void *ctx)
+sha384_init(struct crypto_tfm *tfm)
 {
-        struct sha512_ctx *sctx = ctx;
+	struct sha512_ctx *sctx = crypto_tfm_ctx(tfm);
         sctx->state[0] = HP0;
         sctx->state[1] = HP1;
         sctx->state[2] = HP2;
@@ -191,9 +191,9 @@ sha384_init(void *ctx)
 }
 
 static void
-sha512_update(void *ctx, const u8 *data, unsigned int len)
+sha512_update(struct crypto_tfm *tfm, const u8 *data, unsigned int len)
 {
-        struct sha512_ctx *sctx = ctx;
+	struct sha512_ctx *sctx = crypto_tfm_ctx(tfm);
 
 	unsigned int i, index, part_len;
 
@@ -231,9 +231,9 @@ sha512_update(void *ctx, const u8 *data, unsigned int len)
 }
 
 static void
-sha512_final(void *ctx, u8 *hash)
+sha512_final(struct crypto_tfm *tfm, u8 *hash)
 {
-        struct sha512_ctx *sctx = ctx;
+	struct sha512_ctx *sctx = crypto_tfm_ctx(tfm);
         static u8 padding[128] = { 0x80, };
 	__be64 *dst = (__be64 *)hash;
 	__be32 bits[4];
@@ -249,10 +249,10 @@ sha512_final(void *ctx, u8 *hash)
 	/* Pad out to 112 mod 128. */
 	index = (sctx->count[0] >> 3) & 0x7f;
 	pad_len = (index < 112) ? (112 - index) : ((128+112) - index);
-	sha512_update(sctx, padding, pad_len);
+	sha512_update(tfm, padding, pad_len);
 
 	/* Append length (before padding) */
-	sha512_update(sctx, (const u8 *)bits, sizeof(bits));
+	sha512_update(tfm, (const u8 *)bits, sizeof(bits));
 
 	/* Store state in digest */
 	for (i = 0; i < 8; i++)
@@ -262,12 +262,11 @@ sha512_final(void *ctx, u8 *hash)
 	memset(sctx, 0, sizeof(struct sha512_ctx));
 }
 
-static void sha384_final(void *ctx, u8 *hash)
+static void sha384_final(struct crypto_tfm *tfm, u8 *hash)
 {
-        struct sha512_ctx *sctx = ctx;
         u8 D[64];
 
-        sha512_final(sctx, D);
+	sha512_final(tfm, D);
 
         memcpy(hash, D, 48);
         memset(D, 0, 64);
diff --git a/crypto/tea.c b/crypto/tea.c
index a6a02b30e470..5367adc82fc9 100644
--- a/crypto/tea.c
+++ b/crypto/tea.c
@@ -45,10 +45,10 @@ struct xtea_ctx {
 	u32 KEY[4];
 };
 
-static int tea_setkey(void *ctx_arg, const u8 *in_key,
-                       unsigned int key_len, u32 *flags)
-{ 
-	struct tea_ctx *ctx = ctx_arg;
+static int tea_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+		      unsigned int key_len, u32 *flags)
+{
+	struct tea_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *key = (const __le32 *)in_key;
 	
 	if (key_len != 16)
@@ -66,12 +66,11 @@ static int tea_setkey(void *ctx_arg, const u8 *in_key,
 
 }
 
-static void tea_encrypt(void *ctx_arg, u8 *dst, const u8 *src)
-{ 
+static void tea_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
 	u32 y, z, n, sum = 0;
 	u32 k0, k1, k2, k3;
-
-	struct tea_ctx *ctx = ctx_arg;
+	struct tea_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *in = (const __le32 *)src;
 	__le32 *out = (__le32 *)dst;
 
@@ -95,11 +94,11 @@ static void tea_encrypt(void *ctx_arg, u8 *dst, const u8 *src)
 	out[1] = cpu_to_le32(z);
 }
 
-static void tea_decrypt(void *ctx_arg, u8 *dst, const u8 *src)
-{ 
+static void tea_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
 	u32 y, z, n, sum;
 	u32 k0, k1, k2, k3;
-	struct tea_ctx *ctx = ctx_arg;
+	struct tea_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *in = (const __le32 *)src;
 	__le32 *out = (__le32 *)dst;
 
@@ -125,10 +124,10 @@ static void tea_decrypt(void *ctx_arg, u8 *dst, const u8 *src)
 	out[1] = cpu_to_le32(z);
 }
 
-static int xtea_setkey(void *ctx_arg, const u8 *in_key,
-                       unsigned int key_len, u32 *flags)
-{ 
-	struct xtea_ctx *ctx = ctx_arg;
+static int xtea_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+		       unsigned int key_len, u32 *flags)
+{
+	struct xtea_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *key = (const __le32 *)in_key;
 	
 	if (key_len != 16)
@@ -146,12 +145,11 @@ static int xtea_setkey(void *ctx_arg, const u8 *in_key,
 
 }
 
-static void xtea_encrypt(void *ctx_arg, u8 *dst, const u8 *src)
-{ 
+static void xtea_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
 	u32 y, z, sum = 0;
 	u32 limit = XTEA_DELTA * XTEA_ROUNDS;
-
-	struct xtea_ctx *ctx = ctx_arg;
+	struct xtea_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *in = (const __le32 *)src;
 	__le32 *out = (__le32 *)dst;
 
@@ -168,10 +166,10 @@ static void xtea_encrypt(void *ctx_arg, u8 *dst, const u8 *src)
 	out[1] = cpu_to_le32(z);
 }
 
-static void xtea_decrypt(void *ctx_arg, u8 *dst, const u8 *src)
-{ 
+static void xtea_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
 	u32 y, z, sum;
-	struct tea_ctx *ctx = ctx_arg;
+	struct tea_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *in = (const __le32 *)src;
 	__le32 *out = (__le32 *)dst;
 
@@ -191,12 +189,11 @@ static void xtea_decrypt(void *ctx_arg, u8 *dst, const u8 *src)
 }
 
 
-static void xeta_encrypt(void *ctx_arg, u8 *dst, const u8 *src)
-{ 
+static void xeta_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
 	u32 y, z, sum = 0;
 	u32 limit = XTEA_DELTA * XTEA_ROUNDS;
-
-	struct xtea_ctx *ctx = ctx_arg;
+	struct xtea_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *in = (const __le32 *)src;
 	__le32 *out = (__le32 *)dst;
 
@@ -213,10 +210,10 @@ static void xeta_encrypt(void *ctx_arg, u8 *dst, const u8 *src)
 	out[1] = cpu_to_le32(z);
 }
 
-static void xeta_decrypt(void *ctx_arg, u8 *dst, const u8 *src)
-{ 
+static void xeta_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
 	u32 y, z, sum;
-	struct tea_ctx *ctx = ctx_arg;
+	struct tea_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *in = (const __le32 *)src;
 	__le32 *out = (__le32 *)dst;
 
diff --git a/crypto/tgr192.c b/crypto/tgr192.c
index 004bb841cc5b..a0fadf3dd3e2 100644
--- a/crypto/tgr192.c
+++ b/crypto/tgr192.c
@@ -496,9 +496,9 @@ static void tgr192_transform(struct tgr192_ctx *tctx, const u8 * data)
 	tctx->c = c;
 }
 
-static void tgr192_init(void *ctx)
+static void tgr192_init(struct crypto_tfm *tfm)
 {
-	struct tgr192_ctx *tctx = ctx;
+	struct tgr192_ctx *tctx = crypto_tfm_ctx(tfm);
 
 	tctx->a = 0x0123456789abcdefULL;
 	tctx->b = 0xfedcba9876543210ULL;
@@ -510,9 +510,10 @@ static void tgr192_init(void *ctx)
 
 /* Update the message digest with the contents
  * of INBUF with length INLEN. */
-static void tgr192_update(void *ctx, const u8 * inbuf, unsigned int len)
+static void tgr192_update(struct crypto_tfm *tfm, const u8 *inbuf,
+			  unsigned int len)
 {
-	struct tgr192_ctx *tctx = ctx;
+	struct tgr192_ctx *tctx = crypto_tfm_ctx(tfm);
 
 	if (tctx->count == 64) {	/* flush the buffer */
 		tgr192_transform(tctx, tctx->hash);
@@ -526,7 +527,7 @@ static void tgr192_update(void *ctx, const u8 * inbuf, unsigned int len)
 		for (; len && tctx->count < 64; len--) {
 			tctx->hash[tctx->count++] = *inbuf++;
 		}
-		tgr192_update(tctx, NULL, 0);
+		tgr192_update(tfm, NULL, 0);
 		if (!len) {
 			return;
 		}
@@ -548,15 +549,15 @@ static void tgr192_update(void *ctx, const u8 * inbuf, unsigned int len)
 
 
 /* The routine terminates the computation */
-static void tgr192_final(void *ctx, u8 * out)
+static void tgr192_final(struct crypto_tfm *tfm, u8 * out)
 {
-	struct tgr192_ctx *tctx = ctx;
+	struct tgr192_ctx *tctx = crypto_tfm_ctx(tfm);
 	__be64 *dst = (__be64 *)out;
 	__be64 *be64p;
 	__le32 *le32p;
 	u32 t, msb, lsb;
 
-	tgr192_update(tctx, NULL, 0); /* flush */ ;
+	tgr192_update(tfm, NULL, 0); /* flush */ ;
 
 	msb = 0;
 	t = tctx->nblocks;
@@ -584,7 +585,7 @@ static void tgr192_final(void *ctx, u8 * out)
 		while (tctx->count < 64) {
 			tctx->hash[tctx->count++] = 0;
 		}
-		tgr192_update(tctx, NULL, 0); /* flush */ ;
+		tgr192_update(tfm, NULL, 0); /* flush */ ;
 		memset(tctx->hash, 0, 56);    /* fill next block with zeroes */
 	}
 	/* append the 64 bit count */
@@ -600,22 +601,20 @@ static void tgr192_final(void *ctx, u8 * out)
 	dst[2] = be64p[2] = cpu_to_be64(tctx->c);
 }
 
-static void tgr160_final(void *ctx, u8 * out)
+static void tgr160_final(struct crypto_tfm *tfm, u8 * out)
 {
-	struct tgr192_ctx *wctx = ctx;
 	u8 D[64];
 
-	tgr192_final(wctx, D);
+	tgr192_final(tfm, D);
 	memcpy(out, D, TGR160_DIGEST_SIZE);
 	memset(D, 0, TGR192_DIGEST_SIZE);
 }
 
-static void tgr128_final(void *ctx, u8 * out)
+static void tgr128_final(struct crypto_tfm *tfm, u8 * out)
 {
-	struct tgr192_ctx *wctx = ctx;
 	u8 D[64];
 
-	tgr192_final(wctx, D);
+	tgr192_final(tfm, D);
 	memcpy(out, D, TGR128_DIGEST_SIZE);
 	memset(D, 0, TGR192_DIGEST_SIZE);
 }
diff --git a/crypto/twofish.c b/crypto/twofish.c
index ddfd5a3fcc5f..ec2488242e2d 100644
--- a/crypto/twofish.c
+++ b/crypto/twofish.c
@@ -643,11 +643,11 @@ struct twofish_ctx {
 };
 
 /* Perform the key setup. */
-static int twofish_setkey(void *cx, const u8 *key,
-                          unsigned int key_len, u32 *flags)
+static int twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
+			  unsigned int key_len, u32 *flags)
 {
 	
-	struct twofish_ctx *ctx = cx;
+	struct twofish_ctx *ctx = crypto_tfm_ctx(tfm);
 
 	int i, j, k;
 
@@ -802,9 +802,9 @@ static int twofish_setkey(void *cx, const u8 *key,
 }
 
 /* Encrypt one block.  in and out may be the same. */
-static void twofish_encrypt(void *cx, u8 *out, const u8 *in)
+static void twofish_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
-	struct twofish_ctx *ctx = cx;
+	struct twofish_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *src = (const __le32 *)in;
 	__le32 *dst = (__le32 *)out;
 
@@ -839,9 +839,9 @@ static void twofish_encrypt(void *cx, u8 *out, const u8 *in)
 }
 
 /* Decrypt one block.  in and out may be the same. */
-static void twofish_decrypt(void *cx, u8 *out, const u8 *in)
+static void twofish_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
-	struct twofish_ctx *ctx = cx;
+	struct twofish_ctx *ctx = crypto_tfm_ctx(tfm);
 	const __le32 *src = (const __le32 *)in;
 	__le32 *dst = (__le32 *)out;
   
diff --git a/crypto/wp512.c b/crypto/wp512.c
index b226a126cfae..727d05a19ff4 100644
--- a/crypto/wp512.c
+++ b/crypto/wp512.c
@@ -981,9 +981,9 @@ static void wp512_process_buffer(struct wp512_ctx *wctx) {
 
 }
 
-static void wp512_init (void *ctx) {
+static void wp512_init(struct crypto_tfm *tfm) {
+	struct wp512_ctx *wctx = crypto_tfm_ctx(tfm);
 	int i;
-	struct wp512_ctx *wctx = ctx;
 
 	memset(wctx->bitLength, 0, 32);
 	wctx->bufferBits = wctx->bufferPos = 0;
@@ -993,10 +993,10 @@ static void wp512_init (void *ctx) {
 	}
 }
 
-static void wp512_update(void *ctx, const u8 *source, unsigned int len)
+static void wp512_update(struct crypto_tfm *tfm, const u8 *source,
+			 unsigned int len)
 {
-
-	struct wp512_ctx *wctx = ctx;
+	struct wp512_ctx *wctx = crypto_tfm_ctx(tfm);
 	int sourcePos    = 0;
 	unsigned int bits_len = len * 8; // convert to number of bits
 	int sourceGap    = (8 - ((int)bits_len & 7)) & 7;
@@ -1054,9 +1054,9 @@ static void wp512_update(void *ctx, const u8 *source, unsigned int len)
 
 }
 
-static void wp512_final(void *ctx, u8 *out)
+static void wp512_final(struct crypto_tfm *tfm, u8 *out)
 {
-	struct wp512_ctx *wctx = ctx;
+	struct wp512_ctx *wctx = crypto_tfm_ctx(tfm);
 	int i;
    	u8 *buffer      = wctx->buffer;
    	u8 *bitLength   = wctx->bitLength;
@@ -1087,22 +1087,20 @@ static void wp512_final(void *ctx, u8 *out)
    	wctx->bufferPos    = bufferPos;
 }
 
-static void wp384_final(void *ctx, u8 *out)
+static void wp384_final(struct crypto_tfm *tfm, u8 *out)
 {
-	struct wp512_ctx *wctx = ctx;
 	u8 D[64];
 
-	wp512_final (wctx, D);
+	wp512_final(tfm, D);
 	memcpy (out, D, WP384_DIGEST_SIZE);
 	memset (D, 0, WP512_DIGEST_SIZE);
 }
 
-static void wp256_final(void *ctx, u8 *out)
+static void wp256_final(struct crypto_tfm *tfm, u8 *out)
 {
-	struct wp512_ctx *wctx = ctx;
 	u8 D[64];
 
-	wp512_final (wctx, D);
+	wp512_final(tfm, D);
 	memcpy (out, D, WP256_DIGEST_SIZE);
 	memset (D, 0, WP512_DIGEST_SIZE);
 }
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index 5158a9db4bc5..b98ad203d6cb 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -282,19 +282,20 @@ aes_hw_extkey_available(uint8_t key_len)
 	return 0;
 }
 
-static inline struct aes_ctx *aes_ctx(void *ctx)
+static inline struct aes_ctx *aes_ctx(struct crypto_tfm *tfm)
 {
+	unsigned long addr = (unsigned long)crypto_tfm_ctx(tfm);
 	unsigned long align = PADLOCK_ALIGNMENT;
 
 	if (align <= crypto_tfm_ctx_alignment())
 		align = 1;
-	return (struct aes_ctx *)ALIGN((unsigned long)ctx, align);
+	return (struct aes_ctx *)ALIGN(addr, align);
 }
 
-static int
-aes_set_key(void *ctx_arg, const uint8_t *in_key, unsigned int key_len, uint32_t *flags)
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+		       unsigned int key_len, u32 *flags)
 {
-	struct aes_ctx *ctx = aes_ctx(ctx_arg);
+	struct aes_ctx *ctx = aes_ctx(tfm);
 	const __le32 *key = (const __le32 *)in_key;
 	uint32_t i, t, u, v, w;
 	uint32_t P[AES_EXTENDED_KEY_SIZE];
@@ -414,24 +415,22 @@ static inline u8 *padlock_xcrypt_cbc(const u8 *input, u8 *output, void *key,
 	return iv;
 }
 
-static void
-aes_encrypt(void *ctx_arg, uint8_t *out, const uint8_t *in)
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
-	struct aes_ctx *ctx = aes_ctx(ctx_arg);
+	struct aes_ctx *ctx = aes_ctx(tfm);
 	padlock_xcrypt_ecb(in, out, ctx->E, &ctx->cword.encrypt, 1);
 }
 
-static void
-aes_decrypt(void *ctx_arg, uint8_t *out, const uint8_t *in)
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
-	struct aes_ctx *ctx = aes_ctx(ctx_arg);
+	struct aes_ctx *ctx = aes_ctx(tfm);
 	padlock_xcrypt_ecb(in, out, ctx->D, &ctx->cword.decrypt, 1);
 }
 
 static unsigned int aes_encrypt_ecb(const struct cipher_desc *desc, u8 *out,
 				    const u8 *in, unsigned int nbytes)
 {
-	struct aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(desc->tfm));
+	struct aes_ctx *ctx = aes_ctx(desc->tfm);
 	padlock_xcrypt_ecb(in, out, ctx->E, &ctx->cword.encrypt,
 			   nbytes / AES_BLOCK_SIZE);
 	return nbytes & ~(AES_BLOCK_SIZE - 1);
@@ -440,7 +439,7 @@ static unsigned int aes_encrypt_ecb(const struct cipher_desc *desc, u8 *out,
 static unsigned int aes_decrypt_ecb(const struct cipher_desc *desc, u8 *out,
 				    const u8 *in, unsigned int nbytes)
 {
-	struct aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(desc->tfm));
+	struct aes_ctx *ctx = aes_ctx(desc->tfm);
 	padlock_xcrypt_ecb(in, out, ctx->D, &ctx->cword.decrypt,
 			   nbytes / AES_BLOCK_SIZE);
 	return nbytes & ~(AES_BLOCK_SIZE - 1);
@@ -449,7 +448,7 @@ static unsigned int aes_decrypt_ecb(const struct cipher_desc *desc, u8 *out,
 static unsigned int aes_encrypt_cbc(const struct cipher_desc *desc, u8 *out,
 				    const u8 *in, unsigned int nbytes)
 {
-	struct aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(desc->tfm));
+	struct aes_ctx *ctx = aes_ctx(desc->tfm);
 	u8 *iv;
 
 	iv = padlock_xcrypt_cbc(in, out, ctx->E, desc->info,
@@ -462,7 +461,7 @@ static unsigned int aes_encrypt_cbc(const struct cipher_desc *desc, u8 *out,
 static unsigned int aes_decrypt_cbc(const struct cipher_desc *desc, u8 *out,
 				    const u8 *in, unsigned int nbytes)
 {
-	struct aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(desc->tfm));
+	struct aes_ctx *ctx = aes_ctx(desc->tfm);
 	padlock_xcrypt_cbc(in, out, ctx->D, desc->info, &ctx->cword.decrypt,
 			   nbytes / AES_BLOCK_SIZE);
 	return nbytes & ~(AES_BLOCK_SIZE - 1);
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 5a0470e36111..ef918803ec30 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -66,7 +66,7 @@ struct crypto_tfm;
 
 struct cipher_desc {
 	struct crypto_tfm *tfm;
-	void (*crfn)(void *ctx, u8 *dst, const u8 *src);
+	void (*crfn)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
 	unsigned int (*prfn)(const struct cipher_desc *desc, u8 *dst,
 			     const u8 *src, unsigned int nbytes);
 	void *info;
@@ -79,10 +79,10 @@ struct cipher_desc {
 struct cipher_alg {
 	unsigned int cia_min_keysize;
 	unsigned int cia_max_keysize;
-	int (*cia_setkey)(void *ctx, const u8 *key,
+	int (*cia_setkey)(struct crypto_tfm *tfm, const u8 *key,
 	                  unsigned int keylen, u32 *flags);
-	void (*cia_encrypt)(void *ctx, u8 *dst, const u8 *src);
-	void (*cia_decrypt)(void *ctx, u8 *dst, const u8 *src);
+	void (*cia_encrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+	void (*cia_decrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
 
 	unsigned int (*cia_encrypt_ecb)(const struct cipher_desc *desc,
 					u8 *dst, const u8 *src,
@@ -100,20 +100,21 @@ struct cipher_alg {
 
 struct digest_alg {
 	unsigned int dia_digestsize;
-	void (*dia_init)(void *ctx);
-	void (*dia_update)(void *ctx, const u8 *data, unsigned int len);
-	void (*dia_final)(void *ctx, u8 *out);
-	int (*dia_setkey)(void *ctx, const u8 *key,
+	void (*dia_init)(struct crypto_tfm *tfm);
+	void (*dia_update)(struct crypto_tfm *tfm, const u8 *data,
+			   unsigned int len);
+	void (*dia_final)(struct crypto_tfm *tfm, u8 *out);
+	int (*dia_setkey)(struct crypto_tfm *tfm, const u8 *key,
 	                  unsigned int keylen, u32 *flags);
 };
 
 struct compress_alg {
-	int (*coa_init)(void *ctx);
-	void (*coa_exit)(void *ctx);
-	int (*coa_compress)(void *ctx, const u8 *src, unsigned int slen,
-	                    u8 *dst, unsigned int *dlen);
-	int (*coa_decompress)(void *ctx, const u8 *src, unsigned int slen,
-	                      u8 *dst, unsigned int *dlen);
+	int (*coa_init)(struct crypto_tfm *tfm);
+	void (*coa_exit)(struct crypto_tfm *tfm);
+	int (*coa_compress)(struct crypto_tfm *tfm, const u8 *src,
+			    unsigned int slen, u8 *dst, unsigned int *dlen);
+	int (*coa_decompress)(struct crypto_tfm *tfm, const u8 *src,
+			      unsigned int slen, u8 *dst, unsigned int *dlen);
 };
 
 #define cra_cipher	cra_u.cipher
-- 
cgit v1.2.3


From c7fc05992afcf1d63d6d5fb6142c8d39094dbca9 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 24 May 2006 13:02:26 +1000
Subject: [CRYPTO] api: Added cra_init/cra_exit

This patch adds the hooks cra_init/cra_exit which are called during a tfm's
construction and destruction respectively.  This will be used by the instances
to allocate child tfm's.

For now this lets us get rid of the coa_init/coa_exit functions which are
used for exactly that purpose (unlike the dia_init function which is called
for each transaction).

In fact the coa_exit path is currently buggy as it may get called twice
when an error is encountered during initialisation.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/api.c           | 11 ++++++++---
 crypto/compress.c      |  9 +--------
 crypto/deflate.c       |  4 ++--
 include/linux/crypto.h |  5 +++--
 4 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/api.c b/crypto/api.c
index 80bba637fba7..8145310d7985 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -188,13 +188,16 @@ struct crypto_tfm *crypto_alloc_tfm(const char *name, u32 flags)
 	if (crypto_init_flags(tfm, flags))
 		goto out_free_tfm;
 		
-	if (crypto_init_ops(tfm)) {
-		crypto_exit_ops(tfm);
+	if (crypto_init_ops(tfm))
 		goto out_free_tfm;
-	}
+
+	if (alg->cra_init && alg->cra_init(tfm))
+		goto cra_init_failed;
 
 	goto out;
 
+cra_init_failed:
+	crypto_exit_ops(tfm);
 out_free_tfm:
 	kfree(tfm);
 	tfm = NULL;
@@ -215,6 +218,8 @@ void crypto_free_tfm(struct crypto_tfm *tfm)
 	alg = tfm->__crt_alg;
 	size = sizeof(*tfm) + alg->cra_ctxsize;
 
+	if (alg->cra_exit)
+		alg->cra_exit(tfm);
 	crypto_exit_ops(tfm);
 	crypto_alg_put(alg);
 	memset(tfm, 0, size);
diff --git a/crypto/compress.c b/crypto/compress.c
index f3e07334afd0..eca182aa3380 100644
--- a/crypto/compress.c
+++ b/crypto/compress.c
@@ -41,21 +41,14 @@ int crypto_init_compress_flags(struct crypto_tfm *tfm, u32 flags)
 
 int crypto_init_compress_ops(struct crypto_tfm *tfm)
 {
-	int ret = 0;
 	struct compress_tfm *ops = &tfm->crt_compress;
-	
-	ret = tfm->__crt_alg->cra_compress.coa_init(tfm);
-	if (ret)
-		goto out;
 
 	ops->cot_compress = crypto_compress;
 	ops->cot_decompress = crypto_decompress;
 	
-out:
-	return ret;
+	return 0;
 }
 
 void crypto_exit_compress_ops(struct crypto_tfm *tfm)
 {
-	tfm->__crt_alg->cra_compress.coa_exit(tfm);
 }
diff --git a/crypto/deflate.c b/crypto/deflate.c
index 5dd2404ae5b2..6588bbf82e9b 100644
--- a/crypto/deflate.c
+++ b/crypto/deflate.c
@@ -201,9 +201,9 @@ static struct crypto_alg alg = {
 	.cra_ctxsize		= sizeof(struct deflate_ctx),
 	.cra_module		= THIS_MODULE,
 	.cra_list		= LIST_HEAD_INIT(alg.cra_list),
+	.cra_init		= deflate_init,
+	.cra_exit		= deflate_exit,
 	.cra_u			= { .compress = {
-	.coa_init		= deflate_init,
-	.coa_exit		= deflate_exit,
 	.coa_compress 		= deflate_compress,
 	.coa_decompress  	= deflate_decompress } }
 };
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index ef918803ec30..6c013c88080f 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -109,8 +109,6 @@ struct digest_alg {
 };
 
 struct compress_alg {
-	int (*coa_init)(struct crypto_tfm *tfm);
-	void (*coa_exit)(struct crypto_tfm *tfm);
 	int (*coa_compress)(struct crypto_tfm *tfm, const u8 *src,
 			    unsigned int slen, u8 *dst, unsigned int *dlen);
 	int (*coa_decompress)(struct crypto_tfm *tfm, const u8 *src,
@@ -138,6 +136,9 @@ struct crypto_alg {
 		struct digest_alg digest;
 		struct compress_alg compress;
 	} cra_u;
+
+	int (*cra_init)(struct crypto_tfm *tfm);
+	void (*cra_exit)(struct crypto_tfm *tfm);
 	
 	struct module *cra_module;
 };
-- 
cgit v1.2.3


From d913ea0d6b6a48dd6eed8fc5e299b8b10e049186 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 21 May 2006 08:45:26 +1000
Subject: [CRYPTO] api: Removed const from cra_name/cra_driver_name

We do need to change these names now and even more so in future with
instantiated algorithms.  So let's stop lying to the compiler and get
rid of the const modifiers.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/api.c           | 2 +-
 include/linux/crypto.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/api.c b/crypto/api.c
index 8145310d7985..735fdedd8217 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -229,7 +229,7 @@ void crypto_free_tfm(struct crypto_tfm *tfm)
 static inline int crypto_set_driver_name(struct crypto_alg *alg)
 {
 	static const char suffix[] = "-generic";
-	char *driver_name = (char *)alg->cra_driver_name;
+	char *driver_name = alg->cra_driver_name;
 	int len;
 
 	if (*driver_name)
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 6c013c88080f..7f946241b879 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -128,8 +128,8 @@ struct crypto_alg {
 
 	int cra_priority;
 
-	const char cra_name[CRYPTO_MAX_ALG_NAME];
-	const char cra_driver_name[CRYPTO_MAX_ALG_NAME];
+	char cra_name[CRYPTO_MAX_ALG_NAME];
+	char cra_driver_name[CRYPTO_MAX_ALG_NAME];
 
 	union {
 		struct cipher_alg cipher;
-- 
cgit v1.2.3


From 91a972910df042a0a308b2ffd3aa6fd42b0242e3 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Sat, 24 Jun 2006 09:26:43 -0300
Subject: V4L/DVB (4221): Add HM12 YUV format define.

HM12 is a YUV 4:1:1 format used by the cx2341x MPEG encoder/decoder for
the raw YUV input/output. The Y and UV planes are broken up in 16x16
macroblocks and each macroblock is transmitted in turn (row by row).

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 include/linux/videodev2.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 4f428547ec09..8b4745c5e975 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -245,6 +245,7 @@ struct v4l2_pix_format
 #define V4L2_PIX_FMT_YUV420  v4l2_fourcc('Y','U','1','2') /* 12  YUV 4:2:0     */
 #define V4L2_PIX_FMT_YYUV    v4l2_fourcc('Y','Y','U','V') /* 16  YUV 4:2:2     */
 #define V4L2_PIX_FMT_HI240   v4l2_fourcc('H','I','2','4') /*  8  8-bit color   */
+#define V4L2_PIX_FMT_HM12    v4l2_fourcc('H','M','1','2') /*  8  YUV 4:1:1 16x16 macroblocks */
 
 /* see http://www.siliconimaging.com/RGB%20Bayer.htm */
 #define V4L2_PIX_FMT_SBGGR8  v4l2_fourcc('B','A','8','1') /*  8  BGBG.. GRGR.. */
-- 
cgit v1.2.3


From 8cbde94be34f95c01515ba19ce32bcd51ab3949e Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Sat, 24 Jun 2006 14:36:02 -0300
Subject: V4L/DVB (4223): Add V4L2_CID_MPEG_STREAM_VBI_FMT control

V4L2_CID_MPEG_STREAM_VBI_FMT controls if and how VBI data is embedded in
an MPEG stream. Currently only one format is supported: the format designed
for the ivtv driver. This should be extended with new standard formats
(such as defined for DVB) in the future.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 drivers/media/video/cx2341x.c     | 17 +++++++++++++++++
 drivers/media/video/v4l2-common.c | 14 ++++++++++++++
 include/linux/videodev2.h         |  5 +++++
 include/media/cx2341x.h           |  6 ++++++
 4 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/media/video/cx2341x.c b/drivers/media/video/cx2341x.c
index 554813e6f65d..01b22eab5725 100644
--- a/drivers/media/video/cx2341x.c
+++ b/drivers/media/video/cx2341x.c
@@ -43,6 +43,7 @@ MODULE_PARM_DESC(debug, "Debug level (0-1)");
 const u32 cx2341x_mpeg_ctrls[] = {
 	V4L2_CID_MPEG_CLASS,
 	V4L2_CID_MPEG_STREAM_TYPE,
+	V4L2_CID_MPEG_STREAM_VBI_FMT,
 	V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ,
 	V4L2_CID_MPEG_AUDIO_ENCODING,
 	V4L2_CID_MPEG_AUDIO_L2_BITRATE,
@@ -135,6 +136,9 @@ static int cx2341x_get_ctrl(struct cx2341x_mpeg_params *params,
 	case V4L2_CID_MPEG_STREAM_TYPE:
 		ctrl->value = params->stream_type;
 		break;
+	case V4L2_CID_MPEG_STREAM_VBI_FMT:
+		ctrl->value = params->stream_vbi_fmt;
+		break;
 	case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE:
 		ctrl->value = params->video_spatial_filter_mode;
 		break;
@@ -257,6 +261,9 @@ static int cx2341x_set_ctrl(struct cx2341x_mpeg_params *params,
 			params->video_bitrate_mode = V4L2_MPEG_VIDEO_BITRATE_MODE_CBR;
 		}
 		break;
+	case V4L2_CID_MPEG_STREAM_VBI_FMT:
+		params->stream_vbi_fmt = ctrl->value;
+		break;
 	case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE:
 		params->video_spatial_filter_mode = ctrl->value;
 		break;
@@ -418,6 +425,14 @@ int cx2341x_ctrl_query(struct cx2341x_mpeg_params *params, struct v4l2_queryctrl
 			qctrl->flags |= V4L2_CTRL_FLAG_INACTIVE;
 		return err;
 
+	case V4L2_CID_MPEG_STREAM_VBI_FMT:
+		if (params->capabilities & CX2341X_CAP_HAS_SLICED_VBI)
+			return v4l2_ctrl_query_fill_std(qctrl);
+		return cx2341x_ctrl_query_fill(qctrl,
+				V4L2_MPEG_STREAM_VBI_FMT_NONE,
+				V4L2_MPEG_STREAM_VBI_FMT_NONE, 1,
+				V4L2_MPEG_STREAM_VBI_FMT_NONE);
+
 	/* CX23415/6 specific */
 	case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE:
 		return cx2341x_ctrl_query_fill(qctrl,
@@ -639,6 +654,7 @@ void cx2341x_fill_defaults(struct cx2341x_mpeg_params *p)
 {
 	static struct cx2341x_mpeg_params default_params = {
 	/* misc */
+	.capabilities = 0,
 	.port = CX2341X_PORT_MEMORY,
 	.width = 720,
 	.height = 480,
@@ -646,6 +662,7 @@ void cx2341x_fill_defaults(struct cx2341x_mpeg_params *p)
 
 	/* stream */
 	.stream_type = V4L2_MPEG_STREAM_TYPE_MPEG2_PS,
+	.stream_vbi_fmt = V4L2_MPEG_STREAM_VBI_FMT_NONE,
 
 	/* audio */
 	.audio_sampling_freq = V4L2_MPEG_AUDIO_SAMPLING_FREQ_48000,
diff --git a/drivers/media/video/v4l2-common.c b/drivers/media/video/v4l2-common.c
index 14e523471354..f4b3d64ebf73 100644
--- a/drivers/media/video/v4l2-common.c
+++ b/drivers/media/video/v4l2-common.c
@@ -1101,6 +1101,11 @@ const char **v4l2_ctrl_get_menu(u32 id)
 		"MPEG-2 SVCD-compatible Stream",
 		NULL
 	};
+	static const char *mpeg_stream_vbi_fmt[] = {
+		"No VBI",
+		"VBI in private packets, IVTV format",
+		NULL
+	};
 
 	switch (id) {
 		case V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ:
@@ -1129,6 +1134,8 @@ const char **v4l2_ctrl_get_menu(u32 id)
 			return mpeg_video_bitrate_mode;
 		case V4L2_CID_MPEG_STREAM_TYPE:
 			return mpeg_stream_type;
+		case V4L2_CID_MPEG_STREAM_VBI_FMT:
+			return mpeg_stream_vbi_fmt;
 		default:
 			return NULL;
 	}
@@ -1182,6 +1189,7 @@ int v4l2_ctrl_query_fill(struct v4l2_queryctrl *qctrl, s32 min, s32 max, s32 ste
 	case V4L2_CID_MPEG_STREAM_PID_PCR: 	name = "Stream PCR Program ID"; break;
 	case V4L2_CID_MPEG_STREAM_PES_ID_AUDIO: name = "Stream PES Audio ID"; break;
 	case V4L2_CID_MPEG_STREAM_PES_ID_VIDEO: name = "Stream PES Video ID"; break;
+	case V4L2_CID_MPEG_STREAM_VBI_FMT:	name = "Stream VBI Format"; break;
 
 	default:
 		return -EINVAL;
@@ -1208,6 +1216,7 @@ int v4l2_ctrl_query_fill(struct v4l2_queryctrl *qctrl, s32 min, s32 max, s32 ste
 	case V4L2_CID_MPEG_VIDEO_ASPECT:
 	case V4L2_CID_MPEG_VIDEO_BITRATE_MODE:
 	case V4L2_CID_MPEG_STREAM_TYPE:
+	case V4L2_CID_MPEG_STREAM_VBI_FMT:
 		qctrl->type = V4L2_CTRL_TYPE_MENU;
 		step = 1;
 		break;
@@ -1367,6 +1376,11 @@ int v4l2_ctrl_query_fill_std(struct v4l2_queryctrl *qctrl)
 		return v4l2_ctrl_query_fill(qctrl, 0, 255, 1, 0);
 	case V4L2_CID_MPEG_STREAM_PES_ID_VIDEO:
 		return v4l2_ctrl_query_fill(qctrl, 0, 255, 1, 0);
+	case V4L2_CID_MPEG_STREAM_VBI_FMT:
+		return v4l2_ctrl_query_fill(qctrl,
+				V4L2_MPEG_STREAM_VBI_FMT_NONE,
+				V4L2_MPEG_STREAM_VBI_FMT_IVTV, 1,
+				V4L2_MPEG_STREAM_VBI_FMT_NONE);
 	default:
 		return -EINVAL;
 	}
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 8b4745c5e975..a62673dad76e 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -822,6 +822,11 @@ enum v4l2_mpeg_stream_type {
 #define V4L2_CID_MPEG_STREAM_PID_PCR 		(V4L2_CID_MPEG_BASE+4)
 #define V4L2_CID_MPEG_STREAM_PES_ID_AUDIO 	(V4L2_CID_MPEG_BASE+5)
 #define V4L2_CID_MPEG_STREAM_PES_ID_VIDEO 	(V4L2_CID_MPEG_BASE+6)
+#define V4L2_CID_MPEG_STREAM_VBI_FMT 		(V4L2_CID_MPEG_BASE+7)
+enum v4l2_mpeg_stream_vbi_fmt {
+	V4L2_MPEG_STREAM_VBI_FMT_NONE = 0,  /* No VBI in the MPEG stream */
+	V4L2_MPEG_STREAM_VBI_FMT_IVTV = 1,  /* VBI in private packets, IVTV format */
+};
 
 /*  MPEG audio */
 #define V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ 	(V4L2_CID_MPEG_BASE+100)
diff --git a/include/media/cx2341x.h b/include/media/cx2341x.h
index 51fb06b4c394..074c4008ad52 100644
--- a/include/media/cx2341x.h
+++ b/include/media/cx2341x.h
@@ -25,8 +25,13 @@ enum cx2341x_port {
 	CX2341X_PORT_SERIAL    = 2
 };
 
+enum cx2341x_cap {
+	CX2341X_CAP_HAS_SLICED_VBI = 1 << 0,
+};
+
 struct cx2341x_mpeg_params {
 	/* misc */
+	u32 capabilities;
 	enum cx2341x_port port;
 	u16 width;
 	u16 height;
@@ -34,6 +39,7 @@ struct cx2341x_mpeg_params {
 
 	/* stream */
 	enum v4l2_mpeg_stream_type stream_type;
+	enum v4l2_mpeg_stream_vbi_fmt stream_vbi_fmt;
 
 	/* audio */
 	enum v4l2_mpeg_audio_sampling_freq audio_sampling_freq;
-- 
cgit v1.2.3


From d6e05edc59ecd79e8badf440c0d295a979bdfa3e Mon Sep 17 00:00:00 2001
From: Andreas Mohr <andi@lisas.de>
Date: Mon, 26 Jun 2006 18:35:02 +0200
Subject: spelling fixes

acquired (aquired)
contiguous (contigious)
successful (succesful, succesfull)
surprise (suprise)
whether (weather)
some other misspellings

Signed-off-by: Andreas Mohr <andi@lisas.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 arch/i386/kernel/cpu/cyrix.c               |  2 +-
 arch/i386/kernel/i8259.c                   |  2 +-
 arch/mips/momentum/ocelot_g/gt-irq.c       |  4 ++--
 arch/powerpc/platforms/cell/spufs/switch.c |  2 +-
 arch/powerpc/platforms/pseries/eeh_cache.c |  2 +-
 arch/s390/kernel/vtime.c                   |  2 +-
 arch/um/drivers/ubd_kern.c                 |  2 +-
 arch/x86_64/kernel/i8259.c                 |  2 +-
 block/as-iosched.c                         |  2 +-
 block/ll_rw_blk.c                          |  4 ++--
 drivers/atm/firestream.c                   |  2 +-
 drivers/char/agp/amd-k7-agp.c              |  2 +-
 drivers/char/agp/ati-agp.c                 |  2 +-
 drivers/char/agp/efficeon-agp.c            |  2 +-
 drivers/char/rio/riointr.c                 |  2 +-
 drivers/hwmon/asb100.c                     |  2 +-
 drivers/hwmon/lm78.c                       |  2 +-
 drivers/hwmon/lm80.c                       |  2 +-
 drivers/hwmon/lm87.c                       |  2 +-
 drivers/hwmon/sis5595.c                    |  2 +-
 drivers/hwmon/smsc47m1.c                   |  2 +-
 drivers/hwmon/w83627hf.c                   |  2 +-
 drivers/hwmon/w83781d.c                    |  2 +-
 drivers/hwmon/w83792d.c                    |  2 +-
 drivers/ide/ide-disk.c                     |  2 +-
 drivers/ide/ide-io.c                       |  2 +-
 drivers/ieee1394/hosts.c                   |  2 +-
 drivers/ieee1394/ieee1394_core.h           |  2 +-
 drivers/isdn/divert/isdn_divert.c          |  2 +-
 drivers/media/video/bt8xx/bttv-cards.c     |  2 +-
 drivers/net/3c501.c                        |  4 ++--
 drivers/net/irda/irport.c                  |  2 +-
 drivers/net/pcmcia/smc91c92_cs.c           |  2 +-
 drivers/net/wireless/ipw2100.c             |  2 +-
 drivers/pci/pci.c                          |  2 +-
 drivers/s390/scsi/zfcp_erp.c               | 14 +++++++-------
 drivers/scsi/NCR5380.c                     |  2 +-
 drivers/scsi/advansys.c                    |  2 +-
 drivers/scsi/dc395x.c                      |  2 +-
 drivers/scsi/ibmmca.c                      |  8 ++++----
 drivers/scsi/ips.c                         |  4 ++--
 drivers/scsi/st.c                          |  2 +-
 fs/9p/mux.c                                |  2 +-
 fs/aio.c                                   |  2 +-
 fs/jffs2/summary.c                         |  2 +-
 fs/jfs/jfs_extent.c                        |  8 ++++----
 include/asm-i386/system.h                  |  2 +-
 include/asm-m32r/system.h                  |  2 +-
 include/linux/sunrpc/gss_api.h             |  2 +-
 lib/kernel_lock.c                          |  4 ++--
 mm/page_alloc.c                            |  2 +-
 mm/readahead.c                             |  4 ++--
 net/sunrpc/auth_gss/gss_krb5_mech.c        |  2 +-
 net/sunrpc/auth_gss/gss_spkm3_mech.c       |  2 +-
 sound/core/seq/seq_memory.h                |  2 +-
 sound/oss/sb_ess.c                         | 28 ++++++++++++++--------------
 sound/pci/cs5535audio/cs5535audio_pcm.c    |  2 +-
 57 files changed, 88 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index fc32c8028e24..f03b7f94c304 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -354,7 +354,7 @@ static void __init init_nsc(struct cpuinfo_x86 *c)
 	 * This function only handles the GX processor, and kicks every
 	 * thing else to the Cyrix init function above - that should
 	 * cover any processors that might have been branded differently
-	 * after NSC aquired Cyrix.
+	 * after NSC acquired Cyrix.
 	 *
 	 * If this breaks your GX1 horribly, please e-mail
 	 * info-linux@ldcmail.amd.com to tell us.
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index b7636b96e104..c1a42feba286 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -175,7 +175,7 @@ static void mask_and_ack_8259A(unsigned int irq)
 	 * Lightweight spurious IRQ detection. We do not want
 	 * to overdo spurious IRQ handling - it's usually a sign
 	 * of hardware problems, so we only do the checks we can
-	 * do without slowing down good hardware unnecesserily.
+	 * do without slowing down good hardware unnecessarily.
 	 *
 	 * Note that IRQ7 and IRQ15 (the two spurious IRQs
 	 * usually resulting from the 8259A-1|2 PICs) occur
diff --git a/arch/mips/momentum/ocelot_g/gt-irq.c b/arch/mips/momentum/ocelot_g/gt-irq.c
index e5eceed1beff..8bd9b844fa9e 100644
--- a/arch/mips/momentum/ocelot_g/gt-irq.c
+++ b/arch/mips/momentum/ocelot_g/gt-irq.c
@@ -59,7 +59,7 @@ void hook_irq_handler(int int_cause, int bit_num, void *isr_ptr)
  * bit_num   - Indicates which bit number in the cause register
  *
  * Outputs :
- * 1 if succesful, 0 if failure
+ * 1 if successful, 0 if failure
  */
 int enable_galileo_irq(int int_cause, int bit_num)
 {
@@ -83,7 +83,7 @@ int enable_galileo_irq(int int_cause, int bit_num)
  * bit_num   - Indicates which bit number in the cause register
  *
  * Outputs :
- * 1 if succesful, 0 if failure
+ * 1 if successful, 0 if failure
  */
 int disable_galileo_irq(int int_cause, int bit_num)
 {
diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index b30e55dab832..3068b429b031 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -2100,7 +2100,7 @@ EXPORT_SYMBOL_GPL(spu_save);
  * @spu: pointer to SPU iomem structure.
  *
  * Perform harvest + restore, as we may not be coming
- * from a previous succesful save operation, and the
+ * from a previous successful save operation, and the
  * hardware state is unknown.
  */
 int spu_restore(struct spu_state *new, struct spu *spu)
diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c
index 98c23aec85be..c37a8497c60f 100644
--- a/arch/powerpc/platforms/pseries/eeh_cache.c
+++ b/arch/powerpc/platforms/pseries/eeh_cache.c
@@ -287,7 +287,7 @@ void pci_addr_cache_remove_device(struct pci_dev *dev)
  * find the pci device that corresponds to a given address.
  * This routine scans all pci busses to build the cache.
  * Must be run late in boot process, after the pci controllers
- * have been scaned for devices (after all device resources are known).
+ * have been scanned for devices (after all device resources are known).
  */
 void __init pci_addr_cache_build(void)
 {
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index dfe6f0856617..1f0439dc245a 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -356,7 +356,7 @@ static void internal_add_vtimer(struct vtimer_list *timer)
 
 	set_vtimer(event->expires);
 	spin_unlock_irqrestore(&vt_list->lock, flags);
-	/* release CPU aquired in prepare_vtimer or mod_virt_timer() */
+	/* release CPU acquired in prepare_vtimer or mod_virt_timer() */
 	put_cpu();
 }
 
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 0897852b09a3..290cec6d69e2 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1222,7 +1222,7 @@ int open_ubd_file(char *file, struct openflags *openflags, int shared,
 		}
 	}
 
-	/* Succesful return case! */
+	/* Successful return case! */
 	if(backing_file_out == NULL)
 		return(fd);
 
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 5ecd34ab8c2b..573990a6d668 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -278,7 +278,7 @@ static void mask_and_ack_8259A(unsigned int irq)
 	 * Lightweight spurious IRQ detection. We do not want
 	 * to overdo spurious IRQ handling - it's usually a sign
 	 * of hardware problems, so we only do the checks we can
-	 * do without slowing down good hardware unnecesserily.
+	 * do without slowing down good hardware unnecessarily.
 	 *
 	 * Note that IRQ7 and IRQ15 (the two spurious IRQs
 	 * usually resulting from the 8259A-1|2 PICs) occur
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 1ec5df466708..3af31ed49a9c 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -892,7 +892,7 @@ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
 }
 
 /*
- * as_can_anticipate indicates weather we should either run arq
+ * as_can_anticipate indicates whether we should either run arq
  * or keep anticipating a better request.
  */
 static int as_can_anticipate(struct as_data *ad, struct as_rq *arq)
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 0603ab2f3692..c04422a502da 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -2745,7 +2745,7 @@ static int attempt_merge(request_queue_t *q, struct request *req,
 		return 0;
 
 	/*
-	 * not contigious
+	 * not contiguous
 	 */
 	if (req->sector + req->nr_sectors != next->sector)
 		return 0;
@@ -3415,7 +3415,7 @@ static struct notifier_block blk_cpu_notifier = {
  *
  * Description:
  *     Ends all I/O on a request. It does not handle partial completions,
- *     unless the driver actually implements this in its completionc callback
+ *     unless the driver actually implements this in its completion callback
  *     through requeueing. Theh actual completion happens out-of-order,
  *     through a softirq handler. The user must have registered a completion
  *     callback through blk_queue_softirq_done().
diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c
index 7f7ec288824d..f2eeaf9dc56a 100644
--- a/drivers/atm/firestream.c
+++ b/drivers/atm/firestream.c
@@ -951,7 +951,7 @@ static int fs_open(struct atm_vcc *atm_vcc)
 		   it most likely that the chip will notice it. It also prevents us
 		   from having to wait for completion. On the other hand, we may
 		   need to wait for completion anyway, to see if it completed
-		   succesfully. */
+		   successfully. */
 
 		switch (atm_vcc->qos.aal) {
 		case ATM_AAL2:
diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c
index 1f776651ac64..51d0d562d01e 100644
--- a/drivers/char/agp/amd-k7-agp.c
+++ b/drivers/char/agp/amd-k7-agp.c
@@ -118,7 +118,7 @@ static int amd_create_gatt_pages(int nr_tables)
 	return retval;
 }
 
-/* Since we don't need contigious memory we just try
+/* Since we don't need contiguous memory we just try
  * to get the gatt table once
  */
 
diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c
index 06fd10ba0c5e..160564345993 100644
--- a/drivers/char/agp/ati-agp.c
+++ b/drivers/char/agp/ati-agp.c
@@ -261,7 +261,7 @@ static int agp_ati_suspend(struct pci_dev *dev, pm_message_t state)
 #endif
 
 /*
- *Since we don't need contigious memory we just try
+ *Since we don't need contiguous memory we just try
  * to get the gatt table once
  */
 
diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c
index 86a966b65236..b788b0a3bbf3 100644
--- a/drivers/char/agp/efficeon-agp.c
+++ b/drivers/char/agp/efficeon-agp.c
@@ -177,7 +177,7 @@ static int efficeon_free_gatt_table(struct agp_bridge_data *bridge)
 
 
 /*
- * Since we don't need contigious memory we just try
+ * Since we don't need contiguous memory we just try
  * to get the gatt table once
  */
 
diff --git a/drivers/char/rio/riointr.c b/drivers/char/rio/riointr.c
index eec1fea0cb92..0bd09040a5c0 100644
--- a/drivers/char/rio/riointr.c
+++ b/drivers/char/rio/riointr.c
@@ -546,7 +546,7 @@ static void RIOReceive(struct rio_info *p, struct Port *PortP)
 	 ** run out of space it will be set to the offset of the
 	 ** next byte to copy from the packet data area. The packet
 	 ** length field is decremented by the number of bytes that
-	 ** we succesfully removed from the packet. When this reaches
+	 ** we successfully removed from the packet. When this reaches
 	 ** zero, we reset the offset pointer to be zero, and free
 	 ** the packet from the front of the queue.
 	 */
diff --git a/drivers/hwmon/asb100.c b/drivers/hwmon/asb100.c
index 65b2709f750c..facc1ccb8338 100644
--- a/drivers/hwmon/asb100.c
+++ b/drivers/hwmon/asb100.c
@@ -341,7 +341,7 @@ static ssize_t set_fan_min(struct device *dev, const char *buf,
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan divisor.  This follows the principle of
-   least suprise; the user doesn't expect the fan minimum to change just
+   least surprise; the user doesn't expect the fan minimum to change just
    because the divisor changed. */
 static ssize_t set_fan_div(struct device *dev, const char *buf,
 				size_t count, int nr)
diff --git a/drivers/hwmon/lm78.c b/drivers/hwmon/lm78.c
index 94be3d797e61..a6ce7abf8602 100644
--- a/drivers/hwmon/lm78.c
+++ b/drivers/hwmon/lm78.c
@@ -358,7 +358,7 @@ static ssize_t show_fan_div(struct device *dev, char *buf, int nr)
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan divisor.  This follows the principle of
-   least suprise; the user doesn't expect the fan minimum to change just
+   least surprise; the user doesn't expect the fan minimum to change just
    because the divisor changed. */
 static ssize_t set_fan_div(struct device *dev, const char *buf,
 	size_t count, int nr)
diff --git a/drivers/hwmon/lm80.c b/drivers/hwmon/lm80.c
index f72120d08c4c..b4ccdfc01203 100644
--- a/drivers/hwmon/lm80.c
+++ b/drivers/hwmon/lm80.c
@@ -253,7 +253,7 @@ set_fan(min2, fan_min[1], LM80_REG_FAN_MIN(2), fan_div[1]);
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan divisor.  This follows the principle of
-   least suprise; the user doesn't expect the fan minimum to change just
+   least surprise; the user doesn't expect the fan minimum to change just
    because the divisor changed. */
 static ssize_t set_fan_div(struct device *dev, const char *buf,
 	size_t count, int nr)
diff --git a/drivers/hwmon/lm87.c b/drivers/hwmon/lm87.c
index e229daf666de..e6c1b638c971 100644
--- a/drivers/hwmon/lm87.c
+++ b/drivers/hwmon/lm87.c
@@ -421,7 +421,7 @@ static void set_fan_min(struct device *dev, const char *buf, int nr)
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan clock divider.  This follows the principle
-   of least suprise; the user doesn't expect the fan minimum to change just
+   of least surprise; the user doesn't expect the fan minimum to change just
    because the divider changed. */
 static ssize_t set_fan_div(struct device *dev, const char *buf,
 		size_t count, int nr)
diff --git a/drivers/hwmon/sis5595.c b/drivers/hwmon/sis5595.c
index 6f3fda73f70c..063f71c5f07e 100644
--- a/drivers/hwmon/sis5595.c
+++ b/drivers/hwmon/sis5595.c
@@ -380,7 +380,7 @@ static ssize_t show_fan_div(struct device *dev, char *buf, int nr)
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan divisor.  This follows the principle of
-   least suprise; the user doesn't expect the fan minimum to change just
+   least surprise; the user doesn't expect the fan minimum to change just
    because the divisor changed. */
 static ssize_t set_fan_div(struct device *dev, const char *buf,
 	size_t count, int nr)
diff --git a/drivers/hwmon/smsc47m1.c b/drivers/hwmon/smsc47m1.c
index 7732aec54594..825e8f72698f 100644
--- a/drivers/hwmon/smsc47m1.c
+++ b/drivers/hwmon/smsc47m1.c
@@ -207,7 +207,7 @@ static ssize_t set_fan_min(struct device *dev, const char *buf,
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan clock divider.  This follows the principle
-   of least suprise; the user doesn't expect the fan minimum to change just
+   of least surprise; the user doesn't expect the fan minimum to change just
    because the divider changed. */
 static ssize_t set_fan_div(struct device *dev, const char *buf,
 		size_t count, int nr)
diff --git a/drivers/hwmon/w83627hf.c b/drivers/hwmon/w83627hf.c
index 71fb7f1af8f5..79368d53c363 100644
--- a/drivers/hwmon/w83627hf.c
+++ b/drivers/hwmon/w83627hf.c
@@ -781,7 +781,7 @@ show_fan_div_reg(struct device *dev, char *buf, int nr)
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan divisor.  This follows the principle of
-   least suprise; the user doesn't expect the fan minimum to change just
+   least surprise; the user doesn't expect the fan minimum to change just
    because the divisor changed. */
 static ssize_t
 store_fan_div_reg(struct device *dev, const char *buf, size_t count, int nr)
diff --git a/drivers/hwmon/w83781d.c b/drivers/hwmon/w83781d.c
index e4c700356c44..7be469ed0f8f 100644
--- a/drivers/hwmon/w83781d.c
+++ b/drivers/hwmon/w83781d.c
@@ -630,7 +630,7 @@ show_fan_div_reg(struct device *dev, char *buf, int nr)
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan divisor.  This follows the principle of
-   least suprise; the user doesn't expect the fan minimum to change just
+   least surprise; the user doesn't expect the fan minimum to change just
    because the divisor changed. */
 static ssize_t
 store_fan_div_reg(struct device *dev, const char *buf, size_t count, int nr)
diff --git a/drivers/hwmon/w83792d.c b/drivers/hwmon/w83792d.c
index 4ef884c216e2..e407c74bda35 100644
--- a/drivers/hwmon/w83792d.c
+++ b/drivers/hwmon/w83792d.c
@@ -463,7 +463,7 @@ show_fan_div(struct device *dev, struct device_attribute *attr,
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan divisor.  This follows the principle of
-   least suprise; the user doesn't expect the fan minimum to change just
+   least surprise; the user doesn't expect the fan minimum to change just
    because the divisor changed. */
 static ssize_t
 store_fan_div(struct device *dev, struct device_attribute *attr,
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index a5017de72da5..f033d732f387 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -37,7 +37,7 @@
  * Version 1.15		convert all calls to ide_raw_taskfile
  *				since args will return register content.
  * Version 1.16		added suspend-resume-checkpower
- * Version 1.17		do flush on standy, do flush on ATA < ATA6
+ * Version 1.17		do flush on standby, do flush on ATA < ATA6
  *			fix wcache setup.
  */
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 622a55c72f03..05ba8e01e635 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -1665,7 +1665,7 @@ irqreturn_t ide_intr (int irq, void *dev_id, struct pt_regs *regs)
  *	Initialize a request before we fill it in and send it down to
  *	ide_do_drive_cmd. Commands must be set up by this function. Right
  *	now it doesn't do a lot, but if that changes abusers will have a
- *	nasty suprise.
+ *	nasty surprise.
  */
 
 void ide_init_drive_cmd (struct request *rq)
diff --git a/drivers/ieee1394/hosts.c b/drivers/ieee1394/hosts.c
index 2d47b11777a5..ad49c040b674 100644
--- a/drivers/ieee1394/hosts.c
+++ b/drivers/ieee1394/hosts.c
@@ -103,7 +103,7 @@ static int alloc_hostnum_cb(struct hpsb_host *host, void *__data)
  * driver specific parts, enable the controller and make it available
  * to the general subsystem using hpsb_add_host().
  *
- * Return Value: a pointer to the &hpsb_host if succesful, %NULL if
+ * Return Value: a pointer to the &hpsb_host if successful, %NULL if
  * no memory was available.
  */
 static DEFINE_MUTEX(host_num_alloc);
diff --git a/drivers/ieee1394/ieee1394_core.h b/drivers/ieee1394/ieee1394_core.h
index e7b55e895f50..0ecbf335c64f 100644
--- a/drivers/ieee1394/ieee1394_core.h
+++ b/drivers/ieee1394/ieee1394_core.h
@@ -139,7 +139,7 @@ int hpsb_bus_reset(struct hpsb_host *host);
 
 /*
  * Hand over received selfid packet to the core.  Complement check (second
- * quadlet is complement of first) is expected to be done and succesful.
+ * quadlet is complement of first) is expected to be done and successful.
  */
 void hpsb_selfid_received(struct hpsb_host *host, quadlet_t sid);
 
diff --git a/drivers/isdn/divert/isdn_divert.c b/drivers/isdn/divert/isdn_divert.c
index f1a1f9a9b88e..1f5ebe9ee72c 100644
--- a/drivers/isdn/divert/isdn_divert.c
+++ b/drivers/isdn/divert/isdn_divert.c
@@ -592,7 +592,7 @@ static int put_address(char *st, u_char *p, int len)
 } /* put_address */
 
 /*************************************/
-/* report a succesfull interrogation */
+/* report a successful interrogation */
 /*************************************/
 static int interrogate_success(isdn_ctrl *ic, struct call_struc *cs)
 { char *src = ic->parm.dss1_io.data;
diff --git a/drivers/media/video/bt8xx/bttv-cards.c b/drivers/media/video/bt8xx/bttv-cards.c
index 3116345c93b1..e68a6d2fff24 100644
--- a/drivers/media/video/bt8xx/bttv-cards.c
+++ b/drivers/media/video/bt8xx/bttv-cards.c
@@ -4848,7 +4848,7 @@ static void picolo_tetra_muxsel (struct bttv* btv, unsigned int input)
  *
  * The IVC120G security card has 4 i2c controlled TDA8540 matrix
  * swichers to provide 16 channels to MUX0. The TDA8540's have
- * 4 indepedant outputs and as such the IVC120G also has the
+ * 4 independent outputs and as such the IVC120G also has the
  * optional "Monitor Out" bus. This allows the card to be looking
  * at one input while the monitor is looking at another.
  *
diff --git a/drivers/net/3c501.c b/drivers/net/3c501.c
index bb44509fd404..07136ec423bd 100644
--- a/drivers/net/3c501.c
+++ b/drivers/net/3c501.c
@@ -508,11 +508,11 @@ static int el_start_xmit(struct sk_buff *skb, struct net_device *dev)
  * speak of. We simply pull the packet out of its PIO buffer (which is slow)
  * and queue it for the kernel. Then we reset the card for the next packet.
  *
- * We sometimes get suprise interrupts late both because the SMP IRQ delivery
+ * We sometimes get surprise interrupts late both because the SMP IRQ delivery
  * is message passing and because the card sometimes seems to deliver late. I
  * think if it is part way through a receive and the mode is changed it carries
  * on receiving and sends us an interrupt. We have to band aid all these cases
- * to get a sensible 150kbytes/second performance. Even then you want a small
+ * to get a sensible 150kBytes/second performance. Even then you want a small
  * TCP window.
  */
 
diff --git a/drivers/net/irda/irport.c b/drivers/net/irda/irport.c
index 98fa5319e5cc..44efd49bf4a9 100644
--- a/drivers/net/irda/irport.c
+++ b/drivers/net/irda/irport.c
@@ -386,7 +386,7 @@ static int __irport_change_speed(struct irda_task *task)
 	/* Locking notes : this function may be called from irq context with
 	 * spinlock, via irport_write_wakeup(), or from non-interrupt without
 	 * spinlock (from the task timer). Yuck !
-	 * This is ugly, and unsafe is the spinlock is not already aquired.
+	 * This is ugly, and unsafe is the spinlock is not already acquired.
 	 * This will be fixed when irda-task get rewritten.
 	 * Jean II */
 	if (!spin_is_locked(&self->lock)) {
diff --git a/drivers/net/pcmcia/smc91c92_cs.c b/drivers/net/pcmcia/smc91c92_cs.c
index e74bf5014ef6..a73d54553030 100644
--- a/drivers/net/pcmcia/smc91c92_cs.c
+++ b/drivers/net/pcmcia/smc91c92_cs.c
@@ -1883,7 +1883,7 @@ static void smc_reset(struct net_device *dev)
     /* Set the Window 1 control, configuration and station addr registers.
        No point in writing the I/O base register ;-> */
     SMC_SELECT_BANK(1);
-    /* Automatically release succesfully transmitted packets,
+    /* Automatically release successfully transmitted packets,
        Accept link errors, counter and Tx error interrupts. */
     outw(CTL_AUTO_RELEASE | CTL_TE_ENABLE | CTL_CR_ENABLE,
 	 ioaddr + CONTROL);
diff --git a/drivers/net/wireless/ipw2100.c b/drivers/net/wireless/ipw2100.c
index 72335c8eb97f..94aeb23a7729 100644
--- a/drivers/net/wireless/ipw2100.c
+++ b/drivers/net/wireless/ipw2100.c
@@ -1485,7 +1485,7 @@ static int ipw2100_hw_stop_adapter(struct ipw2100_priv *priv)
 		 *
 		 * Sending the PREPARE_FOR_POWER_DOWN will restrict the
 		 * hardware from going into standby mode and will transition
-		 * out of D0-standy if it is already in that state.
+		 * out of D0-standby if it is already in that state.
 		 *
 		 * STATUS_PREPARE_POWER_DOWN_COMPLETE will be sent by the
 		 * driver upon completion.  Once received, the driver can
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index d408a3c30426..23d3b17c8cad 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -369,7 +369,7 @@ pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 
 	/*
 	 * Give firmware a chance to be called, such as ACPI _PRx, _PSx
-	 * Firmware method after natice method ?
+	 * Firmware method after native method ?
 	 */
 	if (platform_pci_set_power_state)
 		platform_pci_set_power_state(dev, state);
diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index 4682c8b8bd24..909731b99d26 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -167,7 +167,7 @@ zfcp_fsf_scsi_er_timeout_handler(unsigned long data)
  *		initiates adapter recovery which is done
  *		asynchronously
  *
- * returns:	0	- initiated action succesfully
+ * returns:	0	- initiated action successfully
  *		<0	- failed to initiate action
  */
 int
@@ -203,7 +203,7 @@ zfcp_erp_adapter_reopen_internal(struct zfcp_adapter *adapter, int clear_mask)
  * purpose:	Wrappper for zfcp_erp_adapter_reopen_internal
  *              used to ensure the correct locking
  *
- * returns:	0	- initiated action succesfully
+ * returns:	0	- initiated action successfully
  *		<0	- failed to initiate action
  */
 int
@@ -469,7 +469,7 @@ zfcp_test_link(struct zfcp_port *port)
  *		initiates Forced Reopen recovery which is done
  *		asynchronously
  *
- * returns:	0	- initiated action succesfully
+ * returns:	0	- initiated action successfully
  *		<0	- failed to initiate action
  */
 static int
@@ -509,7 +509,7 @@ zfcp_erp_port_forced_reopen_internal(struct zfcp_port *port, int clear_mask)
  * purpose:	Wrappper for zfcp_erp_port_forced_reopen_internal
  *              used to ensure the correct locking
  *
- * returns:	0	- initiated action succesfully
+ * returns:	0	- initiated action successfully
  *		<0	- failed to initiate action
  */
 int
@@ -536,7 +536,7 @@ zfcp_erp_port_forced_reopen(struct zfcp_port *port, int clear_mask)
  *		initiates Reopen recovery which is done
  *		asynchronously
  *
- * returns:	0	- initiated action succesfully
+ * returns:	0	- initiated action successfully
  *		<0	- failed to initiate action
  */
 static int
@@ -605,7 +605,7 @@ zfcp_erp_port_reopen(struct zfcp_port *port, int clear_mask)
  *		initiates Reopen recovery which is done
  *		asynchronously
  *
- * returns:	0	- initiated action succesfully
+ * returns:	0	- initiated action successfully
  *		<0	- failed to initiate action
  */
 static int
@@ -1805,7 +1805,7 @@ zfcp_erp_modify_unit_status(struct zfcp_unit *unit, u32 mask, int set_or_clear)
  * purpose:	Wrappper for zfcp_erp_port_reopen_all_internal
  *              used to ensure the correct locking
  *
- * returns:	0	- initiated action succesfully
+ * returns:	0	- initiated action successfully
  *		<0	- failed to initiate action
  */
 int
diff --git a/drivers/scsi/NCR5380.c b/drivers/scsi/NCR5380.c
index fa57e0b4a5fd..75f2f7ae2a8e 100644
--- a/drivers/scsi/NCR5380.c
+++ b/drivers/scsi/NCR5380.c
@@ -500,7 +500,7 @@ static void NCR5380_print_phase(struct Scsi_Host *instance)
 /* 
  * Function : int should_disconnect (unsigned char cmd)
  *
- * Purpose : decide weather a command would normally disconnect or 
+ * Purpose : decide whether a command would normally disconnect or 
  *      not, since if it won't disconnect we should go to sleep.
  *
  * Input : cmd - opcode of SCSI command
diff --git a/drivers/scsi/advansys.c b/drivers/scsi/advansys.c
index 5ee47555a8af..dd9fb3d91000 100644
--- a/drivers/scsi/advansys.c
+++ b/drivers/scsi/advansys.c
@@ -12374,7 +12374,7 @@ AscInitFromEEP(ASC_DVC_VAR *asc_dvc)
                 ASC_PRINT1(
 "AscInitFromEEP: Failed to re-write EEPROM with %d errors.\n", i);
         } else {
-                ASC_PRINT("AscInitFromEEP: Succesfully re-wrote EEPROM.");
+                ASC_PRINT("AscInitFromEEP: Successfully re-wrote EEPROM.\n");
         }
     }
     return (warn_code);
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c
index 183245254931..58b0748045ee 100644
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -3771,7 +3771,7 @@ static void request_sense(struct AdapterCtlBlk *acb, struct DeviceCtlBlk *dcb,
  * @target: The target for the new device.
  * @lun: The lun for the new device.
  *
- * Return the new device if succesfull or NULL on failure.
+ * Return the new device if successful or NULL on failure.
  **/
 static struct DeviceCtlBlk *device_alloc(struct AdapterCtlBlk *acb,
 		u8 target, u8 lun)
diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c
index 115f55471ed3..497f6642b2dc 100644
--- a/drivers/scsi/ibmmca.c
+++ b/drivers/scsi/ibmmca.c
@@ -760,7 +760,7 @@ static int device_inquiry(int host_index, int ldn)
 		while (!got_interrupt(host_index))
 			barrier();
 
-		/*if command succesful, break */
+		/*if command successful, break */
 		if ((stat_result(host_index) == IM_SCB_CMD_COMPLETED) || (stat_result(host_index) == IM_SCB_CMD_COMPLETED_WITH_RETRIES))
 			return 1;
 	}
@@ -885,7 +885,7 @@ static int immediate_assign(int host_index, unsigned int pun, unsigned int lun,
 		while (!got_interrupt(host_index))
 			barrier();
 
-		/*if command succesful, break */
+		/*if command successful, break */
 		if (stat_result(host_index) == IM_IMMEDIATE_CMD_COMPLETED)
 			return 1;
 	}
@@ -921,7 +921,7 @@ static int immediate_feature(int host_index, unsigned int speed, unsigned int ti
 			return 2;
 		} else
 			global_command_error_excuse = 0;
-		/*if command succesful, break */
+		/*if command successful, break */
 		if (stat_result(host_index) == IM_IMMEDIATE_CMD_COMPLETED)
 			return 1;
 	}
@@ -959,7 +959,7 @@ static int immediate_reset(int host_index, unsigned int ldn)
 			/* did not work, finish */
 			return 1;
 		}
-		/*if command succesful, break */
+		/*if command successful, break */
 		if (stat_result(host_index) == IM_IMMEDIATE_CMD_COMPLETED)
 			return 1;
 	}
diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c
index 5353b28b2939..78f2ff736c3e 100644
--- a/drivers/scsi/ips.c
+++ b/drivers/scsi/ips.c
@@ -6438,7 +6438,7 @@ ips_erase_bios(ips_ha_t * ha)
 		/* VPP failure */
 		return (1);
 
-	/* check for succesful flash */
+	/* check for successful flash */
 	if (status & 0x30)
 		/* sequence error */
 		return (1);
@@ -6550,7 +6550,7 @@ ips_erase_bios_memio(ips_ha_t * ha)
 		/* VPP failure */
 		return (1);
 
-	/* check for succesful flash */
+	/* check for successful flash */
 	if (status & 0x30)
 		/* sequence error */
 		return (1);
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 1272dd249af3..b5218fc0ac86 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -2818,7 +2818,7 @@ static int st_int_ioctl(struct scsi_tape *STp, unsigned int cmd_in, unsigned lon
 		    (cmdstatp->sense_hdr.sense_key == NO_SENSE ||
 		     cmdstatp->sense_hdr.sense_key == RECOVERED_ERROR) &&
 		    undone == 0) {
-			ioctl_result = 0;	/* EOF written succesfully at EOM */
+			ioctl_result = 0;	/* EOF written successfully at EOM */
 			if (fileno >= 0)
 				fileno++;
 			STps->drv_file = fileno;
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index f4407eb276c7..12e1baa4508d 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -712,7 +712,7 @@ static void v9fs_read_work(void *a)
  * v9fs_send_request - send 9P request
  * The function can sleep until the request is scheduled for sending.
  * The function can be interrupted. Return from the function is not
- * a guarantee that the request is sent succesfully. Can return errors
+ * a guarantee that the request is sent successfully. Can return errors
  * that can be retrieved by PTR_ERR macros.
  *
  * @m: mux data
diff --git a/fs/aio.c b/fs/aio.c
index 8c34a62df7d7..950630187acc 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -641,7 +641,7 @@ static inline int __queue_kicked_iocb(struct kiocb *iocb)
  *	invoked both for initial i/o submission and
  *	subsequent retries via the aio_kick_handler.
  *	Expects to be invoked with iocb->ki_ctx->lock
- *	already held. The lock is released and reaquired
+ *	already held. The lock is released and reacquired
  *	as needed during processing.
  *
  * Calls the iocb retry method (already setup for the
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 0b02fc79e4d1..be1acc3dad97 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -43,7 +43,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c)
 		return -ENOMEM;
 	}
 
-	dbg_summary("returned succesfully\n");
+	dbg_summary("returned successfully\n");
 
 	return 0;
 }
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 5549378358bf..4d52593a5fc6 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -126,7 +126,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
 
 	/* allocate the disk blocks for the extent.  initially, extBalloc()
 	 * will try to allocate disk blocks for the requested size (xlen). 
-	 * if this fails (xlen contigious free blocks not avaliable), it'll
+	 * if this fails (xlen contiguous free blocks not avaliable), it'll
 	 * try to allocate a smaller number of blocks (producing a smaller
 	 * extent), with this smaller number of blocks consisting of the
 	 * requested number of blocks rounded down to the next smaller
@@ -493,7 +493,7 @@ int extFill(struct inode *ip, xad_t * xp)
  *
  *		initially, we will try to allocate disk blocks for the
  *		requested size (nblocks).  if this fails (nblocks 
- *		contigious free blocks not avaliable), we'll try to allocate
+ *		contiguous free blocks not avaliable), we'll try to allocate
  *		a smaller number of blocks (producing a smaller extent), with
  *		this smaller number of blocks consisting of the requested
  *		number of blocks rounded down to the next smaller power of 2
@@ -529,7 +529,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 
 	/* get the number of blocks to initially attempt to allocate.
 	 * we'll first try the number of blocks requested unless this
-	 * number is greater than the maximum number of contigious free
+	 * number is greater than the maximum number of contiguous free
 	 * blocks in the map. in that case, we'll start off with the 
 	 * maximum free.
 	 */
@@ -586,7 +586,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
  *		in place.  if this fails, we'll try to move the extent
  *		to a new set of blocks. if moving the extent, we initially
  *		will try to allocate disk blocks for the requested size
- *		(nnew).  if this fails 	(nnew contigious free blocks not
+ *		(nnew).  if this fails 	(new contiguous free blocks not
  *		avaliable), we'll try  to allocate a smaller number of
  *		blocks (producing a smaller extent), with this smaller
  *		number of blocks consisting of the requested number of
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 0249f912a29c..cab0180567f9 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -427,7 +427,7 @@ static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long l
  * does not enforce ordering, since there is no data dependency between
  * the read of "a" and the read of "b".  Therefore, on some CPUs, such
  * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
- * in cases like thiswhere there are no data dependencies.
+ * in cases like this where there are no data dependencies.
  **/
 
 #define read_barrier_depends()	do { } while(0)
diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h
index 33567e8bfe6b..66c4742f09e7 100644
--- a/include/asm-m32r/system.h
+++ b/include/asm-m32r/system.h
@@ -318,7 +318,7 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
  * does not enforce ordering, since there is no data dependency between
  * the read of "a" and the read of "b".  Therefore, on some CPUs, such
  * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
- * in cases like thiswhere there are no data dependencies.
+ * in cases like this where there are no data dependencies.
  **/
 
 #define read_barrier_depends()	do { } while (0)
diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h
index 9b8bcf125c18..6e112cc5cdda 100644
--- a/include/linux/sunrpc/gss_api.h
+++ b/include/linux/sunrpc/gss_api.h
@@ -126,7 +126,7 @@ struct gss_api_mech *gss_mech_get_by_pseudoflavor(u32);
 /* Just increments the mechanism's reference count and returns its input: */
 struct gss_api_mech * gss_mech_get(struct gss_api_mech *);
 
-/* For every succesful gss_mech_get or gss_mech_get_by_* call there must be a
+/* For every successful gss_mech_get or gss_mech_get_by_* call there must be a
  * corresponding call to gss_mech_put. */
 void gss_mech_put(struct gss_api_mech *);
 
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index cb5490ec00f2..e713e86811ae 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -14,7 +14,7 @@
  * The 'big kernel semaphore'
  *
  * This mutex is taken and released recursively by lock_kernel()
- * and unlock_kernel().  It is transparently dropped and reaquired
+ * and unlock_kernel().  It is transparently dropped and reacquired
  * over schedule().  It is used to protect legacy code that hasn't
  * been migrated to a proper locking design yet.
  *
@@ -92,7 +92,7 @@ void __lockfunc unlock_kernel(void)
  * The 'big kernel lock'
  *
  * This spinlock is taken and released recursively by lock_kernel()
- * and unlock_kernel().  It is transparently dropped and reaquired
+ * and unlock_kernel().  It is transparently dropped and reacquired
  * over schedule().  It is used to protect legacy code that hasn't
  * been migrated to a proper locking design yet.
  *
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6c1174fcf52c..9f86191bb632 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -266,7 +266,7 @@ static inline void rmv_page_order(struct page *page)
  * satisfies the following equation:
  *     P = B & ~(1 << O)
  *
- * Assumption: *_mem_map is contigious at least up to MAX_ORDER
+ * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
 static inline struct page *
 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
diff --git a/mm/readahead.c b/mm/readahead.c
index e39e416860d7..aa7ec424656a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -390,8 +390,8 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
  * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
  * is set wait till the read completes.  Otherwise attempt to read without
  * blocking.
- * Returns 1 meaning 'success' if read is succesfull without switching off
- * readhaead mode. Otherwise return failure.
+ * Returns 1 meaning 'success' if read is successful without switching off
+ * readahead mode. Otherwise return failure.
  */
 static int
 blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 129e2bd36aff..b8714a87b34c 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -169,7 +169,7 @@ gss_import_sec_context_kerberos(const void *p,
 	}
 
 	ctx_id->internal_ctx_id = ctx;
-	dprintk("RPC:      Succesfully imported new context.\n");
+	dprintk("RPC:      Successfully imported new context.\n");
 	return 0;
 
 out_err_free_key2:
diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c
index 5bf11ccba7cd..3d0432aa45c1 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_mech.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c
@@ -201,7 +201,7 @@ gss_import_sec_context_spkm3(const void *p, size_t len,
 
 	ctx_id->internal_ctx_id = ctx;
 
-	dprintk("Succesfully imported new spkm context.\n");
+	dprintk("Successfully imported new spkm context.\n");
 	return 0;
 
 out_err_free_key2:
diff --git a/sound/core/seq/seq_memory.h b/sound/core/seq/seq_memory.h
index 39c60d9e1efc..63e91431a29f 100644
--- a/sound/core/seq/seq_memory.h
+++ b/sound/core/seq/seq_memory.h
@@ -31,7 +31,7 @@ struct snd_seq_event_cell {
 	struct snd_seq_event_cell *next;	/* next cell */
 };
 
-/* design note: the pool is a contigious block of memory, if we dynamicly
+/* design note: the pool is a contiguous block of memory, if we dynamicly
    want to add additional cells to the pool be better store this in another
    pool as we need to know the base address of the pool when releasing
    memory. */
diff --git a/sound/oss/sb_ess.c b/sound/oss/sb_ess.c
index fae05fe3de43..180e95c87e3e 100644
--- a/sound/oss/sb_ess.c
+++ b/sound/oss/sb_ess.c
@@ -97,19 +97,19 @@
  *
  * The documentation is an adventure: it's close but not fully accurate. I
  * found out that after a reset some registers are *NOT* reset, though the
- * docs say the would be. Interresting ones are 0x7f, 0x7d and 0x7a. They are
- * related to the Audio 2 channel. I also was suprised about the consequenses
+ * docs say the would be. Interesting ones are 0x7f, 0x7d and 0x7a. They are
+ * related to the Audio 2 channel. I also was surprised about the consequences
  * of writing 0x00 to 0x7f (which should be done by reset): The ES1887 moves
  * into ES1888 mode. This means that it claims IRQ 11, which happens to be my
  * ISDN adapter. Needless to say it no longer worked. I now understand why
  * after rebooting 0x7f already was 0x05, the value of my choice: the BIOS
  * did it.
  *
- * Oh, and this is another trap: in ES1887 docs mixer register 0x70 is decribed
- * as if it's exactly the same as register 0xa1. This is *NOT* true. The
- * description of 0x70 in ES1869 docs is accurate however.
+ * Oh, and this is another trap: in ES1887 docs mixer register 0x70 is
+ * described as if it's exactly the same as register 0xa1. This is *NOT* true.
+ * The description of 0x70 in ES1869 docs is accurate however.
  * Well, the assumption about ES1869 was wrong: register 0x70 is very much
- * like register 0xa1, except that bit 7 is allways 1, whatever you want
+ * like register 0xa1, except that bit 7 is always 1, whatever you want
  * it to be.
  *
  * When using audio 2 mixer register 0x72 seems te be meaningless. Only 0xa2
@@ -117,10 +117,10 @@
  *
  * Software reset not being able to reset all registers is great! Especially
  * the fact that register 0x78 isn't reset is great when you wanna change back
- * to single dma operation (simplex): audio 2 is still operation, and uses the
- * same dma as audio 1: your ess changes into a funny echo machine.
+ * to single dma operation (simplex): audio 2 is still operational, and uses
+ * the same dma as audio 1: your ess changes into a funny echo machine.
  *
- * Received the new that ES1688 is detected as a ES1788. Did some thinking:
+ * Received the news that ES1688 is detected as a ES1788. Did some thinking:
  * the ES1887 detection scheme suggests in step 2 to try if bit 3 of register
  * 0x64 can be changed. This is inaccurate, first I inverted the * check: "If
  * can be modified, it's a 1688", which lead to a correct detection
@@ -135,7 +135,7 @@
  * About recognition of ESS chips
  *
  * The distinction of ES688, ES1688, ES1788, ES1887 and ES1888 is described in
- * a (preliminary ??) datasheet on ES1887. It's aim is to identify ES1887, but
+ * a (preliminary ??) datasheet on ES1887. Its aim is to identify ES1887, but
  * during detection the text claims that "this chip may be ..." when a step
  * fails. This scheme is used to distinct between the above chips.
  * It appears however that some PnP chips like ES1868 are recognized as ES1788
@@ -156,9 +156,9 @@
  *
  * The existing ES1688 support didn't take care of the ES1688+ recording
  * levels very well. Whenever a device was selected (recmask) for recording
- * it's recording level was loud, and it couldn't be changed. The fact that
+ * its recording level was loud, and it couldn't be changed. The fact that
  * internal register 0xb4 could take care of RECLEV, didn't work meaning until
- * it's value was restored every time the chip was reset; this reset the
+ * its value was restored every time the chip was reset; this reset the
  * value of 0xb4 too. I guess that's what 4front also had (have?) trouble with.
  *
  * About ES1887 support:
@@ -169,9 +169,9 @@
  * the latter case the recording volumes are 0.
  * Now recording levels of inputs can be controlled, by changing the playback
  * levels. Futhermore several devices can be recorded together (which is not
- * possible with the ES1688.
+ * possible with the ES1688).
  * Besides the separate recording level control for each input, the common
- * recordig level can also be controlled by RECLEV as described above.
+ * recording level can also be controlled by RECLEV as described above.
  *
  * Not only ES1887 have this recording mixer. I know the following from the
  * documentation:
diff --git a/sound/pci/cs5535audio/cs5535audio_pcm.c b/sound/pci/cs5535audio/cs5535audio_pcm.c
index f0a48693d687..5450a9e8f133 100644
--- a/sound/pci/cs5535audio/cs5535audio_pcm.c
+++ b/sound/pci/cs5535audio/cs5535audio_pcm.c
@@ -143,7 +143,7 @@ static int cs5535audio_build_dma_packets(struct cs5535audio *cs5535au,
 	if (dma->periods == periods && dma->period_bytes == period_bytes)
 		return 0;
 
-	/* the u32 cast is okay because in snd*create we succesfully told
+	/* the u32 cast is okay because in snd*create we successfully told
    	   pci alloc that we're only 32 bit capable so the uppper will be 0 */
 	addr = (u32) substream->runtime->dma_addr;
 	desc_addr = (u32) dma->desc_buf.addr;
-- 
cgit v1.2.3


From 7e047ef5fe2d52e83020e856b1bf2556a6a2ce98 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 26 Jun 2006 00:24:50 -0700
Subject: [PATCH] keys: sort out key quota system

Add the ability for key creation to overrun the user's quota in some
circumstances - notably when a session keyring is created and assigned to a
process that didn't previously have one.

This means it's still possible to log in, should PAM require the creation of a
new session keyring, and fix an overburdened key quota.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/key.h              | 13 ++++++++++---
 include/linux/security.h         | 11 +++++++----
 security/dummy.c                 |  3 ++-
 security/keys/internal.h         |  3 ++-
 security/keys/key.c              | 26 ++++++++++++++------------
 security/keys/keyctl.c           |  5 +++--
 security/keys/keyring.c          |  4 ++--
 security/keys/process_keys.c     | 24 +++++++++++++++++-------
 security/keys/request_key.c      | 36 +++++++++++++++++++++---------------
 security/keys/request_key_auth.c |  2 +-
 security/selinux/hooks.c         |  9 ++++++---
 11 files changed, 85 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index e81ebf910d0b..e693e729bc92 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -248,7 +248,14 @@ extern struct key *key_alloc(struct key_type *type,
 			     const char *desc,
 			     uid_t uid, gid_t gid,
 			     struct task_struct *ctx,
-			     key_perm_t perm, int not_in_quota);
+			     key_perm_t perm,
+			     unsigned long flags);
+
+
+#define KEY_ALLOC_IN_QUOTA	0x0000	/* add to quota, reject if would overrun */
+#define KEY_ALLOC_QUOTA_OVERRUN	0x0001	/* add to quota, permit even if overrun */
+#define KEY_ALLOC_NOT_IN_QUOTA	0x0002	/* not in quota */
+
 extern int key_payload_reserve(struct key *key, size_t datalen);
 extern int key_instantiate_and_link(struct key *key,
 				    const void *data,
@@ -285,7 +292,7 @@ extern key_ref_t key_create_or_update(key_ref_t keyring,
 				      const char *description,
 				      const void *payload,
 				      size_t plen,
-				      int not_in_quota);
+				      unsigned long flags);
 
 extern int key_update(key_ref_t key,
 		      const void *payload,
@@ -299,7 +306,7 @@ extern int key_unlink(struct key *keyring,
 
 extern struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
 				 struct task_struct *ctx,
-				 int not_in_quota,
+				 unsigned long flags,
 				 struct key *dest);
 
 extern int keyring_clear(struct key *keyring);
diff --git a/include/linux/security.h b/include/linux/security.h
index d2c17bd91a29..51805806f974 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -862,6 +862,7 @@ struct swap_info_struct;
  *	Permit allocation of a key and assign security data. Note that key does
  *	not have a serial number assigned at this point.
  *	@key points to the key.
+ *	@flags is the allocation flags
  *	Return 0 if permission is granted, -ve error otherwise.
  * @key_free:
  *	Notification of destruction; free security data.
@@ -1324,7 +1325,7 @@ struct security_operations {
 
 	/* key management security hooks */
 #ifdef CONFIG_KEYS
-	int (*key_alloc)(struct key *key, struct task_struct *tsk);
+	int (*key_alloc)(struct key *key, struct task_struct *tsk, unsigned long flags);
 	void (*key_free)(struct key *key);
 	int (*key_permission)(key_ref_t key_ref,
 			      struct task_struct *context,
@@ -3040,9 +3041,10 @@ static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid
 #ifdef CONFIG_KEYS
 #ifdef CONFIG_SECURITY
 static inline int security_key_alloc(struct key *key,
-				     struct task_struct *tsk)
+				     struct task_struct *tsk,
+				     unsigned long flags)
 {
-	return security_ops->key_alloc(key, tsk);
+	return security_ops->key_alloc(key, tsk, flags);
 }
 
 static inline void security_key_free(struct key *key)
@@ -3060,7 +3062,8 @@ static inline int security_key_permission(key_ref_t key_ref,
 #else
 
 static inline int security_key_alloc(struct key *key,
-				     struct task_struct *tsk)
+				     struct task_struct *tsk,
+				     unsigned long flags)
 {
 	return 0;
 }
diff --git a/security/dummy.c b/security/dummy.c
index c3c5493581e2..310fcdf7b749 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -870,7 +870,8 @@ static int dummy_setprocattr(struct task_struct *p, char *name, void *value, siz
 }
 
 #ifdef CONFIG_KEYS
-static inline int dummy_key_alloc(struct key *key, struct task_struct *ctx)
+static inline int dummy_key_alloc(struct key *key, struct task_struct *ctx,
+				  unsigned long flags)
 {
 	return 0;
 }
diff --git a/security/keys/internal.h b/security/keys/internal.h
index e066e6057955..3c2877f0663e 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -99,7 +99,8 @@ extern int install_process_keyring(struct task_struct *tsk);
 extern struct key *request_key_and_link(struct key_type *type,
 					const char *description,
 					const char *callout_info,
-					struct key *dest_keyring);
+					struct key *dest_keyring,
+					unsigned long flags);
 
 /*
  * request_key authorisation
diff --git a/security/keys/key.c b/security/keys/key.c
index 51f851557389..3601fddca9f2 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -248,7 +248,7 @@ static inline void key_alloc_serial(struct key *key)
  */
 struct key *key_alloc(struct key_type *type, const char *desc,
 		      uid_t uid, gid_t gid, struct task_struct *ctx,
-		      key_perm_t perm, int not_in_quota)
+		      key_perm_t perm, unsigned long flags)
 {
 	struct key_user *user = NULL;
 	struct key *key;
@@ -269,12 +269,14 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 
 	/* check that the user's quota permits allocation of another key and
 	 * its description */
-	if (!not_in_quota) {
+	if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) {
 		spin_lock(&user->lock);
-		if (user->qnkeys + 1 >= KEYQUOTA_MAX_KEYS ||
-		    user->qnbytes + quotalen >= KEYQUOTA_MAX_BYTES
-		    )
-			goto no_quota;
+		if (!(flags & KEY_ALLOC_QUOTA_OVERRUN)) {
+			if (user->qnkeys + 1 >= KEYQUOTA_MAX_KEYS ||
+			    user->qnbytes + quotalen >= KEYQUOTA_MAX_BYTES
+			    )
+				goto no_quota;
+		}
 
 		user->qnkeys++;
 		user->qnbytes += quotalen;
@@ -308,7 +310,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	key->payload.data = NULL;
 	key->security = NULL;
 
-	if (!not_in_quota)
+	if (!(flags & KEY_ALLOC_NOT_IN_QUOTA))
 		key->flags |= 1 << KEY_FLAG_IN_QUOTA;
 
 	memset(&key->type_data, 0, sizeof(key->type_data));
@@ -318,7 +320,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 #endif
 
 	/* let the security module know about the key */
-	ret = security_key_alloc(key, ctx);
+	ret = security_key_alloc(key, ctx, flags);
 	if (ret < 0)
 		goto security_error;
 
@@ -332,7 +334,7 @@ error:
 security_error:
 	kfree(key->description);
 	kmem_cache_free(key_jar, key);
-	if (!not_in_quota) {
+	if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) {
 		spin_lock(&user->lock);
 		user->qnkeys--;
 		user->qnbytes -= quotalen;
@@ -345,7 +347,7 @@ security_error:
 no_memory_3:
 	kmem_cache_free(key_jar, key);
 no_memory_2:
-	if (!not_in_quota) {
+	if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) {
 		spin_lock(&user->lock);
 		user->qnkeys--;
 		user->qnbytes -= quotalen;
@@ -761,7 +763,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 			       const char *description,
 			       const void *payload,
 			       size_t plen,
-			       int not_in_quota)
+			       unsigned long flags)
 {
 	struct key_type *ktype;
 	struct key *keyring, *key = NULL;
@@ -822,7 +824,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 
 	/* allocate a new key */
 	key = key_alloc(ktype, description, current->fsuid, current->fsgid,
-			current, perm, not_in_quota);
+			current, perm, flags);
 	if (IS_ERR(key)) {
 		key_ref = ERR_PTR(PTR_ERR(key));
 		goto error_3;
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index ed71d86d2ce2..d74458522e98 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -102,7 +102,7 @@ asmlinkage long sys_add_key(const char __user *_type,
 	/* create or update the requested key and add it to the target
 	 * keyring */
 	key_ref = key_create_or_update(keyring_ref, type, description,
-				       payload, plen, 0);
+				       payload, plen, KEY_ALLOC_IN_QUOTA);
 	if (!IS_ERR(key_ref)) {
 		ret = key_ref_to_ptr(key_ref)->serial;
 		key_ref_put(key_ref);
@@ -184,7 +184,8 @@ asmlinkage long sys_request_key(const char __user *_type,
 
 	/* do the search */
 	key = request_key_and_link(ktype, description, callout_info,
-				   key_ref_to_ptr(dest_ref));
+				   key_ref_to_ptr(dest_ref),
+				   KEY_ALLOC_IN_QUOTA);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error5;
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 1357207fc9df..6c282bd937e2 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -240,7 +240,7 @@ static long keyring_read(const struct key *keyring,
  * allocate a keyring and link into the destination keyring
  */
 struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
-			  struct task_struct *ctx, int not_in_quota,
+			  struct task_struct *ctx, unsigned long flags,
 			  struct key *dest)
 {
 	struct key *keyring;
@@ -249,7 +249,7 @@ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
 	keyring = key_alloc(&key_type_keyring, description,
 			    uid, gid, ctx,
 			    (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL,
-			    not_in_quota);
+			    flags);
 
 	if (!IS_ERR(keyring)) {
 		ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL);
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 4d9825f9962c..32150cf7c37f 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -77,7 +77,8 @@ int alloc_uid_keyring(struct user_struct *user,
 	/* concoct a default session keyring */
 	sprintf(buf, "_uid_ses.%u", user->uid);
 
-	session_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx, 0, NULL);
+	session_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx,
+					KEY_ALLOC_IN_QUOTA, NULL);
 	if (IS_ERR(session_keyring)) {
 		ret = PTR_ERR(session_keyring);
 		goto error;
@@ -87,8 +88,8 @@ int alloc_uid_keyring(struct user_struct *user,
 	 * keyring */
 	sprintf(buf, "_uid.%u", user->uid);
 
-	uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx, 0,
-				    session_keyring);
+	uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx,
+				    KEY_ALLOC_IN_QUOTA, session_keyring);
 	if (IS_ERR(uid_keyring)) {
 		key_put(session_keyring);
 		ret = PTR_ERR(uid_keyring);
@@ -144,7 +145,8 @@ int install_thread_keyring(struct task_struct *tsk)
 
 	sprintf(buf, "_tid.%u", tsk->pid);
 
-	keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL);
+	keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk,
+				KEY_ALLOC_QUOTA_OVERRUN, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error;
@@ -178,7 +180,8 @@ int install_process_keyring(struct task_struct *tsk)
 	if (!tsk->signal->process_keyring) {
 		sprintf(buf, "_pid.%u", tsk->tgid);
 
-		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL);
+		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk,
+					KEY_ALLOC_QUOTA_OVERRUN, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error;
@@ -209,6 +212,7 @@ error:
 static int install_session_keyring(struct task_struct *tsk,
 				   struct key *keyring)
 {
+	unsigned long flags;
 	struct key *old;
 	char buf[20];
 
@@ -218,7 +222,12 @@ static int install_session_keyring(struct task_struct *tsk,
 	if (!keyring) {
 		sprintf(buf, "_ses.%u", tsk->tgid);
 
-		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL);
+		flags = KEY_ALLOC_QUOTA_OVERRUN;
+		if (tsk->signal->session_keyring)
+			flags = KEY_ALLOC_IN_QUOTA;
+
+		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk,
+					flags, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
 	}
@@ -728,7 +737,8 @@ long join_session_keyring(const char *name)
 	keyring = find_keyring_by_name(name, 0);
 	if (PTR_ERR(keyring) == -ENOKEY) {
 		/* not found - try and create a new one */
-		keyring = keyring_alloc(name, tsk->uid, tsk->gid, tsk, 0, NULL);
+		keyring = keyring_alloc(name, tsk->uid, tsk->gid, tsk,
+					KEY_ALLOC_IN_QUOTA, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error2;
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index eab66a06ca53..58d1efd4fc2c 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -48,8 +48,8 @@ static int call_sbin_request_key(struct key *key,
 	/* allocate a new session keyring */
 	sprintf(desc, "_req.%u", key->serial);
 
-	keyring = keyring_alloc(desc, current->fsuid, current->fsgid,
-				current, 1, NULL);
+	keyring = keyring_alloc(desc, current->fsuid, current->fsgid, current,
+				KEY_ALLOC_QUOTA_OVERRUN, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error_alloc;
@@ -126,7 +126,8 @@ error_alloc:
  */
 static struct key *__request_key_construction(struct key_type *type,
 					      const char *description,
-					      const char *callout_info)
+					      const char *callout_info,
+					      unsigned long flags)
 {
 	request_key_actor_t actor;
 	struct key_construction cons;
@@ -134,12 +135,12 @@ static struct key *__request_key_construction(struct key_type *type,
 	struct key *key, *authkey;
 	int ret, negated;
 
-	kenter("%s,%s,%s", type->name, description, callout_info);
+	kenter("%s,%s,%s,%lx", type->name, description, callout_info, flags);
 
 	/* create a key and add it to the queue */
 	key = key_alloc(type, description,
-			current->fsuid, current->fsgid,
-			current, KEY_POS_ALL, 0);
+			current->fsuid, current->fsgid, current, KEY_POS_ALL,
+			flags);
 	if (IS_ERR(key))
 		goto alloc_failed;
 
@@ -258,15 +259,16 @@ alloc_failed:
 static struct key *request_key_construction(struct key_type *type,
 					    const char *description,
 					    struct key_user *user,
-					    const char *callout_info)
+					    const char *callout_info,
+					    unsigned long flags)
 {
 	struct key_construction *pcons;
 	struct key *key, *ckey;
 
 	DECLARE_WAITQUEUE(myself, current);
 
-	kenter("%s,%s,{%d},%s",
-	       type->name, description, user->uid, callout_info);
+	kenter("%s,%s,{%d},%s,%lx",
+	       type->name, description, user->uid, callout_info, flags);
 
 	/* see if there's such a key under construction already */
 	down_write(&key_construction_sem);
@@ -282,7 +284,8 @@ static struct key *request_key_construction(struct key_type *type,
 	}
 
 	/* see about getting userspace to construct the key */
-	key = __request_key_construction(type, description, callout_info);
+	key = __request_key_construction(type, description, callout_info,
+					 flags);
  error:
 	kleave(" = %p", key);
 	return key;
@@ -389,14 +392,15 @@ static void request_key_link(struct key *key, struct key *dest_keyring)
 struct key *request_key_and_link(struct key_type *type,
 				 const char *description,
 				 const char *callout_info,
-				 struct key *dest_keyring)
+				 struct key *dest_keyring,
+				 unsigned long flags)
 {
 	struct key_user *user;
 	struct key *key;
 	key_ref_t key_ref;
 
-	kenter("%s,%s,%s,%p",
-	       type->name, description, callout_info, dest_keyring);
+	kenter("%s,%s,%s,%p,%lx",
+	       type->name, description, callout_info, dest_keyring, flags);
 
 	/* search all the process keyrings for a key */
 	key_ref = search_process_keyrings(type, description, type->match,
@@ -429,7 +433,8 @@ struct key *request_key_and_link(struct key_type *type,
 			/* ask userspace (returns NULL if it waited on a key
 			 * being constructed) */
 			key = request_key_construction(type, description,
-						       user, callout_info);
+						       user, callout_info,
+						       flags);
 			if (key)
 				break;
 
@@ -485,7 +490,8 @@ struct key *request_key(struct key_type *type,
 			const char *description,
 			const char *callout_info)
 {
-	return request_key_and_link(type, description, callout_info, NULL);
+	return request_key_and_link(type, description, callout_info, NULL,
+				    KEY_ALLOC_IN_QUOTA);
 
 } /* end request_key() */
 
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index cb9817ced3fd..cbf58a91b00a 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -187,7 +187,7 @@ struct key *request_key_auth_new(struct key *target, const char *callout_info)
 	authkey = key_alloc(&key_type_request_key_auth, desc,
 			    current->fsuid, current->fsgid, current,
 			    KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
-			    KEY_USR_VIEW, 1);
+			    KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(authkey)) {
 		ret = PTR_ERR(authkey);
 		goto error_alloc;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 79c16e31c884..13384fef0d60 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4264,7 +4264,8 @@ static int selinux_setprocattr(struct task_struct *p,
 
 #ifdef CONFIG_KEYS
 
-static int selinux_key_alloc(struct key *k, struct task_struct *tsk)
+static int selinux_key_alloc(struct key *k, struct task_struct *tsk,
+			     unsigned long flags)
 {
 	struct task_security_struct *tsec = tsk->security;
 	struct key_security_struct *ksec;
@@ -4513,8 +4514,10 @@ static __init int selinux_init(void)
 
 #ifdef CONFIG_KEYS
 	/* Add security information to initial keyrings */
-	security_key_alloc(&root_user_keyring, current);
-	security_key_alloc(&root_session_keyring, current);
+	security_key_alloc(&root_user_keyring, current,
+			   KEY_ALLOC_NOT_IN_QUOTA);
+	security_key_alloc(&root_session_keyring, current,
+			   KEY_ALLOC_NOT_IN_QUOTA);
 #endif
 
 	return 0;
-- 
cgit v1.2.3


From 844dd05fec172d98b0dacecd9b9e9f6595204c13 Mon Sep 17 00:00:00 2001
From: Michael Buesch <mb@bu3sch.de>
Date: Mon, 26 Jun 2006 00:24:59 -0700
Subject: [PATCH] Add new generic HW RNG core

Signed-off-by: Michael Buesch <mb@bu3sch.de>
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 MAINTAINERS                     |  15 ++
 drivers/char/Kconfig            |   2 +
 drivers/char/Makefile           |   1 +
 drivers/char/hw_random/Kconfig  |  11 ++
 drivers/char/hw_random/Makefile |   5 +
 drivers/char/hw_random/core.c   | 354 ++++++++++++++++++++++++++++++++++++++++
 include/linux/hw_random.h       |  50 ++++++
 7 files changed, 438 insertions(+)
 create mode 100644 drivers/char/hw_random/Kconfig
 create mode 100644 drivers/char/hw_random/Makefile
 create mode 100644 drivers/char/hw_random/core.c
 create mode 100644 include/linux/hw_random.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 4dcd2f1f14d6..28c0a9676927 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1118,6 +1118,11 @@ L:	lm-sensors@lm-sensors.org
 W:	http://www.lm-sensors.nu/
 S:	Maintained
 
+HARDWARE RANDOM NUMBER GENERATOR CORE
+P:	Michael Buesch
+M:	mb@bu3sch.de
+S:	Maintained
+
 HARD DRIVE ACTIVE PROTECTION SYSTEM (HDAPS) DRIVER
 P:	Robert Love
 M:	rlove@rlove.org
@@ -1436,6 +1441,11 @@ P:	Tigran Aivazian
 M:	tigran@veritas.com
 S:	Maintained
 
+INTEL IXP4XX RANDOM NUMBER GENERATOR SUPPORT
+P:	Deepak Saxena
+M:	dsaxena@plexity.net
+S:	Maintained
+
 INTEL PRO/100 ETHERNET SUPPORT
 P:	John Ronciak
 M:	john.ronciak@intel.com
@@ -2725,6 +2735,11 @@ P:	Christoph Hellwig
 M:	hch@infradead.org
 S:	Maintained
 
+TI OMAP RANDOM NUMBER GENERATOR SUPPORT
+P:	Deepak Saxena
+M:	dsaxena@plexity.net
+S:	Maintained
+
 TI PARALLEL LINK CABLE DRIVER
 P:     Romain Lievin
 M:     roms@lpg.ticalc.org
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index e4b44c68ad0f..ed31638bd75c 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -670,6 +670,8 @@ config NWFLASH
 
 	  If you're not sure, say N.
 
+source "drivers/char/hw_random/Kconfig"
+
 config NVRAM
 	tristate "/dev/nvram support"
 	depends on ATARI || X86 || ARM || GENERIC_NVRAM
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 79aecef93b60..524105597ea7 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -75,6 +75,7 @@ endif
 obj-$(CONFIG_TOSHIBA)		+= toshiba.o
 obj-$(CONFIG_I8K)		+= i8k.o
 obj-$(CONFIG_DS1620)		+= ds1620.o
+obj-$(CONFIG_HW_RANDOM)		+= hw_random/
 obj-$(CONFIG_FTAPE)		+= ftape/
 obj-$(CONFIG_COBALT_LCD)	+= lcd.o
 obj-$(CONFIG_PPDEV)		+= ppdev.o
diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
new file mode 100644
index 000000000000..fa4a245127ad
--- /dev/null
+++ b/drivers/char/hw_random/Kconfig
@@ -0,0 +1,11 @@
+#
+# Hardware Random Number Generator (RNG) configuration
+#
+
+config HW_RANDOM
+	bool "Hardware Random Number Generator Core support"
+	default y
+	---help---
+	  Hardware Random Number Generator Core infrastructure.
+
+	  If unsure, say Y.
diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile
new file mode 100644
index 000000000000..aa752af468ce
--- /dev/null
+++ b/drivers/char/hw_random/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for HW Random Number Generator (RNG) device drivers.
+#
+
+obj-$(CONFIG_HW_RANDOM) += core.o
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
new file mode 100644
index 000000000000..88b026639f10
--- /dev/null
+++ b/drivers/char/hw_random/core.c
@@ -0,0 +1,354 @@
+/*
+        Added support for the AMD Geode LX RNG
+	(c) Copyright 2004-2005 Advanced Micro Devices, Inc.
+
+	derived from
+
+ 	Hardware driver for the Intel/AMD/VIA Random Number Generators (RNG)
+	(c) Copyright 2003 Red Hat Inc <jgarzik@redhat.com>
+
+ 	derived from
+
+        Hardware driver for the AMD 768 Random Number Generator (RNG)
+        (c) Copyright 2001 Red Hat Inc <alan@redhat.com>
+
+ 	derived from
+
+	Hardware driver for Intel i810 Random Number Generator (RNG)
+	Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com>
+	Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com>
+
+	Added generic RNG API
+	Copyright 2006 Michael Buesch <mbuesch@freenet.de>
+	Copyright 2005 (c) MontaVista Software, Inc.
+
+	Please read Documentation/hw_random.txt for details on use.
+
+	----------------------------------------------------------
+	This software may be used and distributed according to the terms
+        of the GNU General Public License, incorporated herein by reference.
+
+ */
+
+
+#include <linux/device.h>
+#include <linux/hw_random.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+
+
+#define RNG_MODULE_NAME		"hw_random"
+#define PFX			RNG_MODULE_NAME ": "
+#define RNG_MISCDEV_MINOR	183 /* official */
+
+
+static struct hwrng *current_rng;
+static LIST_HEAD(rng_list);
+static DEFINE_MUTEX(rng_mutex);
+
+
+static inline int hwrng_init(struct hwrng *rng)
+{
+	if (!rng->init)
+		return 0;
+	return rng->init(rng);
+}
+
+static inline void hwrng_cleanup(struct hwrng *rng)
+{
+	if (rng && rng->cleanup)
+		rng->cleanup(rng);
+}
+
+static inline int hwrng_data_present(struct hwrng *rng)
+{
+	if (!rng->data_present)
+		return 1;
+	return rng->data_present(rng);
+}
+
+static inline int hwrng_data_read(struct hwrng *rng, u32 *data)
+{
+	return rng->data_read(rng, data);
+}
+
+
+static int rng_dev_open(struct inode *inode, struct file *filp)
+{
+	/* enforce read-only access to this chrdev */
+	if ((filp->f_mode & FMODE_READ) == 0)
+		return -EINVAL;
+	if (filp->f_mode & FMODE_WRITE)
+		return -EINVAL;
+	return 0;
+}
+
+static ssize_t rng_dev_read(struct file *filp, char __user *buf,
+			    size_t size, loff_t *offp)
+{
+	u32 data;
+	ssize_t ret = 0;
+	int i, err = 0;
+	int data_present;
+	int bytes_read;
+
+	while (size) {
+		err = -ERESTARTSYS;
+		if (mutex_lock_interruptible(&rng_mutex))
+			goto out;
+		if (!current_rng) {
+			mutex_unlock(&rng_mutex);
+			err = -ENODEV;
+			goto out;
+		}
+		if (filp->f_flags & O_NONBLOCK) {
+			data_present = hwrng_data_present(current_rng);
+		} else {
+			/* Some RNG require some time between data_reads to gather
+			 * new entropy. Poll it.
+			 */
+			for (i = 0; i < 20; i++) {
+				data_present = hwrng_data_present(current_rng);
+				if (data_present)
+					break;
+				udelay(10);
+			}
+		}
+		bytes_read = 0;
+		if (data_present)
+			bytes_read = hwrng_data_read(current_rng, &data);
+		mutex_unlock(&rng_mutex);
+
+		err = -EAGAIN;
+		if (!bytes_read && (filp->f_flags & O_NONBLOCK))
+			goto out;
+
+		err = -EFAULT;
+		while (bytes_read && size) {
+			if (put_user((u8)data, buf++))
+				goto out;
+			size--;
+			ret++;
+			bytes_read--;
+			data >>= 8;
+		}
+
+		if (need_resched())
+			schedule_timeout_interruptible(1);
+		err = -ERESTARTSYS;
+		if (signal_pending(current))
+			goto out;
+	}
+out:
+	return ret ? : err;
+}
+
+
+static struct file_operations rng_chrdev_ops = {
+	.owner		= THIS_MODULE,
+	.open		= rng_dev_open,
+	.read		= rng_dev_read,
+};
+
+static struct miscdevice rng_miscdev = {
+	.minor		= RNG_MISCDEV_MINOR,
+	.name		= RNG_MODULE_NAME,
+	.fops		= &rng_chrdev_ops,
+};
+
+
+static ssize_t hwrng_attr_current_store(struct class_device *class,
+					const char *buf, size_t len)
+{
+	int err;
+	struct hwrng *rng;
+
+	err = mutex_lock_interruptible(&rng_mutex);
+	if (err)
+		return -ERESTARTSYS;
+	err = -ENODEV;
+	list_for_each_entry(rng, &rng_list, list) {
+		if (strcmp(rng->name, buf) == 0) {
+			if (rng == current_rng) {
+				err = 0;
+				break;
+			}
+			err = hwrng_init(rng);
+			if (err)
+				break;
+			hwrng_cleanup(current_rng);
+			current_rng = rng;
+			err = 0;
+			break;
+		}
+	}
+	mutex_unlock(&rng_mutex);
+
+	return err ? : len;
+}
+
+static ssize_t hwrng_attr_current_show(struct class_device *class,
+				       char *buf)
+{
+	int err;
+	ssize_t ret;
+	const char *name = "none";
+
+	err = mutex_lock_interruptible(&rng_mutex);
+	if (err)
+		return -ERESTARTSYS;
+	if (current_rng)
+		name = current_rng->name;
+	ret = snprintf(buf, PAGE_SIZE, "%s\n", name);
+	mutex_unlock(&rng_mutex);
+
+	return ret;
+}
+
+static ssize_t hwrng_attr_available_show(struct class_device *class,
+					 char *buf)
+{
+	int err;
+	ssize_t ret = 0;
+	struct hwrng *rng;
+
+	err = mutex_lock_interruptible(&rng_mutex);
+	if (err)
+		return -ERESTARTSYS;
+	buf[0] = '\0';
+	list_for_each_entry(rng, &rng_list, list) {
+		strncat(buf, rng->name, PAGE_SIZE - ret - 1);
+		ret += strlen(rng->name);
+		strncat(buf, " ", PAGE_SIZE - ret - 1);
+		ret++;
+	}
+	strncat(buf, "\n", PAGE_SIZE - ret - 1);
+	ret++;
+	mutex_unlock(&rng_mutex);
+
+	return ret;
+}
+
+static CLASS_DEVICE_ATTR(rng_current, S_IRUGO | S_IWUSR,
+			 hwrng_attr_current_show,
+			 hwrng_attr_current_store);
+static CLASS_DEVICE_ATTR(rng_available, S_IRUGO,
+			 hwrng_attr_available_show,
+			 NULL);
+
+
+static void unregister_miscdev(void)
+{
+	class_device_remove_file(rng_miscdev.class,
+				 &class_device_attr_rng_available);
+	class_device_remove_file(rng_miscdev.class,
+				 &class_device_attr_rng_current);
+	misc_deregister(&rng_miscdev);
+}
+
+static int register_miscdev(void)
+{
+	int err;
+
+	err = misc_register(&rng_miscdev);
+	if (err)
+		goto out;
+	err = class_device_create_file(rng_miscdev.class,
+				       &class_device_attr_rng_current);
+	if (err)
+		goto err_misc_dereg;
+	err = class_device_create_file(rng_miscdev.class,
+				       &class_device_attr_rng_available);
+	if (err)
+		goto err_remove_current;
+out:
+	return err;
+
+err_remove_current:
+	class_device_remove_file(rng_miscdev.class,
+				 &class_device_attr_rng_current);
+err_misc_dereg:
+	misc_deregister(&rng_miscdev);
+	goto out;
+}
+
+int hwrng_register(struct hwrng *rng)
+{
+	int must_register_misc;
+	int err = -EINVAL;
+	struct hwrng *old_rng, *tmp;
+
+	if (rng->name == NULL ||
+	    rng->data_read == NULL)
+		goto out;
+
+	mutex_lock(&rng_mutex);
+
+	/* Must not register two RNGs with the same name. */
+	err = -EEXIST;
+	list_for_each_entry(tmp, &rng_list, list) {
+		if (strcmp(tmp->name, rng->name) == 0)
+			goto out_unlock;
+	}
+
+	must_register_misc = (current_rng == NULL);
+	old_rng = current_rng;
+	if (!old_rng) {
+		err = hwrng_init(rng);
+		if (err)
+			goto out_unlock;
+		current_rng = rng;
+	}
+	err = 0;
+	if (must_register_misc) {
+		err = register_miscdev();
+		if (err) {
+			if (!old_rng) {
+				hwrng_cleanup(rng);
+				current_rng = NULL;
+			}
+			goto out_unlock;
+		}
+	}
+	INIT_LIST_HEAD(&rng->list);
+	list_add_tail(&rng->list, &rng_list);
+out_unlock:
+	mutex_unlock(&rng_mutex);
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(hwrng_register);
+
+void hwrng_unregister(struct hwrng *rng)
+{
+	int err;
+
+	mutex_lock(&rng_mutex);
+
+	list_del(&rng->list);
+	if (current_rng == rng) {
+		hwrng_cleanup(rng);
+		if (list_empty(&rng_list)) {
+			current_rng = NULL;
+		} else {
+			current_rng = list_entry(rng_list.prev, struct hwrng, list);
+			err = hwrng_init(current_rng);
+			if (err)
+				current_rng = NULL;
+		}
+	}
+	if (list_empty(&rng_list))
+		unregister_miscdev();
+
+	mutex_unlock(&rng_mutex);
+}
+EXPORT_SYMBOL_GPL(hwrng_unregister);
+
+
+MODULE_DESCRIPTION("H/W Random Number Generator (RNG) driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
new file mode 100644
index 000000000000..21ea7610e177
--- /dev/null
+++ b/include/linux/hw_random.h
@@ -0,0 +1,50 @@
+/*
+	Hardware Random Number Generator
+
+	Please read Documentation/hw_random.txt for details on use.
+
+	----------------------------------------------------------
+	This software may be used and distributed according to the terms
+        of the GNU General Public License, incorporated herein by reference.
+
+ */
+
+#ifndef LINUX_HWRANDOM_H_
+#define LINUX_HWRANDOM_H_
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+/**
+ * struct hwrng - Hardware Random Number Generator driver
+ * @name:		Unique RNG name.
+ * @init:		Initialization callback (can be NULL).
+ * @cleanup:		Cleanup callback (can be NULL).
+ * @data_present:	Callback to determine if data is available
+ *			on the RNG. If NULL, it is assumed that
+ *			there is always data available.
+ * @data_read:		Read data from the RNG device.
+ *			Returns the number of lower random bytes in "data".
+ *			Must not be NULL.
+ * @priv:		Private data, for use by the RNG driver.
+ */
+struct hwrng {
+	const char *name;
+	int (*init)(struct hwrng *rng);
+	void (*cleanup)(struct hwrng *rng);
+	int (*data_present)(struct hwrng *rng);
+	int (*data_read)(struct hwrng *rng, u32 *data);
+	unsigned long priv;
+
+	/* internal. */
+	struct list_head list;
+};
+
+/** Register a new Hardware Random Number Generator driver. */
+extern int hwrng_register(struct hwrng *rng);
+/** Unregister a Hardware Random Number Generator driver. */
+extern void hwrng_unregister(struct hwrng *rng);
+
+#endif /* __KERNEL__ */
+#endif /* LINUX_HWRANDOM_H_ */
-- 
cgit v1.2.3


From 734efb467b31e56c2f9430590a9aa867ecf3eea1 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Mon, 26 Jun 2006 00:25:05 -0700
Subject: [PATCH] Time: Clocksource Infrastructure

This introduces the clocksource management infrastructure.  A clocksource is a
driver-like architecture generic abstraction of a free-running counter.  This
code defines the clocksource structure, and provides management code for
registering, selecting, accessing and scaling clocksources.

Additionally, this includes the trivial jiffies clocksource, a lowest common
denominator clocksource, provided mainly for use as an example.

[hirofumi@mail.parknet.co.jp: Don't enable IRQ too early]
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt |  14 +-
 include/linux/clocksource.h         | 181 +++++++++++++++++++
 kernel/time/Makefile                |   1 +
 kernel/time/clocksource.c           | 344 ++++++++++++++++++++++++++++++++++++
 kernel/time/jiffies.c               |  73 ++++++++
 5 files changed, 609 insertions(+), 4 deletions(-)
 create mode 100644 include/linux/clocksource.h
 create mode 100644 kernel/time/Makefile
 create mode 100644 kernel/time/clocksource.c
 create mode 100644 kernel/time/jiffies.c

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index bca6f389da66..968631678d4a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -61,6 +61,7 @@ parameter is applicable:
 	MTD	MTD support is enabled.
 	NET	Appropriate network support is enabled.
 	NUMA	NUMA support is enabled.
+	GENERIC_TIME The generic timeofday code is enabled.
 	NFS	Appropriate NFS support is enabled.
 	OSS	OSS sound support is enabled.
 	PARIDE	The ParIDE subsystem is enabled.
@@ -341,10 +342,11 @@ running once the system is up.
 			Value can be changed at runtime via
 				/selinux/checkreqprot.
 
- 	clock=		[BUGS=IA-32,HW] gettimeofday timesource override.
-			Forces specified timesource (if avaliable) to be used
-			when calculating gettimeofday(). If specicified
-			timesource is not avalible, it defaults to PIT.
+	clock=		[BUGS=IA-32, HW] gettimeofday clocksource override.
+			[Deprecated]
+			Forces specified clocksource (if avaliable) to be used
+			when calculating gettimeofday(). If specified
+			clocksource is not avalible, it defaults to PIT.
 			Format: { pit | tsc | cyclone | pmtmr }
 
 	disable_8254_timer
@@ -1617,6 +1619,10 @@ running once the system is up.
 
 	time		Show timing data prefixed to each printk message line
 
+	clocksource=	[GENERIC_TIME] Override the default clocksource
+			Override the default clocksource and use the clocksource
+			with the name specified.
+
 	tipar.timeout=	[HW,PPT]
 			Set communications timeout in tenths of a second
 			(default 15).
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
new file mode 100644
index 000000000000..c4187cda0ee4
--- /dev/null
+++ b/include/linux/clocksource.h
@@ -0,0 +1,181 @@
+/*  linux/include/linux/clocksource.h
+ *
+ *  This file contains the structure definitions for clocksources.
+ *
+ *  If you are not a clocksource, or timekeeping code, you should
+ *  not be including this file!
+ */
+#ifndef _LINUX_CLOCKSOURCE_H
+#define _LINUX_CLOCKSOURCE_H
+
+#include <linux/types.h>
+#include <linux/timex.h>
+#include <linux/time.h>
+#include <linux/list.h>
+#include <asm/div64.h>
+#include <asm/io.h>
+
+/* clocksource cycle base type */
+typedef u64 cycle_t;
+
+/**
+ * struct clocksource - hardware abstraction for a free running counter
+ *	Provides mostly state-free accessors to the underlying hardware.
+ *
+ * @name:		ptr to clocksource name
+ * @list:		list head for registration
+ * @rating:		rating value for selection (higher is better)
+ *			To avoid rating inflation the following
+ *			list should give you a guide as to how
+ *			to assign your clocksource a rating
+ *			1-99: Unfit for real use
+ *				Only available for bootup and testing purposes.
+ *			100-199: Base level usability.
+ *				Functional for real use, but not desired.
+ *			200-299: Good.
+ *				A correct and usable clocksource.
+ *			300-399: Desired.
+ *				A reasonably fast and accurate clocksource.
+ *			400-499: Perfect
+ *				The ideal clocksource. A must-use where
+ *				available.
+ * @read:		returns a cycle value
+ * @mask:		bitmask for two's complement
+ *			subtraction of non 64 bit counters
+ * @mult:		cycle to nanosecond multiplier
+ * @shift:		cycle to nanosecond divisor (power of two)
+ * @update_callback:	called when safe to alter clocksource values
+ * @is_continuous:	defines if clocksource is free-running.
+ * @interval_cycles:	Used internally by timekeeping core, please ignore.
+ * @interval_snsecs:	Used internally by timekeeping core, please ignore.
+ */
+struct clocksource {
+	char *name;
+	struct list_head list;
+	int rating;
+	cycle_t (*read)(void);
+	cycle_t mask;
+	u32 mult;
+	u32 shift;
+	int (*update_callback)(void);
+	int is_continuous;
+
+	/* timekeeping specific data, ignore */
+	cycle_t interval_cycles;
+	u64 interval_snsecs;
+};
+
+
+/**
+ * clocksource_khz2mult - calculates mult from khz and shift
+ * @khz:		Clocksource frequency in KHz
+ * @shift_constant:	Clocksource shift factor
+ *
+ * Helper functions that converts a khz counter frequency to a timsource
+ * multiplier, given the clocksource shift value
+ */
+static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant)
+{
+	/*  khz = cyc/(Million ns)
+	 *  mult/2^shift  = ns/cyc
+	 *  mult = ns/cyc * 2^shift
+	 *  mult = 1Million/khz * 2^shift
+	 *  mult = 1000000 * 2^shift / khz
+	 *  mult = (1000000<<shift) / khz
+	 */
+	u64 tmp = ((u64)1000000) << shift_constant;
+
+	tmp += khz/2; /* round for do_div */
+	do_div(tmp, khz);
+
+	return (u32)tmp;
+}
+
+/**
+ * clocksource_hz2mult - calculates mult from hz and shift
+ * @hz:			Clocksource frequency in Hz
+ * @shift_constant:	Clocksource shift factor
+ *
+ * Helper functions that converts a hz counter
+ * frequency to a timsource multiplier, given the
+ * clocksource shift value
+ */
+static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
+{
+	/*  hz = cyc/(Billion ns)
+	 *  mult/2^shift  = ns/cyc
+	 *  mult = ns/cyc * 2^shift
+	 *  mult = 1Billion/hz * 2^shift
+	 *  mult = 1000000000 * 2^shift / hz
+	 *  mult = (1000000000<<shift) / hz
+	 */
+	u64 tmp = ((u64)1000000000) << shift_constant;
+
+	tmp += hz/2; /* round for do_div */
+	do_div(tmp, hz);
+
+	return (u32)tmp;
+}
+
+/**
+ * read_clocksource: - Access the clocksource's current cycle value
+ * @cs:		pointer to clocksource being read
+ *
+ * Uses the clocksource to return the current cycle_t value
+ */
+static inline cycle_t read_clocksource(struct clocksource *cs)
+{
+	return cs->read();
+}
+
+/**
+ * cyc2ns - converts clocksource cycles to nanoseconds
+ * @cs:		Pointer to clocksource
+ * @cycles:	Cycles
+ *
+ * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds.
+ *
+ * XXX - This could use some mult_lxl_ll() asm optimization
+ */
+static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles)
+{
+	u64 ret = (u64)cycles;
+	ret = (ret * cs->mult) >> cs->shift;
+	return ret;
+}
+
+/**
+ * calculate_clocksource_interval - Calculates a clocksource interval struct
+ *
+ * @c:		Pointer to clocksource.
+ * @length_nsec: Desired interval length in nanoseconds.
+ *
+ * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
+ * pair and interval request.
+ *
+ * Unless you're the timekeeping code, you should not be using this!
+ */
+static inline void calculate_clocksource_interval(struct clocksource *c,
+						unsigned long length_nsec)
+{
+	u64 tmp;
+
+	/* XXX - All of this could use a whole lot of optimization */
+	tmp = length_nsec;
+	tmp <<= c->shift;
+	tmp += c->mult/2;
+	do_div(tmp, c->mult);
+
+	c->interval_cycles = (cycle_t)tmp;
+	if(c->interval_cycles == 0)
+		c->interval_cycles = 1;
+
+	c->interval_snsecs = (u64)c->interval_cycles * c->mult;
+}
+
+/* used to install a new clocksource */
+int register_clocksource(struct clocksource*);
+void reselect_clocksource(void);
+struct clocksource* get_next_clocksource(void);
+
+#endif /* _LINUX_CLOCKSOURCE_H */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
new file mode 100644
index 000000000000..e1dfd8e86cce
--- /dev/null
+++ b/kernel/time/Makefile
@@ -0,0 +1 @@
+obj-y += clocksource.o jiffies.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
new file mode 100644
index 000000000000..95dd2200a109
--- /dev/null
+++ b/kernel/time/clocksource.c
@@ -0,0 +1,344 @@
+/*
+ * linux/kernel/time/clocksource.c
+ *
+ * This file contains the functions which manage clocksource drivers.
+ *
+ * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * TODO WishList:
+ *   o Allow clocksource drivers to be unregistered
+ *   o get rid of clocksource_jiffies extern
+ */
+
+#include <linux/clocksource.h>
+#include <linux/sysdev.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+/* XXX - Would like a better way for initializing curr_clocksource */
+extern struct clocksource clocksource_jiffies;
+
+/*[Clocksource internal variables]---------
+ * curr_clocksource:
+ *	currently selected clocksource. Initialized to clocksource_jiffies.
+ * next_clocksource:
+ *	pending next selected clocksource.
+ * clocksource_list:
+ *	linked list with the registered clocksources
+ * clocksource_lock:
+ *	protects manipulations to curr_clocksource and next_clocksource
+ *	and the clocksource_list
+ * override_name:
+ *	Name of the user-specified clocksource.
+ */
+static struct clocksource *curr_clocksource = &clocksource_jiffies;
+static struct clocksource *next_clocksource;
+static LIST_HEAD(clocksource_list);
+static DEFINE_SPINLOCK(clocksource_lock);
+static char override_name[32];
+static int finished_booting;
+
+/* clocksource_done_booting - Called near the end of bootup
+ *
+ * Hack to avoid lots of clocksource churn at boot time
+ */
+static int clocksource_done_booting(void)
+{
+	finished_booting = 1;
+	return 0;
+}
+
+late_initcall(clocksource_done_booting);
+
+/**
+ * get_next_clocksource - Returns the selected clocksource
+ *
+ */
+struct clocksource *get_next_clocksource(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&clocksource_lock, flags);
+	if (next_clocksource && finished_booting) {
+		curr_clocksource = next_clocksource;
+		next_clocksource = NULL;
+	}
+	spin_unlock_irqrestore(&clocksource_lock, flags);
+
+	return curr_clocksource;
+}
+
+/**
+ * select_clocksource - Finds the best registered clocksource.
+ *
+ * Private function. Must hold clocksource_lock when called.
+ *
+ * Looks through the list of registered clocksources, returning
+ * the one with the highest rating value. If there is a clocksource
+ * name that matches the override string, it returns that clocksource.
+ */
+static struct clocksource *select_clocksource(void)
+{
+	struct clocksource *best = NULL;
+	struct list_head *tmp;
+
+	list_for_each(tmp, &clocksource_list) {
+		struct clocksource *src;
+
+		src = list_entry(tmp, struct clocksource, list);
+		if (!best)
+			best = src;
+
+		/* check for override: */
+		if (strlen(src->name) == strlen(override_name) &&
+		    !strcmp(src->name, override_name)) {
+			best = src;
+			break;
+		}
+		/* pick the highest rating: */
+		if (src->rating > best->rating)
+		 	best = src;
+	}
+
+	return best;
+}
+
+/**
+ * is_registered_source - Checks if clocksource is registered
+ * @c:		pointer to a clocksource
+ *
+ * Private helper function. Must hold clocksource_lock when called.
+ *
+ * Returns one if the clocksource is already registered, zero otherwise.
+ */
+static int is_registered_source(struct clocksource *c)
+{
+	int len = strlen(c->name);
+	struct list_head *tmp;
+
+	list_for_each(tmp, &clocksource_list) {
+		struct clocksource *src;
+
+		src = list_entry(tmp, struct clocksource, list);
+		if (strlen(src->name) == len &&	!strcmp(src->name, c->name))
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * register_clocksource - Used to install new clocksources
+ * @t:		clocksource to be registered
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ */
+int register_clocksource(struct clocksource *c)
+{
+	int ret = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&clocksource_lock, flags);
+	/* check if clocksource is already registered */
+	if (is_registered_source(c)) {
+		printk("register_clocksource: Cannot register %s. "
+			"Already registered!", c->name);
+		ret = -EBUSY;
+	} else {
+		/* register it */
+ 		list_add(&c->list, &clocksource_list);
+		/* scan the registered clocksources, and pick the best one */
+		next_clocksource = select_clocksource();
+	}
+	spin_unlock_irqrestore(&clocksource_lock, flags);
+	return ret;
+}
+
+EXPORT_SYMBOL(register_clocksource);
+
+/**
+ * reselect_clocksource - Rescan list for next clocksource
+ *
+ * A quick helper function to be used if a clocksource changes its
+ * rating. Forces the clocksource list to be re-scaned for the best
+ * clocksource.
+ */
+void reselect_clocksource(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&clocksource_lock, flags);
+	next_clocksource = select_clocksource();
+	spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+
+/**
+ * sysfs_show_current_clocksources - sysfs interface for current clocksource
+ * @dev:	unused
+ * @buf:	char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing current clocksource.
+ */
+static ssize_t
+sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
+{
+	char *curr = buf;
+
+	spin_lock_irq(&clocksource_lock);
+	curr += sprintf(curr, "%s ", curr_clocksource->name);
+	spin_unlock_irq(&clocksource_lock);
+
+	curr += sprintf(curr, "\n");
+
+	return curr - buf;
+}
+
+/**
+ * sysfs_override_clocksource - interface for manually overriding clocksource
+ * @dev:	unused
+ * @buf:	name of override clocksource
+ * @count:	length of buffer
+ *
+ * Takes input from sysfs interface for manually overriding the default
+ * clocksource selction.
+ */
+static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+					  const char *buf, size_t count)
+{
+	size_t ret = count;
+	/* strings from sysfs write are not 0 terminated! */
+	if (count >= sizeof(override_name))
+		return -EINVAL;
+
+	/* strip of \n: */
+	if (buf[count-1] == '\n')
+		count--;
+	if (count < 1)
+		return -EINVAL;
+
+	spin_lock_irq(&clocksource_lock);
+
+	/* copy the name given: */
+	memcpy(override_name, buf, count);
+	override_name[count] = 0;
+
+	/* try to select it: */
+	next_clocksource = select_clocksource();
+
+	spin_unlock_irq(&clocksource_lock);
+
+	return ret;
+}
+
+/**
+ * sysfs_show_available_clocksources - sysfs interface for listing clocksource
+ * @dev:	unused
+ * @buf:	char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing registered clocksources
+ */
+static ssize_t
+sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
+{
+	struct list_head *tmp;
+	char *curr = buf;
+
+	spin_lock_irq(&clocksource_lock);
+	list_for_each(tmp, &clocksource_list) {
+		struct clocksource *src;
+
+		src = list_entry(tmp, struct clocksource, list);
+		curr += sprintf(curr, "%s ", src->name);
+	}
+	spin_unlock_irq(&clocksource_lock);
+
+	curr += sprintf(curr, "\n");
+
+	return curr - buf;
+}
+
+/*
+ * Sysfs setup bits:
+ */
+static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
+			sysfs_override_clocksource);
+
+static SYSDEV_ATTR(available_clocksource, 0600,
+			sysfs_show_available_clocksources, NULL);
+
+static struct sysdev_class clocksource_sysclass = {
+	set_kset_name("clocksource"),
+};
+
+static struct sys_device device_clocksource = {
+	.id	= 0,
+	.cls	= &clocksource_sysclass,
+};
+
+static int init_clocksource_sysfs(void)
+{
+	int error = sysdev_class_register(&clocksource_sysclass);
+
+	if (!error)
+		error = sysdev_register(&device_clocksource);
+	if (!error)
+		error = sysdev_create_file(
+				&device_clocksource,
+				&attr_current_clocksource);
+	if (!error)
+		error = sysdev_create_file(
+				&device_clocksource,
+				&attr_available_clocksource);
+	return error;
+}
+
+device_initcall(init_clocksource_sysfs);
+
+/**
+ * boot_override_clocksource - boot clock override
+ * @str:	override name
+ *
+ * Takes a clocksource= boot argument and uses it
+ * as the clocksource override name.
+ */
+static int __init boot_override_clocksource(char* str)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&clocksource_lock, flags);
+	if (str)
+		strlcpy(override_name, str, sizeof(override_name));
+	spin_unlock_irqrestore(&clocksource_lock, flags);
+	return 1;
+}
+
+__setup("clocksource=", boot_override_clocksource);
+
+/**
+ * boot_override_clock - Compatibility layer for deprecated boot option
+ * @str:	override name
+ *
+ * DEPRECATED! Takes a clock= boot argument and uses it
+ * as the clocksource override name
+ */
+static int __init boot_override_clock(char* str)
+{
+	printk("Warning! clock= boot option is deprecated.\n");
+
+	return boot_override_clocksource(str);
+}
+
+__setup("clock=", boot_override_clock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
new file mode 100644
index 000000000000..1fe8376e717b
--- /dev/null
+++ b/kernel/time/jiffies.c
@@ -0,0 +1,73 @@
+/***********************************************************************
+* linux/kernel/time/jiffies.c
+*
+* This file contains the jiffies based clocksource.
+*
+* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+************************************************************************/
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+
+/* The Jiffies based clocksource is the lowest common
+ * denominator clock source which should function on
+ * all systems. It has the same coarse resolution as
+ * the timer interrupt frequency HZ and it suffers
+ * inaccuracies caused by missed or lost timer
+ * interrupts and the inability for the timer
+ * interrupt hardware to accuratly tick at the
+ * requested HZ value. It is also not reccomended
+ * for "tick-less" systems.
+ */
+#define NSEC_PER_JIFFY	((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
+
+/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
+ * conversion, the .shift value could be zero. However
+ * this would make NTP adjustments impossible as they are
+ * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
+ * shift both the nominator and denominator the same
+ * amount, and give ntp adjustments in units of 1/2^8
+ *
+ * The value 8 is somewhat carefully chosen, as anything
+ * larger can result in overflows. NSEC_PER_JIFFY grows as
+ * HZ shrinks, so values greater then 8 overflow 32bits when
+ * HZ=100.
+ */
+#define JIFFIES_SHIFT	8
+
+static cycle_t jiffies_read(void)
+{
+	return (cycle_t) jiffies;
+}
+
+struct clocksource clocksource_jiffies = {
+	.name		= "jiffies",
+	.rating		= 0, /* lowest rating*/
+	.read		= jiffies_read,
+	.mask		= 0xffffffff, /*32bits*/
+	.mult		= NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+	.shift		= JIFFIES_SHIFT,
+	.is_continuous	= 0, /* tick based, not free running */
+};
+
+static int __init init_jiffies_clocksource(void)
+{
+	return register_clocksource(&clocksource_jiffies);
+}
+
+module_init(init_jiffies_clocksource);
-- 
cgit v1.2.3


From ad596171ed635c51a9eef829187af100cbf8dcf7 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Mon, 26 Jun 2006 00:25:06 -0700
Subject: [PATCH] Time: Use clocksource infrastructure for update_wall_time

Modify the update_wall_time function so it increments time using the
clocksource abstraction instead of jiffies.  Since the only clocksource driver
currently provided is the jiffies clocksource, this should result in no
functional change.  Additionally, a timekeeping_init and timekeeping_resume
function has been added to initialize and maintain some of the new timekeping
state.

[hirofumi@mail.parknet.co.jp: fixlet]
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h      |  2 +
 init/main.c               |  1 +
 kernel/Makefile           |  1 +
 kernel/time/clocksource.c |  4 +-
 kernel/timer.c            | 93 ++++++++++++++++++++++++++++++++++++++++-------
 5 files changed, 86 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 0cd696cee998..88d3b812841e 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -77,6 +77,8 @@ extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
 extern seqlock_t xtime_lock;
 
+void timekeeping_init(void);
+
 static inline unsigned long get_seconds(void)
 {
 	return xtime.tv_sec;
diff --git a/init/main.c b/init/main.c
index f715b9b89753..9a970d317ea5 100644
--- a/init/main.c
+++ b/init/main.c
@@ -490,6 +490,7 @@ asmlinkage void __init start_kernel(void)
 	hrtimers_init();
 	softirq_init();
 	time_init();
+	timekeeping_init();
 
 	/*
 	 * HACK ALERT! This is early. We're enabling the console before
diff --git a/kernel/Makefile b/kernel/Makefile
index f6ef00f4f90f..bc4b8a7161ff 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,6 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o
 
+obj-y += time/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_FUTEX) += futex.o
 ifeq ($(CONFIG_COMPAT),y)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 95dd2200a109..4288bfa12c3f 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -56,7 +56,7 @@ static int finished_booting;
  *
  * Hack to avoid lots of clocksource churn at boot time
  */
-static int clocksource_done_booting(void)
+static int __init clocksource_done_booting(void)
 {
 	finished_booting = 1;
 	return 0;
@@ -289,7 +289,7 @@ static struct sys_device device_clocksource = {
 	.cls	= &clocksource_sysclass,
 };
 
-static int init_clocksource_sysfs(void)
+static int __init init_clocksource_sysfs(void)
 {
 	int error = sysdev_class_register(&clocksource_sysclass);
 
diff --git a/kernel/timer.c b/kernel/timer.c
index eb97371b87d8..524c7f638365 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -792,24 +792,93 @@ u64 current_tick_length(void)
 	return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj;
 }
 
+/* XXX - all of this timekeeping code should be later moved to time.c */
+#include <linux/clocksource.h>
+static struct clocksource *clock; /* pointer to current clocksource */
+static cycle_t last_clock_cycle;  /* cycle value at last update_wall_time */
 /*
- * Using a loop looks inefficient, but "ticks" is
- * usually just one (we shouldn't be losing ticks,
- * we're doing this this way mainly for interrupt
- * latency reasons, not because we think we'll
- * have lots of lost timer ticks
+ * timekeeping_init - Initializes the clocksource and common timekeeping values
  */
-static void update_wall_time(unsigned long ticks)
+void __init timekeeping_init(void)
 {
-	do {
-		ticks--;
+	unsigned long flags;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+	clock = get_next_clocksource();
+	calculate_clocksource_interval(clock, tick_nsec);
+	last_clock_cycle = read_clocksource(clock);
+	ntp_clear();
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+
+
+/*
+ * timekeeping_resume - Resumes the generic timekeeping subsystem.
+ * @dev:	unused
+ *
+ * This is for the generic clocksource timekeeping.
+ * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are
+ * still managed by arch specific suspend/resume code.
+ */
+static int timekeeping_resume(struct sys_device *dev)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+	/* restart the last cycle value */
+	last_clock_cycle = read_clocksource(clock);
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+	return 0;
+}
+
+/* sysfs resume/suspend bits for timekeeping */
+static struct sysdev_class timekeeping_sysclass = {
+	.resume		= timekeeping_resume,
+	set_kset_name("timekeeping"),
+};
+
+static struct sys_device device_timer = {
+	.id		= 0,
+	.cls		= &timekeeping_sysclass,
+};
+
+static int __init timekeeping_init_device(void)
+{
+	int error = sysdev_class_register(&timekeeping_sysclass);
+	if (!error)
+		error = sysdev_register(&device_timer);
+	return error;
+}
+
+device_initcall(timekeeping_init_device);
+
+/*
+ * update_wall_time - Uses the current clocksource to increment the wall time
+ *
+ * Called from the timer interrupt, must hold a write on xtime_lock.
+ */
+static void update_wall_time(void)
+{
+	cycle_t now, offset;
+
+	now = read_clocksource(clock);
+	offset = (now - last_clock_cycle)&clock->mask;
+
+	/* normally this loop will run just once, however in the
+	 * case of lost or late ticks, it will accumulate correctly.
+	 */
+	while (offset > clock->interval_cycles) {
+		/* accumulate one interval */
+		last_clock_cycle += clock->interval_cycles;
+		offset -= clock->interval_cycles;
+
 		update_wall_time_one_tick();
 		if (xtime.tv_nsec >= 1000000000) {
 			xtime.tv_nsec -= 1000000000;
 			xtime.tv_sec++;
 			second_overflow();
 		}
-	} while (ticks);
+	}
 }
 
 /*
@@ -915,10 +984,8 @@ static inline void update_times(void)
 	unsigned long ticks;
 
 	ticks = jiffies - wall_jiffies;
-	if (ticks) {
-		wall_jiffies += ticks;
-		update_wall_time(ticks);
-	}
+	wall_jiffies += ticks;
+	update_wall_time();
 	calc_load(ticks);
 }
   
-- 
cgit v1.2.3


From 260a42309b31cbc54eb4b6b85649e412bcad053f Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Mon, 26 Jun 2006 00:25:07 -0700
Subject: [PATCH] Time: Let user request precision from current_tick_length()

Change the current_tick_length() function so it takes an argument which
specifies how much precision to return in shifted nanoseconds.  This provides
a simple way to convert between NTPs internal nanoseconds shifted by
(SHIFT_SCALE - 10) to other shifted nanosecond units that are used by the
clocksource abstraction.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/kernel/time.c |  2 +-
 include/linux/timex.h      |  2 +-
 kernel/timer.c             | 21 +++++++++++++++++----
 3 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index d20907561f46..742f07a63161 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -534,7 +534,7 @@ static __inline__ void timer_recalc_offset(u64 cur_tb)
 
 	if (__USE_RTC())
 		return;
-	tlen = current_tick_length();
+	tlen = current_tick_length(SHIFT_SCALE - 10);
 	offset = cur_tb - do_gtod.varp->tb_orig_stamp;
 	if (tlen == last_tick_len && offset < 0x80000000u)
 		return;
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 34d3ccff7bbb..1ba3071fcb82 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -304,7 +304,7 @@ time_interpolator_reset(void)
 #endif /* !CONFIG_TIME_INTERPOLATION */
 
 /* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */
-extern u64 current_tick_length(void);
+extern u64 current_tick_length(long);
 
 extern int do_adjtimex(struct timex *);
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 524c7f638365..623f9ea198d8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -780,16 +780,29 @@ static void update_wall_time_one_tick(void)
  * Return how long ticks are at the moment, that is, how much time
  * update_wall_time_one_tick will add to xtime next time we call it
  * (assuming no calls to do_adjtimex in the meantime).
- * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10
- * bits to the right of the binary point.
+ * The return value is in fixed-point nanoseconds shifted by the
+ * specified number of bits to the right of the binary point.
  * This function has no side-effects.
  */
-u64 current_tick_length(void)
+u64 current_tick_length(long shift)
 {
 	long delta_nsec;
+	u64 ret;
 
+	/* calculate the finest interval NTP will allow.
+	 *    ie: nanosecond value shifted by (SHIFT_SCALE - 10)
+	 */
 	delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
-	return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj;
+	ret = ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj;
+
+	/* convert from (SHIFT_SCALE - 10) to specified shift scale: */
+	shift = shift - (SHIFT_SCALE - 10);
+	if (shift < 0)
+		ret >>= -shift;
+	else
+		ret <<= shift;
+
+	return ret;
 }
 
 /* XXX - all of this timekeeping code should be later moved to time.c */
-- 
cgit v1.2.3


From 5eb6d20533d14a432df714520939a6181e28f099 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Mon, 26 Jun 2006 00:25:07 -0700
Subject: [PATCH] Time: Use clocksource abstraction for NTP adjustments

Instead of incrementing xtime by tick_nsec + ntp adjustments, use the
clocksource abstraction to increment and scale time.  Using the clocksource
abstraction allows other clocksources to be used consistently in the face of
late or lost ticks, while preserving the existing behavior via the jiffies
clocksource.

This removes the need to keep time_phase adjustments as we just use the
current_tick_length() function as the NTP interface and accumulate time using
shifted nanoseconds.

The basics of this design was by Roman Zippel, however it is my own
interpretation and implementation, so the credit should go to him and the
blame to me.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/clocksource.h | 97 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c              | 47 +++++++++++++---------
 2 files changed, 125 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index c4187cda0ee4..c4739c4e3039 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -173,6 +173,103 @@ static inline void calculate_clocksource_interval(struct clocksource *c,
 	c->interval_snsecs = (u64)c->interval_cycles * c->mult;
 }
 
+
+/**
+ * error_aproximation - calculates an error adjustment for a given error
+ *
+ * @error:	Error value (unsigned)
+ * @unit:	Adjustment unit
+ *
+ * For a given error value, this function takes the adjustment unit
+ * and uses binary approximation to return a power of two adjustment value.
+ *
+ * This function is only for use by the the make_ntp_adj() function
+ * and you must hold a write on the xtime_lock when calling.
+ */
+static inline int error_aproximation(u64 error, u64 unit)
+{
+	static int saved_adj = 0;
+	u64 adjusted_unit = unit << saved_adj;
+
+	if (error > (adjusted_unit * 2)) {
+		/* large error, so increment the adjustment factor */
+		saved_adj++;
+	} else if (error > adjusted_unit) {
+		/* just right, don't touch it */
+	} else if (saved_adj) {
+		/* small error, so drop the adjustment factor */
+		saved_adj--;
+		return 0;
+	}
+
+	return saved_adj;
+}
+
+
+/**
+ * make_ntp_adj - Adjusts the specified clocksource for a given error
+ *
+ * @clock:		Pointer to clock to be adjusted
+ * @cycles_delta:	Current unacounted cycle delta
+ * @error:		Pointer to current error value
+ *
+ * Returns clock shifted nanosecond adjustment to be applied against
+ * the accumulated time value (ie: xtime).
+ *
+ * If the error value is large enough, this function calulates the
+ * (power of two) adjustment value, and adjusts the clock's mult and
+ * interval_snsecs values accordingly.
+ *
+ * However, since there may be some unaccumulated cycles, to avoid
+ * time inconsistencies we must adjust the accumulation value
+ * accordingly.
+ *
+ * This is not very intuitive, so the following proof should help:
+ * The basic timeofday algorithm:  base + cycle * mult
+ * Thus:
+ *    new_base + cycle * new_mult = old_base + cycle * old_mult
+ *    new_base = old_base + cycle * old_mult - cycle * new_mult
+ *    new_base = old_base + cycle * (old_mult - new_mult)
+ *    new_base - old_base = cycle * (old_mult - new_mult)
+ *    base_delta = cycle * (old_mult - new_mult)
+ *    base_delta = cycle * (mult_delta)
+ *
+ * Where mult_delta is the adjustment value made to mult
+ *
+ */
+static inline s64 make_ntp_adj(struct clocksource *clock,
+				cycles_t cycles_delta, s64* error)
+{
+	s64 ret = 0;
+	if (*error  > ((s64)clock->interval_cycles+1)/2) {
+		/* calculate adjustment value */
+		int adjustment = error_aproximation(*error,
+						clock->interval_cycles);
+		/* adjust clock */
+		clock->mult += 1 << adjustment;
+		clock->interval_snsecs += clock->interval_cycles << adjustment;
+
+		/* adjust the base and error for the adjustment */
+		ret =  -(cycles_delta << adjustment);
+		*error -= clock->interval_cycles << adjustment;
+		/* XXX adj error for cycle_delta offset? */
+	} else if ((-(*error))  > ((s64)clock->interval_cycles+1)/2) {
+		/* calculate adjustment value */
+		int adjustment = error_aproximation(-(*error),
+						clock->interval_cycles);
+		/* adjust clock */
+		clock->mult -= 1 << adjustment;
+		clock->interval_snsecs -= clock->interval_cycles << adjustment;
+
+		/* adjust the base and error for the adjustment */
+		ret =  cycles_delta << adjustment;
+		*error += clock->interval_cycles << adjustment;
+		/* XXX adj error for cycle_delta offset? */
+	}
+	return ret;
+}
+
+
 /* used to install a new clocksource */
 int register_clocksource(struct clocksource*);
 void reselect_clocksource(void);
diff --git a/kernel/timer.c b/kernel/timer.c
index 623f9ea198d8..6811436a031d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -597,7 +597,6 @@ long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
 long time_precision = 1;		/* clock precision (us)		*/
 long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
-static long time_phase;			/* phase offset (scaled us)	*/
 long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
 					/* frequency offset (scaled ppm)*/
 static long time_adj;			/* tick adjust (scaled 1 / HZ)	*/
@@ -747,27 +746,14 @@ static long adjtime_adjustment(void)
 }
 
 /* in the NTP reference this is called "hardclock()" */
-static void update_wall_time_one_tick(void)
+static void update_ntp_one_tick(void)
 {
-	long time_adjust_step, delta_nsec;
+	long time_adjust_step;
 
 	time_adjust_step = adjtime_adjustment();
 	if (time_adjust_step)
 		/* Reduce by this step the amount of time left  */
 		time_adjust -= time_adjust_step;
-	delta_nsec = tick_nsec + time_adjust_step * 1000;
-	/*
-	 * Advance the phase, once it gets to one microsecond, then
-	 * advance the tick more.
-	 */
-	time_phase += time_adj;
-	if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
-		long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
-		time_phase -= ltemp << (SHIFT_SCALE - 10);
-		delta_nsec += ltemp;
-	}
-	xtime.tv_nsec += delta_nsec;
-	time_interpolator_update(delta_nsec);
 
 	/* Changes by adjtime() do not take effect till next tick. */
 	if (time_next_adjust != 0) {
@@ -872,8 +858,13 @@ device_initcall(timekeeping_init_device);
  */
 static void update_wall_time(void)
 {
+	static s64 remainder_snsecs, error;
+	s64 snsecs_per_sec;
 	cycle_t now, offset;
 
+	snsecs_per_sec = (s64)NSEC_PER_SEC << clock->shift;
+	remainder_snsecs += (s64)xtime.tv_nsec << clock->shift;
+
 	now = read_clocksource(clock);
 	offset = (now - last_clock_cycle)&clock->mask;
 
@@ -881,17 +872,35 @@ static void update_wall_time(void)
 	 * case of lost or late ticks, it will accumulate correctly.
 	 */
 	while (offset > clock->interval_cycles) {
+		/* get the ntp interval in clock shifted nanoseconds */
+		s64 ntp_snsecs	= current_tick_length(clock->shift);
+
 		/* accumulate one interval */
+		remainder_snsecs += clock->interval_snsecs;
 		last_clock_cycle += clock->interval_cycles;
 		offset -= clock->interval_cycles;
 
-		update_wall_time_one_tick();
-		if (xtime.tv_nsec >= 1000000000) {
-			xtime.tv_nsec -= 1000000000;
+		/* interpolator bits */
+		time_interpolator_update(clock->interval_snsecs
+						>> clock->shift);
+		/* increment the NTP state machine */
+		update_ntp_one_tick();
+
+		/* accumulate error between NTP and clock interval */
+		error += (ntp_snsecs - (s64)clock->interval_snsecs);
+
+		/* correct the clock when NTP error is too big */
+		remainder_snsecs += make_ntp_adj(clock, offset, &error);
+
+		if (remainder_snsecs >= snsecs_per_sec) {
+			remainder_snsecs -= snsecs_per_sec;
 			xtime.tv_sec++;
 			second_overflow();
 		}
 	}
+	/* store full nanoseconds into xtime */
+	xtime.tv_nsec = remainder_snsecs >> clock->shift;
+	remainder_snsecs -= (s64)xtime.tv_nsec << clock->shift;
 }
 
 /*
-- 
cgit v1.2.3


From cf3c769b4b0dd1146da84d5cf045dcfe53bd0f13 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Mon, 26 Jun 2006 00:25:08 -0700
Subject: [PATCH] Time: Introduce arch generic time accessors

Introduces clocksource switching code and the arch generic time accessor
functions that use the clocksource infrastructure.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h |  15 +++++
 kernel/time.c        |   2 +
 kernel/timer.c       | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 187 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 88d3b812841e..65dd85b2105e 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -102,6 +102,7 @@ extern int do_getitimer(int which, struct itimerval *value);
 extern void getnstimeofday(struct timespec *tv);
 
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
+extern int timekeeping_is_continuous(void);
 
 /**
  * timespec_to_ns - Convert timespec to nanoseconds
@@ -144,6 +145,20 @@ extern struct timespec ns_to_timespec(const s64 nsec);
  */
 extern struct timeval ns_to_timeval(const s64 nsec);
 
+/**
+ * timespec_add_ns - Adds nanoseconds to a timespec
+ * @a:		pointer to timespec to be incremented
+ * @ns:		unsigned nanoseconds value to be added
+ */
+static inline void timespec_add_ns(struct timespec *a, u64 ns)
+{
+	ns += a->tv_nsec;
+	while(unlikely(ns >= NSEC_PER_SEC)) {
+		ns -= NSEC_PER_SEC;
+		a->tv_sec++;
+	}
+	a->tv_nsec = ns;
+}
 #endif /* __KERNEL__ */
 
 #define NFDBITS			__NFDBITS
diff --git a/kernel/time.c b/kernel/time.c
index b00ddc71cedb..5bd489747643 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday);
 
 
 #else
+#ifndef CONFIG_GENERIC_TIME
 /*
  * Simulate gettimeofday using do_gettimeofday which only allows a timeval
  * and therefore only yields usec accuracy
@@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv)
 }
 EXPORT_SYMBOL_GPL(getnstimeofday);
 #endif
+#endif
 
 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
  * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/timer.c b/kernel/timer.c
index 6811436a031d..e5adb9e2e7a7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -795,6 +795,169 @@ u64 current_tick_length(long shift)
 #include <linux/clocksource.h>
 static struct clocksource *clock; /* pointer to current clocksource */
 static cycle_t last_clock_cycle;  /* cycle value at last update_wall_time */
+
+#ifdef CONFIG_GENERIC_TIME
+/**
+ * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+ *
+ * private function, must hold xtime_lock lock when being
+ * called. Returns the number of nanoseconds since the
+ * last call to update_wall_time() (adjusted by NTP scaling)
+ */
+static inline s64 __get_nsec_offset(void)
+{
+	cycle_t cycle_now, cycle_delta;
+	s64 ns_offset;
+
+	/* read clocksource: */
+	cycle_now = read_clocksource(clock);
+
+	/* calculate the delta since the last update_wall_time: */
+	cycle_delta = (cycle_now - last_clock_cycle) & clock->mask;
+
+	/* convert to nanoseconds: */
+	ns_offset = cyc2ns(clock, cycle_delta);
+
+	return ns_offset;
+}
+
+/**
+ * __get_realtime_clock_ts - Returns the time of day in a timespec
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec. Used by
+ * do_gettimeofday() and get_realtime_clock_ts().
+ */
+static inline void __get_realtime_clock_ts(struct timespec *ts)
+{
+	unsigned long seq;
+	s64 nsecs;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		*ts = xtime;
+		nsecs = __get_nsec_offset();
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	timespec_add_ns(ts, nsecs);
+}
+
+/**
+ * get_realtime_clock_ts - Returns the time of day in a timespec
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void getnstimeofday(struct timespec *ts)
+{
+	__get_realtime_clock_ts(ts);
+}
+
+EXPORT_SYMBOL(getnstimeofday);
+
+/**
+ * do_gettimeofday - Returns the time of day in a timeval
+ * @tv:		pointer to the timeval to be set
+ *
+ * NOTE: Users should be converted to using get_realtime_clock_ts()
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+	struct timespec now;
+
+	__get_realtime_clock_ts(&now);
+	tv->tv_sec = now.tv_sec;
+	tv->tv_usec = now.tv_nsec/1000;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+/**
+ * do_settimeofday - Sets the time of day
+ * @tv:		pointer to the timespec variable containing the new time
+ *
+ * Sets the time of day to the new time and update NTP and notify hrtimers
+ */
+int do_settimeofday(struct timespec *tv)
+{
+	unsigned long flags;
+	time_t wtm_sec, sec = tv->tv_sec;
+	long wtm_nsec, nsec = tv->tv_nsec;
+
+	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+		return -EINVAL;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+
+	nsec -= __get_nsec_offset();
+
+	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+
+	set_normalized_timespec(&xtime, sec, nsec);
+	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+
+	ntp_clear();
+
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+
+	/* signal hrtimers about time change */
+	clock_was_set();
+
+	return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+/**
+ * change_clocksource - Swaps clocksources if a new one is available
+ *
+ * Accumulates current time interval and initializes new clocksource
+ */
+static int change_clocksource(void)
+{
+	struct clocksource *new;
+	cycle_t now;
+	u64 nsec;
+	new = get_next_clocksource();
+	if (clock != new) {
+		now = read_clocksource(new);
+		nsec =  __get_nsec_offset();
+		timespec_add_ns(&xtime, nsec);
+
+		clock = new;
+		last_clock_cycle = now;
+		printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+					clock->name);
+		return 1;
+	} else if (clock->update_callback) {
+		return clock->update_callback();
+	}
+	return 0;
+}
+#else
+#define change_clocksource() (0)
+#endif
+
+/**
+ * timeofday_is_continuous - check to see if timekeeping is free running
+ */
+int timekeeping_is_continuous(void)
+{
+	unsigned long seq;
+	int ret;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		ret = clock->is_continuous;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	return ret;
+}
+
 /*
  * timekeeping_init - Initializes the clocksource and common timekeeping values
  */
@@ -901,6 +1064,13 @@ static void update_wall_time(void)
 	/* store full nanoseconds into xtime */
 	xtime.tv_nsec = remainder_snsecs >> clock->shift;
 	remainder_snsecs -= (s64)xtime.tv_nsec << clock->shift;
+
+	/* check to see if there is a new clocksource to use */
+	if (change_clocksource()) {
+		error = 0;
+		remainder_snsecs = 0;
+		calculate_clocksource_interval(clock, tick_nsec);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From a275254975a29c51929ee175b92ac471ac2a0043 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Mon, 26 Jun 2006 00:25:14 -0700
Subject: [PATCH] time: rename clocksource functions

As suggested by Roman Zippel, change clocksource functions to use
clocksource_xyz rather then xyz_clocksource to avoid polluting the
namespace.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/hpet.c       |  2 +-
 arch/i386/kernel/i8253.c      |  2 +-
 arch/i386/kernel/tsc.c        |  4 ++--
 drivers/clocksource/acpi_pm.c |  2 +-
 drivers/clocksource/cyclone.c |  2 +-
 include/linux/clocksource.h   | 14 +++++++-------
 kernel/time/clocksource.c     | 16 ++++++++--------
 kernel/time/jiffies.c         |  2 +-
 kernel/timer.c                | 20 ++++++++++----------
 9 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 91a5bdd9f604..23e7d2c5d253 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -61,7 +61,7 @@ static int __init init_hpet_clocksource(void)
 	do_div(tmp, FSEC_PER_NSEC);
 	clocksource_hpet.mult = (u32)tmp;
 
-	return register_clocksource(&clocksource_hpet);
+	return clocksource_register(&clocksource_hpet);
 }
 
 module_init(init_hpet_clocksource);
diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c
index a276bceade68..5b13739a7ff4 100644
--- a/arch/i386/kernel/i8253.c
+++ b/arch/i386/kernel/i8253.c
@@ -80,6 +80,6 @@ static int __init init_pit_clocksource(void)
 		return 0;
 
 	clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
-	return register_clocksource(&clocksource_pit);
+	return clocksource_register(&clocksource_pit);
 }
 module_init(init_pit_clocksource);
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 7713f86389af..1c3c927755de 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -351,7 +351,7 @@ static int tsc_update_callback(void)
 	/* check to see if we should switch to the safe clocksource: */
 	if (clocksource_tsc.rating != 50 && check_tsc_unstable()) {
 		clocksource_tsc.rating = 50;
-		reselect_clocksource();
+		clocksource_reselect();
 		change = 1;
 	}
 
@@ -469,7 +469,7 @@ static int __init init_tsc_clocksource(void)
 			jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL);
 		add_timer(&verify_tsc_freq_timer);
 
-		return register_clocksource(&clocksource_tsc);
+		return clocksource_register(&clocksource_tsc);
 	}
 
 	return 0;
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index a0e5cde2fa71..9217be5048d5 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -171,7 +171,7 @@ static int __init init_acpi_pm_clocksource(void)
 	return -ENODEV;
 
 pm_good:
-	return register_clocksource(&clocksource_acpi_pm);
+	return clocksource_register(&clocksource_acpi_pm);
 }
 
 module_init(init_acpi_pm_clocksource);
diff --git a/drivers/clocksource/cyclone.c b/drivers/clocksource/cyclone.c
index 444eb11b9b4f..5906a0af825d 100644
--- a/drivers/clocksource/cyclone.c
+++ b/drivers/clocksource/cyclone.c
@@ -113,7 +113,7 @@ static int __init init_cyclone_clocksource(void)
 	clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ,
 						clocksource_cyclone.shift);
 
-	return register_clocksource(&clocksource_cyclone);
+	return clocksource_register(&clocksource_cyclone);
 }
 
 module_init(init_cyclone_clocksource);
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index c4739c4e3039..5f4a7f72f3ee 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -118,12 +118,12 @@ static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
 }
 
 /**
- * read_clocksource: - Access the clocksource's current cycle value
+ * clocksource_read: - Access the clocksource's current cycle value
  * @cs:		pointer to clocksource being read
  *
  * Uses the clocksource to return the current cycle_t value
  */
-static inline cycle_t read_clocksource(struct clocksource *cs)
+static inline cycle_t clocksource_read(struct clocksource *cs)
 {
 	return cs->read();
 }
@@ -145,7 +145,7 @@ static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles)
 }
 
 /**
- * calculate_clocksource_interval - Calculates a clocksource interval struct
+ * clocksource_calculate_interval - Calculates a clocksource interval struct
  *
  * @c:		Pointer to clocksource.
  * @length_nsec: Desired interval length in nanoseconds.
@@ -155,7 +155,7 @@ static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles)
  *
  * Unless you're the timekeeping code, you should not be using this!
  */
-static inline void calculate_clocksource_interval(struct clocksource *c,
+static inline void clocksource_calculate_interval(struct clocksource *c,
 						unsigned long length_nsec)
 {
 	u64 tmp;
@@ -271,8 +271,8 @@ static inline s64 make_ntp_adj(struct clocksource *clock,
 
 
 /* used to install a new clocksource */
-int register_clocksource(struct clocksource*);
-void reselect_clocksource(void);
-struct clocksource* get_next_clocksource(void);
+int clocksource_register(struct clocksource*);
+void clocksource_reselect(void);
+struct clocksource* clocksource_get_next(void);
 
 #endif /* _LINUX_CLOCKSOURCE_H */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index a9f387ea83b0..74eca5939bd9 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -65,10 +65,10 @@ static int __init clocksource_done_booting(void)
 late_initcall(clocksource_done_booting);
 
 /**
- * get_next_clocksource - Returns the selected clocksource
+ * clocksource_get_next - Returns the selected clocksource
  *
  */
-struct clocksource *get_next_clocksource(void)
+struct clocksource *clocksource_get_next(void)
 {
 	unsigned long flags;
 
@@ -142,12 +142,12 @@ static int is_registered_source(struct clocksource *c)
 }
 
 /**
- * register_clocksource - Used to install new clocksources
+ * clocksource_register - Used to install new clocksources
  * @t:		clocksource to be registered
  *
  * Returns -EBUSY if registration fails, zero otherwise.
  */
-int register_clocksource(struct clocksource *c)
+int clocksource_register(struct clocksource *c)
 {
 	int ret = 0;
 	unsigned long flags;
@@ -167,17 +167,16 @@ int register_clocksource(struct clocksource *c)
 	spin_unlock_irqrestore(&clocksource_lock, flags);
 	return ret;
 }
-
-EXPORT_SYMBOL(register_clocksource);
+EXPORT_SYMBOL(clocksource_register);
 
 /**
- * reselect_clocksource - Rescan list for next clocksource
+ * clocksource_reselect - Rescan list for next clocksource
  *
  * A quick helper function to be used if a clocksource changes its
  * rating. Forces the clocksource list to be re-scanned for the best
  * clocksource.
  */
-void reselect_clocksource(void)
+void clocksource_reselect(void)
 {
 	unsigned long flags;
 
@@ -185,6 +184,7 @@ void reselect_clocksource(void)
 	next_clocksource = select_clocksource();
 	spin_unlock_irqrestore(&clocksource_lock, flags);
 }
+EXPORT_SYMBOL(clocksource_reselect);
 
 /**
  * sysfs_show_current_clocksources - sysfs interface for current clocksource
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 1fe8376e717b..126bb30c4afe 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -67,7 +67,7 @@ struct clocksource clocksource_jiffies = {
 
 static int __init init_jiffies_clocksource(void)
 {
-	return register_clocksource(&clocksource_jiffies);
+	return clocksource_register(&clocksource_jiffies);
 }
 
 module_init(init_jiffies_clocksource);
diff --git a/kernel/timer.c b/kernel/timer.c
index e5adb9e2e7a7..890a56937cfa 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -810,7 +810,7 @@ static inline s64 __get_nsec_offset(void)
 	s64 ns_offset;
 
 	/* read clocksource: */
-	cycle_now = read_clocksource(clock);
+	cycle_now = clocksource_read(clock);
 
 	/* calculate the delta since the last update_wall_time: */
 	cycle_delta = (cycle_now - last_clock_cycle) & clock->mask;
@@ -845,7 +845,7 @@ static inline void __get_realtime_clock_ts(struct timespec *ts)
 }
 
 /**
- * get_realtime_clock_ts - Returns the time of day in a timespec
+ * getnstimeofday - Returns the time of day in a timespec
  * @ts:		pointer to the timespec to be set
  *
  * Returns the time of day in a timespec.
@@ -920,9 +920,9 @@ static int change_clocksource(void)
 	struct clocksource *new;
 	cycle_t now;
 	u64 nsec;
-	new = get_next_clocksource();
+	new = clocksource_get_next();
 	if (clock != new) {
-		now = read_clocksource(new);
+		now = clocksource_read(new);
 		nsec =  __get_nsec_offset();
 		timespec_add_ns(&xtime, nsec);
 
@@ -966,9 +966,9 @@ void __init timekeeping_init(void)
 	unsigned long flags;
 
 	write_seqlock_irqsave(&xtime_lock, flags);
-	clock = get_next_clocksource();
-	calculate_clocksource_interval(clock, tick_nsec);
-	last_clock_cycle = read_clocksource(clock);
+	clock = clocksource_get_next();
+	clocksource_calculate_interval(clock, tick_nsec);
+	last_clock_cycle = clocksource_read(clock);
 	ntp_clear();
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 }
@@ -988,7 +988,7 @@ static int timekeeping_resume(struct sys_device *dev)
 
 	write_seqlock_irqsave(&xtime_lock, flags);
 	/* restart the last cycle value */
-	last_clock_cycle = read_clocksource(clock);
+	last_clock_cycle = clocksource_read(clock);
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	return 0;
 }
@@ -1028,7 +1028,7 @@ static void update_wall_time(void)
 	snsecs_per_sec = (s64)NSEC_PER_SEC << clock->shift;
 	remainder_snsecs += (s64)xtime.tv_nsec << clock->shift;
 
-	now = read_clocksource(clock);
+	now = clocksource_read(clock);
 	offset = (now - last_clock_cycle)&clock->mask;
 
 	/* normally this loop will run just once, however in the
@@ -1069,7 +1069,7 @@ static void update_wall_time(void)
 	if (change_clocksource()) {
 		error = 0;
 		remainder_snsecs = 0;
-		calculate_clocksource_interval(clock, tick_nsec);
+		clocksource_calculate_interval(clock, tick_nsec);
 	}
 }
 
-- 
cgit v1.2.3


From 7f9f303aa33c7acc7b4aa9ebea25cbd990bc707b Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Mon, 26 Jun 2006 00:25:15 -0700
Subject: [PATCH] generic-time: add macro to simplify/hide mask constants

Add a CLOCKSOURCE_MASK macro to simplify initializing the mask for a struct
clocksource, and use it to replace literal mask constants in the various
clocksource drivers.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/hpet.c       | 4 ++--
 arch/i386/kernel/i8253.c      | 2 +-
 arch/i386/kernel/tsc.c        | 2 +-
 drivers/clocksource/acpi_pm.c | 2 +-
 drivers/clocksource/cyclone.c | 4 ++--
 include/linux/clocksource.h   | 2 ++
 6 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 23e7d2c5d253..c6737c35815d 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -6,7 +6,7 @@
 #include <asm/hpet.h>
 #include <asm/io.h>
 
-#define HPET_MASK	0xFFFFFFFF
+#define HPET_MASK	CLOCKSOURCE_MASK(32)
 #define HPET_SHIFT	22
 
 /* FSEC = 10^-15 NSEC = 10^-9 */
@@ -23,7 +23,7 @@ static struct clocksource clocksource_hpet = {
 	.name		= "hpet",
 	.rating		= 250,
 	.read		= read_hpet,
-	.mask		= (cycle_t)HPET_MASK,
+	.mask		= HPET_MASK,
 	.mult		= 0, /* set below */
 	.shift		= HPET_SHIFT,
 	.is_continuous	= 1,
diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c
index 5b13739a7ff4..38aa7f19f1d3 100644
--- a/arch/i386/kernel/i8253.c
+++ b/arch/i386/kernel/i8253.c
@@ -69,7 +69,7 @@ static struct clocksource clocksource_pit = {
 	.name	= "pit",
 	.rating = 110,
 	.read	= pit_read,
-	.mask	= (cycle_t)-1,
+	.mask	= CLOCKSOURCE_MASK(64),
 	.mult	= 0,
 	.shift	= 20,
 };
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 1c3c927755de..7e0d8dab2075 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -337,7 +337,7 @@ static struct clocksource clocksource_tsc = {
 	.name			= "tsc",
 	.rating			= 300,
 	.read			= read_tsc,
-	.mask			= (cycle_t)-1,
+	.mask			= CLOCKSOURCE_MASK(64),
 	.mult			= 0, /* to be set */
 	.shift			= 22,
 	.update_callback	= tsc_update_callback,
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index 066dc77433d5..7ad3be8c0f49 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -32,7 +32,7 @@
  */
 u32 pmtmr_ioport __read_mostly;
 
-#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
+#define ACPI_PM_MASK CLOCKSOURCE_MASK(24) /* limit it to 24 bits */
 
 static inline u32 read_pmtmr(void)
 {
diff --git a/drivers/clocksource/cyclone.c b/drivers/clocksource/cyclone.c
index 5906a0af825d..bf4d3d50d1c4 100644
--- a/drivers/clocksource/cyclone.c
+++ b/drivers/clocksource/cyclone.c
@@ -14,7 +14,7 @@
 #define CYCLONE_MPCS_OFFSET	0x51A8		/* offset to select register */
 #define CYCLONE_MPMC_OFFSET	0x51D0		/* offset to count register */
 #define CYCLONE_TIMER_FREQ	99780000	/* 100Mhz, but not really */
-#define CYCLONE_TIMER_MASK	0xFFFFFFFF	/* 32 bit mask */
+#define CYCLONE_TIMER_MASK	CLOCKSOURCE_MASK(32) /* 32 bit mask */
 
 int use_cyclone = 0;
 static void __iomem *cyclone_ptr;
@@ -28,7 +28,7 @@ static struct clocksource clocksource_cyclone = {
 	.name		= "cyclone",
 	.rating		= 250,
 	.read		= read_cyclone,
-	.mask		= (cycle_t)CYCLONE_TIMER_MASK,
+	.mask		= CYCLONE_TIMER_MASK,
 	.mult		= 10,
 	.shift		= 0,
 	.is_continuous	= 1,
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 5f4a7f72f3ee..4bc94282c364 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -65,6 +65,8 @@ struct clocksource {
 	u64 interval_snsecs;
 };
 
+/* simplify initialization of mask field */
+#define CLOCKSOURCE_MASK(bits) (cycle_t)(bits<64 ? ((1ULL<<bits)-1) : -1)
 
 /**
  * clocksource_khz2mult - calculates mult from khz and shift
-- 
cgit v1.2.3


From 19923c190e0932bf0ac1e1d06a48f5c3678dd0de Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Mon, 26 Jun 2006 00:25:18 -0700
Subject: [PATCH] fix and optimize clock source update

This fixes the clock source updates in update_wall_time() to correctly
track the time coming in via current_tick_length().  Optimize the fast
paths to be as short as possible to keep the overhead low.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/kernel/time.c  |   4 +-
 include/linux/clocksource.h | 113 +++------------------------------
 include/linux/timex.h       |   4 +-
 kernel/timer.c              | 151 ++++++++++++++++++++++++++++++++------------
 4 files changed, 123 insertions(+), 149 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 742f07a63161..7dd5dab789a1 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -102,7 +102,7 @@ EXPORT_SYMBOL(tb_ticks_per_sec);	/* for cputime_t conversions */
 u64 tb_to_xs;
 unsigned tb_to_us;
 
-#define TICKLEN_SCALE	(SHIFT_SCALE - 10)
+#define TICKLEN_SCALE	TICK_LENGTH_SHIFT
 u64 last_tick_len;	/* units are ns / 2^TICKLEN_SCALE */
 u64 ticklen_to_xs;	/* 0.64 fraction */
 
@@ -534,7 +534,7 @@ static __inline__ void timer_recalc_offset(u64 cur_tb)
 
 	if (__USE_RTC())
 		return;
-	tlen = current_tick_length(SHIFT_SCALE - 10);
+	tlen = current_tick_length();
 	offset = cur_tb - do_gtod.varp->tb_orig_stamp;
 	if (tlen == last_tick_len && offset < 0x80000000u)
 		return;
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 4bc94282c364..d852024ed095 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -46,8 +46,8 @@ typedef u64 cycle_t;
  * @shift:		cycle to nanosecond divisor (power of two)
  * @update_callback:	called when safe to alter clocksource values
  * @is_continuous:	defines if clocksource is free-running.
- * @interval_cycles:	Used internally by timekeeping core, please ignore.
- * @interval_snsecs:	Used internally by timekeeping core, please ignore.
+ * @cycle_interval:	Used internally by timekeeping core, please ignore.
+ * @xtime_interval:	Used internally by timekeeping core, please ignore.
  */
 struct clocksource {
 	char *name;
@@ -61,8 +61,9 @@ struct clocksource {
 	int is_continuous;
 
 	/* timekeeping specific data, ignore */
-	cycle_t interval_cycles;
-	u64 interval_snsecs;
+	cycle_t cycle_last, cycle_interval;
+	u64 xtime_nsec, xtime_interval;
+	s64 error;
 };
 
 /* simplify initialization of mask field */
@@ -168,107 +169,11 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
 	tmp += c->mult/2;
 	do_div(tmp, c->mult);
 
-	c->interval_cycles = (cycle_t)tmp;
-	if(c->interval_cycles == 0)
-		c->interval_cycles = 1;
+	c->cycle_interval = (cycle_t)tmp;
+	if (c->cycle_interval == 0)
+		c->cycle_interval = 1;
 
-	c->interval_snsecs = (u64)c->interval_cycles * c->mult;
-}
-
-
-/**
- * error_aproximation - calculates an error adjustment for a given error
- *
- * @error:	Error value (unsigned)
- * @unit:	Adjustment unit
- *
- * For a given error value, this function takes the adjustment unit
- * and uses binary approximation to return a power of two adjustment value.
- *
- * This function is only for use by the the make_ntp_adj() function
- * and you must hold a write on the xtime_lock when calling.
- */
-static inline int error_aproximation(u64 error, u64 unit)
-{
-	static int saved_adj = 0;
-	u64 adjusted_unit = unit << saved_adj;
-
-	if (error > (adjusted_unit * 2)) {
-		/* large error, so increment the adjustment factor */
-		saved_adj++;
-	} else if (error > adjusted_unit) {
-		/* just right, don't touch it */
-	} else if (saved_adj) {
-		/* small error, so drop the adjustment factor */
-		saved_adj--;
-		return 0;
-	}
-
-	return saved_adj;
-}
-
-
-/**
- * make_ntp_adj - Adjusts the specified clocksource for a given error
- *
- * @clock:		Pointer to clock to be adjusted
- * @cycles_delta:	Current unacounted cycle delta
- * @error:		Pointer to current error value
- *
- * Returns clock shifted nanosecond adjustment to be applied against
- * the accumulated time value (ie: xtime).
- *
- * If the error value is large enough, this function calulates the
- * (power of two) adjustment value, and adjusts the clock's mult and
- * interval_snsecs values accordingly.
- *
- * However, since there may be some unaccumulated cycles, to avoid
- * time inconsistencies we must adjust the accumulation value
- * accordingly.
- *
- * This is not very intuitive, so the following proof should help:
- * The basic timeofday algorithm:  base + cycle * mult
- * Thus:
- *    new_base + cycle * new_mult = old_base + cycle * old_mult
- *    new_base = old_base + cycle * old_mult - cycle * new_mult
- *    new_base = old_base + cycle * (old_mult - new_mult)
- *    new_base - old_base = cycle * (old_mult - new_mult)
- *    base_delta = cycle * (old_mult - new_mult)
- *    base_delta = cycle * (mult_delta)
- *
- * Where mult_delta is the adjustment value made to mult
- *
- */
-static inline s64 make_ntp_adj(struct clocksource *clock,
-				cycles_t cycles_delta, s64* error)
-{
-	s64 ret = 0;
-	if (*error  > ((s64)clock->interval_cycles+1)/2) {
-		/* calculate adjustment value */
-		int adjustment = error_aproximation(*error,
-						clock->interval_cycles);
-		/* adjust clock */
-		clock->mult += 1 << adjustment;
-		clock->interval_snsecs += clock->interval_cycles << adjustment;
-
-		/* adjust the base and error for the adjustment */
-		ret =  -(cycles_delta << adjustment);
-		*error -= clock->interval_cycles << adjustment;
-		/* XXX adj error for cycle_delta offset? */
-	} else if ((-(*error))  > ((s64)clock->interval_cycles+1)/2) {
-		/* calculate adjustment value */
-		int adjustment = error_aproximation(-(*error),
-						clock->interval_cycles);
-		/* adjust clock */
-		clock->mult -= 1 << adjustment;
-		clock->interval_snsecs -= clock->interval_cycles << adjustment;
-
-		/* adjust the base and error for the adjustment */
-		ret =  cycles_delta << adjustment;
-		*error += clock->interval_cycles << adjustment;
-		/* XXX adj error for cycle_delta offset? */
-	}
-	return ret;
+	c->xtime_interval = (u64)c->cycle_interval * c->mult;
 }
 
 
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 1ba3071fcb82..19bb6538b49e 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -303,8 +303,10 @@ time_interpolator_reset(void)
 
 #endif /* !CONFIG_TIME_INTERPOLATION */
 
+#define TICK_LENGTH_SHIFT	32
+
 /* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */
-extern u64 current_tick_length(long);
+extern u64 current_tick_length(void);
 
 extern int do_adjtimex(struct timex *);
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 890a56937cfa..5bb6b7976eec 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -770,7 +770,7 @@ static void update_ntp_one_tick(void)
  * specified number of bits to the right of the binary point.
  * This function has no side-effects.
  */
-u64 current_tick_length(long shift)
+u64 current_tick_length(void)
 {
 	long delta_nsec;
 	u64 ret;
@@ -779,14 +779,8 @@ u64 current_tick_length(long shift)
 	 *    ie: nanosecond value shifted by (SHIFT_SCALE - 10)
 	 */
 	delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
-	ret = ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj;
-
-	/* convert from (SHIFT_SCALE - 10) to specified shift scale: */
-	shift = shift - (SHIFT_SCALE - 10);
-	if (shift < 0)
-		ret >>= -shift;
-	else
-		ret <<= shift;
+	ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
+	ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
 
 	return ret;
 }
@@ -794,7 +788,6 @@ u64 current_tick_length(long shift)
 /* XXX - all of this timekeeping code should be later moved to time.c */
 #include <linux/clocksource.h>
 static struct clocksource *clock; /* pointer to current clocksource */
-static cycle_t last_clock_cycle;  /* cycle value at last update_wall_time */
 
 #ifdef CONFIG_GENERIC_TIME
 /**
@@ -813,7 +806,7 @@ static inline s64 __get_nsec_offset(void)
 	cycle_now = clocksource_read(clock);
 
 	/* calculate the delta since the last update_wall_time: */
-	cycle_delta = (cycle_now - last_clock_cycle) & clock->mask;
+	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
 	/* convert to nanoseconds: */
 	ns_offset = cyc2ns(clock, cycle_delta);
@@ -927,7 +920,7 @@ static int change_clocksource(void)
 		timespec_add_ns(&xtime, nsec);
 
 		clock = new;
-		last_clock_cycle = now;
+		clock->cycle_last = now;
 		printk(KERN_INFO "Time: %s clocksource has been installed.\n",
 					clock->name);
 		return 1;
@@ -968,7 +961,7 @@ void __init timekeeping_init(void)
 	write_seqlock_irqsave(&xtime_lock, flags);
 	clock = clocksource_get_next();
 	clocksource_calculate_interval(clock, tick_nsec);
-	last_clock_cycle = clocksource_read(clock);
+	clock->cycle_last = clocksource_read(clock);
 	ntp_clear();
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 }
@@ -988,7 +981,7 @@ static int timekeeping_resume(struct sys_device *dev)
 
 	write_seqlock_irqsave(&xtime_lock, flags);
 	/* restart the last cycle value */
-	last_clock_cycle = clocksource_read(clock);
+	clock->cycle_last = clocksource_read(clock);
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	return 0;
 }
@@ -1014,6 +1007,81 @@ static int __init timekeeping_init_device(void)
 
 device_initcall(timekeeping_init_device);
 
+/*
+ * If the error is already larger, we look ahead another tick,
+ * to compensate for late or lost adjustments.
+ */
+static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset)
+{
+	int adj;
+
+	/*
+	 * As soon as the machine is synchronized to the external time
+	 * source this should be the common case.
+	 */
+	error >>= 2;
+	if (likely(sign > 0 ? error <= *interval : error >= *interval))
+		return sign;
+
+	/*
+	 * An extra look ahead dampens the effect of the current error,
+	 * which can grow quite large with continously late updates, as
+	 * it would dominate the adjustment value and can lead to
+	 * oscillation.
+	 */
+	error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
+	error -= clock->xtime_interval >> 1;
+
+	adj = 0;
+	while (1) {
+		error >>= 1;
+		if (sign > 0 ? error <= *interval : error >= *interval)
+			break;
+		adj++;
+	}
+
+	/*
+	 * Add the current adjustments to the error and take the offset
+	 * into account, the latter can cause the error to be hardly
+	 * reduced at the next tick. Check the error again if there's
+	 * room for another adjustment, thus further reducing the error
+	 * which otherwise had to be corrected at the next update.
+	 */
+	error = (error << 1) - *interval + *offset;
+	if (sign > 0 ? error > *interval : error < *interval)
+		adj++;
+
+	*interval <<= adj;
+	*offset <<= adj;
+	return sign << adj;
+}
+
+/*
+ * Adjust the multiplier to reduce the error value,
+ * this is optimized for the most common adjustments of -1,0,1,
+ * for other values we can do a bit more work.
+ */
+static void clocksource_adjust(struct clocksource *clock, s64 offset)
+{
+	s64 error, interval = clock->cycle_interval;
+	int adj;
+
+	error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
+	if (error > interval) {
+		adj = clocksource_bigadjust(1, error, &interval, &offset);
+	} else if (error < -interval) {
+		interval = -interval;
+		offset = -offset;
+		adj = clocksource_bigadjust(-1, error, &interval, &offset);
+	} else
+		return;
+
+	clock->mult += adj;
+	clock->xtime_interval += interval;
+	clock->xtime_nsec -= offset;
+	clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
+}
+
 /*
  * update_wall_time - Uses the current clocksource to increment the wall time
  *
@@ -1021,54 +1089,53 @@ device_initcall(timekeeping_init_device);
  */
 static void update_wall_time(void)
 {
-	static s64 remainder_snsecs, error;
-	s64 snsecs_per_sec;
-	cycle_t now, offset;
+	cycle_t offset;
 
-	snsecs_per_sec = (s64)NSEC_PER_SEC << clock->shift;
-	remainder_snsecs += (s64)xtime.tv_nsec << clock->shift;
+	clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
 
-	now = clocksource_read(clock);
-	offset = (now - last_clock_cycle)&clock->mask;
+#ifdef CONFIG_GENERIC_TIME
+	offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+#else
+	offset = clock->cycle_interval;
+#endif
 
 	/* normally this loop will run just once, however in the
 	 * case of lost or late ticks, it will accumulate correctly.
 	 */
-	while (offset > clock->interval_cycles) {
-		/* get the ntp interval in clock shifted nanoseconds */
-		s64 ntp_snsecs	= current_tick_length(clock->shift);
-
+	while (offset >= clock->cycle_interval) {
 		/* accumulate one interval */
-		remainder_snsecs += clock->interval_snsecs;
-		last_clock_cycle += clock->interval_cycles;
-		offset -= clock->interval_cycles;
+		clock->xtime_nsec += clock->xtime_interval;
+		clock->cycle_last += clock->cycle_interval;
+		offset -= clock->cycle_interval;
+
+		if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
+			clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
+			xtime.tv_sec++;
+			second_overflow();
+		}
 
 		/* interpolator bits */
-		time_interpolator_update(clock->interval_snsecs
+		time_interpolator_update(clock->xtime_interval
 						>> clock->shift);
 		/* increment the NTP state machine */
 		update_ntp_one_tick();
 
 		/* accumulate error between NTP and clock interval */
-		error += (ntp_snsecs - (s64)clock->interval_snsecs);
+		clock->error += current_tick_length();
+		clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
+	}
 
-		/* correct the clock when NTP error is too big */
-		remainder_snsecs += make_ntp_adj(clock, offset, &error);
+	/* correct the clock when NTP error is too big */
+	clocksource_adjust(clock, offset);
 
-		if (remainder_snsecs >= snsecs_per_sec) {
-			remainder_snsecs -= snsecs_per_sec;
-			xtime.tv_sec++;
-			second_overflow();
-		}
-	}
 	/* store full nanoseconds into xtime */
-	xtime.tv_nsec = remainder_snsecs >> clock->shift;
-	remainder_snsecs -= (s64)xtime.tv_nsec << clock->shift;
+	xtime.tv_nsec = clock->xtime_nsec >> clock->shift;
+	clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
 
 	/* check to see if there is a new clocksource to use */
 	if (change_clocksource()) {
-		error = 0;
-		remainder_snsecs = 0;
+		clock->error = 0;
+		clock->xtime_nsec = 0;
 		clocksource_calculate_interval(clock, tick_nsec);
 	}
 }
-- 
cgit v1.2.3


From 5024ad4af69b3570e18d312786dc46318a1bad1a Mon Sep 17 00:00:00 2001
From: Hansjoerg Lipp <hjlipp@web.de>
Date: Mon, 26 Jun 2006 00:25:35 -0700
Subject: [PATCH] i4l: Gigaset drivers: add IOCTLs to compat_ioctl.h

Add the IOCTLs of the Gigaset drivers to compat_ioctl.h in order to make
them available for 32 bit programs on 64 bit platforms.  Please merge.

Signed-off-by: Hansjoerg Lipp <hjlipp@web.de>
Acked-by: Tilman Schmidt <tilman@imap.cc>
Cc: Karsten Keil <kkeil@suse.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat_ioctl.c            | 1 +
 include/linux/compat_ioctl.h | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 9eb9824dd332..d8ecfedef189 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -80,6 +80,7 @@
 #include <net/bluetooth/rfcomm.h>
 
 #include <linux/capi.h>
+#include <linux/gigaset_dev.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index 89ab677cb993..917d62e41480 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -673,6 +673,11 @@ COMPATIBLE_IOCTL(CAPI_SET_FLAGS)
 COMPATIBLE_IOCTL(CAPI_CLR_FLAGS)
 COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT)
 COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT)
+/* Siemens Gigaset */
+COMPATIBLE_IOCTL(GIGASET_REDIR)
+COMPATIBLE_IOCTL(GIGASET_CONFIG)
+COMPATIBLE_IOCTL(GIGASET_BRKCHARS)
+COMPATIBLE_IOCTL(GIGASET_VERSION)
 /* Misc. */
 COMPATIBLE_IOCTL(0x41545900)		/* ATYIO_CLKR */
 COMPATIBLE_IOCTL(0x41545901)		/* ATYIO_CLKW */
-- 
cgit v1.2.3


From aed7a6c476d90660ac5af860158407ae9fe61c68 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:44 -0700
Subject: [PATCH] proc: Replace proc_inode.type with proc_inode.fd

The sole renaming use of proc_inode.type is to discover the file descriptor
number, so just store the file descriptor number and don't wory about
processing this field.  This removes any /proc limits on the maximum number of
file descriptors, and clears the path to make the hard coded /proc inode
numbers go away.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c          | 6 +++---
 fs/proc/inode.c         | 2 +-
 fs/proc/internal.h      | 4 ++--
 include/linux/proc_fs.h | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index fa0e6bee40fa..9562df760901 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -297,7 +297,7 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm
 	struct task_struct *task = proc_task(inode);
 	struct files_struct *files;
 	struct file *file;
-	int fd = proc_type(inode) - PROC_TID_FD_DIR;
+	int fd = proc_fd(inode);
 
 	files = get_files_struct(task);
 	if (files) {
@@ -1368,7 +1368,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	 */
 	get_task_struct(task);
 	ei->task = task;
-	ei->type = ino;
 	inode->i_uid = 0;
 	inode->i_gid = 0;
 	if (task_dumpable(task)) {
@@ -1418,7 +1417,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
 	struct task_struct *task = proc_task(inode);
-	int fd = proc_type(inode) - PROC_TID_FD_DIR;
+	int fd = proc_fd(inode);
 	struct files_struct *files;
 
 	files = get_files_struct(task);
@@ -1525,6 +1524,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	if (!inode)
 		goto out;
 	ei = PROC_I(inode);
+	ei->fd = fd;
 	files = get_files_struct(task);
 	if (!files)
 		goto out_unlock;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 722b9c463111..fbc94df138a7 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -95,7 +95,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	if (!ei)
 		return NULL;
 	ei->task = NULL;
-	ei->type = 0;
+	ei->fd = 0;
 	ei->op.proc_get_link = NULL;
 	ei->pde = NULL;
 	inode = &ei->vfs_inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 0502f17b860d..6264b7a3a9f0 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -46,7 +46,7 @@ static inline struct task_struct *proc_task(struct inode *inode)
 	return PROC_I(inode)->task;
 }
 
-static inline int proc_type(struct inode *inode)
+static inline int proc_fd(struct inode *inode)
 {
-	return PROC_I(inode)->type;
+	return PROC_I(inode)->fd;
 }
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 5810d28fbed9..9dd84884abb1 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -249,7 +249,7 @@ extern void kclist_add(struct kcore_list *, void *, size_t);
 
 struct proc_inode {
 	struct task_struct *task;
-	int type;
+	int fd;
 	union {
 		int (*proc_get_link)(struct inode *, struct dentry **, struct vfsmount **);
 		int (*proc_read)(struct task_struct *task, char *page);
-- 
cgit v1.2.3


From 48e6484d49020dba3578ad117b461e8a391e8f0f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:48 -0700
Subject: [PATCH] proc: Rewrite the proc dentry flush on exit optimization

To keep the dcache from filling up with dead /proc entries we flush them on
process exit.  However over the years that code has gotten hairy with a
dentry_pointer and a lock in task_struct and misdocumented as a correctness
feature.

I have rewritten this code to look and see if we have a corresponding entry in
the dcache and if so flush it on process exit.  This removes the extra fields
in the task_struct and allows me to trivially handle the case of a
/proc/<tgid>/task/<pid> entry as well as the current /proc/<pid> entries.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c                 |  10 ----
 fs/proc/base.c            | 134 +++++++++++++++++++++-------------------------
 include/linux/init_task.h |   1 -
 include/linux/proc_fs.h   |   6 +--
 include/linux/sched.h     |   3 --
 kernel/exit.c             |   7 +--
 kernel/fork.c             |   3 --
 7 files changed, 64 insertions(+), 100 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 0b88bf646143..8c5196087f31 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -666,8 +666,6 @@ static int de_thread(struct task_struct *tsk)
 	 * and to assume its PID:
 	 */
 	if (!thread_group_leader(current)) {
-		struct dentry *proc_dentry1, *proc_dentry2;
-
 		/*
 		 * Wait for the thread group leader to be a zombie.
 		 * It should already be zombie at this point, most
@@ -689,10 +687,6 @@ static int de_thread(struct task_struct *tsk)
 		 */
 		current->start_time = leader->start_time;
 
-		spin_lock(&leader->proc_lock);
-		spin_lock(&current->proc_lock);
-		proc_dentry1 = proc_pid_unhash(current);
-		proc_dentry2 = proc_pid_unhash(leader);
 		write_lock_irq(&tasklist_lock);
 
 		BUG_ON(leader->tgid != current->tgid);
@@ -729,10 +723,6 @@ static int de_thread(struct task_struct *tsk)
 		leader->exit_state = EXIT_DEAD;
 
 		write_unlock_irq(&tasklist_lock);
-		spin_unlock(&leader->proc_lock);
-		spin_unlock(&current->proc_lock);
-		proc_pid_flush(proc_dentry1);
-		proc_pid_flush(proc_dentry2);
         }
 
 	/*
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c8636841bbcf..f435932e6432 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1352,16 +1352,6 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return 0;
 }
 
-static void pid_base_iput(struct dentry *dentry, struct inode *inode)
-{
-	struct task_struct *task = proc_task(inode);
-	spin_lock(&task->proc_lock);
-	if (task->proc_dentry == dentry)
-		task->proc_dentry = NULL;
-	spin_unlock(&task->proc_lock);
-	iput(inode);
-}
-
 static int pid_delete_dentry(struct dentry * dentry)
 {
 	/* Is the task we represent dead?
@@ -1383,13 +1373,6 @@ static struct dentry_operations pid_dentry_operations =
 	.d_delete	= pid_delete_dentry,
 };
 
-static struct dentry_operations pid_base_dentry_operations =
-{
-	.d_revalidate	= pid_revalidate,
-	.d_iput		= pid_base_iput,
-	.d_delete	= pid_delete_dentry,
-};
-
 /* Lookups */
 
 static unsigned name_to_int(struct dentry *dentry)
@@ -1859,57 +1842,70 @@ static struct inode_operations proc_self_inode_operations = {
 };
 
 /**
- * proc_pid_unhash -  Unhash /proc/@pid entry from the dcache.
- * @p: task that should be flushed.
+ * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
+ *
+ * @task: task that should be flushed.
+ *
+ * Looks in the dcache for
+ * /proc/@pid
+ * /proc/@tgid/task/@pid
+ * if either directory is present flushes it and all of it'ts children
+ * from the dcache.
  *
- * Drops the /proc/@pid dcache entry from the hash chains.
+ * It is safe and reasonable to cache /proc entries for a task until
+ * that task exits.  After that they just clog up the dcache with
+ * useless entries, possibly causing useful dcache entries to be
+ * flushed instead.  This routine is proved to flush those useless
+ * dcache entries at process exit time.
  *
- * Dropping /proc/@pid entries and detach_pid must be synchroneous,
- * otherwise e.g. /proc/@pid/exe might point to the wrong executable,
- * if the pid value is immediately reused. This is enforced by
- * - caller must acquire spin_lock(p->proc_lock)
- * - must be called before detach_pid()
- * - proc_pid_lookup acquires proc_lock, and checks that
- *   the target is not dead by looking at the attach count
- *   of PIDTYPE_PID.
+ * NOTE: This routine is just an optimization so it does not guarantee
+ *       that no dcache entries will exist at process exit time it
+ *       just makes it very unlikely that any will persist.
  */
-
-struct dentry *proc_pid_unhash(struct task_struct *p)
+void proc_flush_task(struct task_struct *task)
 {
-	struct dentry *proc_dentry;
+	struct dentry *dentry, *leader, *dir;
+	char buf[30];
+	struct qstr name;
+
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
+	dentry = d_hash_and_lookup(proc_mnt->mnt_root, &name);
+	if (dentry) {
+		shrink_dcache_parent(dentry);
+		d_drop(dentry);
+		dput(dentry);
+	}
 
-	proc_dentry = p->proc_dentry;
-	if (proc_dentry != NULL) {
+	if (thread_group_leader(task))
+		goto out;
 
-		spin_lock(&dcache_lock);
-		spin_lock(&proc_dentry->d_lock);
-		if (!d_unhashed(proc_dentry)) {
-			dget_locked(proc_dentry);
-			__d_drop(proc_dentry);
-			spin_unlock(&proc_dentry->d_lock);
-		} else {
-			spin_unlock(&proc_dentry->d_lock);
-			proc_dentry = NULL;
-		}
-		spin_unlock(&dcache_lock);
-	}
-	return proc_dentry;
-}
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", task->tgid);
+	leader = d_hash_and_lookup(proc_mnt->mnt_root, &name);
+	if (!leader)
+		goto out;
 
-/**
- * proc_pid_flush - recover memory used by stale /proc/@pid/x entries
- * @proc_dentry: directoy to prune.
- *
- * Shrink the /proc directory that was used by the just killed thread.
- */
-	
-void proc_pid_flush(struct dentry *proc_dentry)
-{
-	might_sleep();
-	if(proc_dentry != NULL) {
-		shrink_dcache_parent(proc_dentry);
-		dput(proc_dentry);
+	name.name = "task";
+	name.len = strlen(name.name);
+	dir = d_hash_and_lookup(leader, &name);
+	if (!dir)
+		goto out_put_leader;
+
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
+	dentry = d_hash_and_lookup(dir, &name);
+	if (dentry) {
+		shrink_dcache_parent(dentry);
+		d_drop(dentry);
+		dput(dentry);
 	}
+
+	dput(dir);
+out_put_leader:
+	dput(leader);
+out:
+	return;
 }
 
 /* SMP-safe */
@@ -1919,7 +1915,6 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
 	struct inode *inode;
 	struct proc_inode *ei;
 	unsigned tgid;
-	int died;
 
 	if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) {
 		inode = new_inode(dir->i_sb);
@@ -1965,23 +1960,16 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
 	inode->i_nlink = 4;
 #endif
 
-	dentry->d_op = &pid_base_dentry_operations;
+	dentry->d_op = &pid_dentry_operations;
 
-	died = 0;
 	d_add(dentry, inode);
-	spin_lock(&task->proc_lock);
-	task->proc_dentry = dentry;
 	if (!pid_alive(task)) {
-		dentry = proc_pid_unhash(task);
-		died = 1;
+		d_drop(dentry);
+		shrink_dcache_parent(dentry);
+		goto out;
 	}
-	spin_unlock(&task->proc_lock);
 
 	put_task_struct(task);
-	if (died) {
-		proc_pid_flush(dentry);
-		goto out;
-	}
 	return NULL;
 out:
 	return ERR_PTR(-ENOENT);
@@ -2024,7 +2012,7 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
 	inode->i_nlink = 3;
 #endif
 
-	dentry->d_op = &pid_base_dentry_operations;
+	dentry->d_op = &pid_dentry_operations;
 
 	d_add(dentry, inode);
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 41ecbb847f32..e127ef7e8da8 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -119,7 +119,6 @@ extern struct group_info init_groups;
 		.signal = {{0}}},					\
 	.blocked	= {{0}},					\
 	.alloc_lock	= SPIN_LOCK_UNLOCKED,				\
-	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 9dd84884abb1..d4d2081dbaf7 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -99,9 +99,8 @@ extern void proc_misc_init(void);
 
 struct mm_struct;
 
+void proc_flush_task(struct task_struct *task);
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
-struct dentry *proc_pid_unhash(struct task_struct *p);
-void proc_pid_flush(struct dentry *proc_dentry);
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
 unsigned long task_vsize(struct mm_struct *);
 int task_statm(struct mm_struct *, int *, int *, int *, int *);
@@ -211,8 +210,7 @@ static inline void proc_net_remove(const char *name)
 #define proc_net_create(name, mode, info)	({ (void)(mode), NULL; })
 static inline void proc_net_remove(const char *name) {}
 
-static inline struct dentry *proc_pid_unhash(struct task_struct *p) { return NULL; }
-static inline void proc_pid_flush(struct dentry *proc_dentry) { }
+static inline void proc_flush_task(struct task_struct *task) { }
 
 static inline struct proc_dir_entry *create_proc_entry(const char *name,
 	mode_t mode, struct proc_dir_entry *parent) { return NULL; }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8d11d9310db0..122a25c1b997 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -842,8 +842,6 @@ struct task_struct {
    	u32 self_exec_id;
 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
 	spinlock_t alloc_lock;
-/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
-	spinlock_t proc_lock;
 
 #ifdef CONFIG_DEBUG_MUTEXES
 	/* mutex deadlock detection */
@@ -856,7 +854,6 @@ struct task_struct {
 /* VM state */
 	struct reclaim_state *reclaim_state;
 
-	struct dentry *proc_dentry;
 	struct backing_dev_info *backing_dev_info;
 
 	struct io_context *io_context;
diff --git a/kernel/exit.c b/kernel/exit.c
index e76bd02e930e..304ef637be6c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -137,12 +137,8 @@ void release_task(struct task_struct * p)
 {
 	int zap_leader;
 	task_t *leader;
-	struct dentry *proc_dentry;
-
 repeat:
 	atomic_dec(&p->user->processes);
-	spin_lock(&p->proc_lock);
-	proc_dentry = proc_pid_unhash(p);
 	write_lock_irq(&tasklist_lock);
 	ptrace_unlink(p);
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
@@ -171,8 +167,7 @@ repeat:
 
 	sched_exit(p);
 	write_unlock_irq(&tasklist_lock);
-	spin_unlock(&p->proc_lock);
-	proc_pid_flush(proc_dentry);
+	proc_flush_task(p);
 	release_thread(p);
 	call_rcu(&p->rcu, delayed_put_task_struct);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index dfd10cb370c3..79e91046f36e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -993,13 +993,10 @@ static task_t *copy_process(unsigned long clone_flags,
 		if (put_user(p->pid, parent_tidptr))
 			goto bad_fork_cleanup;
 
-	p->proc_dentry = NULL;
-
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
-	spin_lock_init(&p->proc_lock);
 
 	clear_tsk_thread_flag(p, TIF_SIGPENDING);
 	init_sigpending(&p->pending);
-- 
cgit v1.2.3


From 99f895518368252ba862cc15ce4eb98ebbe1bec6 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:55 -0700
Subject: [PATCH] proc: don't lock task_structs indefinitely

Every inode in /proc holds a reference to a struct task_struct.  If a
directory or file is opened and remains open after the the task exits this
pinning continues.  With 8K stacks on a 32bit machine the amount pinned per
file descriptor is about 10K.

Normally I would figure a reasonable per user process limit is about 100
processes.  With 80 processes, with a 1000 file descriptors each I can trigger
the 00M killer on a 32bit kernel, because I have pinned about 800MB of useless
data.

This patch replaces the struct task_struct pointer with a pointer to a struct
task_ref which has a struct task_struct pointer.  The so the pinning of dead
tasks does not happen.

The code now has to contend with the fact that the task may now exit at any
time.  Which is a little but not muh more complicated.

With this change it takes about 1000 processes each opening up 1000 file
descriptors before I can trigger the OOM killer.  Much better.

[mlp@google.com: task_mmu small fixes]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Paul Jackson <pj@sgi.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Albert Cahalan <acahalan@gmail.com>
Signed-off-by: Prasanna Meda <mlp@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c          | 355 ++++++++++++++++++++++++++++++++++--------------
 fs/proc/inode.c         |   9 +-
 fs/proc/internal.h      |  15 +-
 fs/proc/task_mmu.c      |  72 +++++++---
 include/linux/proc_fs.h |   8 +-
 kernel/cpuset.c         |  27 +++-
 mm/mempolicy.c          |   6 +-
 7 files changed, 349 insertions(+), 143 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 20746e124409..489810abc72d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -307,12 +307,15 @@ static struct pid_entry tid_attr_stuff[] = {
 
 static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-	struct task_struct *task = proc_task(inode);
-	struct files_struct *files;
+	struct task_struct *task = get_proc_task(inode);
+	struct files_struct *files = NULL;
 	struct file *file;
 	int fd = proc_fd(inode);
 
-	files = get_files_struct(task);
+	if (task) {
+		files = get_files_struct(task);
+		put_task_struct(task);
+	}
 	if (files) {
 		/*
 		 * We are not taking a ref to the file structure, so we must
@@ -344,10 +347,29 @@ static struct fs_struct *get_fs_struct(struct task_struct *task)
 	return fs;
 }
 
+static int get_nr_threads(struct task_struct *tsk)
+{
+	/* Must be called with the rcu_read_lock held */
+	unsigned long flags;
+	int count = 0;
+
+	if (lock_task_sighand(tsk, &flags)) {
+		count = atomic_read(&tsk->signal->count);
+		unlock_task_sighand(tsk, &flags);
+	}
+	return count;
+}
+
 static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-	struct fs_struct *fs = get_fs_struct(proc_task(inode));
+	struct task_struct *task = get_proc_task(inode);
+	struct fs_struct *fs = NULL;
 	int result = -ENOENT;
+
+	if (task) {
+		fs = get_fs_struct(task);
+		put_task_struct(task);
+	}
 	if (fs) {
 		read_lock(&fs->lock);
 		*mnt = mntget(fs->pwdmnt);
@@ -361,8 +383,14 @@ static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfs
 
 static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-	struct fs_struct *fs = get_fs_struct(proc_task(inode));
+	struct task_struct *task = get_proc_task(inode);
+	struct fs_struct *fs = NULL;
 	int result = -ENOENT;
+
+	if (task) {
+		fs = get_fs_struct(task);
+		put_task_struct(task);
+	}
 	if (fs) {
 		read_lock(&fs->lock);
 		*mnt = mntget(fs->rootmnt);
@@ -550,16 +578,19 @@ struct proc_mounts {
 
 static int mounts_open(struct inode *inode, struct file *file)
 {
-	struct task_struct *task = proc_task(inode);
-	struct namespace *namespace;
+	struct task_struct *task = get_proc_task(inode);
+	struct namespace *namespace = NULL;
 	struct proc_mounts *p;
 	int ret = -EINVAL;
 
-	task_lock(task);
-	namespace = task->namespace;
-	if (namespace)
-		get_namespace(namespace);
-	task_unlock(task);
+	if (task) {
+		task_lock(task);
+		namespace = task->namespace;
+		if (namespace)
+			get_namespace(namespace);
+		task_unlock(task);
+		put_task_struct(task);
+	}
 
 	if (namespace) {
 		ret = -ENOMEM;
@@ -616,17 +647,21 @@ static struct file_operations proc_mounts_operations = {
 extern struct seq_operations mountstats_op;
 static int mountstats_open(struct inode *inode, struct file *file)
 {
-	struct task_struct *task = proc_task(inode);
 	int ret = seq_open(file, &mountstats_op);
 
 	if (!ret) {
 		struct seq_file *m = file->private_data;
-		struct namespace *namespace;
-		task_lock(task);
-		namespace = task->namespace;
-		if (namespace)
-			get_namespace(namespace);
-		task_unlock(task);
+		struct namespace *namespace = NULL;
+		struct task_struct *task = get_proc_task(inode);
+
+		if (task) {
+			task_lock(task);
+			namespace = task->namespace;
+			if (namespace)
+				get_namespace(namespace);
+			task_unlock(task);
+			put_task_struct(task);
+		}
 
 		if (namespace)
 			m->private = namespace;
@@ -653,18 +688,27 @@ static ssize_t proc_info_read(struct file * file, char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	unsigned long page;
 	ssize_t length;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
+
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
 
 	if (count > PROC_BLOCK_SIZE)
 		count = PROC_BLOCK_SIZE;
+
+	length = -ENOMEM;
 	if (!(page = __get_free_page(GFP_KERNEL)))
-		return -ENOMEM;
+		goto out;
 
 	length = PROC_I(inode)->op.proc_read(task, (char*)page);
 
 	if (length >= 0)
 		length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
 	free_page(page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return length;
 }
 
@@ -681,12 +725,15 @@ static int mem_open(struct inode* inode, struct file* file)
 static ssize_t mem_read(struct file * file, char __user * buf,
 			size_t count, loff_t *ppos)
 {
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 	char *page;
 	unsigned long src = *ppos;
 	int ret = -ESRCH;
 	struct mm_struct *mm;
 
+	if (!task)
+		goto out_no_task;
+
 	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
 		goto out;
 
@@ -736,6 +783,8 @@ out_put:
 out_free:
 	free_page((unsigned long) page);
 out:
+	put_task_struct(task);
+out_no_task:
 	return ret;
 }
 
@@ -748,15 +797,20 @@ static ssize_t mem_write(struct file * file, const char * buf,
 {
 	int copied = 0;
 	char *page;
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 	unsigned long dst = *ppos;
 
+	copied = -ESRCH;
+	if (!task)
+		goto out_no_task;
+
 	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
-		return -ESRCH;
+		goto out;
 
+	copied = -ENOMEM;
 	page = (char *)__get_free_page(GFP_USER);
 	if (!page)
-		return -ENOMEM;
+		goto out;
 
 	while (count > 0) {
 		int this_len, retval;
@@ -779,6 +833,9 @@ static ssize_t mem_write(struct file * file, const char * buf,
 	}
 	*ppos = dst;
 	free_page((unsigned long) page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return copied;
 }
 #endif
@@ -809,12 +866,17 @@ static struct file_operations proc_mem_operations = {
 static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 				size_t count, loff_t *ppos)
 {
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 	char buffer[PROC_NUMBUF];
 	size_t len;
-	int oom_adjust = task->oomkilladj;
+	int oom_adjust;
 	loff_t __ppos = *ppos;
 
+	if (!task)
+		return -ESRCH;
+	oom_adjust = task->oomkilladj;
+	put_task_struct(task);
+
 	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
 	if (__ppos >= len)
 		return 0;
@@ -829,7 +891,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	struct task_struct *task;
 	char buffer[PROC_NUMBUF], *end;
 	int oom_adjust;
 
@@ -845,7 +907,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		return -EINVAL;
 	if (*end == '\n')
 		end++;
+	task = get_proc_task(file->f_dentry->d_inode);
+	if (!task)
+		return -ESRCH;
 	task->oomkilladj = oom_adjust;
+	put_task_struct(task);
 	if (end - buffer == 0)
 		return -EIO;
 	return end - buffer;
@@ -862,12 +928,15 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
 				  size_t count, loff_t *ppos)
 {
 	struct inode * inode = file->f_dentry->d_inode;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
 	ssize_t length;
 	char tmpbuf[TMPBUFLEN];
 
+	if (!task)
+		return -ESRCH;
 	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
 				audit_get_loginuid(task->audit_context));
+	put_task_struct(task);
 	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
 }
 
@@ -877,13 +946,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	char *page, *tmp;
 	ssize_t length;
-	struct task_struct *task = proc_task(inode);
 	uid_t loginuid;
 
 	if (!capable(CAP_AUDIT_CONTROL))
 		return -EPERM;
 
-	if (current != task)
+	if (current != proc_tref(inode)->task)
 		return -EPERM;
 
 	if (count >= PAGE_SIZE)
@@ -907,7 +975,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 		goto out_free_page;
 
 	}
-	length = audit_set_loginuid(task, loginuid);
+	length = audit_set_loginuid(current, loginuid);
 	if (likely(length == 0))
 		length = count;
 
@@ -926,13 +994,16 @@ static struct file_operations proc_loginuid_operations = {
 static ssize_t seccomp_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
-	struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+	struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
 	char __buf[20];
 	loff_t __ppos = *ppos;
 	size_t len;
 
+	if (!tsk)
+		return -ESRCH;
 	/* no need to print the trailing zero, so use only len */
 	len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
+	put_task_struct(tsk);
 	if (__ppos >= len)
 		return 0;
 	if (count > len - __ppos)
@@ -946,29 +1017,43 @@ static ssize_t seccomp_read(struct file *file, char __user *buf,
 static ssize_t seccomp_write(struct file *file, const char __user *buf,
 			     size_t count, loff_t *ppos)
 {
-	struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+	struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
 	char __buf[20], *end;
 	unsigned int seccomp_mode;
+	ssize_t result;
+
+	result = -ESRCH;
+	if (!tsk)
+		goto out_no_task;
 
 	/* can set it only once to be even more secure */
+	result = -EPERM;
 	if (unlikely(tsk->seccomp.mode))
-		return -EPERM;
+		goto out;
 
+	result = -EFAULT;
 	memset(__buf, 0, sizeof(__buf));
 	count = min(count, sizeof(__buf) - 1);
 	if (copy_from_user(__buf, buf, count))
-		return -EFAULT;
+		goto out;
+
 	seccomp_mode = simple_strtoul(__buf, &end, 0);
 	if (*end == '\n')
 		end++;
+	result = -EINVAL;
 	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
 		tsk->seccomp.mode = seccomp_mode;
 		set_tsk_thread_flag(tsk, TIF_SECCOMP);
 	} else
-		return -EINVAL;
+		goto out;
+	result = -EIO;
 	if (unlikely(!(end - __buf)))
-		return -EIO;
-	return end - __buf;
+		goto out;
+	result = end - __buf;
+out:
+	put_task_struct(tsk);
+out_no_task:
+	return result;
 }
 
 static struct file_operations proc_seccomp_operations = {
@@ -995,7 +1080,7 @@ static int proc_check_dentry_visible(struct inode *inode,
 	/* See if the the two tasks share a commone set of
 	 * file descriptors.  If so everything is visible.
 	 */
-	task = proc_task(inode);
+	task = get_proc_task(inode);
 	if (!task)
 		goto out;
 	files = get_files_struct(current);
@@ -1006,6 +1091,7 @@ static int proc_check_dentry_visible(struct inode *inode,
 		put_files_struct(task_files);
 	if (files)
 		put_files_struct(files);
+	put_task_struct(task);
 	if (!error)
 		goto out;
 
@@ -1106,7 +1192,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 {
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *p = proc_task(inode);
+	struct task_struct *p = get_proc_task(inode);
 	unsigned int fd, tid, ino;
 	int retval;
 	char buf[PROC_NUMBUF];
@@ -1114,8 +1200,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 	struct fdtable *fdt;
 
 	retval = -ENOENT;
-	if (!pid_alive(p))
-		goto out;
+	if (!p)
+		goto out_no_task;
 	retval = 0;
 	tid = p->pid;
 
@@ -1164,6 +1250,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 			put_files_struct(files);
 	}
 out:
+	put_task_struct(p);
+out_no_task:
 	return retval;
 }
 
@@ -1175,16 +1263,18 @@ static int proc_pident_readdir(struct file *filp,
 	int pid;
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
+	struct task_struct *task = get_proc_task(inode);
 	struct pid_entry *p;
 	ino_t ino;
 	int ret;
 
 	ret = -ENOENT;
-	if (!pid_alive(proc_task(inode)))
+	if (!task)
 		goto out;
 
 	ret = 0;
-	pid = proc_task(inode)->pid;
+	pid = task->pid;
+	put_task_struct(task);
 	i = filp->f_pos;
 	switch (i) {
 	case 0:
@@ -1270,14 +1360,13 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_ino = fake_ino(task->pid, ino);
 
-	if (!pid_alive(task))
-		goto out_unlock;
-
 	/*
 	 * grab the reference to task.
 	 */
-	get_task_struct(task);
-	ei->task = task;
+	ei->tref = tref_get_by_task(task);
+	if (!tref_task(ei->tref))
+		goto out_unlock;
+
 	inode->i_uid = 0;
 	inode->i_gid = 0;
 	if (task_dumpable(task)) {
@@ -1303,13 +1392,21 @@ out_unlock:
  *
  * Rewrite the inode's ownerships here because the owning task may have
  * performed a setuid(), etc.
+ *
+ * Before the /proc/pid/status file was created the only way to read
+ * the effective uid of a /process was to stat /proc/pid.  Reading
+ * /proc/pid/status is slow enough that procps and other packages
+ * kept stating /proc/pid.  To keep the rules in /proc simple I have
+ * made this apply to all per process world readable and executable
+ * directories.
  */
 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = proc_task(inode);
-	if (pid_alive(task)) {
-		if (task_dumpable(task)) {
+	struct task_struct *task = get_proc_task(inode);
+	if (task) {
+		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+		    task_dumpable(task)) {
 			inode->i_uid = task->euid;
 			inode->i_gid = task->egid;
 		} else {
@@ -1317,37 +1414,63 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 			inode->i_gid = 0;
 		}
 		security_task_to_inode(task, inode);
+		put_task_struct(task);
 		return 1;
 	}
 	d_drop(dentry);
 	return 0;
 }
 
+static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	generic_fillattr(inode, stat);
+
+	rcu_read_lock();
+	stat->uid = 0;
+	stat->gid = 0;
+	task = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (task) {
+		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+		    task_dumpable(task)) {
+			stat->uid = task->euid;
+			stat->gid = task->egid;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
 	int fd = proc_fd(inode);
 	struct files_struct *files;
 
-	files = get_files_struct(task);
-	if (files) {
-		rcu_read_lock();
-		if (fcheck_files(files, fd)) {
+	if (task) {
+		files = get_files_struct(task);
+		if (files) {
+			rcu_read_lock();
+			if (fcheck_files(files, fd)) {
+				rcu_read_unlock();
+				put_files_struct(files);
+				if (task_dumpable(task)) {
+					inode->i_uid = task->euid;
+					inode->i_gid = task->egid;
+				} else {
+					inode->i_uid = 0;
+					inode->i_gid = 0;
+				}
+				security_task_to_inode(task, inode);
+				put_task_struct(task);
+				return 1;
+			}
 			rcu_read_unlock();
 			put_files_struct(files);
-			if (task_dumpable(task)) {
-				inode->i_uid = task->euid;
-				inode->i_gid = task->egid;
-			} else {
-				inode->i_uid = 0;
-				inode->i_gid = 0;
-			}
-			security_task_to_inode(task, inode);
-			return 1;
 		}
-		rcu_read_unlock();
-		put_files_struct(files);
+		put_task_struct(task);
 	}
 	d_drop(dentry);
 	return 0;
@@ -1359,7 +1482,7 @@ static int pid_delete_dentry(struct dentry * dentry)
 	 * If so, then don't put the dentry on the lru list,
 	 * kill it immediately.
 	 */
-	return !pid_alive(proc_task(dentry->d_inode));
+	return !proc_tref(dentry->d_inode)->task;
 }
 
 static struct dentry_operations tid_fd_dentry_operations =
@@ -1401,7 +1524,7 @@ out:
 /* SMP-safe */
 static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
 {
-	struct task_struct *task = proc_task(dir);
+	struct task_struct *task = get_proc_task(dir);
 	unsigned fd = name_to_int(dentry);
 	struct dentry *result = ERR_PTR(-ENOENT);
 	struct file * file;
@@ -1409,10 +1532,10 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	struct inode *inode;
 	struct proc_inode *ei;
 
+	if (!task)
+		goto out_no_task;
 	if (fd == ~0U)
 		goto out;
-	if (!pid_alive(task))
-		goto out;
 
 	inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd);
 	if (!inode)
@@ -1447,6 +1570,8 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	if (tid_fd_revalidate(dentry, NULL))
 		result = NULL;
 out:
+	put_task_struct(task);
+out_no_task:
 	return result;
 
 out_unlock2:
@@ -1490,12 +1615,17 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	unsigned long page;
 	ssize_t length;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
+
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
 
 	if (count > PAGE_SIZE)
 		count = PAGE_SIZE;
+	length = -ENOMEM;
 	if (!(page = __get_free_page(GFP_KERNEL)))
-		return -ENOMEM;
+		goto out;
 
 	length = security_getprocattr(task, 
 				      (char*)file->f_dentry->d_name.name, 
@@ -1503,6 +1633,9 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
 	if (length >= 0)
 		length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
 	free_page(page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return length;
 }
 
@@ -1512,26 +1645,36 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	char *page; 
 	ssize_t length; 
-	struct task_struct *task = proc_task(inode); 
+	struct task_struct *task = get_proc_task(inode);
 
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
 	if (count > PAGE_SIZE) 
 		count = PAGE_SIZE; 
-	if (*ppos != 0) {
-		/* No partial writes. */
-		return -EINVAL;
-	}
+
+	/* No partial writes. */
+	length = -EINVAL;
+	if (*ppos != 0)
+		goto out;
+
+	length = -ENOMEM;
 	page = (char*)__get_free_page(GFP_USER); 
 	if (!page) 
-		return -ENOMEM;
+		goto out;
+
 	length = -EFAULT; 
 	if (copy_from_user(page, buf, count)) 
-		goto out;
+		goto out_free;
 
 	length = security_setprocattr(task, 
 				      (char*)file->f_dentry->d_name.name, 
 				      (void*)page, count);
-out:
+out_free:
 	free_page((unsigned long) page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return length;
 } 
 
@@ -1553,15 +1696,15 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 {
 	struct inode *inode;
 	struct dentry *error;
-	struct task_struct *task = proc_task(dir);
+	struct task_struct *task = get_proc_task(dir);
 	struct pid_entry *p;
 	struct proc_inode *ei;
 
 	error = ERR_PTR(-ENOENT);
 	inode = NULL;
 
-	if (!pid_alive(task))
-		goto out;
+	if (!task)
+		goto out_no_task;
 
 	for (p = ents; p->name; p++) {
 		if (p->len != dentry->d_name.len)
@@ -1748,6 +1891,8 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 	if (pid_revalidate(dentry, NULL))
 		error = NULL;
 out:
+	put_task_struct(task);
+out_no_task:
 	return error;
 }
 
@@ -1771,10 +1916,12 @@ static struct file_operations proc_tid_base_operations = {
 
 static struct inode_operations proc_tgid_base_inode_operations = {
 	.lookup		= proc_tgid_base_lookup,
+	.getattr	= pid_getattr,
 };
 
 static struct inode_operations proc_tid_base_inode_operations = {
 	.lookup		= proc_tid_base_lookup,
+	.getattr	= pid_getattr,
 };
 
 #ifdef CONFIG_SECURITY
@@ -1816,10 +1963,12 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir,
 
 static struct inode_operations proc_tgid_attr_inode_operations = {
 	.lookup		= proc_tgid_attr_lookup,
+	.getattr	= pid_getattr,
 };
 
 static struct inode_operations proc_tid_attr_inode_operations = {
 	.lookup		= proc_tid_attr_lookup,
+	.getattr	= pid_getattr,
 };
 #endif
 
@@ -1981,10 +2130,13 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
 {
 	struct dentry *result = ERR_PTR(-ENOENT);
 	struct task_struct *task;
-	struct task_struct *leader = proc_task(dir);
+	struct task_struct *leader = get_proc_task(dir);
 	struct inode *inode;
 	unsigned tid;
 
+	if (!leader)
+		goto out_no_task;
+
 	tid = name_to_int(dentry);
 	if (tid == ~0U)
 		goto out;
@@ -2024,6 +2176,8 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
 out_drop_task:
 	put_task_struct(task);
 out:
+	put_task_struct(leader);
+out_no_task:
 	return result;
 }
 
@@ -2163,12 +2317,7 @@ static struct task_struct *first_tid(struct task_struct *leader, int tid, int nr
 
 	/* If nr exceeds the number of threads there is nothing todo */
 	if (nr) {
-		int threads = 0;
-		task_lock(leader);
-		if (leader->signal)
-			threads = atomic_read(&leader->signal->count);
-		task_unlock(leader);
-		if (nr >= threads)
+		if (nr >= get_nr_threads(leader))
 			goto done;
 	}
 
@@ -2218,15 +2367,15 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 	char buf[PROC_NUMBUF];
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *leader = proc_task(inode);
+	struct task_struct *leader = get_proc_task(inode);
 	struct task_struct *task;
 	int retval = -ENOENT;
 	ino_t ino;
 	int tid;
 	unsigned long pos = filp->f_pos;  /* avoiding "long long" filp->f_pos */
 
-	if (!pid_alive(leader))
-		goto out;
+	if (!leader)
+		goto out_no_task;
 	retval = 0;
 
 	switch (pos) {
@@ -2266,20 +2415,22 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 	}
 out:
 	filp->f_pos = pos;
+	put_task_struct(leader);
+out_no_task:
 	return retval;
 }
 
 static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *p = proc_task(inode);
+	struct task_struct *p = get_proc_task(inode);
 	generic_fillattr(inode, stat);
 
-	if (pid_alive(p)) {
-		task_lock(p);
-		if (p->signal)
-			stat->nlink += atomic_read(&p->signal->count);
-		task_unlock(p);
+	if (p) {
+		rcu_read_lock();
+		stat->nlink += get_nr_threads(p);
+		rcu_read_unlock();
+		put_task_struct(p);
 	}
 
 	return 0;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index fbc94df138a7..31e0475c6cb9 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry *de)
 static void proc_delete_inode(struct inode *inode)
 {
 	struct proc_dir_entry *de;
-	struct task_struct *tsk;
 
 	truncate_inode_pages(&inode->i_data, 0);
 
-	/* Let go of any associated process */
-	tsk = PROC_I(inode)->task;
-	if (tsk)
-		put_task_struct(tsk);
+	/* Stop tracking associated processes */
+	tref_put(PROC_I(inode)->tref);
 
 	/* Let go of any associated proc directory entry */
 	de = PROC_I(inode)->pde;
@@ -94,7 +91,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL);
 	if (!ei)
 		return NULL;
-	ei->task = NULL;
+	ei->tref = NULL;
 	ei->fd = 0;
 	ei->op.proc_get_link = NULL;
 	ei->pde = NULL;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 548e7447ea47..37f1648adc23 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -10,6 +10,7 @@
  */
 
 #include <linux/proc_fs.h>
+#include <linux/task_ref.h>
 
 struct vmalloc_info {
 	unsigned long	used;
@@ -41,13 +42,23 @@ extern struct file_operations proc_maps_operations;
 extern struct file_operations proc_numa_maps_operations;
 extern struct file_operations proc_smaps_operations;
 
+extern struct file_operations proc_maps_operations;
+extern struct file_operations proc_numa_maps_operations;
+extern struct file_operations proc_smaps_operations;
+
+
 void free_proc_entry(struct proc_dir_entry *de);
 
 int proc_init_inodecache(void);
 
-static inline struct task_struct *proc_task(struct inode *inode)
+static inline struct task_ref *proc_tref(struct inode *inode)
+{
+	return PROC_I(inode)->tref;
+}
+
+static inline struct task_struct *get_proc_task(struct inode *inode)
 {
-	return PROC_I(inode)->task;
+	return get_tref_task(proc_tref(inode));
 }
 
 static inline int proc_fd(struct inode *inode)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4187b4e9cdb3..abf3208c3f60 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount *
 {
 	struct vm_area_struct * vma;
 	int result = -ENOENT;
-	struct task_struct *task = proc_task(inode);
-	struct mm_struct * mm = get_task_mm(task);
+	struct task_struct *task = get_proc_task(inode);
+	struct mm_struct * mm = NULL;
 
+	if (task) {
+		mm = get_task_mm(task);
+		put_task_struct(task);
+	}
 	if (!mm)
 		goto out;
 	down_read(&mm->mmap_sem);
@@ -120,7 +124,8 @@ struct mem_size_stats
 
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
-	struct task_struct *task = m->private;
+	struct proc_maps_private *priv = m->private;
+	struct task_struct *task = priv->task;
 	struct vm_area_struct *vma = v;
 	struct mm_struct *mm = vma->vm_mm;
 	struct file *file = vma->vm_file;
@@ -295,12 +300,16 @@ static int show_smap(struct seq_file *m, void *v)
 
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
-	struct task_struct *task = m->private;
+	struct proc_maps_private *priv = m->private;
 	unsigned long last_addr = m->version;
 	struct mm_struct *mm;
-	struct vm_area_struct *vma, *tail_vma;
+	struct vm_area_struct *vma, *tail_vma = NULL;
 	loff_t l = *pos;
 
+	/* Clear the per syscall fields in priv */
+	priv->task = NULL;
+	priv->tail_vma = NULL;
+
 	/*
 	 * We remember last_addr rather than next_addr to hit with
 	 * mmap_cache most of the time. We have zero last_addr at
@@ -311,11 +320,15 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	if (last_addr == -1UL)
 		return NULL;
 
-	mm = get_task_mm(task);
+	priv->task = get_tref_task(priv->tref);
+	if (!priv->task)
+		return NULL;
+
+	mm = get_task_mm(priv->task);
 	if (!mm)
 		return NULL;
 
-	tail_vma = get_gate_vma(task);
+	priv->tail_vma = tail_vma = get_gate_vma(priv->task);
 	down_read(&mm->mmap_sem);
 
 	/* Start with last addr hint */
@@ -350,11 +363,9 @@ out:
 	return tail_vma;
 }
 
-static void m_stop(struct seq_file *m, void *v)
+static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
 {
-	struct task_struct *task = m->private;
-	struct vm_area_struct *vma = v;
-	if (vma && vma != get_gate_vma(task)) {
+	if (vma && vma != priv->tail_vma) {
 		struct mm_struct *mm = vma->vm_mm;
 		up_read(&mm->mmap_sem);
 		mmput(mm);
@@ -363,17 +374,27 @@ static void m_stop(struct seq_file *m, void *v)
 
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct task_struct *task = m->private;
+	struct proc_maps_private *priv = m->private;
 	struct vm_area_struct *vma = v;
-	struct vm_area_struct *tail_vma = get_gate_vma(task);
+	struct vm_area_struct *tail_vma = priv->tail_vma;
 
 	(*pos)++;
 	if (vma && (vma != tail_vma) && vma->vm_next)
 		return vma->vm_next;
-	m_stop(m, v);
+	vma_stop(priv, vma);
 	return (vma != tail_vma)? tail_vma: NULL;
 }
 
+static void m_stop(struct seq_file *m, void *v)
+{
+	struct proc_maps_private *priv = m->private;
+	struct vm_area_struct *vma = v;
+
+	vma_stop(priv, vma);
+	if (priv->task)
+		put_task_struct(priv->task);
+}
+
 static struct seq_operations proc_pid_maps_op = {
 	.start	= m_start,
 	.next	= m_next,
@@ -391,11 +412,18 @@ static struct seq_operations proc_pid_smaps_op = {
 static int do_maps_open(struct inode *inode, struct file *file,
 			struct seq_operations *ops)
 {
-	struct task_struct *task = proc_task(inode);
-	int ret = seq_open(file, ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = task;
+	struct proc_maps_private *priv;
+	int ret = -ENOMEM;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv) {
+		priv->tref = proc_tref(inode);
+		ret = seq_open(file, ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = priv;
+		} else {
+			kfree(priv);
+		}
 	}
 	return ret;
 }
@@ -409,7 +437,7 @@ struct file_operations proc_maps_operations = {
 	.open		= maps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_private,
 };
 
 #ifdef CONFIG_NUMA
@@ -431,7 +459,7 @@ struct file_operations proc_numa_maps_operations = {
 	.open		= numa_maps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_private,
 };
 #endif
 
@@ -444,5 +472,5 @@ struct file_operations proc_smaps_operations = {
 	.open		= smaps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_private,
 };
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index d4d2081dbaf7..4c7271f04697 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -246,7 +246,7 @@ extern void kclist_add(struct kcore_list *, void *, size_t);
 #endif
 
 struct proc_inode {
-	struct task_struct *task;
+	struct task_ref *tref;
 	int fd;
 	union {
 		int (*proc_get_link)(struct inode *, struct dentry **, struct vfsmount **);
@@ -266,4 +266,10 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode)
 	return PROC_I(inode)->pde;
 }
 
+struct proc_maps_private {
+	struct task_ref *tref;
+	struct task_struct *task;
+	struct vm_area_struct *tail_vma;
+};
+
 #endif /* _LINUX_PROC_FS_H */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b602f73fb38d..3e991c0c02e2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -50,6 +50,7 @@
 #include <linux/time.h>
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
+#include <linux/task_ref.h>
 
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
@@ -2442,31 +2443,43 @@ void __cpuset_memory_pressure_bump(void)
  */
 static int proc_cpuset_show(struct seq_file *m, void *v)
 {
+	struct task_ref *tref;
 	struct task_struct *tsk;
 	char *buf;
-	int retval = 0;
+	int retval;
 
+	retval = -ENOMEM;
 	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!buf)
-		return -ENOMEM;
+		goto out;
+
+	retval = -ESRCH;
+	tref = m->private;
+	tsk = get_tref_task(tref);
+	if (!tsk)
+		goto out_free;
 
-	tsk = m->private;
+	retval = -EINVAL;
 	mutex_lock(&manage_mutex);
+
 	retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
 	if (retval < 0)
-		goto out;
+		goto out_unlock;
 	seq_puts(m, buf);
 	seq_putc(m, '\n');
-out:
+out_unlock:
 	mutex_unlock(&manage_mutex);
+	put_task_struct(tsk);
+out_free:
 	kfree(buf);
+out:
 	return retval;
 }
 
 static int cpuset_open(struct inode *inode, struct file *file)
 {
-	struct task_struct *tsk = PROC_I(inode)->task;
-	return single_open(file, proc_cpuset_show, tsk);
+	struct task_ref *tref = PROC_I(inode)->tref;
+	return single_open(file, proc_cpuset_show, tref);
 }
 
 struct file_operations proc_cpuset_operations = {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 73e0f23b7f51..6b9740bbf4c0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1821,7 +1821,7 @@ static inline void check_huge_range(struct vm_area_struct *vma,
 
 int show_numa_map(struct seq_file *m, void *v)
 {
-	struct task_struct *task = m->private;
+	struct proc_maps_private *priv = m->private;
 	struct vm_area_struct *vma = v;
 	struct numa_maps *md;
 	struct file *file = vma->vm_file;
@@ -1837,7 +1837,7 @@ int show_numa_map(struct seq_file *m, void *v)
 		return 0;
 
 	mpol_to_str(buffer, sizeof(buffer),
-			get_vma_policy(task, vma, vma->vm_start));
+			    get_vma_policy(priv->task, vma, vma->vm_start));
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
@@ -1891,7 +1891,7 @@ out:
 	kfree(md);
 
 	if (m->count < m->size)
-		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 13b41b09491e5d75e8027dca1ee78f5e073bc4c0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:56 -0700
Subject: [PATCH] proc: Use struct pid not struct task_ref

Incrementally update my proc-dont-lock-task_structs-indefinitely patches so
that they work with struct pid instead of struct task_ref.

Mostly this is a straight 1-1 substitution.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c          |  8 ++++----
 fs/proc/inode.c         |  4 ++--
 fs/proc/internal.h      |  7 +++----
 fs/proc/task_mmu.c      |  4 ++--
 include/linux/proc_fs.h |  4 ++--
 kernel/cpuset.c         | 11 +++++------
 6 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 489810abc72d..c7f855441573 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -951,7 +951,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	if (!capable(CAP_AUDIT_CONTROL))
 		return -EPERM;
 
-	if (current != proc_tref(inode)->task)
+	if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
 		return -EPERM;
 
 	if (count >= PAGE_SIZE)
@@ -1363,8 +1363,8 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	/*
 	 * grab the reference to task.
 	 */
-	ei->tref = tref_get_by_task(task);
-	if (!tref_task(ei->tref))
+	ei->pid = get_pid(task->pids[PIDTYPE_PID].pid);
+	if (!ei->pid)
 		goto out_unlock;
 
 	inode->i_uid = 0;
@@ -1482,7 +1482,7 @@ static int pid_delete_dentry(struct dentry * dentry)
 	 * If so, then don't put the dentry on the lru list,
 	 * kill it immediately.
 	 */
-	return !proc_tref(dentry->d_inode)->task;
+	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
 }
 
 static struct dentry_operations tid_fd_dentry_operations =
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 31e0475c6cb9..6dcef089e18e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -62,7 +62,7 @@ static void proc_delete_inode(struct inode *inode)
 	truncate_inode_pages(&inode->i_data, 0);
 
 	/* Stop tracking associated processes */
-	tref_put(PROC_I(inode)->tref);
+	put_pid(PROC_I(inode)->pid);
 
 	/* Let go of any associated proc directory entry */
 	de = PROC_I(inode)->pde;
@@ -91,7 +91,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL);
 	if (!ei)
 		return NULL;
-	ei->tref = NULL;
+	ei->pid = NULL;
 	ei->fd = 0;
 	ei->op.proc_get_link = NULL;
 	ei->pde = NULL;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 37f1648adc23..146a434ba944 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -10,7 +10,6 @@
  */
 
 #include <linux/proc_fs.h>
-#include <linux/task_ref.h>
 
 struct vmalloc_info {
 	unsigned long	used;
@@ -51,14 +50,14 @@ void free_proc_entry(struct proc_dir_entry *de);
 
 int proc_init_inodecache(void);
 
-static inline struct task_ref *proc_tref(struct inode *inode)
+static inline struct pid *proc_pid(struct inode *inode)
 {
-	return PROC_I(inode)->tref;
+	return PROC_I(inode)->pid;
 }
 
 static inline struct task_struct *get_proc_task(struct inode *inode)
 {
-	return get_tref_task(proc_tref(inode));
+	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
 }
 
 static inline int proc_fd(struct inode *inode)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index abf3208c3f60..0137ec4c1368 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -320,7 +320,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	if (last_addr == -1UL)
 		return NULL;
 
-	priv->task = get_tref_task(priv->tref);
+	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
 	if (!priv->task)
 		return NULL;
 
@@ -416,7 +416,7 @@ static int do_maps_open(struct inode *inode, struct file *file,
 	int ret = -ENOMEM;
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
 	if (priv) {
-		priv->tref = proc_tref(inode);
+		priv->pid = proc_pid(inode);
 		ret = seq_open(file, ops);
 		if (!ret) {
 			struct seq_file *m = file->private_data;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 4c7271f04697..17e75783e3a5 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -246,7 +246,7 @@ extern void kclist_add(struct kcore_list *, void *, size_t);
 #endif
 
 struct proc_inode {
-	struct task_ref *tref;
+	struct pid *pid;
 	int fd;
 	union {
 		int (*proc_get_link)(struct inode *, struct dentry **, struct vfsmount **);
@@ -267,7 +267,7 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode)
 }
 
 struct proc_maps_private {
-	struct task_ref *tref;
+	struct pid *pid;
 	struct task_struct *task;
 	struct vm_area_struct *tail_vma;
 };
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e991c0c02e2..1535af3a912d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -50,7 +50,6 @@
 #include <linux/time.h>
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
-#include <linux/task_ref.h>
 
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
@@ -2443,7 +2442,7 @@ void __cpuset_memory_pressure_bump(void)
  */
 static int proc_cpuset_show(struct seq_file *m, void *v)
 {
-	struct task_ref *tref;
+	struct pid *pid;
 	struct task_struct *tsk;
 	char *buf;
 	int retval;
@@ -2454,8 +2453,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 		goto out;
 
 	retval = -ESRCH;
-	tref = m->private;
-	tsk = get_tref_task(tref);
+	pid = m->private;
+	tsk = get_pid_task(pid, PIDTYPE_PID);
 	if (!tsk)
 		goto out_free;
 
@@ -2478,8 +2477,8 @@ out:
 
 static int cpuset_open(struct inode *inode, struct file *file)
 {
-	struct task_ref *tref = PROC_I(inode)->tref;
-	return single_open(file, proc_cpuset_show, tref);
+	struct pid *pid = PROC_I(inode)->pid;
+	return single_open(file, proc_cpuset_show, pid);
 }
 
 struct file_operations proc_cpuset_operations = {
-- 
cgit v1.2.3


From d5f70c00ad24cd1158d3678b44ff969b4c971d49 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:07 -0700
Subject: [PATCH] coredump: kill ptrace related stuff

With this patch zap_process() sets SIGNAL_GROUP_EXIT while sending SIGKILL to
the thread group.  This means that a TASK_TRACED task

	1. Will be awakened by signal_wake_up(1)

	2. Can't sleep again via ptrace_notify()

	3. Can't go to do_signal_stop() after return
	   from ptrace_stop() in get_signal_to_deliver()

So we can remove all ptrace related stuff from coredump path.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c              | 30 +++++-------------------------
 include/linux/ptrace.h |  1 -
 kernel/ptrace.c        |  3 ++-
 kernel/signal.c        | 35 ++++++++++++++++++++++++++++++-----
 4 files changed, 37 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index a5c51646d1ad..b58ba7d127e0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1368,12 +1368,14 @@ static void format_corename(char *corename, const char *pattern, long signr)
 	*out_ptr = 0;
 }
 
-static void zap_process(struct task_struct *start, int *ptraced)
+static void zap_process(struct task_struct *start)
 {
 	struct task_struct *t;
 	unsigned long flags;
 
 	spin_lock_irqsave(&start->sighand->siglock, flags);
+	start->signal->flags = SIGNAL_GROUP_EXIT;
+	start->signal->group_stop_count = 0;
 
 	t = start;
 	do {
@@ -1381,22 +1383,17 @@ static void zap_process(struct task_struct *start, int *ptraced)
 			t->mm->core_waiters++;
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
-
-			if (unlikely(t->ptrace) &&
-			    unlikely(t->parent->mm == t->mm))
-				*ptraced = 1;
 		}
 	} while ((t = next_thread(t)) != start);
 
 	spin_unlock_irqrestore(&start->sighand->siglock, flags);
 }
 
-static void zap_threads (struct mm_struct *mm)
+static void zap_threads(struct mm_struct *mm)
 {
 	struct task_struct *g, *p;
 	struct task_struct *tsk = current;
 	struct completion *vfork_done = tsk->vfork_done;
-	int traced = 0;
 
 	/*
 	 * Make sure nobody is waiting for us to release the VM,
@@ -1413,29 +1410,12 @@ static void zap_threads (struct mm_struct *mm)
 		do {
 			if (p->mm) {
 				if (p->mm == mm)
-					zap_process(p, &traced);
+					zap_process(p);
 				break;
 			}
 		} while ((p = next_thread(p)) != g);
 	}
 	read_unlock(&tasklist_lock);
-
-	if (unlikely(traced)) {
-		/*
-		 * We are zapping a thread and the thread it ptraces.
-		 * If the tracee went into a ptrace stop for exit tracing,
-		 * we could deadlock since the tracer is waiting for this
-		 * coredump to finish.  Detach them so they can both die.
-		 */
-		write_lock_irq(&tasklist_lock);
-		do_each_thread(g,p) {
-			if (mm == p->mm && p != tsk &&
-			    p->ptrace && p->parent->mm == mm) {
-				__ptrace_detach(p, 0);
-			}
-		} while_each_thread(g,p);
-		write_unlock_irq(&tasklist_lock);
-	}
 }
 
 static void coredump_wait(struct mm_struct *mm)
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index ee918bc6e18c..8b2749a259dc 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -88,7 +88,6 @@ extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __us
 extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
 extern int ptrace_attach(struct task_struct *tsk);
 extern int ptrace_detach(struct task_struct *, unsigned int);
-extern void __ptrace_detach(struct task_struct *, unsigned int);
 extern void ptrace_disable(struct task_struct *);
 extern int ptrace_check_attach(struct task_struct *task, int kill);
 extern int ptrace_request(struct task_struct *child, long request, long addr, long data);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6252d2fa2bf3..335c5b932e14 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -214,7 +214,7 @@ out:
 	return retval;
 }
 
-void __ptrace_detach(struct task_struct *child, unsigned int data)
+static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
 {
 	child->exit_code = data;
 	/* .. re-parent .. */
@@ -233,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	ptrace_disable(child);
 
 	write_lock_irq(&tasklist_lock);
+	/* protect against de_thread()->release_task() */
 	if (child->ptrace)
 		__ptrace_detach(child, data);
 	write_unlock_irq(&tasklist_lock);
diff --git a/kernel/signal.c b/kernel/signal.c
index 1b3c921737e2..52adf53929f6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1531,6 +1531,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 	spin_unlock_irqrestore(&sighand->siglock, flags);
 }
 
+static inline int may_ptrace_stop(void)
+{
+	if (!likely(current->ptrace & PT_PTRACED))
+		return 0;
+
+	if (unlikely(current->parent == current->real_parent &&
+		    (current->ptrace & PT_ATTACHED)))
+		return 0;
+
+	if (unlikely(current->signal == current->parent->signal) &&
+	    unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))
+		return 0;
+
+	/*
+	 * Are we in the middle of do_coredump?
+	 * If so and our tracer is also part of the coredump stopping
+	 * is a deadlock situation, and pointless because our tracer
+	 * is dead so don't allow us to stop.
+	 * If SIGKILL was already sent before the caller unlocked
+	 * ->siglock we must see ->core_waiters != 0. Otherwise it
+	 * is safe to enter schedule().
+	 */
+	if (unlikely(current->mm->core_waiters) &&
+	    unlikely(current->mm == current->parent->mm))
+		return 0;
+
+	return 1;
+}
+
 /*
  * This must be called with current->sighand->siglock held.
  *
@@ -1559,11 +1588,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
 	spin_unlock_irq(&current->sighand->siglock);
 	try_to_freeze();
 	read_lock(&tasklist_lock);
-	if (likely(current->ptrace & PT_PTRACED) &&
-	    likely(current->parent != current->real_parent ||
-		   !(current->ptrace & PT_ATTACHED)) &&
-	    (likely(current->parent->signal != current->signal) ||
-	     !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
+	if (may_ptrace_stop()) {
 		do_notify_parent_cldstop(current, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
 		schedule();
-- 
cgit v1.2.3


From 9a17917671d407d37bf23a527aa55acca3cb4735 Mon Sep 17 00:00:00 2001
From: "Antonino A. Daplas" <adaplas@gmail.com>
Date: Mon, 26 Jun 2006 00:27:05 -0700
Subject: [PATCH] Detaching fbcon: sdd sysfs class device entry for fbcon

In order for this feature to work, an interface will be needed.  The most
appropriate is sysfs.  However, the framebuffer console has no sysfs entry
yet.  This will create a sysfs class device entry for fbcon under
/sys/class/graphics.

Add a class_device entry 'fbcon' under class 'graphics'.  Console-specific
attributes which where previously under class/graphics/fb[x] are moved to
class/graphics/fbcon.  These attributes, 'con_rotate' and 'con_rotate_all',
are also renamed to 'rotate' and 'rotate_all' respectively.

Signed-off-by: Antonino Daplas <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/video/Makefile        |   8 +--
 drivers/video/console/fbcon.c | 113 +++++++++++++++++++++++++++++++++++++++---
 drivers/video/console/fbcon.h |   1 +
 drivers/video/fbmem.c         |  25 +---------
 drivers/video/fbsysfs.c       |  41 ---------------
 include/linux/fb.h            |   7 ---
 6 files changed, 112 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/Makefile b/drivers/video/Makefile
index b7dfee209cd0..c335e9bc3b20 100644
--- a/drivers/video/Makefile
+++ b/drivers/video/Makefile
@@ -4,15 +4,15 @@
 
 # Each configuration option enables a list of files.
 
-obj-$(CONFIG_VT)		  += console/
-obj-$(CONFIG_LOGO)		  += logo/
-obj-$(CONFIG_SYSFS)		  += backlight/
-
 obj-$(CONFIG_FB)                  += fb.o
 fb-y                              := fbmem.o fbmon.o fbcmap.o fbsysfs.o \
                                      modedb.o fbcvt.o
 fb-objs                           := $(fb-y)
 
+obj-$(CONFIG_VT)		  += console/
+obj-$(CONFIG_LOGO)		  += logo/
+obj-$(CONFIG_SYSFS)		  += backlight/
+
 obj-$(CONFIG_FB_CFB_FILLRECT)  += cfbfillrect.o
 obj-$(CONFIG_FB_CFB_COPYAREA)  += cfbcopyarea.o
 obj-$(CONFIG_FB_CFB_IMAGEBLIT) += cfbimgblt.o
diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 47ba1a79adcd..746225bf8c44 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -195,6 +195,8 @@ static void fbcon_redraw_move(struct vc_data *vc, struct display *p,
 static void fbcon_modechanged(struct fb_info *info);
 static void fbcon_set_all_vcs(struct fb_info *info);
 
+static struct class_device *fbcon_class_device;
+
 #ifdef CONFIG_MAC
 /*
  * On the Macintoy, there may or may not be a working VBL int. We need to probe
@@ -2945,14 +2947,6 @@ static int fbcon_event_notify(struct notifier_block *self,
 	case FB_EVENT_NEW_MODELIST:
 		fbcon_new_modelist(info);
 		break;
-	case FB_EVENT_SET_CON_ROTATE:
-		fbcon_rotate(info, *(int *)event->data);
-		break;
-	case FB_EVENT_GET_CON_ROTATE:
-		ret = fbcon_get_rotate(info);
-		break;
-	case FB_EVENT_SET_CON_ROTATE_ALL:
-		fbcon_rotate_all(info, *(int *)event->data);
 	}
 
 	return ret;
@@ -2992,6 +2986,81 @@ static struct notifier_block fbcon_event_notifier = {
 	.notifier_call	= fbcon_event_notify,
 };
 
+static ssize_t store_rotate(struct class_device *class_device,
+			    const char *buf, size_t count)
+{
+	struct fb_info *info;
+	int rotate, idx;
+	char **last = NULL;
+
+	acquire_console_sem();
+	idx = con2fb_map[fg_console];
+
+	if (idx == -1 || registered_fb[idx] == NULL)
+		goto err;
+
+	info = registered_fb[idx];
+	rotate = simple_strtoul(buf, last, 0);
+	fbcon_rotate(info, rotate);
+err:
+	release_console_sem();
+	return count;
+}
+
+static ssize_t store_rotate_all(struct class_device *class_device,
+				const char *buf, size_t count)
+{
+	struct fb_info *info;
+	int rotate, idx;
+	char **last = NULL;
+
+	acquire_console_sem();
+	idx = con2fb_map[fg_console];
+
+	if (idx == -1 || registered_fb[idx] == NULL)
+		goto err;
+
+	info = registered_fb[idx];
+	rotate = simple_strtoul(buf, last, 0);
+	fbcon_rotate_all(info, rotate);
+err:
+	release_console_sem();
+	return count;
+}
+
+static ssize_t show_rotate(struct class_device *class_device, char *buf)
+{
+	struct fb_info *info;
+	int rotate = 0, idx;
+
+	acquire_console_sem();
+	idx = con2fb_map[fg_console];
+
+	if (idx == -1 || registered_fb[idx] == NULL)
+		goto err;
+
+	info = registered_fb[idx];
+	rotate = fbcon_get_rotate(info);
+err:
+	release_console_sem();
+	return snprintf(buf, PAGE_SIZE, "%d\n", rotate);
+}
+
+static struct class_device_attribute class_device_attrs[] = {
+	__ATTR(rotate, S_IRUGO|S_IWUSR, show_rotate, store_rotate),
+	__ATTR(rotate_all, S_IWUSR, NULL, store_rotate_all),
+};
+
+static int fbcon_init_class_device(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(class_device_attrs); i++)
+		class_device_create_file(fbcon_class_device,
+					 &class_device_attrs[i]);
+	return 0;
+}
+
 static int __init fb_console_init(void)
 {
 	int i;
@@ -3000,6 +3069,18 @@ static int __init fb_console_init(void)
 	fb_register_client(&fbcon_event_notifier);
 	release_console_sem();
 
+	fbcon_class_device =
+	    class_device_create(fb_class, NULL,
+				MKDEV(FB_MAJOR, FB_MAX), NULL,
+				"fbcon");
+	if (IS_ERR(fbcon_class_device)) {
+	    printk(KERN_WARNING "Unable to create class_device "
+		   "for fbcon; errno = %ld\n",
+		   PTR_ERR(fbcon_class_device));
+	    fbcon_class_device = NULL;
+	} else
+	    fbcon_init_class_device();
+
 	for (i = 0; i < MAX_NR_CONSOLES; i++)
 		con2fb_map[i] = -1;
 
@@ -3020,10 +3101,26 @@ module_init(fb_console_init);
 
 #ifdef MODULE
 
+static void __exit fbcon_deinit_class_device(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(class_device_attrs); i++)
+		class_device_remove_file(fbcon_class_device,
+					 &class_device_attrs[i]);
+}
+
+static void __exit fbcon_exit(void)
+{
+	fbcon_deinit_class_device();
+	class_device_destroy(fb_class, MKDEV(FB_MAJOR, FB_MAX));
+}
+
 static void __exit fb_console_exit(void)
 {
 	acquire_console_sem();
 	fb_unregister_client(&fbcon_event_notifier);
+	fbcon_exit();
 	release_console_sem();
 	give_up_console(&fb_con);
 }	
diff --git a/drivers/video/console/fbcon.h b/drivers/video/console/fbcon.h
index c38c3d8e7a74..3487a636370a 100644
--- a/drivers/video/console/fbcon.h
+++ b/drivers/video/console/fbcon.h
@@ -175,6 +175,7 @@ extern void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info);
 #endif
 extern void fbcon_set_bitops(struct fbcon_ops *ops);
 extern int  soft_cursor(struct fb_info *info, struct fb_cursor *cursor);
+extern struct class *fb_class;
 
 #define FBCON_ATTRIBUTE_UNDERLINE 1
 #define FBCON_ATTRIBUTE_REVERSE   2
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index e9af5e61018d..a2102a543ee7 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1276,8 +1276,8 @@ static struct file_operations fb_fops = {
 #endif
 };
 
-static struct class *fb_class;
-
+struct class *fb_class;
+EXPORT_SYMBOL(fb_class);
 /**
  *	register_framebuffer - registers a frame buffer device
  *	@fb_info: frame buffer info structure
@@ -1489,27 +1489,6 @@ int fb_new_modelist(struct fb_info *info)
 	return err;
 }
 
-/**
- * fb_con_duit - user<->fbcon passthrough
- * @info: struct fb_info
- * @event: notification event to be passed to fbcon
- * @data: private data
- *
- * DESCRIPTION
- * This function is an fbcon-user event passing channel
- * which bypasses fbdev.  This is hopefully temporary
- * until a user interface for fbcon is created
- */
-int fb_con_duit(struct fb_info *info, int event, void *data)
-{
-	struct fb_event evnt;
-
-	evnt.info = info;
-	evnt.data = data;
-
-	return blocking_notifier_call_chain(&fb_notifier_list, event, &evnt);
-}
-
 static char *video_options[FB_MAX];
 static int ofonly;
 
diff --git a/drivers/video/fbsysfs.c b/drivers/video/fbsysfs.c
index 6de02189abbe..4f78f234473d 100644
--- a/drivers/video/fbsysfs.c
+++ b/drivers/video/fbsysfs.c
@@ -247,45 +247,6 @@ static ssize_t show_rotate(struct class_device *class_device, char *buf)
 	return snprintf(buf, PAGE_SIZE, "%d\n", fb_info->var.rotate);
 }
 
-static ssize_t store_con_rotate(struct class_device *class_device,
-				const char *buf, size_t count)
-{
-	struct fb_info *fb_info = class_get_devdata(class_device);
-	int rotate;
-	char **last = NULL;
-
-	acquire_console_sem();
-	rotate = simple_strtoul(buf, last, 0);
-	fb_con_duit(fb_info, FB_EVENT_SET_CON_ROTATE, &rotate);
-	release_console_sem();
-	return count;
-}
-
-static ssize_t store_con_rotate_all(struct class_device *class_device,
-				const char *buf, size_t count)
-{
-	struct fb_info *fb_info = class_get_devdata(class_device);
-	int rotate;
-	char **last = NULL;
-
-	acquire_console_sem();
-	rotate = simple_strtoul(buf, last, 0);
-	fb_con_duit(fb_info, FB_EVENT_SET_CON_ROTATE_ALL, &rotate);
-	release_console_sem();
-	return count;
-}
-
-static ssize_t show_con_rotate(struct class_device *class_device, char *buf)
-{
-	struct fb_info *fb_info = class_get_devdata(class_device);
-	int rotate;
-
-	acquire_console_sem();
-	rotate = fb_con_duit(fb_info, FB_EVENT_GET_CON_ROTATE, NULL);
-	release_console_sem();
-	return snprintf(buf, PAGE_SIZE, "%d\n", rotate);
-}
-
 static ssize_t store_virtual(struct class_device *class_device,
 			     const char * buf, size_t count)
 {
@@ -502,8 +463,6 @@ static struct class_device_attribute class_device_attrs[] = {
 	__ATTR(name, S_IRUGO, show_name, NULL),
 	__ATTR(stride, S_IRUGO, show_stride, NULL),
 	__ATTR(rotate, S_IRUGO|S_IWUSR, show_rotate, store_rotate),
-	__ATTR(con_rotate, S_IRUGO|S_IWUSR, show_con_rotate, store_con_rotate),
-	__ATTR(con_rotate_all, S_IWUSR, NULL, store_con_rotate_all),
 	__ATTR(state, S_IRUGO|S_IWUSR, show_fbstate, store_fbstate),
 #ifdef CONFIG_FB_BACKLIGHT
 	__ATTR(bl_curve, S_IRUGO|S_IWUSR, show_bl_curve, store_bl_curve),
diff --git a/include/linux/fb.h b/include/linux/fb.h
index f1281687e549..c64f25255286 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -515,12 +515,6 @@ struct fb_cursor_user {
 /*	The resolution of the passed in fb_info about to change and
         all vc's should be changed         */
 #define FB_EVENT_MODE_CHANGE_ALL	0x0A
-/*      CONSOLE-SPECIFIC: set console rotation */
-#define FB_EVENT_SET_CON_ROTATE         0x0B
-/*      CONSOLE-SPECIFIC: get console rotation */
-#define FB_EVENT_GET_CON_ROTATE         0x0C
-/*      CONSOLE-SPECIFIC: rotate all consoles */
-#define FB_EVENT_SET_CON_ROTATE_ALL     0x0D
 
 struct fb_event {
 	struct fb_info *info;
@@ -892,7 +886,6 @@ extern int fb_get_color_depth(struct fb_var_screeninfo *var,
 			      struct fb_fix_screeninfo *fix);
 extern int fb_get_options(char *name, char **option);
 extern int fb_new_modelist(struct fb_info *info);
-extern int fb_con_duit(struct fb_info *info, int event, void *data);
 
 extern struct fb_info *registered_fb[FB_MAX];
 extern int num_registered_fb;
-- 
cgit v1.2.3


From 3e795de7631b2366d7301182c8d91f6d2911467b Mon Sep 17 00:00:00 2001
From: "Antonino A. Daplas" <adaplas@gmail.com>
Date: Mon, 26 Jun 2006 00:27:08 -0700
Subject: [PATCH] VT binding: Add binding/unbinding support for the VT console

The framebuffer console is now able to dynamically bind and unbind from the VT
console layer.  Due to the way the VT console layer works, the drivers
themselves decide when to bind or unbind.  However, it was decided that
binding must be controlled, not by the drivers themselves, but by the VT
console layer.  With this, dynamic binding is possible for all VT console
drivers, not just fbcon.

Thus, the VT console layer will impose the following to all VT console
drivers:

- all registered VT console drivers will be entered in a private list
- drivers can register themselves to the VT console layer, but they cannot
  decide when to bind or unbind. (Exception: To maintain backwards
  compatibility, take_over_console() will automatically bind the driver after
  registration.)
- drivers can remove themselves from the list by unregistering from the VT
  console layer. A prerequisite for unregistration is that the driver must not
  be bound.

The following functions are new in the vt.c:

register_con_driver() - public function, this function adds the VT console
driver to an internal list maintained by the VT console

bind_con_driver() - private function, it binds the driver to the console

take_over_console() is changed to call register_con_driver() followed by a
bind_con_driver().  This is the only time drivers can decide when to bind to
the VT layer.  This is to maintain backwards compatibility.

unbind_con_driver() - private function, it unbinds the driver from its
console.  The vacated consoles will be taken over by the default boot console
driver.

unregister_con_driver() - public function, removes the driver from the
internal list maintained by the VT console.  It will only succeed if the
driver is currently unbound.

con_is_bound() checks if the driver is currently bound or not

give_up_console() is just a wrapper to unregister_con_driver().

There are also 3 additional functions meant to be called only by the tty layer
for sysfs control:

	vt_bind() - calls bind_con_driver()
	vt_unbind() - calls unbind_con_driver()
	vt_show_drivers() - shows the list of registered drivers

Most VT console drivers will continue to work as is, but might have problems
when unbinding or binding which should be fixable with minimal changes.

Signed-off-by: Antonino Daplas <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/vt.c       | 467 ++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/console.h |   4 +-
 2 files changed, 437 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index d7ff7fdb6fe3..d788a0eaf26a 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -98,9 +98,21 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
+#define MAX_NR_CON_DRIVER 16
 
+#define CON_DRIVER_FLAG_BIND 1
+#define CON_DRIVER_FLAG_INIT 2
+
+struct con_driver {
+	const struct consw *con;
+	const char *desc;
+	int first;
+	int last;
+	int flag;
+};
+
+static struct con_driver registered_con_driver[MAX_NR_CON_DRIVER];
 const struct consw *conswitchp;
-static struct consw *defcsw; /* default console */
 
 /* A bitmap for codes <32. A bit of 1 indicates that the code
  * corresponding to that bit number invokes some special action
@@ -2558,7 +2570,7 @@ static int __init con_init(void)
 {
 	const char *display_desc = NULL;
 	struct vc_data *vc;
-	unsigned int currcons = 0;
+	unsigned int currcons = 0, i;
 
 	acquire_console_sem();
 
@@ -2570,6 +2582,22 @@ static int __init con_init(void)
 		return 0;
 	}
 
+	for (i = 0; i < MAX_NR_CON_DRIVER; i++) {
+		struct con_driver *con_driver = &registered_con_driver[i];
+
+		if (con_driver->con == NULL) {
+			con_driver->con = conswitchp;
+			con_driver->desc = display_desc;
+			con_driver->flag = CON_DRIVER_FLAG_INIT;
+			con_driver->first = 0;
+			con_driver->last = MAX_NR_CONSOLES - 1;
+			break;
+		}
+	}
+
+	for (i = 0; i < MAX_NR_CONSOLES; i++)
+		con_driver_map[i] = conswitchp;
+
 	init_timer(&console_timer);
 	console_timer.function = blank_screen_t;
 	if (blankinterval) {
@@ -2658,33 +2686,36 @@ int __init vty_init(void)
 
 #ifndef VT_SINGLE_DRIVER
 
-/*
- *	If we support more console drivers, this function is used
- *	when a driver wants to take over some existing consoles
- *	and become default driver for newly opened ones.
- */
-
-int take_over_console(const struct consw *csw, int first, int last, int deflt)
+static int bind_con_driver(const struct consw *csw, int first, int last,
+			   int deflt)
 {
-	int i, j = -1, k = -1;
-	const char *desc;
-	struct module *owner;
+	struct module *owner = csw->owner;
+	const char *desc = NULL;
+	struct con_driver *con_driver;
+	int i, j = -1, k = -1, retval = -ENODEV;
 
-	owner = csw->owner;
 	if (!try_module_get(owner))
 		return -ENODEV;
 
-	/* save default console, for possible recovery later on */
-	if (!defcsw)
-		defcsw = (struct consw *) conswitchp;
-
 	acquire_console_sem();
-	desc = csw->con_startup();
 
-	if (!desc) {
-		release_console_sem();
-		module_put(owner);
-		return -ENODEV;
+	/* check if driver is registered */
+	for (i = 0; i < MAX_NR_CON_DRIVER; i++) {
+		con_driver = &registered_con_driver[i];
+
+		if (con_driver->con == csw) {
+			desc = con_driver->desc;
+			retval = 0;
+			break;
+		}
+	}
+
+	if (retval)
+		goto err;
+
+	if (!(con_driver->flag & CON_DRIVER_FLAG_INIT)) {
+		csw->con_startup();
+		con_driver->flag |= CON_DRIVER_FLAG_INIT;
 	}
 
 	if (deflt) {
@@ -2695,6 +2726,9 @@ int take_over_console(const struct consw *csw, int first, int last, int deflt)
 		conswitchp = csw;
 	}
 
+	first = max(first, con_driver->first);
+	last = min(last, con_driver->last);
+
 	for (i = first; i <= last; i++) {
 		int old_was_color;
 		struct vc_data *vc = vc_cons[i].d;
@@ -2746,32 +2780,399 @@ int take_over_console(const struct consw *csw, int first, int last, int deflt)
 	} else
 		printk("to %s\n", desc);
 
+	retval = 0;
+err:
 	release_console_sem();
 	module_put(owner);
-	return 0;
-}
+	return retval;
+};
 
-void give_up_console(const struct consw *csw)
+static int unbind_con_driver(const struct consw *csw, int first, int last,
+			     int deflt)
 {
-	int i, first = -1, last = -1, deflt = 0;
+	struct module *owner = csw->owner;
+	const struct consw *defcsw = NULL;
+	struct con_driver *con_driver = NULL, *con_back = NULL;
+	int i, retval = -ENODEV;
 
-	for (i = 0; i < MAX_NR_CONSOLES; i++)
+	if (!try_module_get(owner))
+		return -ENODEV;
+
+	acquire_console_sem();
+
+	/* check if driver is registered and if it is unbindable */
+	for (i = 0; i < MAX_NR_CON_DRIVER; i++) {
+		con_driver = &registered_con_driver[i];
+
+		if (con_driver->con == csw &&
+		    con_driver->flag & CON_DRIVER_FLAG_BIND) {
+			retval = 0;
+			break;
+		}
+	}
+
+	if (retval) {
+		release_console_sem();
+		goto err;
+	}
+
+	/* check if backup driver exists */
+	for (i = 0; i < MAX_NR_CON_DRIVER; i++) {
+		con_back = &registered_con_driver[i];
+
+		if (con_back->con &&
+		    !(con_back->flag & CON_DRIVER_FLAG_BIND)) {
+			defcsw = con_back->con;
+			retval = 0;
+			break;
+		}
+	}
+
+	if (retval) {
+		release_console_sem();
+		goto err;
+	}
+
+	if (!con_is_bound(csw)) {
+		release_console_sem();
+		goto err;
+	}
+
+	first = max(first, con_driver->first);
+	last = min(last, con_driver->last);
+
+	for (i = first; i <= last; i++) {
 		if (con_driver_map[i] == csw) {
-			if (first == -1)
-				first = i;
-			last = i;
 			module_put(csw->owner);
 			con_driver_map[i] = NULL;
 		}
+	}
+
+	if (!con_is_bound(defcsw)) {
+		defcsw->con_startup();
+		con_back->flag |= CON_DRIVER_FLAG_INIT;
+	}
+
+	if (!con_is_bound(csw))
+		con_driver->flag &= ~CON_DRIVER_FLAG_INIT;
+
+	release_console_sem();
+	retval = bind_con_driver(defcsw, first, last, deflt);
+err:
+	module_put(owner);
+	return retval;
+
+}
+
+/**
+ * con_is_bound - checks if driver is bound to the console
+ * @csw: console driver
+ *
+ * RETURNS: zero if unbound, nonzero if bound
+ *
+ * Drivers can call this and if zero, they should release
+ * all resources allocated on con_startup()
+ */
+int con_is_bound(const struct consw *csw)
+{
+	int i, bound = 0;
+
+	for (i = 0; i < MAX_NR_CONSOLES; i++) {
+		if (con_driver_map[i] == csw) {
+			bound = 1;
+			break;
+		}
+	}
+
+	return bound;
+}
+EXPORT_SYMBOL(con_is_bound);
+
+/**
+ * register_con_driver - register console driver to console layer
+ * @csw: console driver
+ * @first: the first console to take over, minimum value is 0
+ * @last: the last console to take over, maximum value is MAX_NR_CONSOLES -1
+ *
+ * DESCRIPTION: This function registers a console driver which can later
+ * bind to a range of consoles specified by @first and @last. It will
+ * also initialize the console driver by calling con_startup().
+ */
+int register_con_driver(const struct consw *csw, int first, int last)
+{
+	struct module *owner = csw->owner;
+	struct con_driver *con_driver;
+	const char *desc;
+	int i, retval = 0;
+
+	if (!try_module_get(owner))
+		return -ENODEV;
+
+	acquire_console_sem();
+
+	for (i = 0; i < MAX_NR_CON_DRIVER; i++) {
+		con_driver = &registered_con_driver[i];
+
+		/* already registered */
+		if (con_driver->con == csw)
+			retval = -EINVAL;
+	}
+
+	if (retval)
+		goto err;
+
+	desc = csw->con_startup();
+
+	if (!desc)
+		goto err;
+
+	retval = -EINVAL;
+
+	for (i = 0; i < MAX_NR_CON_DRIVER; i++) {
+		con_driver = &registered_con_driver[i];
+
+		if (con_driver->con == NULL) {
+			con_driver->con = csw;
+			con_driver->desc = desc;
+			con_driver->flag = CON_DRIVER_FLAG_BIND |
+			                   CON_DRIVER_FLAG_INIT;
+			con_driver->first = first;
+			con_driver->last = last;
+			retval = 0;
+			break;
+		}
+	}
+
+err:
+	release_console_sem();
+	module_put(owner);
+	return retval;
+}
+EXPORT_SYMBOL(register_con_driver);
+
+/**
+ * unregister_con_driver - unregister console driver from console layer
+ * @csw: console driver
+ *
+ * DESCRIPTION: All drivers that registers to the console layer must
+ * call this function upon exit, or if the console driver is in a state
+ * where it won't be able to handle console services, such as the
+ * framebuffer console without loaded framebuffer drivers.
+ *
+ * The driver must unbind first prior to unregistration.
+ */
+int unregister_con_driver(const struct consw *csw)
+{
+	int i, retval = -ENODEV;
+
+	acquire_console_sem();
+
+	/* cannot unregister a bound driver */
+	if (con_is_bound(csw))
+		goto err;
+
+	for (i = 0; i < MAX_NR_CON_DRIVER; i++) {
+		struct con_driver *con_driver = &registered_con_driver[i];
+
+		if (con_driver->con == csw &&
+		    con_driver->flag & CON_DRIVER_FLAG_BIND) {
+			con_driver->con = NULL;
+			con_driver->desc = NULL;
+			con_driver->flag = 0;
+			con_driver->first = 0;
+			con_driver->last = 0;
+			retval = 0;
+			break;
+		}
+	}
+
+err:
+	release_console_sem();
+	return retval;
+}
+EXPORT_SYMBOL(unregister_con_driver);
+
+/*
+ *	If we support more console drivers, this function is used
+ *	when a driver wants to take over some existing consoles
+ *	and become default driver for newly opened ones.
+ *
+ *      take_over_console is basically a register followed by unbind
+ */
+int take_over_console(const struct consw *csw, int first, int last, int deflt)
+{
+	int err;
+
+	err = register_con_driver(csw, first, last);
+
+	if (!err)
+		err = bind_con_driver(csw, first, last, deflt);
+
+	return err;
+}
+
+/*
+ * give_up_console is a wrapper to unregister_con_driver. It will only
+ * work if driver is fully unbound.
+ */
+void give_up_console(const struct consw *csw)
+{
+	unregister_con_driver(csw);
+}
+
+/*
+ * this function is intended to be called by the tty layer only
+ */
+int vt_bind(int index)
+{
+	const struct consw *defcsw = NULL, *csw = NULL;
+	struct con_driver *con;
+	int i, more = 1, first = -1, last = -1, deflt = 0;
+
+	if (index >= MAX_NR_CON_DRIVER)
+		goto err;
+
+	con = &registered_con_driver[index];
+
+ 	if (!con->con || !(con->flag & CON_DRIVER_FLAG_BIND))
+		goto err;
+
+	csw = con->con;
+
+	for (i = 0; i < MAX_NR_CON_DRIVER; i++) {
+		struct con_driver *con = &registered_con_driver[i];
+
+		if (con->con && !(con->flag & CON_DRIVER_FLAG_BIND)) {
+			defcsw = con->con;
+			break;
+		}
+	}
 
-	if (first != -1 && defcsw) {
-		if (first == 0 && last == MAX_NR_CONSOLES - 1)
+	if (!defcsw)
+		goto err;
+
+	while (more) {
+		more = 0;
+
+		for (i = con->first; i <= con->last; i++) {
+			if (con_driver_map[i] == defcsw) {
+				if (first == -1)
+					first = i;
+				last = i;
+				more = 1;
+			} else if (first != -1)
+				break;
+		}
+
+		if (first == 0 && last == MAX_NR_CONSOLES -1)
 			deflt = 1;
-		take_over_console(defcsw, first, last, deflt);
+
+		if (first != -1)
+			bind_con_driver(csw, first, last, deflt);
+
+		first = -1;
+		last = -1;
+		deflt = 0;
 	}
+
+err:
+	return 0;
+}
+
+/*
+ * this function is intended to be called by the tty layer only
+ */
+int vt_unbind(int index)
+{
+	const struct consw *csw = NULL;
+	struct con_driver *con;
+	int i, more = 1, first = -1, last = -1, deflt = 0;
+
+	if (index >= MAX_NR_CON_DRIVER)
+		goto err;
+
+	con = &registered_con_driver[index];
+
+ 	if (!con->con || !(con->flag & CON_DRIVER_FLAG_BIND))
+		goto err;
+
+	csw = con->con;
+
+	while (more) {
+		more = 0;
+
+		for (i = con->first; i <= con->last; i++) {
+			if (con_driver_map[i] == csw) {
+				if (first == -1)
+					first = i;
+				last = i;
+				more = 1;
+			} else if (first != -1)
+				break;
+		}
+
+		if (first == 0 && last == MAX_NR_CONSOLES -1)
+			deflt = 1;
+
+		if (first != -1)
+			unbind_con_driver(csw, first, last, deflt);
+
+		first = -1;
+		last = -1;
+		deflt = 0;
+	}
+
+err:
+	return 0;
+}
+#else
+int vt_bind(int index)
+{
+	return 0;
+}
+
+int vt_unbind(int index)
+{
+	return 0;
 }
 #endif
 
+/*
+ * this function is intended to be called by the tty layer only
+ */
+int vt_show_drivers(char *buf)
+{
+	int i, j, read, offset = 0, cnt = PAGE_SIZE;
+
+	for (i = 0; i < MAX_NR_CON_DRIVER; i++) {
+		struct con_driver *con_driver = &registered_con_driver[i];
+
+		if (con_driver->con != NULL) {
+			int sys = 0;
+
+			if (con_driver->flag & CON_DRIVER_FLAG_BIND) {
+				sys = 2;
+
+				for (j = 0; j < MAX_NR_CONSOLES; j++) {
+					if (con_driver_map[j] ==
+					    con_driver->con) {
+						sys = 1;
+						break;
+					}
+				}
+			}
+
+			read = snprintf(buf + offset, cnt, "%i %s: %s\n",
+					i, (sys) ? ((sys == 1) ? "B" : "U") :
+					"S", con_driver->desc);
+			offset += read;
+			cnt -= read;
+		}
+	}
+
+	return offset;
+}
+
 /*
  *	Screen blanking
  */
diff --git a/include/linux/console.h b/include/linux/console.h
index d0f8a8009490..3bdf2155e565 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -63,9 +63,11 @@ extern const struct consw vga_con;	/* VGA text console */
 extern const struct consw newport_con;	/* SGI Newport console  */
 extern const struct consw prom_con;	/* SPARC PROM console */
 
+int con_is_bound(const struct consw *csw);
+int register_con_driver(const struct consw *csw, int first, int last);
+int unregister_con_driver(const struct consw *csw);
 int take_over_console(const struct consw *sw, int first, int last, int deflt);
 void give_up_console(const struct consw *sw);
-
 /* scroll */
 #define SM_UP       (1)
 #define SM_DOWN     (2)
-- 
cgit v1.2.3


From e614b18dcedb247ce6f848e623cdf2336df2b476 Mon Sep 17 00:00:00 2001
From: "Antonino A. Daplas" <adaplas@gmail.com>
Date: Mon, 26 Jun 2006 00:27:09 -0700
Subject: [PATCH] VT binding: Update fbcon to support binding

The control for binding/unbinding is moved from fbcon to the console layer.
Thus the fbcon sysfs attributes, attach and detach, are also gone.

    1. Add a notifier event that tells fbcon if a framebuffer driver has been
       unregistered.  If no registered driver remains, fbcon will unregister
       itself from the console layer.

    2. Replaced calls to give_up_console() with unregister_con_driver().

    3. Still use take_over_console() instead of register_con_driver() to
       maintain compatibility

    4. Respect the parameter first_fb_vc and last_fb_vc instead of using 0 and
       MAX_NR_CONSOLES - 1. These parameters are settable by the user.

    5. When fbcon is completely unbound from the console layer, fbcon will
       also release (iow, decrement module reference counts to zero) all fbdev
       drivers. In other words, a bind or unbind request from the console layer
       will propagate down to the framebuffer drivers.

    6. If fbcon is not bound to the console, it will ignore all notifier
       events (except driver registration and unregistration) and all sysfs
       requests.

Signed-off-by: Antonino Daplas <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/video/console/fbcon.c | 204 +++++++++++++++++++++++++++++-------------
 drivers/video/fbmem.c         |   7 +-
 include/linux/fb.h            |  12 +--
 3 files changed, 157 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 4b7be685c160..26effab5a13e 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -125,6 +125,8 @@ static int softback_lines;
 static int first_fb_vc;
 static int last_fb_vc = MAX_NR_CONSOLES - 1;
 static int fbcon_is_default = 1; 
+static int fbcon_has_exited;
+
 /* font data */
 static char fontname[40];
 
@@ -140,7 +142,6 @@ static const struct consw fb_con;
 
 #define advance_row(p, delta) (unsigned short *)((unsigned long)(p) + (delta) * vc->vc_size_row)
 
-static void fbcon_free_font(struct display *);
 static int fbcon_set_origin(struct vc_data *);
 
 #define CURSOR_DRAW_DELAY		(1)
@@ -255,7 +256,7 @@ static void fbcon_rotate_all(struct fb_info *info, u32 rotate)
 	if (!ops || ops->currcon < 0 || rotate > 3)
 		return;
 
-	for (i = 0; i < MAX_NR_CONSOLES; i++) {
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
 		vc = vc_cons[i].d;
 		if (!vc || vc->vc_mode != KD_TEXT ||
 		    registered_fb[con2fb_map[i]] != info)
@@ -534,7 +535,7 @@ static int search_fb_in_map(int idx)
 {
 	int i, retval = 0;
 
-	for (i = 0; i < MAX_NR_CONSOLES; i++) {
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
 		if (con2fb_map[i] == idx)
 			retval = 1;
 	}
@@ -545,7 +546,7 @@ static int search_for_mapped_con(void)
 {
 	int i, retval = 0;
 
-	for (i = 0; i < MAX_NR_CONSOLES; i++) {
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
 		if (con2fb_map[i] != -1)
 			retval = 1;
 	}
@@ -567,6 +568,7 @@ static int fbcon_takeover(int show_logo)
 
 	err = take_over_console(&fb_con, first_fb_vc, last_fb_vc,
 				fbcon_is_default);
+
 	if (err) {
 		for (i = first_fb_vc; i <= last_fb_vc; i++) {
 			con2fb_map[i] = -1;
@@ -801,8 +803,8 @@ static int set_con2fb_map(int unit, int newidx, int user)
 	if (oldidx == newidx)
 		return 0;
 
-	if (!info)
- 		err =  -EINVAL;
+	if (!info || fbcon_has_exited)
+		return -EINVAL;
 
  	if (!err && !search_for_mapped_con()) {
 		info_idx = newidx;
@@ -838,6 +840,9 @@ static int set_con2fb_map(int unit, int newidx, int user)
  		con2fb_init_display(vc, info, unit, show_logo);
 	}
 
+	if (!search_fb_in_map(info_idx))
+		info_idx = newidx;
+
 	release_console_sem();
  	return err;
 }
@@ -1040,6 +1045,7 @@ static const char *fbcon_startup(void)
 #endif				/* CONFIG_MAC */
 
 	fbcon_add_cursor_timer(info);
+	fbcon_has_exited = 0;
 	return display_desc;
 }
 
@@ -1067,17 +1073,36 @@ static void fbcon_init(struct vc_data *vc, int init)
 
 	/* If we are not the first console on this
 	   fb, copy the font from that console */
-	t = &fb_display[svc->vc_num];
-	if (!vc->vc_font.data) {
-		vc->vc_font.data = (void *)(p->fontdata = t->fontdata);
-		vc->vc_font.width = (*default_mode)->vc_font.width;
-		vc->vc_font.height = (*default_mode)->vc_font.height;
-		p->userfont = t->userfont;
-		if (p->userfont)
-			REFCOUNT(p->fontdata)++;
+	t = &fb_display[fg_console];
+	if (!p->fontdata) {
+		if (t->fontdata) {
+			struct vc_data *fvc = vc_cons[fg_console].d;
+
+			vc->vc_font.data = (void *)(p->fontdata =
+						    fvc->vc_font.data);
+			vc->vc_font.width = fvc->vc_font.width;
+			vc->vc_font.height = fvc->vc_font.height;
+			p->userfont = t->userfont;
+
+			if (p->userfont)
+				REFCOUNT(p->fontdata)++;
+		} else {
+			const struct font_desc *font = NULL;
+
+			if (!fontname[0] || !(font = find_font(fontname)))
+				font = get_default_font(info->var.xres,
+							info->var.yres);
+			vc->vc_font.width = font->width;
+			vc->vc_font.height = font->height;
+			vc->vc_font.data = (void *)(p->fontdata = font->data);
+			vc->vc_font.charcount = 256; /* FIXME  Need to
+							support more fonts */
+		}
 	}
+
 	if (p->userfont)
 		charcnt = FNTCHARCNT(p->fontdata);
+
 	vc->vc_can_do_color = (fb_get_color_depth(&info->var, &info->fix)!=1);
 	vc->vc_complement_mask = vc->vc_can_do_color ? 0x7700 : 0x0800;
 	if (charcnt == 256) {
@@ -1151,13 +1176,47 @@ static void fbcon_init(struct vc_data *vc, int init)
 	ops->p = &fb_display[fg_console];
 }
 
+static void fbcon_free_font(struct display *p)
+{
+	if (p->userfont && p->fontdata && (--REFCOUNT(p->fontdata) == 0))
+		kfree(p->fontdata - FONT_EXTRA_WORDS * sizeof(int));
+	p->fontdata = NULL;
+	p->userfont = 0;
+}
+
 static void fbcon_deinit(struct vc_data *vc)
 {
 	struct display *p = &fb_display[vc->vc_num];
+	struct fb_info *info;
+	struct fbcon_ops *ops;
+	int idx;
 
-	if (info_idx != -1)
-	    return;
 	fbcon_free_font(p);
+	idx = con2fb_map[vc->vc_num];
+
+	if (idx == -1)
+		goto finished;
+
+	info = registered_fb[idx];
+
+	if (!info)
+		goto finished;
+
+	ops = info->fbcon_par;
+
+	if (!ops)
+		goto finished;
+
+	if (CON_IS_VISIBLE(vc))
+		fbcon_del_cursor_timer(info);
+
+	ops->flags &= ~FBCON_FLAGS_INIT;
+finished:
+
+	if (!con_is_bound(&fb_con))
+		fbcon_exit();
+
+	return;
 }
 
 /* ====================================================================== */
@@ -2227,14 +2286,6 @@ static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch)
 	return 0;
 }
 
-static void fbcon_free_font(struct display *p)
-{
-	if (p->userfont && p->fontdata && (--REFCOUNT(p->fontdata) == 0))
-		kfree(p->fontdata - FONT_EXTRA_WORDS * sizeof(int));
-	p->fontdata = NULL;
-	p->userfont = 0;
-}
-
 static int fbcon_get_font(struct vc_data *vc, struct console_font *font)
 {
 	u8 *fontdata = vc->vc_font.data;
@@ -2448,7 +2499,7 @@ static int fbcon_set_font(struct vc_data *vc, struct console_font *font, unsigne
 
 	FNTSUM(new_data) = csum;
 	/* Check if the same font is on some other console already */
-	for (i = 0; i < MAX_NR_CONSOLES; i++) {
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
 		struct vc_data *tmp = vc_cons[i].d;
 		
 		if (fb_display[i].userfont &&
@@ -2773,7 +2824,7 @@ static void fbcon_set_all_vcs(struct fb_info *info)
 	if (!ops || ops->currcon < 0)
 		return;
 
-	for (i = 0; i < MAX_NR_CONSOLES; i++) {
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
 		vc = vc_cons[i].d;
 		if (!vc || vc->vc_mode != KD_TEXT ||
 		    registered_fb[con2fb_map[i]] != info)
@@ -2835,21 +2886,55 @@ static int fbcon_mode_deleted(struct fb_info *info,
 	return found;
 }
 
+static int fbcon_fb_unregistered(int idx)
+{
+	int i;
+
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
+		if (con2fb_map[i] == idx)
+			con2fb_map[i] = -1;
+	}
+
+	if (idx == info_idx) {
+		info_idx = -1;
+
+		for (i = 0; i < FB_MAX; i++) {
+			if (registered_fb[i] != NULL) {
+				info_idx = i;
+				break;
+			}
+		}
+	}
+
+	if (info_idx != -1) {
+		for (i = first_fb_vc; i <= last_fb_vc; i++) {
+			if (con2fb_map[i] == -1)
+				con2fb_map[i] = info_idx;
+		}
+	}
+
+	if (!num_registered_fb)
+		unregister_con_driver(&fb_con);
+
+	return 0;
+}
+
 static int fbcon_fb_registered(int idx)
 {
 	int ret = 0, i;
 
 	if (info_idx == -1) {
-		for (i = 0; i < MAX_NR_CONSOLES; i++) {
+		for (i = first_fb_vc; i <= last_fb_vc; i++) {
 			if (con2fb_map_boot[i] == idx) {
 				info_idx = idx;
 				break;
 			}
 		}
+
 		if (info_idx != -1)
 			ret = fbcon_takeover(1);
 	} else {
-		for (i = 0; i < MAX_NR_CONSOLES; i++) {
+		for (i = first_fb_vc; i <= last_fb_vc; i++) {
 			if (con2fb_map_boot[i] == idx &&
 			    con2fb_map[i] == -1)
 				set_con2fb_map(i, idx, 0);
@@ -2888,7 +2973,7 @@ static void fbcon_new_modelist(struct fb_info *info)
 	struct fb_var_screeninfo var;
 	struct fb_videomode *mode;
 
-	for (i = 0; i < MAX_NR_CONSOLES; i++) {
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
 		if (registered_fb[con2fb_map[i]] != info)
 			continue;
 		if (!fb_display[i].mode)
@@ -2916,6 +3001,14 @@ static int fbcon_event_notify(struct notifier_block *self,
 	struct fb_con2fbmap *con2fb;
 	int ret = 0;
 
+	/*
+	 * ignore all events except driver registration and deregistration
+	 * if fbcon is not active
+	 */
+	if (fbcon_has_exited && !(action == FB_EVENT_FB_REGISTERED ||
+				  action == FB_EVENT_FB_UNREGISTERED))
+		goto done;
+
 	switch(action) {
 	case FB_EVENT_SUSPEND:
 		fbcon_suspended(info);
@@ -2936,6 +3029,9 @@ static int fbcon_event_notify(struct notifier_block *self,
 	case FB_EVENT_FB_REGISTERED:
 		ret = fbcon_fb_registered(info->node);
 		break;
+	case FB_EVENT_FB_UNREGISTERED:
+		ret = fbcon_fb_unregistered(info->node);
+		break;
 	case FB_EVENT_SET_CONSOLE_MAP:
 		con2fb = event->data;
 		ret = set_con2fb_map(con2fb->console - 1,
@@ -2953,6 +3049,7 @@ static int fbcon_event_notify(struct notifier_block *self,
 		break;
 	}
 
+done:
 	return ret;
 }
 
@@ -2997,6 +3094,9 @@ static ssize_t store_rotate(struct class_device *class_device,
 	int rotate, idx;
 	char **last = NULL;
 
+	if (fbcon_has_exited)
+		return count;
+
 	acquire_console_sem();
 	idx = con2fb_map[fg_console];
 
@@ -3018,6 +3118,9 @@ static ssize_t store_rotate_all(struct class_device *class_device,
 	int rotate, idx;
 	char **last = NULL;
 
+	if (fbcon_has_exited)
+		return count;
+
 	acquire_console_sem();
 	idx = con2fb_map[fg_console];
 
@@ -3037,6 +3140,9 @@ static ssize_t show_rotate(struct class_device *class_device, char *buf)
 	struct fb_info *info;
 	int rotate = 0, idx;
 
+	if (fbcon_has_exited)
+		return 0;
+
 	acquire_console_sem();
 	idx = con2fb_map[fg_console];
 
@@ -3050,32 +3156,9 @@ err:
 	return snprintf(buf, PAGE_SIZE, "%d\n", rotate);
 }
 
-static ssize_t store_attach(struct class_device *class_device,
-			    const char *buf, size_t count)
-{
-	if (info_idx == -1)
-		fbcon_start();
-
-	return count;
-}
-
-static ssize_t store_detach(struct class_device *class_device,
-			    const char *buf, size_t count)
-{
-	if (info_idx != -1) {
-		fbcon_exit();
-		give_up_console(&fb_con);
-	}
-
-	info_idx = -1;
-	return count;
-}
-
 static struct class_device_attribute class_device_attrs[] = {
 	__ATTR(rotate, S_IRUGO|S_IWUSR, show_rotate, store_rotate),
 	__ATTR(rotate_all, S_IWUSR, NULL, store_rotate_all),
-	__ATTR(attach, S_IWUSR, NULL, store_attach),
-	__ATTR(detach, S_IWUSR, NULL, store_detach),
 };
 
 static int fbcon_init_class_device(void)
@@ -3112,7 +3195,9 @@ static void fbcon_exit(void)
 	struct fb_info *info;
 	int i, j, mapped;
 
-	acquire_console_sem();
+	if (fbcon_has_exited)
+		return;
+
 #ifdef CONFIG_ATARI
 	free_irq(IRQ_AUTO_4, fbcon_vbl_handler);
 #endif
@@ -3131,11 +3216,9 @@ static void fbcon_exit(void)
 		if (info == NULL)
 			continue;
 
-		for (j = 0; j < MAX_NR_CONSOLES; j++) {
-			if (con2fb_map[j] == i) {
-				con2fb_map[j] = -1;
+		for (j = first_fb_vc; j <= last_fb_vc; j++) {
+			if (con2fb_map[j] == i)
 				mapped = 1;
-			}
 		}
 
 		if (mapped) {
@@ -3151,11 +3234,10 @@ static void fbcon_exit(void)
 
 			if (info->queue.func == fb_flashcursor)
 				info->queue.func = NULL;
-
 		}
 	}
 
-	release_console_sem();
+	fbcon_has_exited = 1;
 }
 
 static int __init fb_console_init(void)
@@ -3189,7 +3271,7 @@ module_init(fb_console_init);
 
 #ifdef MODULE
 
-static void fbcon_deinit_class_device(void)
+static void __exit fbcon_deinit_class_device(void)
 {
 	int i;
 
@@ -3204,7 +3286,9 @@ static void __exit fb_console_exit(void)
 	fb_unregister_client(&fbcon_event_notifier);
 	fbcon_deinit_class_device();
 	class_device_destroy(fb_class, MKDEV(FB_MAJOR, FB_MAX));
+	fbcon_exit();
 	release_console_sem();
+	unregister_con_driver(&fb_con);
 }	
 
 module_exit(fb_console_exit);
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index a2102a543ee7..31143afe7c95 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1353,6 +1353,7 @@ register_framebuffer(struct fb_info *fb_info)
 int
 unregister_framebuffer(struct fb_info *fb_info)
 {
+	struct fb_event event;
 	int i;
 
 	i = fb_info->node;
@@ -1360,13 +1361,17 @@ unregister_framebuffer(struct fb_info *fb_info)
 		return -EINVAL;
 	devfs_remove("fb/%d", i);
 
-	if (fb_info->pixmap.addr && (fb_info->pixmap.flags & FB_PIXMAP_DEFAULT))
+	if (fb_info->pixmap.addr &&
+	    (fb_info->pixmap.flags & FB_PIXMAP_DEFAULT))
 		kfree(fb_info->pixmap.addr);
 	fb_destroy_modelist(&fb_info->modelist);
 	registered_fb[i]=NULL;
 	num_registered_fb--;
 	fb_cleanup_class_device(fb_info);
 	class_device_destroy(fb_class, MKDEV(FB_MAJOR, i));
+	event.info = fb_info;
+	blocking_notifier_call_chain(&fb_notifier_list,
+				     FB_EVENT_FB_UNREGISTERED, &event);
 	return 0;
 }
 
diff --git a/include/linux/fb.h b/include/linux/fb.h
index c64f25255286..07a08e92bc73 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -504,17 +504,19 @@ struct fb_cursor_user {
 #define FB_EVENT_MODE_DELETE            0x04
 /*      A driver registered itself */
 #define FB_EVENT_FB_REGISTERED          0x05
+/*      A driver unregistered itself */
+#define FB_EVENT_FB_UNREGISTERED        0x06
 /*      CONSOLE-SPECIFIC: get console to framebuffer mapping */
-#define FB_EVENT_GET_CONSOLE_MAP        0x06
+#define FB_EVENT_GET_CONSOLE_MAP        0x07
 /*      CONSOLE-SPECIFIC: set console to framebuffer mapping */
-#define FB_EVENT_SET_CONSOLE_MAP        0x07
+#define FB_EVENT_SET_CONSOLE_MAP        0x08
 /*      A display blank is requested       */
-#define FB_EVENT_BLANK                  0x08
+#define FB_EVENT_BLANK                  0x09
 /*      Private modelist is to be replaced */
-#define FB_EVENT_NEW_MODELIST           0x09
+#define FB_EVENT_NEW_MODELIST           0x0A
 /*	The resolution of the passed in fb_info about to change and
         all vc's should be changed         */
-#define FB_EVENT_MODE_CHANGE_ALL	0x0A
+#define FB_EVENT_MODE_CHANGE_ALL	0x0B
 
 struct fb_event {
 	struct fb_info *info;
-- 
cgit v1.2.3


From 5806f07cd2c32920d5105e0f9ff3117338f34eec Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 26 Jun 2006 00:27:19 -0700
Subject: [PATCH] lib: add idr_replace

This patch adds idr_replace() to replace an existing pointer in a single
operation.

Device-mapper will use this to update the pointer it stored against a given
id.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/idr.h |  1 +
 lib/idr.c           | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index d37c8d808b0f..f559a719dbe8 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -78,6 +78,7 @@ void *idr_find(struct idr *idp, int id);
 int idr_pre_get(struct idr *idp, gfp_t gfp_mask);
 int idr_get_new(struct idr *idp, void *ptr, int *id);
 int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
+void *idr_replace(struct idr *idp, void *ptr, int id);
 void idr_remove(struct idr *idp, int id);
 void idr_destroy(struct idr *idp);
 void idr_init(struct idr *idp);
diff --git a/lib/idr.c b/lib/idr.c
index de19030a999b..4d096819511a 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -29,6 +29,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #endif
+#include <linux/err.h>
 #include <linux/string.h>
 #include <linux/idr.h>
 
@@ -398,6 +399,48 @@ void *idr_find(struct idr *idp, int id)
 }
 EXPORT_SYMBOL(idr_find);
 
+/**
+ * idr_replace - replace pointer for given id
+ * @idp: idr handle
+ * @ptr: pointer you want associated with the id
+ * @id: lookup key
+ *
+ * Replace the pointer registered with an id and return the old value.
+ * A -ENOENT return indicates that @id was not found.
+ * A -EINVAL return indicates that @id was not within valid constraints.
+ *
+ * The caller must serialize vs idr_find(), idr_get_new(), and idr_remove().
+ */
+void *idr_replace(struct idr *idp, void *ptr, int id)
+{
+	int n;
+	struct idr_layer *p, *old_p;
+
+	n = idp->layers * IDR_BITS;
+	p = idp->top;
+
+	id &= MAX_ID_MASK;
+
+	if (id >= (1 << n))
+		return ERR_PTR(-EINVAL);
+
+	n -= IDR_BITS;
+	while ((n > 0) && p) {
+		p = p->ary[(id >> n) & IDR_MASK];
+		n -= IDR_BITS;
+	}
+
+	n = id & IDR_MASK;
+	if (unlikely(p == NULL || !test_bit(n, &p->bitmap)))
+		return ERR_PTR(-ENOENT);
+
+	old_p = p->ary[n];
+	p->ary[n] = ptr;
+
+	return old_p;
+}
+EXPORT_SYMBOL(idr_replace);
+
 static void idr_cache_ctor(void * idr_layer, kmem_cache_t *idr_layer_cache,
 		unsigned long flags)
 {
-- 
cgit v1.2.3


From 17b2f66f2a39a4e4d1ed456f35ee3bb598e41d35 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Mon, 26 Jun 2006 00:27:33 -0700
Subject: [PATCH] dm: add exports

Move definitions of core device-mapper functions for manipulating mapped
devices and their tables to <linux/device-mapper.h> advertising their
availability for use elsewhere in the kernel.

Protect the contents of device-mapper.h with ifdef __KERNEL__.  And throw
in a few formatting clean-ups and extra comments.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm.c               |  12 ++---
 drivers/md/dm.h               |  75 +-----------------------------
 include/linux/device-mapper.h | 104 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 109 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 59bf797de9a5..952c49c3b9aa 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -50,9 +50,9 @@ struct target_io {
 
 union map_info *dm_get_mapinfo(struct bio *bio)
 {
-        if (bio && bio->bi_private)
-                return &((struct target_io *)bio->bi_private)->info;
-        return NULL;
+	if (bio && bio->bi_private)
+		return &((struct target_io *)bio->bi_private)->info;
+	return NULL;
 }
 
 #define MINOR_ALLOCED ((void *)-1)
@@ -474,8 +474,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 	if (r > 0) {
 		/* the bio has been remapped so dispatch it */
 
-		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 
-				    tio->io->bio->bi_bdev->bd_dev, sector, 
+		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
+				    tio->io->bio->bi_bdev->bd_dev, sector,
 				    clone->bi_sector);
 
 		generic_make_request(clone);
@@ -1042,7 +1042,7 @@ static struct mapped_device *dm_find_md(dev_t dev)
 	md = idr_find(&_minor_idr, minor);
 	if (md && (md == MINOR_ALLOCED ||
 		   (dm_disk(md)->first_minor != minor) ||
-	           test_bit(DMF_FREEING, &md->flags))) {
+		   test_bit(DMF_FREEING, &md->flags))) {
 		md = NULL;
 		goto out;
 	}
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 2901ab943191..71ddd1e23007 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -39,88 +39,16 @@ struct dm_dev {
 };
 
 struct dm_table;
-struct mapped_device;
 
 /*-----------------------------------------------------------------
- * Functions for manipulating a struct mapped_device.
- * Drop the reference with dm_put when you finish with the object.
+ * Internal table functions.
  *---------------------------------------------------------------*/
-
-/*
- * DM_ANY_MINOR allocates any available minor number.
- */
-#define DM_ANY_MINOR (-1)
-int dm_create(int minor, struct mapped_device **md);
-
-void dm_set_mdptr(struct mapped_device *md, void *ptr);
-void *dm_get_mdptr(struct mapped_device *md);
-
-/*
- * Reference counting for md.
- */
-void dm_get(struct mapped_device *md);
-struct mapped_device *dm_get_md(dev_t dev);
-void dm_put(struct mapped_device *md);
-
-/*
- * A device can still be used while suspended, but I/O is deferred.
- */
-int dm_suspend(struct mapped_device *md, int with_lockfs);
-int dm_resume(struct mapped_device *md);
-
-/*
- * The device must be suspended before calling this method.
- */
-int dm_swap_table(struct mapped_device *md, struct dm_table *t);
-
-/*
- * Drop a reference on the table when you've finished with the
- * result.
- */
-struct dm_table *dm_get_table(struct mapped_device *md);
-
-/*
- * Event functions.
- */
-uint32_t dm_get_event_nr(struct mapped_device *md);
-int dm_wait_event(struct mapped_device *md, int event_nr);
-
-/*
- * Info functions.
- */
-struct gendisk *dm_disk(struct mapped_device *md);
-int dm_suspended(struct mapped_device *md);
-
-/*
- * Geometry functions.
- */
-int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo);
-int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo);
-
-/*-----------------------------------------------------------------
- * Functions for manipulating a table.  Tables are also reference
- * counted.
- *---------------------------------------------------------------*/
-int dm_table_create(struct dm_table **result, int mode,
-		    unsigned num_targets, struct mapped_device *md);
-
-void dm_table_get(struct dm_table *t);
-void dm_table_put(struct dm_table *t);
-
-int dm_table_add_target(struct dm_table *t, const char *type,
-			sector_t start,	sector_t len, char *params);
-int dm_table_complete(struct dm_table *t);
 void dm_table_event_callback(struct dm_table *t,
 			     void (*fn)(void *), void *context);
-void dm_table_event(struct dm_table *t);
-sector_t dm_table_get_size(struct dm_table *t);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
-unsigned int dm_table_get_num_targets(struct dm_table *t);
 struct list_head *dm_table_get_devices(struct dm_table *t);
-int dm_table_get_mode(struct dm_table *t);
-struct mapped_device *dm_table_get_md(struct dm_table *t);
 void dm_table_presuspend_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 void dm_table_resume_targets(struct dm_table *t);
@@ -138,7 +66,6 @@ void dm_put_target_type(struct target_type *t);
 int dm_target_iterate(void (*iter_func)(struct target_type *tt,
 					void *param), void *param);
 
-
 /*-----------------------------------------------------------------
  * Useful inlines.
  *---------------------------------------------------------------*/
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index aee10b2ea4c6..010c8c5eeb37 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -8,9 +8,12 @@
 #ifndef _LINUX_DEVICE_MAPPER_H
 #define _LINUX_DEVICE_MAPPER_H
 
+#ifdef __KERNEL__
+
 struct dm_target;
 struct dm_table;
 struct dm_dev;
+struct mapped_device;
 
 typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 
@@ -78,7 +81,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d);
 struct target_type {
 	const char *name;
 	struct module *module;
-        unsigned version[3];
+	unsigned version[3];
 	dm_ctr_fn ctr;
 	dm_dtr_fn dtr;
 	dm_map_fn map;
@@ -128,4 +131,101 @@ struct dm_target {
 int dm_register_target(struct target_type *t);
 int dm_unregister_target(struct target_type *t);
 
-#endif				/* _LINUX_DEVICE_MAPPER_H */
+
+/*-----------------------------------------------------------------
+ * Functions for creating and manipulating mapped devices.
+ * Drop the reference with dm_put when you finish with the object.
+ *---------------------------------------------------------------*/
+
+/*
+ * DM_ANY_MINOR chooses the next available minor number.
+ */
+#define DM_ANY_MINOR (-1)
+int dm_create(int minor, struct mapped_device **md);
+
+/*
+ * Reference counting for md.
+ */
+struct mapped_device *dm_get_md(dev_t dev);
+void dm_get(struct mapped_device *md);
+void dm_put(struct mapped_device *md);
+
+/*
+ * An arbitrary pointer may be stored alongside a mapped device.
+ */
+void dm_set_mdptr(struct mapped_device *md, void *ptr);
+void *dm_get_mdptr(struct mapped_device *md);
+
+/*
+ * A device can still be used while suspended, but I/O is deferred.
+ */
+int dm_suspend(struct mapped_device *md, int with_lockfs);
+int dm_resume(struct mapped_device *md);
+
+/*
+ * Event functions.
+ */
+uint32_t dm_get_event_nr(struct mapped_device *md);
+int dm_wait_event(struct mapped_device *md, int event_nr);
+
+/*
+ * Info functions.
+ */
+struct gendisk *dm_disk(struct mapped_device *md);
+int dm_suspended(struct mapped_device *md);
+
+/*
+ * Geometry functions.
+ */
+int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo);
+int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo);
+
+
+/*-----------------------------------------------------------------
+ * Functions for manipulating device-mapper tables.
+ *---------------------------------------------------------------*/
+
+/*
+ * First create an empty table.
+ */
+int dm_table_create(struct dm_table **result, int mode,
+		    unsigned num_targets, struct mapped_device *md);
+
+/*
+ * Then call this once for each target.
+ */
+int dm_table_add_target(struct dm_table *t, const char *type,
+			sector_t start, sector_t len, char *params);
+
+/*
+ * Finally call this to make the table ready for use.
+ */
+int dm_table_complete(struct dm_table *t);
+
+/*
+ * Table reference counting.
+ */
+struct dm_table *dm_get_table(struct mapped_device *md);
+void dm_table_get(struct dm_table *t);
+void dm_table_put(struct dm_table *t);
+
+/*
+ * Queries
+ */
+sector_t dm_table_get_size(struct dm_table *t);
+unsigned int dm_table_get_num_targets(struct dm_table *t);
+int dm_table_get_mode(struct dm_table *t);
+struct mapped_device *dm_table_get_md(struct dm_table *t);
+
+/*
+ * Trigger an event.
+ */
+void dm_table_event(struct dm_table *t);
+
+/*
+ * The device must be suspended before calling this method.
+ */
+int dm_swap_table(struct mapped_device *md, struct dm_table *t);
+
+#endif	/* __KERNEL__ */
+#endif	/* _LINUX_DEVICE_MAPPER_H */
-- 
cgit v1.2.3


From c2ade42dd35466d90aa6fc7cc717f396e165492f Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 26 Jun 2006 00:27:33 -0700
Subject: [PATCH] dm: create error table

Add a library function dm_create_error_table() to create a table that rejects
any I/O sent to a device with EIO.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-table.c         | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/device-mapper.h |  6 ++++++
 2 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 10c9439635ca..827b648fac50 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -237,6 +237,44 @@ int dm_table_create(struct dm_table **result, int mode,
 	return 0;
 }
 
+int dm_create_error_table(struct dm_table **result, struct mapped_device *md)
+{
+	struct dm_table *t;
+	sector_t dev_size = 1;
+	int r;
+
+	/*
+	 * Find current size of device.
+	 * Default to 1 sector if inactive.
+	 */
+	t = dm_get_table(md);
+	if (t) {
+		dev_size = dm_table_get_size(t);
+		dm_table_put(t);
+	}
+
+	r = dm_table_create(&t, FMODE_READ, 1, md);
+	if (r)
+		return r;
+
+	r = dm_table_add_target(t, "error", 0, dev_size, NULL);
+	if (r)
+		goto out;
+
+	r = dm_table_complete(t);
+	if (r)
+		goto out;
+
+	*result = t;
+
+out:
+	if (r)
+		dm_table_put(t);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_create_error_table);
+
 static void free_devices(struct list_head *devices)
 {
 	struct list_head *tmp, *next;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 010c8c5eeb37..61103d2bc244 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -227,5 +227,11 @@ void dm_table_event(struct dm_table *t);
  */
 int dm_swap_table(struct mapped_device *md, struct dm_table *t);
 
+/*
+ * Prepare a table for a device that will error all I/O.
+ * To make it active, call dm_suspend(), dm_swap_table() then dm_resume().
+ */
+int dm_create_error_table(struct dm_table **result, struct mapped_device *md);
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_DEVICE_MAPPER_H */
-- 
cgit v1.2.3


From 5c6bd75d06db512515a3781aa97e42df2faf0815 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Mon, 26 Jun 2006 00:27:34 -0700
Subject: [PATCH] dm: prevent removal if open

If you misuse the device-mapper interface (or there's a bug in your userspace
tools) it's possible to end up with 'unlinked' mapped devices that cannot be
removed until you reboot (along with uninterruptible processes).

This patch prevents you from removing a device that is still open.

It introduces dm_lock_for_deletion() which is called when a device is about to
be removed to ensure that nothing has it open and nothing further can open it.
 It uses a private open_count for this which also lets us remove one of the
problematic bdget_disk() calls elsewhere.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-ioctl.c    | 65 +++++++++++++++++++++++++++++++++---------------
 drivers/md/dm.c          | 32 +++++++++++++++++++++++-
 drivers/md/dm.h          |  2 ++
 include/linux/dm-ioctl.h |  6 ++---
 4 files changed, 81 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index c826b3e1799a..c6ce13b18747 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -48,7 +48,7 @@ struct vers_iter {
 static struct list_head _name_buckets[NUM_BUCKETS];
 static struct list_head _uuid_buckets[NUM_BUCKETS];
 
-static void dm_hash_remove_all(void);
+static void dm_hash_remove_all(int keep_open_devices);
 
 /*
  * Guards access to both hash tables.
@@ -73,7 +73,7 @@ static int dm_hash_init(void)
 
 static void dm_hash_exit(void)
 {
-	dm_hash_remove_all();
+	dm_hash_remove_all(0);
 	devfs_remove(DM_DIR);
 }
 
@@ -260,19 +260,41 @@ static void __hash_remove(struct hash_cell *hc)
 	free_cell(hc);
 }
 
-static void dm_hash_remove_all(void)
+static void dm_hash_remove_all(int keep_open_devices)
 {
-	int i;
+	int i, dev_skipped, dev_removed;
 	struct hash_cell *hc;
 	struct list_head *tmp, *n;
 
 	down_write(&_hash_lock);
+
+retry:
+	dev_skipped = dev_removed = 0;
 	for (i = 0; i < NUM_BUCKETS; i++) {
 		list_for_each_safe (tmp, n, _name_buckets + i) {
 			hc = list_entry(tmp, struct hash_cell, name_list);
+
+			if (keep_open_devices &&
+			    dm_lock_for_deletion(hc->md)) {
+				dev_skipped++;
+				continue;
+			}
 			__hash_remove(hc);
+			dev_removed = 1;
 		}
 	}
+
+	/*
+	 * Some mapped devices may be using other mapped devices, so if any
+	 * still exist, repeat until we make no further progress.
+	 */
+	if (dev_skipped) {
+		if (dev_removed)
+			goto retry;
+
+		DMWARN("remove_all left %d open device(s)", dev_skipped);
+	}
+
 	up_write(&_hash_lock);
 }
 
@@ -355,7 +377,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
 
 static int remove_all(struct dm_ioctl *param, size_t param_size)
 {
-	dm_hash_remove_all();
+	dm_hash_remove_all(1);
 	param->data_size = 0;
 	return 0;
 }
@@ -535,7 +557,6 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 {
 	struct gendisk *disk = dm_disk(md);
 	struct dm_table *table;
-	struct block_device *bdev;
 
 	param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
 			  DM_ACTIVE_PRESENT_FLAG);
@@ -545,20 +566,12 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 
 	param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor));
 
-	if (!(param->flags & DM_SKIP_BDGET_FLAG)) {
-		bdev = bdget_disk(disk, 0);
-		if (!bdev)
-			return -ENXIO;
-
-		/*
-		 * Yes, this will be out of date by the time it gets back
-		 * to userland, but it is still very useful for
-		 * debugging.
-		 */
-		param->open_count = bdev->bd_openers;
-		bdput(bdev);
-	} else
-		param->open_count = -1;
+	/*
+	 * Yes, this will be out of date by the time it gets back
+	 * to userland, but it is still very useful for
+	 * debugging.
+	 */
+	param->open_count = dm_open_count(md);
 
 	if (disk->policy)
 		param->flags |= DM_READONLY_FLAG;
@@ -661,6 +674,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 {
 	struct hash_cell *hc;
 	struct mapped_device *md;
+	int r;
 
 	down_write(&_hash_lock);
 	hc = __find_device_hash_cell(param);
@@ -673,6 +687,17 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 
 	md = hc->md;
 
+	/*
+	 * Ensure the device is not open and nothing further can open it.
+	 */
+	r = dm_lock_for_deletion(md);
+	if (r) {
+		DMWARN("unable to remove open device %s", hc->name);
+		up_write(&_hash_lock);
+		dm_put(md);
+		return r;
+	}
+
 	__hash_remove(hc);
 	up_write(&_hash_lock);
 	dm_put(md);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 952c49c3b9aa..6982f86db6dc 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -64,12 +64,14 @@ union map_info *dm_get_mapinfo(struct bio *bio)
 #define DMF_SUSPENDED 1
 #define DMF_FROZEN 2
 #define DMF_FREEING 3
+#define DMF_DELETING 4
 
 struct mapped_device {
 	struct rw_semaphore io_lock;
 	struct semaphore suspend_lock;
 	rwlock_t map_lock;
 	atomic_t holders;
+	atomic_t open_count;
 
 	unsigned long flags;
 
@@ -228,12 +230,14 @@ static int dm_blk_open(struct inode *inode, struct file *file)
 	if (!md)
 		goto out;
 
-	if (test_bit(DMF_FREEING, &md->flags)) {
+	if (test_bit(DMF_FREEING, &md->flags) ||
+	    test_bit(DMF_DELETING, &md->flags)) {
 		md = NULL;
 		goto out;
 	}
 
 	dm_get(md);
+	atomic_inc(&md->open_count);
 
 out:
 	spin_unlock(&_minor_lock);
@@ -246,10 +250,35 @@ static int dm_blk_close(struct inode *inode, struct file *file)
 	struct mapped_device *md;
 
 	md = inode->i_bdev->bd_disk->private_data;
+	atomic_dec(&md->open_count);
 	dm_put(md);
 	return 0;
 }
 
+int dm_open_count(struct mapped_device *md)
+{
+	return atomic_read(&md->open_count);
+}
+
+/*
+ * Guarantees nothing is using the device before it's deleted.
+ */
+int dm_lock_for_deletion(struct mapped_device *md)
+{
+	int r = 0;
+
+	spin_lock(&_minor_lock);
+
+	if (dm_open_count(md))
+		r = -EBUSY;
+	else
+		set_bit(DMF_DELETING, &md->flags);
+
+	spin_unlock(&_minor_lock);
+
+	return r;
+}
+
 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
@@ -867,6 +896,7 @@ static struct mapped_device *alloc_dev(int minor)
 	init_MUTEX(&md->suspend_lock);
 	rwlock_init(&md->map_lock);
 	atomic_set(&md->holders, 1);
+	atomic_set(&md->open_count, 0);
 	atomic_set(&md->event_nr, 0);
 
 	md->queue = blk_alloc_queue(GFP_KERNEL);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 71ddd1e23007..9ebb24b037bc 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -123,5 +123,7 @@ void dm_stripe_exit(void);
 
 void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
 union map_info *dm_get_mapinfo(struct bio *bio);
+int dm_open_count(struct mapped_device *md);
+int dm_lock_for_deletion(struct mapped_device *md);
 
 #endif
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index c67c6786612a..9623bb625090 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -285,9 +285,9 @@ typedef char ioctl_struct[308];
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	6
+#define DM_VERSION_MINOR	7
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2006-02-17)"
+#define DM_VERSION_EXTRA	"-ioctl (2006-06-24)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
@@ -314,7 +314,7 @@ typedef char ioctl_struct[308];
 #define DM_BUFFER_FULL_FLAG	(1 << 8) /* Out */
 
 /*
- * Set this to improve performance when you aren't going to use open_count.
+ * This flag is now ignored.
  */
 #define DM_SKIP_BDGET_FLAG	(1 << 9) /* In */
 
-- 
cgit v1.2.3


From 72d9486169a2a8353e022813185ba2f32d7dde69 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Mon, 26 Jun 2006 00:27:35 -0700
Subject: [PATCH] dm: improve error message consistency

Tidy device-mapper error messages to include context information
automatically.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-crypt.c           | 56 ++++++++++++++++++++---------------------
 drivers/md/dm-emc.c             | 40 +++++++++++++++--------------
 drivers/md/dm-exception-store.c |  2 ++
 drivers/md/dm-ioctl.c           |  1 +
 drivers/md/dm-linear.c          |  8 +++---
 drivers/md/dm-log.c             |  2 ++
 drivers/md/dm-mpath.c           | 43 ++++++++++++++++---------------
 drivers/md/dm-raid1.c           | 24 ++++++++++--------
 drivers/md/dm-round-robin.c     |  6 +++--
 drivers/md/dm-snap.c            | 10 +++++---
 drivers/md/dm-stripe.c          | 25 +++++++++---------
 drivers/md/dm-table.c           | 11 ++++----
 drivers/md/dm-target.c          |  2 ++
 drivers/md/dm-zero.c            |  8 +++---
 drivers/md/dm.c                 |  8 ++++++
 drivers/md/dm.h                 |  7 +++---
 include/linux/device-mapper.h   |  1 +
 17 files changed, 142 insertions(+), 112 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 61a590bb6241..6022ed12a795 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -20,7 +20,7 @@
 
 #include "dm.h"
 
-#define PFX	"crypt: "
+#define DM_MSG_PREFIX "crypt"
 
 /*
  * per bio private data
@@ -125,19 +125,19 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	u8 *salt;
 
 	if (opts == NULL) {
-		ti->error = PFX "Digest algorithm missing for ESSIV mode";
+		ti->error = "Digest algorithm missing for ESSIV mode";
 		return -EINVAL;
 	}
 
 	/* Hash the cipher key with the given hash algorithm */
 	hash_tfm = crypto_alloc_tfm(opts, CRYPTO_TFM_REQ_MAY_SLEEP);
 	if (hash_tfm == NULL) {
-		ti->error = PFX "Error initializing ESSIV hash";
+		ti->error = "Error initializing ESSIV hash";
 		return -EINVAL;
 	}
 
 	if (crypto_tfm_alg_type(hash_tfm) != CRYPTO_ALG_TYPE_DIGEST) {
-		ti->error = PFX "Expected digest algorithm for ESSIV hash";
+		ti->error = "Expected digest algorithm for ESSIV hash";
 		crypto_free_tfm(hash_tfm);
 		return -EINVAL;
 	}
@@ -145,7 +145,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	saltsize = crypto_tfm_alg_digestsize(hash_tfm);
 	salt = kmalloc(saltsize, GFP_KERNEL);
 	if (salt == NULL) {
-		ti->error = PFX "Error kmallocing salt storage in ESSIV";
+		ti->error = "Error kmallocing salt storage in ESSIV";
 		crypto_free_tfm(hash_tfm);
 		return -ENOMEM;
 	}
@@ -159,20 +159,20 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	                             CRYPTO_TFM_MODE_ECB |
 	                             CRYPTO_TFM_REQ_MAY_SLEEP);
 	if (essiv_tfm == NULL) {
-		ti->error = PFX "Error allocating crypto tfm for ESSIV";
+		ti->error = "Error allocating crypto tfm for ESSIV";
 		kfree(salt);
 		return -EINVAL;
 	}
 	if (crypto_tfm_alg_blocksize(essiv_tfm)
 	    != crypto_tfm_alg_ivsize(cc->tfm)) {
-		ti->error = PFX "Block size of ESSIV cipher does "
+		ti->error = "Block size of ESSIV cipher does "
 			        "not match IV size of block cipher";
 		crypto_free_tfm(essiv_tfm);
 		kfree(salt);
 		return -EINVAL;
 	}
 	if (crypto_cipher_setkey(essiv_tfm, salt, saltsize) < 0) {
-		ti->error = PFX "Failed to set key for ESSIV cipher";
+		ti->error = "Failed to set key for ESSIV cipher";
 		crypto_free_tfm(essiv_tfm);
 		kfree(salt);
 		return -EINVAL;
@@ -521,7 +521,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	unsigned long long tmpll;
 
 	if (argc != 5) {
-		ti->error = PFX "Not enough arguments";
+		ti->error = "Not enough arguments";
 		return -EINVAL;
 	}
 
@@ -532,21 +532,21 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ivmode = strsep(&ivopts, ":");
 
 	if (tmp)
-		DMWARN(PFX "Unexpected additional cipher options");
+		DMWARN("Unexpected additional cipher options");
 
 	key_size = strlen(argv[1]) >> 1;
 
 	cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
 	if (cc == NULL) {
 		ti->error =
-			PFX "Cannot allocate transparent encryption context";
+			"Cannot allocate transparent encryption context";
 		return -ENOMEM;
 	}
 
 	cc->key_size = key_size;
 	if ((!key_size && strcmp(argv[1], "-") != 0) ||
 	    (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) {
-		ti->error = PFX "Error decoding key";
+		ti->error = "Error decoding key";
 		goto bad1;
 	}
 
@@ -562,22 +562,22 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	else if (strcmp(chainmode, "ecb") == 0)
 		crypto_flags = CRYPTO_TFM_MODE_ECB;
 	else {
-		ti->error = PFX "Unknown chaining mode";
+		ti->error = "Unknown chaining mode";
 		goto bad1;
 	}
 
 	if (crypto_flags != CRYPTO_TFM_MODE_ECB && !ivmode) {
-		ti->error = PFX "This chaining mode requires an IV mechanism";
+		ti->error = "This chaining mode requires an IV mechanism";
 		goto bad1;
 	}
 
 	tfm = crypto_alloc_tfm(cipher, crypto_flags | CRYPTO_TFM_REQ_MAY_SLEEP);
 	if (!tfm) {
-		ti->error = PFX "Error allocating crypto tfm";
+		ti->error = "Error allocating crypto tfm";
 		goto bad1;
 	}
 	if (crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_CIPHER) {
-		ti->error = PFX "Expected cipher algorithm";
+		ti->error = "Expected cipher algorithm";
 		goto bad2;
 	}
 
@@ -595,7 +595,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	else if (strcmp(ivmode, "essiv") == 0)
 		cc->iv_gen_ops = &crypt_iv_essiv_ops;
 	else {
-		ti->error = PFX "Invalid IV mode";
+		ti->error = "Invalid IV mode";
 		goto bad2;
 	}
 
@@ -610,7 +610,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	else {
 		cc->iv_size = 0;
 		if (cc->iv_gen_ops) {
-			DMWARN(PFX "Selected cipher does not support IVs");
+			DMWARN("Selected cipher does not support IVs");
 			if (cc->iv_gen_ops->dtr)
 				cc->iv_gen_ops->dtr(cc);
 			cc->iv_gen_ops = NULL;
@@ -619,36 +619,36 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
 	if (!cc->io_pool) {
-		ti->error = PFX "Cannot allocate crypt io mempool";
+		ti->error = "Cannot allocate crypt io mempool";
 		goto bad3;
 	}
 
 	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
 	if (!cc->page_pool) {
-		ti->error = PFX "Cannot allocate page mempool";
+		ti->error = "Cannot allocate page mempool";
 		goto bad4;
 	}
 
 	if (tfm->crt_cipher.cit_setkey(tfm, cc->key, key_size) < 0) {
-		ti->error = PFX "Error setting key";
+		ti->error = "Error setting key";
 		goto bad5;
 	}
 
 	if (sscanf(argv[2], "%llu", &tmpll) != 1) {
-		ti->error = PFX "Invalid iv_offset sector";
+		ti->error = "Invalid iv_offset sector";
 		goto bad5;
 	}
 	cc->iv_offset = tmpll;
 
 	if (sscanf(argv[4], "%llu", &tmpll) != 1) {
-		ti->error = PFX "Invalid device sector";
+		ti->error = "Invalid device sector";
 		goto bad5;
 	}
 	cc->start = tmpll;
 
 	if (dm_get_device(ti, argv[3], cc->start, ti->len,
 	                  dm_table_get_mode(ti->table), &cc->dev)) {
-		ti->error = PFX "Device lookup failed";
+		ti->error = "Device lookup failed";
 		goto bad5;
 	}
 
@@ -657,7 +657,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 			*(ivopts - 1) = ':';
 		cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL);
 		if (!cc->iv_mode) {
-			ti->error = PFX "Error kmallocing iv_mode string";
+			ti->error = "Error kmallocing iv_mode string";
 			goto bad5;
 		}
 		strcpy(cc->iv_mode, ivmode);
@@ -918,13 +918,13 @@ static int __init dm_crypt_init(void)
 	_kcryptd_workqueue = create_workqueue("kcryptd");
 	if (!_kcryptd_workqueue) {
 		r = -ENOMEM;
-		DMERR(PFX "couldn't create kcryptd");
+		DMERR("couldn't create kcryptd");
 		goto bad1;
 	}
 
 	r = dm_register_target(&crypt_target);
 	if (r < 0) {
-		DMERR(PFX "register failed %d", r);
+		DMERR("register failed %d", r);
 		goto bad2;
 	}
 
@@ -942,7 +942,7 @@ static void __exit dm_crypt_exit(void)
 	int r = dm_unregister_target(&crypt_target);
 
 	if (r < 0)
-		DMERR(PFX "unregister failed %d", r);
+		DMERR("unregister failed %d", r);
 
 	destroy_workqueue(_kcryptd_workqueue);
 	kmem_cache_destroy(_crypt_io_pool);
diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c
index c7067674dcb7..2a374ccb30dd 100644
--- a/drivers/md/dm-emc.c
+++ b/drivers/md/dm-emc.c
@@ -12,6 +12,8 @@
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
 
+#define DM_MSG_PREFIX "multipath emc"
+
 struct emc_handler {
 	spinlock_t lock;
 
@@ -66,7 +68,7 @@ static struct bio *get_failover_bio(struct path *path, unsigned data_size)
 
 	bio = bio_alloc(GFP_ATOMIC, 1);
 	if (!bio) {
-		DMERR("dm-emc: get_failover_bio: bio_alloc() failed.");
+		DMERR("get_failover_bio: bio_alloc() failed.");
 		return NULL;
 	}
 
@@ -78,13 +80,13 @@ static struct bio *get_failover_bio(struct path *path, unsigned data_size)
 
 	page = alloc_page(GFP_ATOMIC);
 	if (!page) {
-		DMERR("dm-emc: get_failover_bio: alloc_page() failed.");
+		DMERR("get_failover_bio: alloc_page() failed.");
 		bio_put(bio);
 		return NULL;
 	}
 
 	if (bio_add_page(bio, page, data_size, 0) != data_size) {
-		DMERR("dm-emc: get_failover_bio: alloc_page() failed.");
+		DMERR("get_failover_bio: alloc_page() failed.");
 		__free_page(page);
 		bio_put(bio);
 		return NULL;
@@ -103,7 +105,7 @@ static struct request *get_failover_req(struct emc_handler *h,
 	/* FIXME: Figure out why it fails with GFP_ATOMIC. */
 	rq = blk_get_request(q, WRITE, __GFP_WAIT);
 	if (!rq) {
-		DMERR("dm-emc: get_failover_req: blk_get_request failed");
+		DMERR("get_failover_req: blk_get_request failed");
 		return NULL;
 	}
 
@@ -160,7 +162,7 @@ static struct request *emc_trespass_get(struct emc_handler *h,
 
 	bio = get_failover_bio(path, data_size);
 	if (!bio) {
-		DMERR("dm-emc: emc_trespass_get: no bio");
+		DMERR("emc_trespass_get: no bio");
 		return NULL;
 	}
 
@@ -173,7 +175,7 @@ static struct request *emc_trespass_get(struct emc_handler *h,
 	/* get request for block layer packet command */
 	rq = get_failover_req(h, bio, path);
 	if (!rq) {
-		DMERR("dm-emc: emc_trespass_get: no rq");
+		DMERR("emc_trespass_get: no rq");
 		free_bio(bio);
 		return NULL;
 	}
@@ -200,18 +202,18 @@ static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed,
 	 * initial state passed into us and then get an update here.
 	 */
 	if (!q) {
-		DMINFO("dm-emc: emc_pg_init: no queue");
+		DMINFO("emc_pg_init: no queue");
 		goto fail_path;
 	}
 
 	/* FIXME: The request should be pre-allocated. */
 	rq = emc_trespass_get(hwh->context, path);
 	if (!rq) {
-		DMERR("dm-emc: emc_pg_init: no rq");
+		DMERR("emc_pg_init: no rq");
 		goto fail_path;
 	}
 
-	DMINFO("dm-emc: emc_pg_init: sending switch-over command");
+	DMINFO("emc_pg_init: sending switch-over command");
 	elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1);
 	return;
 
@@ -241,18 +243,18 @@ static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
 		hr = 0;
 		short_trespass = 0;
 	} else if (argc != 2) {
-		DMWARN("dm-emc hwhandler: incorrect number of arguments");
+		DMWARN("incorrect number of arguments");
 		return -EINVAL;
 	} else {
 		if ((sscanf(argv[0], "%u", &short_trespass) != 1)
 			|| (short_trespass > 1)) {
-			DMWARN("dm-emc: invalid trespass mode selected");
+			DMWARN("invalid trespass mode selected");
 			return -EINVAL;
 		}
 
 		if ((sscanf(argv[1], "%u", &hr) != 1)
 			|| (hr > 1)) {
-			DMWARN("dm-emc: invalid honor reservation flag selected");
+			DMWARN("invalid honor reservation flag selected");
 			return -EINVAL;
 		}
 	}
@@ -264,14 +266,14 @@ static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
 	hwh->context = h;
 
 	if ((h->short_trespass = short_trespass))
-		DMWARN("dm-emc: short trespass command will be send");
+		DMWARN("short trespass command will be send");
 	else
-		DMWARN("dm-emc: long trespass command will be send");
+		DMWARN("long trespass command will be send");
 
 	if ((h->hr = hr))
-		DMWARN("dm-emc: honor reservation bit will be set");
+		DMWARN("honor reservation bit will be set");
 	else
-		DMWARN("dm-emc: honor reservation bit will not be set (default)");
+		DMWARN("honor reservation bit will not be set (default)");
 
 	return 0;
 }
@@ -336,9 +338,9 @@ static int __init dm_emc_init(void)
 	int r = dm_register_hw_handler(&emc_hwh);
 
 	if (r < 0)
-		DMERR("emc: register failed %d", r);
+		DMERR("register failed %d", r);
 
-	DMINFO("dm-emc version 0.0.3 loaded");
+	DMINFO("version 0.0.3 loaded");
 
 	return r;
 }
@@ -348,7 +350,7 @@ static void __exit dm_emc_exit(void)
 	int r = dm_unregister_hw_handler(&emc_hwh);
 
 	if (r < 0)
-		DMERR("emc: unregister failed %d", r);
+		DMERR("unregister failed %d", r);
 }
 
 module_init(dm_emc_init);
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 34a75939a0ca..d12379b5cdb5 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -16,6 +16,8 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 
+#define DM_MSG_PREFIX "snapshots"
+
 /*-----------------------------------------------------------------
  * Persistent snapshots, by persistent we mean that the snapshot
  * will survive a reboot.
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index c6ce13b18747..3edb3477f987 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -19,6 +19,7 @@
 
 #include <asm/uaccess.h>
 
+#define DM_MSG_PREFIX "ioctl"
 #define DM_DRIVER_EMAIL "dm-devel@redhat.com"
 
 /*-----------------------------------------------------------------
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index daf586c0898d..47b3c62bbdb8 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -12,6 +12,8 @@
 #include <linux/bio.h>
 #include <linux/slab.h>
 
+#define DM_MSG_PREFIX "linear"
+
 /*
  * Linear: maps a linear range of a device.
  */
@@ -29,7 +31,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	unsigned long long tmp;
 
 	if (argc != 2) {
-		ti->error = "dm-linear: Invalid argument count";
+		ti->error = "Invalid argument count";
 		return -EINVAL;
 	}
 
@@ -111,7 +113,7 @@ int __init dm_linear_init(void)
 	int r = dm_register_target(&linear_target);
 
 	if (r < 0)
-		DMERR("linear: register failed %d", r);
+		DMERR("register failed %d", r);
 
 	return r;
 }
@@ -121,5 +123,5 @@ void dm_linear_exit(void)
 	int r = dm_unregister_target(&linear_target);
 
 	if (r < 0)
-		DMERR("linear: unregister failed %d", r);
+		DMERR("unregister failed %d", r);
 }
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index ba98bf57c7c0..64b764bd02cc 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -12,6 +12,8 @@
 #include "dm-log.h"
 #include "dm-io.h"
 
+#define DM_MSG_PREFIX "mirror log"
+
 static LIST_HEAD(_log_types);
 static DEFINE_SPINLOCK(_lock);
 
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 1816f30678ed..217615b33223 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -21,6 +21,7 @@
 #include <linux/workqueue.h>
 #include <asm/atomic.h>
 
+#define DM_MSG_PREFIX "multipath"
 #define MESG_STR(x) x, sizeof(x)
 
 /* Path properties */
@@ -446,8 +447,6 @@ struct param {
 	char *error;
 };
 
-#define ESTR(s) ("dm-multipath: " s)
-
 static int read_param(struct param *param, char *str, unsigned *v, char **error)
 {
 	if (!str ||
@@ -495,12 +494,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
 	unsigned ps_argc;
 
 	static struct param _params[] = {
-		{0, 1024, ESTR("invalid number of path selector args")},
+		{0, 1024, "invalid number of path selector args"},
 	};
 
 	pst = dm_get_path_selector(shift(as));
 	if (!pst) {
-		ti->error = ESTR("unknown path selector type");
+		ti->error = "unknown path selector type";
 		return -EINVAL;
 	}
 
@@ -511,7 +510,7 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
 	r = pst->create(&pg->ps, ps_argc, as->argv);
 	if (r) {
 		dm_put_path_selector(pst);
-		ti->error = ESTR("path selector constructor failed");
+		ti->error = "path selector constructor failed";
 		return r;
 	}
 
@@ -529,7 +528,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
 
 	/* we need at least a path arg */
 	if (as->argc < 1) {
-		ti->error = ESTR("no device given");
+		ti->error = "no device given";
 		return NULL;
 	}
 
@@ -540,7 +539,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
 	r = dm_get_device(ti, shift(as), ti->begin, ti->len,
 			  dm_table_get_mode(ti->table), &p->path.dev);
 	if (r) {
-		ti->error = ESTR("error getting device");
+		ti->error = "error getting device";
 		goto bad;
 	}
 
@@ -562,8 +561,8 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
 						   struct dm_target *ti)
 {
 	static struct param _params[] = {
-		{1, 1024, ESTR("invalid number of paths")},
-		{0, 1024, ESTR("invalid number of selector args")}
+		{1, 1024, "invalid number of paths"},
+		{0, 1024, "invalid number of selector args"}
 	};
 
 	int r;
@@ -572,13 +571,13 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
 
 	if (as->argc < 2) {
 		as->argc = 0;
-		ti->error = ESTR("not enough priority group aruments");
+		ti->error = "not enough priority group aruments";
 		return NULL;
 	}
 
 	pg = alloc_priority_group();
 	if (!pg) {
-		ti->error = ESTR("couldn't allocate priority group");
+		ti->error = "couldn't allocate priority group";
 		return NULL;
 	}
 	pg->m = m;
@@ -633,7 +632,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m,
 	unsigned hw_argc;
 
 	static struct param _params[] = {
-		{0, 1024, ESTR("invalid number of hardware handler args")},
+		{0, 1024, "invalid number of hardware handler args"},
 	};
 
 	r = read_param(_params, shift(as), &hw_argc, &ti->error);
@@ -645,14 +644,14 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m,
 
 	hwht = dm_get_hw_handler(shift(as));
 	if (!hwht) {
-		ti->error = ESTR("unknown hardware handler type");
+		ti->error = "unknown hardware handler type";
 		return -EINVAL;
 	}
 
 	r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv);
 	if (r) {
 		dm_put_hw_handler(hwht);
-		ti->error = ESTR("hardware handler constructor failed");
+		ti->error = "hardware handler constructor failed";
 		return r;
 	}
 
@@ -669,7 +668,7 @@ static int parse_features(struct arg_set *as, struct multipath *m,
 	unsigned argc;
 
 	static struct param _params[] = {
-		{0, 1, ESTR("invalid number of feature args")},
+		{0, 1, "invalid number of feature args"},
 	};
 
 	r = read_param(_params, shift(as), &argc, &ti->error);
@@ -692,8 +691,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 {
 	/* target parameters */
 	static struct param _params[] = {
-		{1, 1024, ESTR("invalid number of priority groups")},
-		{1, 1024, ESTR("invalid initial priority group number")},
+		{1, 1024, "invalid number of priority groups"},
+		{1, 1024, "invalid initial priority group number"},
 	};
 
 	int r;
@@ -707,7 +706,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 
 	m = alloc_multipath();
 	if (!m) {
-		ti->error = ESTR("can't allocate multipath");
+		ti->error = "can't allocate multipath";
 		return -EINVAL;
 	}
 
@@ -746,7 +745,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	}
 
 	if (pg_count != m->nr_priority_groups) {
-		ti->error = ESTR("priority group count mismatch");
+		ti->error = "priority group count mismatch";
 		r = -EINVAL;
 		goto bad;
 	}
@@ -807,7 +806,7 @@ static int fail_path(struct pgpath *pgpath)
 	if (!pgpath->path.is_active)
 		goto out;
 
-	DMWARN("dm-multipath: Failing path %s.", pgpath->path.dev->name);
+	DMWARN("Failing path %s.", pgpath->path.dev->name);
 
 	pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
 	pgpath->path.is_active = 0;
@@ -1250,7 +1249,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
 	r = dm_get_device(ti, argv[1], ti->begin, ti->len,
 			  dm_table_get_mode(ti->table), &dev);
 	if (r) {
-		DMWARN("dm-multipath message: error getting device %s",
+		DMWARN("message: error getting device %s",
 		       argv[1]);
 		return -EINVAL;
 	}
@@ -1309,7 +1308,7 @@ static int __init dm_multipath_init(void)
 		return -ENOMEM;
 	}
 
-	DMINFO("dm-multipath version %u.%u.%u loaded",
+	DMINFO("version %u.%u.%u loaded",
 	       multipath_target.version[0], multipath_target.version[1],
 	       multipath_target.version[2]);
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 6ae42f1d76ae..be48cedf986b 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -20,6 +20,8 @@
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 
+#define DM_MSG_PREFIX "raid1"
+
 static struct workqueue_struct *_kmirrord_wq;
 static struct work_struct _kmirrord_work;
 
@@ -892,7 +894,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 
 	ms = kmalloc(len, GFP_KERNEL);
 	if (!ms) {
-		ti->error = "dm-mirror: Cannot allocate mirror context";
+		ti->error = "Cannot allocate mirror context";
 		return NULL;
 	}
 
@@ -906,7 +908,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 	ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
 
 	if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
-		ti->error = "dm-mirror: Error creating dirty region hash";
+		ti->error = "Error creating dirty region hash";
 		kfree(ms);
 		return NULL;
 	}
@@ -936,14 +938,14 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
 	unsigned long long offset;
 
 	if (sscanf(argv[1], "%llu", &offset) != 1) {
-		ti->error = "dm-mirror: Invalid offset";
+		ti->error = "Invalid offset";
 		return -EINVAL;
 	}
 
 	if (dm_get_device(ti, argv[0], offset, ti->len,
 			  dm_table_get_mode(ti->table),
 			  &ms->mirror[mirror].dev)) {
-		ti->error = "dm-mirror: Device lookup failure";
+		ti->error = "Device lookup failure";
 		return -ENXIO;
 	}
 
@@ -980,30 +982,30 @@ static struct dirty_log *create_dirty_log(struct dm_target *ti,
 	struct dirty_log *dl;
 
 	if (argc < 2) {
-		ti->error = "dm-mirror: Insufficient mirror log arguments";
+		ti->error = "Insufficient mirror log arguments";
 		return NULL;
 	}
 
 	if (sscanf(argv[1], "%u", &param_count) != 1) {
-		ti->error = "dm-mirror: Invalid mirror log argument count";
+		ti->error = "Invalid mirror log argument count";
 		return NULL;
 	}
 
 	*args_used = 2 + param_count;
 
 	if (argc < *args_used) {
-		ti->error = "dm-mirror: Insufficient mirror log arguments";
+		ti->error = "Insufficient mirror log arguments";
 		return NULL;
 	}
 
 	dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2);
 	if (!dl) {
-		ti->error = "dm-mirror: Error creating mirror dirty log";
+		ti->error = "Error creating mirror dirty log";
 		return NULL;
 	}
 
 	if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
-		ti->error = "dm-mirror: Invalid region size";
+		ti->error = "Invalid region size";
 		dm_destroy_dirty_log(dl);
 		return NULL;
 	}
@@ -1037,7 +1039,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
 	    nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) {
-		ti->error = "dm-mirror: Invalid number of mirrors";
+		ti->error = "Invalid number of mirrors";
 		dm_destroy_dirty_log(dl);
 		return -EINVAL;
 	}
@@ -1045,7 +1047,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	argv++, argc--;
 
 	if (argc != nr_mirrors * 2) {
-		ti->error = "dm-mirror: Wrong number of mirror arguments";
+		ti->error = "Wrong number of mirror arguments";
 		dm_destroy_dirty_log(dl);
 		return -EINVAL;
 	}
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index d0024865a789..c5a16c550122 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -14,6 +14,8 @@
 
 #include <linux/slab.h>
 
+#define DM_MSG_PREFIX "multipath round-robin"
+
 /*-----------------------------------------------------------------
  * Path-handling code, paths are held in lists
  *---------------------------------------------------------------*/
@@ -191,9 +193,9 @@ static int __init dm_rr_init(void)
 	int r = dm_register_path_selector(&rr_ps);
 
 	if (r < 0)
-		DMERR("round-robin: register failed %d", r);
+		DMERR("register failed %d", r);
 
-	DMINFO("dm-round-robin version 1.0.0 loaded");
+	DMINFO("version 1.0.0 loaded");
 
 	return r;
 }
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index b84bc1aae353..8eea0ddbf5ec 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -23,6 +23,8 @@
 #include "dm-bio-list.h"
 #include "kcopyd.h"
 
+#define DM_MSG_PREFIX "snapshots"
+
 /*
  * The percentage increment we will wake up users at
  */
@@ -117,7 +119,7 @@ static int init_origin_hash(void)
 	_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
 			   GFP_KERNEL);
 	if (!_origins) {
-		DMERR("Device mapper: Snapshot: unable to allocate memory");
+		DMERR("unable to allocate memory");
 		return -ENOMEM;
 	}
 
@@ -412,7 +414,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	int blocksize;
 
 	if (argc < 4) {
-		ti->error = "dm-snapshot: requires exactly 4 arguments";
+		ti->error = "requires exactly 4 arguments";
 		r = -EINVAL;
 		goto bad1;
 	}
@@ -1127,7 +1129,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	struct dm_dev *dev;
 
 	if (argc != 1) {
-		ti->error = "dm-origin: incorrect number of arguments";
+		ti->error = "origin: incorrect number of arguments";
 		return -EINVAL;
 	}
 
@@ -1236,7 +1238,7 @@ static int __init dm_snapshot_init(void)
 
 	r = dm_register_target(&origin_target);
 	if (r < 0) {
-		DMERR("Device mapper: Origin: register failed %d\n", r);
+		DMERR("Origin target register failed %d", r);
 		goto bad1;
 	}
 
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 08328a8f5a3c..6c29fcecd892 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -12,6 +12,8 @@
 #include <linux/bio.h>
 #include <linux/slab.h>
 
+#define DM_MSG_PREFIX "striped"
+
 struct stripe {
 	struct dm_dev *dev;
 	sector_t physical_start;
@@ -78,19 +80,19 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	unsigned int i;
 
 	if (argc < 2) {
-		ti->error = "dm-stripe: Not enough arguments";
+		ti->error = "Not enough arguments";
 		return -EINVAL;
 	}
 
 	stripes = simple_strtoul(argv[0], &end, 10);
 	if (*end) {
-		ti->error = "dm-stripe: Invalid stripe count";
+		ti->error = "Invalid stripe count";
 		return -EINVAL;
 	}
 
 	chunk_size = simple_strtoul(argv[1], &end, 10);
 	if (*end) {
-		ti->error = "dm-stripe: Invalid chunk_size";
+		ti->error = "Invalid chunk_size";
 		return -EINVAL;
 	}
 
@@ -99,19 +101,19 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	 */
 	if (!chunk_size || (chunk_size & (chunk_size - 1)) ||
 	    (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) {
-		ti->error = "dm-stripe: Invalid chunk size";
+		ti->error = "Invalid chunk size";
 		return -EINVAL;
 	}
 
 	if (ti->len & (chunk_size - 1)) {
-		ti->error = "dm-stripe: Target length not divisible by "
+		ti->error = "Target length not divisible by "
 		    "chunk size";
 		return -EINVAL;
 	}
 
 	width = ti->len;
 	if (sector_div(width, stripes)) {
-		ti->error = "dm-stripe: Target length not divisible by "
+		ti->error = "Target length not divisible by "
 		    "number of stripes";
 		return -EINVAL;
 	}
@@ -120,14 +122,14 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	 * Do we have enough arguments for that many stripes ?
 	 */
 	if (argc != (2 + 2 * stripes)) {
-		ti->error = "dm-stripe: Not enough destinations "
+		ti->error = "Not enough destinations "
 			"specified";
 		return -EINVAL;
 	}
 
 	sc = alloc_context(stripes);
 	if (!sc) {
-		ti->error = "dm-stripe: Memory allocation for striped context "
+		ti->error = "Memory allocation for striped context "
 		    "failed";
 		return -ENOMEM;
 	}
@@ -149,8 +151,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 		r = get_stripe(ti, sc, i, argv);
 		if (r < 0) {
-			ti->error = "dm-stripe: Couldn't parse stripe "
-				"destination";
+			ti->error = "Couldn't parse stripe destination";
 			while (i--)
 				dm_put_device(ti, sc->stripe[i].dev);
 			kfree(sc);
@@ -227,7 +228,7 @@ int __init dm_stripe_init(void)
 
 	r = dm_register_target(&stripe_target);
 	if (r < 0)
-		DMWARN("striped target registration failed");
+		DMWARN("target registration failed");
 
 	return r;
 }
@@ -235,7 +236,7 @@ int __init dm_stripe_init(void)
 void dm_stripe_exit(void)
 {
 	if (dm_unregister_target(&stripe_target))
-		DMWARN("striped target unregistration failed");
+		DMWARN("target unregistration failed");
 
 	return;
 }
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 827b648fac50..75fe9493e6af 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -17,6 +17,8 @@
 #include <linux/mutex.h>
 #include <asm/atomic.h>
 
+#define DM_MSG_PREFIX "table"
+
 #define MAX_DEPTH 16
 #define NODE_SIZE L1_CACHE_BYTES
 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
@@ -715,15 +717,14 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 	memset(tgt, 0, sizeof(*tgt));
 
 	if (!len) {
-		tgt->error = "zero-length target";
-		DMERR("%s", tgt->error);
+		DMERR("%s: zero-length target", dm_device_name(t->md));
 		return -EINVAL;
 	}
 
 	tgt->type = dm_get_target_type(type);
 	if (!tgt->type) {
-		tgt->error = "unknown target type";
-		DMERR("%s", tgt->error);
+		DMERR("%s: %s: unknown target type", dm_device_name(t->md),
+		      type);
 		return -EINVAL;
 	}
 
@@ -760,7 +761,7 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 	return 0;
 
  bad:
-	DMERR("%s", tgt->error);
+	DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error);
 	dm_put_target_type(tgt->type);
 	return r;
 }
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 64fd8e79ea4c..477a041a41cf 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -12,6 +12,8 @@
 #include <linux/bio.h>
 #include <linux/slab.h>
 
+#define DM_MSG_PREFIX "target"
+
 struct tt_internal {
 	struct target_type tt;
 
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 51c0639b2487..ea569f7348d2 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -10,13 +10,15 @@
 #include <linux/init.h>
 #include <linux/bio.h>
 
+#define DM_MSG_PREFIX "zero"
+
 /*
  * Construct a dummy mapping that only returns zeros
  */
 static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	if (argc != 0) {
-		ti->error = "dm-zero: No arguments required";
+		ti->error = "No arguments required";
 		return -EINVAL;
 	}
 
@@ -60,7 +62,7 @@ static int __init dm_zero_init(void)
 	int r = dm_register_target(&zero_target);
 
 	if (r < 0)
-		DMERR("zero: register failed %d", r);
+		DMERR("register failed %d", r);
 
 	return r;
 }
@@ -70,7 +72,7 @@ static void __exit dm_zero_exit(void)
 	int r = dm_unregister_target(&zero_target);
 
 	if (r < 0)
-		DMERR("zero: unregister failed %d", r);
+		DMERR("unregister failed %d", r);
 }
 
 module_init(dm_zero_init)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6982f86db6dc..3ed2e53b9eb6 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -21,6 +21,8 @@
 #include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
 
+#define DM_MSG_PREFIX "core"
+
 static const char *_name = DM_NAME;
 
 static unsigned int major = 0;
@@ -1108,6 +1110,12 @@ void dm_get(struct mapped_device *md)
 	atomic_inc(&md->holders);
 }
 
+const char *dm_device_name(struct mapped_device *md)
+{
+	return md->name;
+}
+EXPORT_SYMBOL_GPL(dm_device_name);
+
 void dm_put(struct mapped_device *md)
 {
 	struct dm_table *map;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 9ebb24b037bc..3c03c0ecab7e 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -17,9 +17,10 @@
 #include <linux/hdreg.h>
 
 #define DM_NAME "device-mapper"
-#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
-#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
-#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
+
+#define DMERR(f, arg...) printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
+#define DMWARN(f, arg...) printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
+#define DMINFO(f, arg...) printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
 
 #define DMEMIT(x...) sz += ((sz >= maxlen) ? \
 			  0 : scnprintf(result + sz, maxlen - sz, x))
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 61103d2bc244..e3d1c33d1558 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -171,6 +171,7 @@ int dm_wait_event(struct mapped_device *md, int event_nr);
 /*
  * Info functions.
  */
+const char *dm_device_name(struct mapped_device *md);
 struct gendisk *dm_disk(struct mapped_device *md);
 int dm_suspended(struct mapped_device *md);
 
-- 
cgit v1.2.3


From 8932c2e0dcae52e73430878fd8a7a7800176eada Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:36 -0700
Subject: [PATCH] md: remove arbitrary limit on chunk size

The largest chunk size the code can support without substantial surgery is
2^30 bytes, so make that the limit instead of an arbitrary 4Meg.  Some day,
the 'chunksize' should change to a sector-shift instead of a byte-count.  Then
no limit would be needed.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid10.c       | 2 +-
 drivers/md/raid5.c        | 4 ++--
 drivers/md/raid6main.c    | 4 ++--
 include/linux/raid/md_k.h | 3 ++-
 4 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1440935414e6..affeaefd4033 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2050,7 +2050,7 @@ static int run(mddev_t *mddev)
 	 * maybe...
 	 */
 	{
-		int stripe = conf->raid_disks * mddev->chunk_size / PAGE_SIZE;
+		int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE);
 		stripe /= conf->near_copies;
 		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 31843604049c..122e64e557b1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2382,8 +2382,8 @@ static int run(mddev_t *mddev)
 	 * 2 * (n-1) * chunksize where 'n' is the number of raid devices
 	 */
 	{
-		int stripe = (mddev->raid_disks-1) * mddev->chunk_size
-			/ PAGE_SIZE;
+		int stripe = (mddev->raid_disks-1) *
+			(mddev->chunk_size / PAGE_SIZE);
 		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
 	}
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index bc69355e0100..e53d2d96ea3a 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -2135,8 +2135,8 @@ static int run(mddev_t *mddev)
 	 * 2 * (n-2) * chunksize where 'n' is the number of raid devices
 	 */
 	{
-		int stripe = (mddev->raid_disks-2) * mddev->chunk_size
-			/ PAGE_SIZE;
+		int stripe = (mddev->raid_disks-2) *
+			(mddev->chunk_size / PAGE_SIZE);
 		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
 	}
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index e2df61f5b09a..db2ca2d9066e 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -40,7 +40,8 @@ typedef struct mdk_rdev_s mdk_rdev_t;
  * options passed in raidrun:
  */
 
-#define MAX_CHUNK_SIZE (4096*1024)
+/* Currently this must fix in an 'int' */
+#define MAX_CHUNK_SIZE (1<<30)
 
 /*
  * MD's 'extended' device
-- 
cgit v1.2.3


From 16a53ecc35f2a80dc285be2e769768847d89ca37 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:38 -0700
Subject: [PATCH] md: merge raid5 and raid6 code

There is a lot of commonality between raid5.c and raid6main.c.  This patches
merges both into one module called raid456.  This saves a lot of code, and
paves the way for online raid5->raid6 migrations.

There is still duplication, e.g.  between handle_stripe5 and handle_stripe6.
This will probably be cleaned up later.

Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/Kconfig         |   38 +-
 drivers/md/Makefile        |    5 +-
 drivers/md/raid5.c         | 1161 ++++++++++++++++++---
 drivers/md/raid6main.c     | 2427 --------------------------------------------
 include/linux/raid/raid5.h |    1 +
 5 files changed, 1054 insertions(+), 2578 deletions(-)
 delete mode 100644 drivers/md/raid6main.c

(limited to 'include/linux')

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index ac25a48362ac..f657aa7ec78c 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -104,8 +104,8 @@ config MD_RAID10
 
 	  If unsure, say Y.
 
-config MD_RAID5
-	tristate "RAID-4/RAID-5 mode"
+config MD_RAID456
+	tristate "RAID-4/RAID-5/RAID-6 mode"
 	depends on BLK_DEV_MD
 	---help---
 	  A RAID-5 set of N drives with a capacity of C MB per drive provides
@@ -116,14 +116,22 @@ config MD_RAID5
 	  while a RAID-5 set distributes the parity across the drives in one
 	  of the available parity distribution methods.
 
+	  A RAID-6 set of N drives with a capacity of C MB per drive
+	  provides the capacity of C * (N - 2) MB, and protects
+	  against a failure of any two drives. For a given sector
+	  (row) number, (N - 2) drives contain data sectors, and two
+	  drives contains two independent redundancy syndromes.  Like
+	  RAID-5, RAID-6 distributes the syndromes across the drives
+	  in one of the available parity distribution methods.
+
 	  Information about Software RAID on Linux is contained in the
 	  Software-RAID mini-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>. There you will also
 	  learn where to get the supporting user space utilities raidtools.
 
-	  If you want to use such a RAID-4/RAID-5 set, say Y.  To
+	  If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y.  To
 	  compile this code as a module, choose M here: the module
-	  will be called raid5.
+	  will be called raid456.
 
 	  If unsure, say Y.
 
@@ -154,28 +162,6 @@ config MD_RAID5_RESHAPE
 	  There should be enough spares already present to make the new
 	  array workable.
 
-config MD_RAID6
-	tristate "RAID-6 mode"
-	depends on BLK_DEV_MD
-	---help---
-	  A RAID-6 set of N drives with a capacity of C MB per drive
-	  provides the capacity of C * (N - 2) MB, and protects
-	  against a failure of any two drives. For a given sector
-	  (row) number, (N - 2) drives contain data sectors, and two
-	  drives contains two independent redundancy syndromes.  Like
-	  RAID-5, RAID-6 distributes the syndromes across the drives
-	  in one of the available parity distribution methods.
-
-	  RAID-6 requires mdadm-1.5.0 or later, available at:
-
-	  ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
-
-	  If you want to use such a RAID-6 set, say Y.  To compile
-	  this code as a module, choose M here: the module will be
-	  called raid6.
-
-	  If unsure, say Y.
-
 config MD_MULTIPATH
 	tristate "Multipath I/O support"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index d3efedf6a6ad..34957a68d921 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -8,7 +8,7 @@ dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
 dm-snapshot-objs := dm-snap.o dm-exception-store.o
 dm-mirror-objs	:= dm-log.o dm-raid1.o
 md-mod-objs     := md.o bitmap.o
-raid6-objs	:= raid6main.o raid6algos.o raid6recov.o raid6tables.o \
+raid456-objs	:= raid5.o raid6algos.o raid6recov.o raid6tables.o \
 		   raid6int1.o raid6int2.o raid6int4.o \
 		   raid6int8.o raid6int16.o raid6int32.o \
 		   raid6altivec1.o raid6altivec2.o raid6altivec4.o \
@@ -25,8 +25,7 @@ obj-$(CONFIG_MD_LINEAR)		+= linear.o
 obj-$(CONFIG_MD_RAID0)		+= raid0.o
 obj-$(CONFIG_MD_RAID1)		+= raid1.o
 obj-$(CONFIG_MD_RAID10)		+= raid10.o
-obj-$(CONFIG_MD_RAID5)		+= raid5.o xor.o
-obj-$(CONFIG_MD_RAID6)		+= raid6.o xor.o
+obj-$(CONFIG_MD_RAID456)	+= raid456.o xor.o
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_MD_FAULTY)		+= faulty.o
 obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 122e64e557b1..9ba73074df04 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2,8 +2,11 @@
  * raid5.c : Multiple Devices driver for Linux
  *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  *	   Copyright (C) 1999, 2000 Ingo Molnar
+ *	   Copyright (C) 2002, 2003 H. Peter Anvin
  *
- * RAID-5 management functions.
+ * RAID-4/5/6 management functions.
+ * Thanks to Penguin Computing for making the RAID-6 development possible
+ * by donating a test server!
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -19,11 +22,11 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/raid/raid5.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
 #include <linux/kthread.h>
 #include <asm/atomic.h>
+#include "raid6.h"
 
 #include <linux/raid/bitmap.h>
 
@@ -68,6 +71,16 @@
 #define __inline__
 #endif
 
+#if !RAID6_USE_EMPTY_ZERO_PAGE
+/* In .bss so it's zeroed */
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+#endif
+
+static inline int raid6_next_disk(int disk, int raid_disks)
+{
+	disk++;
+	return (disk < raid_disks) ? disk : 0;
+}
 static void print_raid5_conf (raid5_conf_t *conf);
 
 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
@@ -104,7 +117,7 @@ static void release_stripe(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	unsigned long flags;
-	
+
 	spin_lock_irqsave(&conf->device_lock, flags);
 	__release_stripe(conf, sh);
 	spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -117,7 +130,7 @@ static inline void remove_hash(struct stripe_head *sh)
 	hlist_del_init(&sh->hash);
 }
 
-static void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
+static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
 {
 	struct hlist_head *hp = stripe_hash(conf, sh->sector);
 
@@ -190,7 +203,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
 		(unsigned long long)sh->sector);
 
 	remove_hash(sh);
-	
+
 	sh->sector = sector;
 	sh->pd_idx = pd_idx;
 	sh->state = 0;
@@ -269,8 +282,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 			} else {
 				if (!test_bit(STRIPE_HANDLE, &sh->state))
 					atomic_inc(&conf->active_stripes);
-				if (!list_empty(&sh->lru))
-					list_del_init(&sh->lru);
+				if (list_empty(&sh->lru))
+					BUG();
+				list_del_init(&sh->lru);
 			}
 		}
 	} while (sh == NULL);
@@ -321,10 +335,9 @@ static int grow_stripes(raid5_conf_t *conf, int num)
 		return 1;
 	conf->slab_cache = sc;
 	conf->pool_size = devs;
-	while (num--) {
+	while (num--)
 		if (!grow_one_stripe(conf))
 			return 1;
-	}
 	return 0;
 }
 
@@ -631,8 +644,7 @@ static void raid5_build_block (struct stripe_head *sh, int i)
 	dev->req.bi_private = sh;
 
 	dev->flags = 0;
-	if (i != sh->pd_idx)
-		dev->sector = compute_blocknr(sh, i);
+	dev->sector = compute_blocknr(sh, i);
 }
 
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
@@ -659,7 +671,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 			" Operation continuing on %d devices\n",
 			bdevname(rdev->bdev,b), conf->working_disks);
 	}
-}	
+}
 
 /*
  * Input: a 'big' sector number,
@@ -697,9 +709,12 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
 	/*
 	 * Select the parity disk based on the user selected algorithm.
 	 */
-	if (conf->level == 4)
+	switch(conf->level) {
+	case 4:
 		*pd_idx = data_disks;
-	else switch (conf->algorithm) {
+		break;
+	case 5:
+		switch (conf->algorithm) {
 		case ALGORITHM_LEFT_ASYMMETRIC:
 			*pd_idx = data_disks - stripe % raid_disks;
 			if (*dd_idx >= *pd_idx)
@@ -721,6 +736,39 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
 		default:
 			printk(KERN_ERR "raid5: unsupported algorithm %d\n",
 				conf->algorithm);
+		}
+		break;
+	case 6:
+
+		/**** FIX THIS ****/
+		switch (conf->algorithm) {
+		case ALGORITHM_LEFT_ASYMMETRIC:
+			*pd_idx = raid_disks - 1 - (stripe % raid_disks);
+			if (*pd_idx == raid_disks-1)
+				(*dd_idx)++; 	/* Q D D D P */
+			else if (*dd_idx >= *pd_idx)
+				(*dd_idx) += 2; /* D D P Q D */
+			break;
+		case ALGORITHM_RIGHT_ASYMMETRIC:
+			*pd_idx = stripe % raid_disks;
+			if (*pd_idx == raid_disks-1)
+				(*dd_idx)++; 	/* Q D D D P */
+			else if (*dd_idx >= *pd_idx)
+				(*dd_idx) += 2; /* D D P Q D */
+			break;
+		case ALGORITHM_LEFT_SYMMETRIC:
+			*pd_idx = raid_disks - 1 - (stripe % raid_disks);
+			*dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+			break;
+		case ALGORITHM_RIGHT_SYMMETRIC:
+			*pd_idx = stripe % raid_disks;
+			*dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+			break;
+		default:
+			printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
+				conf->algorithm);
+		}
+		break;
 	}
 
 	/*
@@ -742,12 +790,17 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
 	int chunk_number, dummy1, dummy2, dd_idx = i;
 	sector_t r_sector;
 
+
 	chunk_offset = sector_div(new_sector, sectors_per_chunk);
 	stripe = new_sector;
 	BUG_ON(new_sector != stripe);
 
-	
-	switch (conf->algorithm) {
+	if (i == sh->pd_idx)
+		return 0;
+	switch(conf->level) {
+	case 4: break;
+	case 5:
+		switch (conf->algorithm) {
 		case ALGORITHM_LEFT_ASYMMETRIC:
 		case ALGORITHM_RIGHT_ASYMMETRIC:
 			if (i > sh->pd_idx)
@@ -761,7 +814,37 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
 			break;
 		default:
 			printk(KERN_ERR "raid5: unsupported algorithm %d\n",
+			       conf->algorithm);
+		}
+		break;
+	case 6:
+		data_disks = raid_disks - 2;
+		if (i == raid6_next_disk(sh->pd_idx, raid_disks))
+			return 0; /* It is the Q disk */
+		switch (conf->algorithm) {
+		case ALGORITHM_LEFT_ASYMMETRIC:
+		case ALGORITHM_RIGHT_ASYMMETRIC:
+		  	if (sh->pd_idx == raid_disks-1)
+				i--; 	/* Q D D D P */
+			else if (i > sh->pd_idx)
+				i -= 2; /* D D P Q D */
+			break;
+		case ALGORITHM_LEFT_SYMMETRIC:
+		case ALGORITHM_RIGHT_SYMMETRIC:
+			if (sh->pd_idx == raid_disks-1)
+				i--; /* Q D D D P */
+			else {
+				/* D D P Q D */
+				if (i < sh->pd_idx)
+					i += raid_disks;
+				i -= (sh->pd_idx + 2);
+			}
+			break;
+		default:
+			printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
 				conf->algorithm);
+		}
+		break;
 	}
 
 	chunk_number = stripe * data_disks + i;
@@ -778,10 +861,11 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
 
 
 /*
- * Copy data between a page in the stripe cache, and a bio.
- * There are no alignment or size guarantees between the page or the
- * bio except that there is some overlap.
- * All iovecs in the bio must be considered.
+ * Copy data between a page in the stripe cache, and one or more bion
+ * The page could align with the middle of the bio, or there could be
+ * several bion, each with several bio_vecs, which cover part of the page
+ * Multiple bion are linked together on bi_next.  There may be extras
+ * at the end of this list.  We ignore them.
  */
 static void copy_data(int frombio, struct bio *bio,
 		     struct page *page,
@@ -810,7 +894,7 @@ static void copy_data(int frombio, struct bio *bio,
 		if (len > 0 && page_offset + len > STRIPE_SIZE)
 			clen = STRIPE_SIZE - page_offset;
 		else clen = len;
-			
+
 		if (clen > 0) {
 			char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
 			if (frombio)
@@ -862,14 +946,14 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
 	set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 }
 
-static void compute_parity(struct stripe_head *sh, int method)
+static void compute_parity5(struct stripe_head *sh, int method)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
 	void *ptr[MAX_XOR_BLOCKS];
 	struct bio *chosen;
 
-	PRINTK("compute_parity, stripe %llu, method %d\n",
+	PRINTK("compute_parity5, stripe %llu, method %d\n",
 		(unsigned long long)sh->sector, method);
 
 	count = 1;
@@ -956,9 +1040,195 @@ static void compute_parity(struct stripe_head *sh, int method)
 		clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
 }
 
+static void compute_parity6(struct stripe_head *sh, int method)
+{
+	raid6_conf_t *conf = sh->raid_conf;
+	int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
+	struct bio *chosen;
+	/**** FIX THIS: This could be very bad if disks is close to 256 ****/
+	void *ptrs[disks];
+
+	qd_idx = raid6_next_disk(pd_idx, disks);
+	d0_idx = raid6_next_disk(qd_idx, disks);
+
+	PRINTK("compute_parity, stripe %llu, method %d\n",
+		(unsigned long long)sh->sector, method);
+
+	switch(method) {
+	case READ_MODIFY_WRITE:
+		BUG();		/* READ_MODIFY_WRITE N/A for RAID-6 */
+	case RECONSTRUCT_WRITE:
+		for (i= disks; i-- ;)
+			if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
+				chosen = sh->dev[i].towrite;
+				sh->dev[i].towrite = NULL;
+
+				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+					wake_up(&conf->wait_for_overlap);
+
+				if (sh->dev[i].written) BUG();
+				sh->dev[i].written = chosen;
+			}
+		break;
+	case CHECK_PARITY:
+		BUG();		/* Not implemented yet */
+	}
+
+	for (i = disks; i--;)
+		if (sh->dev[i].written) {
+			sector_t sector = sh->dev[i].sector;
+			struct bio *wbi = sh->dev[i].written;
+			while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+				copy_data(1, wbi, sh->dev[i].page, sector);
+				wbi = r5_next_bio(wbi, sector);
+			}
+
+			set_bit(R5_LOCKED, &sh->dev[i].flags);
+			set_bit(R5_UPTODATE, &sh->dev[i].flags);
+		}
+
+//	switch(method) {
+//	case RECONSTRUCT_WRITE:
+//	case CHECK_PARITY:
+//	case UPDATE_PARITY:
+		/* Note that unlike RAID-5, the ordering of the disks matters greatly. */
+		/* FIX: Is this ordering of drives even remotely optimal? */
+		count = 0;
+		i = d0_idx;
+		do {
+			ptrs[count++] = page_address(sh->dev[i].page);
+			if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
+				printk("block %d/%d not uptodate on parity calc\n", i,count);
+			i = raid6_next_disk(i, disks);
+		} while ( i != d0_idx );
+//		break;
+//	}
+
+	raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
+
+	switch(method) {
+	case RECONSTRUCT_WRITE:
+		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+		set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
+		set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
+		set_bit(R5_LOCKED,   &sh->dev[qd_idx].flags);
+		break;
+	case UPDATE_PARITY:
+		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+		set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
+		break;
+	}
+}
+
+
+/* Compute one missing block */
+static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
+{
+	raid6_conf_t *conf = sh->raid_conf;
+	int i, count, disks = conf->raid_disks;
+	void *ptr[MAX_XOR_BLOCKS], *p;
+	int pd_idx = sh->pd_idx;
+	int qd_idx = raid6_next_disk(pd_idx, disks);
+
+	PRINTK("compute_block_1, stripe %llu, idx %d\n",
+		(unsigned long long)sh->sector, dd_idx);
+
+	if ( dd_idx == qd_idx ) {
+		/* We're actually computing the Q drive */
+		compute_parity6(sh, UPDATE_PARITY);
+	} else {
+		ptr[0] = page_address(sh->dev[dd_idx].page);
+		if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
+		count = 1;
+		for (i = disks ; i--; ) {
+			if (i == dd_idx || i == qd_idx)
+				continue;
+			p = page_address(sh->dev[i].page);
+			if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
+				ptr[count++] = p;
+			else
+				printk("compute_block() %d, stripe %llu, %d"
+				       " not present\n", dd_idx,
+				       (unsigned long long)sh->sector, i);
+
+			check_xor();
+		}
+		if (count != 1)
+			xor_block(count, STRIPE_SIZE, ptr);
+		if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
+		else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
+	}
+}
+
+/* Compute two missing blocks */
+static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
+{
+	raid6_conf_t *conf = sh->raid_conf;
+	int i, count, disks = conf->raid_disks;
+	int pd_idx = sh->pd_idx;
+	int qd_idx = raid6_next_disk(pd_idx, disks);
+	int d0_idx = raid6_next_disk(qd_idx, disks);
+	int faila, failb;
+
+	/* faila and failb are disk numbers relative to d0_idx */
+	/* pd_idx become disks-2 and qd_idx become disks-1 */
+	faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
+	failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
+
+	BUG_ON(faila == failb);
+	if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
+
+	PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
+	       (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
+
+	if ( failb == disks-1 ) {
+		/* Q disk is one of the missing disks */
+		if ( faila == disks-2 ) {
+			/* Missing P+Q, just recompute */
+			compute_parity6(sh, UPDATE_PARITY);
+			return;
+		} else {
+			/* We're missing D+Q; recompute D from P */
+			compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
+			compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
+			return;
+		}
+	}
+
+	/* We're missing D+P or D+D; build pointer table */
+	{
+		/**** FIX THIS: This could be very bad if disks is close to 256 ****/
+		void *ptrs[disks];
+
+		count = 0;
+		i = d0_idx;
+		do {
+			ptrs[count++] = page_address(sh->dev[i].page);
+			i = raid6_next_disk(i, disks);
+			if (i != dd_idx1 && i != dd_idx2 &&
+			    !test_bit(R5_UPTODATE, &sh->dev[i].flags))
+				printk("compute_2 with missing block %d/%d\n", count, i);
+		} while ( i != d0_idx );
+
+		if ( failb == disks-2 ) {
+			/* We're missing D+P. */
+			raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
+		} else {
+			/* We're missing D+D. */
+			raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
+		}
+
+		/* Both the above update both missing blocks */
+		set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
+		set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
+	}
+}
+
+
+
 /*
  * Each stripe/dev can have one or more bion attached.
- * toread/towrite point to the first in a chain. 
+ * toread/towrite point to the first in a chain.
  * The bi_next chain must be in order.
  */
 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
@@ -1031,6 +1301,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 
 static void end_reshape(raid5_conf_t *conf);
 
+static int page_is_zero(struct page *p)
+{
+	char *a = page_address(p);
+	return ((*(u32*)a) == 0 &&
+		memcmp(a, a+4, STRIPE_SIZE-4)==0);
+}
+
 static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
 {
 	int sectors_per_chunk = conf->chunk_size >> 9;
@@ -1062,7 +1339,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
  *
  */
  
-static void handle_stripe(struct stripe_head *sh)
+static void handle_stripe5(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks;
@@ -1394,7 +1671,7 @@ static void handle_stripe(struct stripe_head *sh)
 		if (locked == 0 && (rcw == 0 ||rmw == 0) &&
 		    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
 			PRINTK("Computing parity...\n");
-			compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
+			compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
 			/* now every locked buffer is ready to be written */
 			for (i=disks; i--;)
 				if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
@@ -1421,13 +1698,10 @@ static void handle_stripe(struct stripe_head *sh)
 	    !test_bit(STRIPE_INSYNC, &sh->state)) {
 		set_bit(STRIPE_HANDLE, &sh->state);
 		if (failed == 0) {
-			char *pagea;
 			BUG_ON(uptodate != disks);
-			compute_parity(sh, CHECK_PARITY);
+			compute_parity5(sh, CHECK_PARITY);
 			uptodate--;
-			pagea = page_address(sh->dev[sh->pd_idx].page);
-			if ((*(u32*)pagea) == 0 &&
-			    !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
+			if (page_is_zero(sh->dev[sh->pd_idx].page)) {
 				/* parity is correct (on disc, not in buffer any more) */
 				set_bit(STRIPE_INSYNC, &sh->state);
 			} else {
@@ -1487,7 +1761,7 @@ static void handle_stripe(struct stripe_head *sh)
 		/* Need to write out all blocks after computing parity */
 		sh->disks = conf->raid_disks;
 		sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
-		compute_parity(sh, RECONSTRUCT_WRITE);
+		compute_parity5(sh, RECONSTRUCT_WRITE);
 		for (i= conf->raid_disks; i--;) {
 			set_bit(R5_LOCKED, &sh->dev[i].flags);
 			locked++;
@@ -1615,71 +1889,634 @@ static void handle_stripe(struct stripe_head *sh)
 	}
 }
 
-static void raid5_activate_delayed(raid5_conf_t *conf)
+static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 {
-	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
-		while (!list_empty(&conf->delayed_list)) {
-			struct list_head *l = conf->delayed_list.next;
-			struct stripe_head *sh;
-			sh = list_entry(l, struct stripe_head, lru);
-			list_del_init(l);
-			clear_bit(STRIPE_DELAYED, &sh->state);
-			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-				atomic_inc(&conf->preread_active_stripes);
-			list_add_tail(&sh->lru, &conf->handle_list);
-		}
-	}
-}
+	raid6_conf_t *conf = sh->raid_conf;
+	int disks = conf->raid_disks;
+	struct bio *return_bi= NULL;
+	struct bio *bi;
+	int i;
+	int syncing;
+	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
+	int non_overwrite = 0;
+	int failed_num[2] = {0, 0};
+	struct r5dev *dev, *pdev, *qdev;
+	int pd_idx = sh->pd_idx;
+	int qd_idx = raid6_next_disk(pd_idx, disks);
+	int p_failed, q_failed;
 
-static void activate_bit_delay(raid5_conf_t *conf)
-{
-	/* device_lock is held */
-	struct list_head head;
-	list_add(&head, &conf->bitmap_list);
-	list_del_init(&conf->bitmap_list);
-	while (!list_empty(&head)) {
-		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
-		list_del_init(&sh->lru);
-		atomic_inc(&sh->count);
-		__release_stripe(conf, sh);
-	}
-}
+	PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
+	       (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
+	       pd_idx, qd_idx);
 
-static void unplug_slaves(mddev_t *mddev)
-{
-	raid5_conf_t *conf = mddev_to_conf(mddev);
-	int i;
+	spin_lock(&sh->lock);
+	clear_bit(STRIPE_HANDLE, &sh->state);
+	clear_bit(STRIPE_DELAYED, &sh->state);
+
+	syncing = test_bit(STRIPE_SYNCING, &sh->state);
+	/* Now to look around and see what can be done */
 
 	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+	for (i=disks; i--; ) {
+		mdk_rdev_t *rdev;
+		dev = &sh->dev[i];
+		clear_bit(R5_Insync, &dev->flags);
 
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
+		PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
+			i, dev->flags, dev->toread, dev->towrite, dev->written);
+		/* maybe we can reply to a read */
+		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
+			struct bio *rbi, *rbi2;
+			PRINTK("Return read for disc %d\n", i);
+			spin_lock_irq(&conf->device_lock);
+			rbi = dev->toread;
+			dev->toread = NULL;
+			if (test_and_clear_bit(R5_Overlap, &dev->flags))
+				wake_up(&conf->wait_for_overlap);
+			spin_unlock_irq(&conf->device_lock);
+			while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+				copy_data(0, rbi, dev->page, dev->sector);
+				rbi2 = r5_next_bio(rbi, dev->sector);
+				spin_lock_irq(&conf->device_lock);
+				if (--rbi->bi_phys_segments == 0) {
+					rbi->bi_next = return_bi;
+					return_bi = rbi;
+				}
+				spin_unlock_irq(&conf->device_lock);
+				rbi = rbi2;
+			}
+		}
 
-			if (r_queue->unplug_fn)
-				r_queue->unplug_fn(r_queue);
+		/* now count some things */
+		if (test_bit(R5_LOCKED, &dev->flags)) locked++;
+		if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
 
-			rdev_dec_pending(rdev, mddev);
-			rcu_read_lock();
+
+		if (dev->toread) to_read++;
+		if (dev->towrite) {
+			to_write++;
+			if (!test_bit(R5_OVERWRITE, &dev->flags))
+				non_overwrite++;
+		}
+		if (dev->written) written++;
+		rdev = rcu_dereference(conf->disks[i].rdev);
+		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
+			/* The ReadError flag will just be confusing now */
+			clear_bit(R5_ReadError, &dev->flags);
+			clear_bit(R5_ReWrite, &dev->flags);
 		}
+		if (!rdev || !test_bit(In_sync, &rdev->flags)
+		    || test_bit(R5_ReadError, &dev->flags)) {
+			if ( failed < 2 )
+				failed_num[failed] = i;
+			failed++;
+		} else
+			set_bit(R5_Insync, &dev->flags);
 	}
 	rcu_read_unlock();
-}
+	PRINTK("locked=%d uptodate=%d to_read=%d"
+	       " to_write=%d failed=%d failed_num=%d,%d\n",
+	       locked, uptodate, to_read, to_write, failed,
+	       failed_num[0], failed_num[1]);
+	/* check if the array has lost >2 devices and, if so, some requests might
+	 * need to be failed
+	 */
+	if (failed > 2 && to_read+to_write+written) {
+		for (i=disks; i--; ) {
+			int bitmap_end = 0;
 
-static void raid5_unplug_device(request_queue_t *q)
-{
-	mddev_t *mddev = q->queuedata;
-	raid5_conf_t *conf = mddev_to_conf(mddev);
-	unsigned long flags;
+			if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
+				mdk_rdev_t *rdev;
+				rcu_read_lock();
+				rdev = rcu_dereference(conf->disks[i].rdev);
+				if (rdev && test_bit(In_sync, &rdev->flags))
+					/* multiple read failures in one stripe */
+					md_error(conf->mddev, rdev);
+				rcu_read_unlock();
+			}
 
-	spin_lock_irqsave(&conf->device_lock, flags);
+			spin_lock_irq(&conf->device_lock);
+			/* fail all writes first */
+			bi = sh->dev[i].towrite;
+			sh->dev[i].towrite = NULL;
+			if (bi) { to_write--; bitmap_end = 1; }
 
-	if (blk_remove_plug(q)) {
-		conf->seq_flush++;
-		raid5_activate_delayed(conf);
+			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+				wake_up(&conf->wait_for_overlap);
+
+			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+				struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+				clear_bit(BIO_UPTODATE, &bi->bi_flags);
+				if (--bi->bi_phys_segments == 0) {
+					md_write_end(conf->mddev);
+					bi->bi_next = return_bi;
+					return_bi = bi;
+				}
+				bi = nextbi;
+			}
+			/* and fail all 'written' */
+			bi = sh->dev[i].written;
+			sh->dev[i].written = NULL;
+			if (bi) bitmap_end = 1;
+			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
+				struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
+				clear_bit(BIO_UPTODATE, &bi->bi_flags);
+				if (--bi->bi_phys_segments == 0) {
+					md_write_end(conf->mddev);
+					bi->bi_next = return_bi;
+					return_bi = bi;
+				}
+				bi = bi2;
+			}
+
+			/* fail any reads if this device is non-operational */
+			if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
+			    test_bit(R5_ReadError, &sh->dev[i].flags)) {
+				bi = sh->dev[i].toread;
+				sh->dev[i].toread = NULL;
+				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+					wake_up(&conf->wait_for_overlap);
+				if (bi) to_read--;
+				while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+					struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+					clear_bit(BIO_UPTODATE, &bi->bi_flags);
+					if (--bi->bi_phys_segments == 0) {
+						bi->bi_next = return_bi;
+						return_bi = bi;
+					}
+					bi = nextbi;
+				}
+			}
+			spin_unlock_irq(&conf->device_lock);
+			if (bitmap_end)
+				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+						STRIPE_SECTORS, 0, 0);
+		}
+	}
+	if (failed > 2 && syncing) {
+		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
+		clear_bit(STRIPE_SYNCING, &sh->state);
+		syncing = 0;
+	}
+
+	/*
+	 * might be able to return some write requests if the parity blocks
+	 * are safe, or on a failed drive
+	 */
+	pdev = &sh->dev[pd_idx];
+	p_failed = (failed >= 1 && failed_num[0] == pd_idx)
+		|| (failed >= 2 && failed_num[1] == pd_idx);
+	qdev = &sh->dev[qd_idx];
+	q_failed = (failed >= 1 && failed_num[0] == qd_idx)
+		|| (failed >= 2 && failed_num[1] == qd_idx);
+
+	if ( written &&
+	     ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
+			     && !test_bit(R5_LOCKED, &pdev->flags)
+			     && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
+	     ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
+			     && !test_bit(R5_LOCKED, &qdev->flags)
+			     && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
+		/* any written block on an uptodate or failed drive can be
+		 * returned.  Note that if we 'wrote' to a failed drive,
+		 * it will be UPTODATE, but never LOCKED, so we don't need
+		 * to test 'failed' directly.
+		 */
+		for (i=disks; i--; )
+			if (sh->dev[i].written) {
+				dev = &sh->dev[i];
+				if (!test_bit(R5_LOCKED, &dev->flags) &&
+				    test_bit(R5_UPTODATE, &dev->flags) ) {
+					/* We can return any write requests */
+					int bitmap_end = 0;
+					struct bio *wbi, *wbi2;
+					PRINTK("Return write for stripe %llu disc %d\n",
+					       (unsigned long long)sh->sector, i);
+					spin_lock_irq(&conf->device_lock);
+					wbi = dev->written;
+					dev->written = NULL;
+					while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+						wbi2 = r5_next_bio(wbi, dev->sector);
+						if (--wbi->bi_phys_segments == 0) {
+							md_write_end(conf->mddev);
+							wbi->bi_next = return_bi;
+							return_bi = wbi;
+						}
+						wbi = wbi2;
+					}
+					if (dev->towrite == NULL)
+						bitmap_end = 1;
+					spin_unlock_irq(&conf->device_lock);
+					if (bitmap_end)
+						bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+								STRIPE_SECTORS,
+								!test_bit(STRIPE_DEGRADED, &sh->state), 0);
+				}
+			}
+	}
+
+	/* Now we might consider reading some blocks, either to check/generate
+	 * parity, or to satisfy requests
+	 * or to load a block that is being partially written.
+	 */
+	if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
+		for (i=disks; i--;) {
+			dev = &sh->dev[i];
+			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+			    (dev->toread ||
+			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+			     syncing ||
+			     (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
+			     (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
+				    )
+				) {
+				/* we would like to get this block, possibly
+				 * by computing it, but we might not be able to
+				 */
+				if (uptodate == disks-1) {
+					PRINTK("Computing stripe %llu block %d\n",
+					       (unsigned long long)sh->sector, i);
+					compute_block_1(sh, i, 0);
+					uptodate++;
+				} else if ( uptodate == disks-2 && failed >= 2 ) {
+					/* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
+					int other;
+					for (other=disks; other--;) {
+						if ( other == i )
+							continue;
+						if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
+							break;
+					}
+					BUG_ON(other < 0);
+					PRINTK("Computing stripe %llu blocks %d,%d\n",
+					       (unsigned long long)sh->sector, i, other);
+					compute_block_2(sh, i, other);
+					uptodate += 2;
+				} else if (test_bit(R5_Insync, &dev->flags)) {
+					set_bit(R5_LOCKED, &dev->flags);
+					set_bit(R5_Wantread, &dev->flags);
+#if 0
+					/* if I am just reading this block and we don't have
+					   a failed drive, or any pending writes then sidestep the cache */
+					if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
+					    ! syncing && !failed && !to_write) {
+						sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
+						sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
+					}
+#endif
+					locked++;
+					PRINTK("Reading block %d (sync=%d)\n",
+						i, syncing);
+				}
+			}
+		}
+		set_bit(STRIPE_HANDLE, &sh->state);
+	}
+
+	/* now to consider writing and what else, if anything should be read */
+	if (to_write) {
+		int rcw=0, must_compute=0;
+		for (i=disks ; i--;) {
+			dev = &sh->dev[i];
+			/* Would I have to read this buffer for reconstruct_write */
+			if (!test_bit(R5_OVERWRITE, &dev->flags)
+			    && i != pd_idx && i != qd_idx
+			    && (!test_bit(R5_LOCKED, &dev->flags)
+#if 0
+				|| sh->bh_page[i] != bh->b_page
+#endif
+				    ) &&
+			    !test_bit(R5_UPTODATE, &dev->flags)) {
+				if (test_bit(R5_Insync, &dev->flags)) rcw++;
+				else {
+					PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
+					must_compute++;
+				}
+			}
+		}
+		PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
+		       (unsigned long long)sh->sector, rcw, must_compute);
+		set_bit(STRIPE_HANDLE, &sh->state);
+
+		if (rcw > 0)
+			/* want reconstruct write, but need to get some data */
+			for (i=disks; i--;) {
+				dev = &sh->dev[i];
+				if (!test_bit(R5_OVERWRITE, &dev->flags)
+				    && !(failed == 0 && (i == pd_idx || i == qd_idx))
+				    && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+				    test_bit(R5_Insync, &dev->flags)) {
+					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+					{
+						PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
+						       (unsigned long long)sh->sector, i);
+						set_bit(R5_LOCKED, &dev->flags);
+						set_bit(R5_Wantread, &dev->flags);
+						locked++;
+					} else {
+						PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
+						       (unsigned long long)sh->sector, i);
+						set_bit(STRIPE_DELAYED, &sh->state);
+						set_bit(STRIPE_HANDLE, &sh->state);
+					}
+				}
+			}
+		/* now if nothing is locked, and if we have enough data, we can start a write request */
+		if (locked == 0 && rcw == 0 &&
+		    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
+			if ( must_compute > 0 ) {
+				/* We have failed blocks and need to compute them */
+				switch ( failed ) {
+				case 0:	BUG();
+				case 1: compute_block_1(sh, failed_num[0], 0); break;
+				case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
+				default: BUG();	/* This request should have been failed? */
+				}
+			}
+
+			PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
+			compute_parity6(sh, RECONSTRUCT_WRITE);
+			/* now every locked buffer is ready to be written */
+			for (i=disks; i--;)
+				if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
+					PRINTK("Writing stripe %llu block %d\n",
+					       (unsigned long long)sh->sector, i);
+					locked++;
+					set_bit(R5_Wantwrite, &sh->dev[i].flags);
+				}
+			/* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
+			set_bit(STRIPE_INSYNC, &sh->state);
+
+			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+				atomic_dec(&conf->preread_active_stripes);
+				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+					md_wakeup_thread(conf->mddev->thread);
+			}
+		}
+	}
+
+	/* maybe we need to check and possibly fix the parity for this stripe
+	 * Any reads will already have been scheduled, so we just see if enough data
+	 * is available
+	 */
+	if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
+		int update_p = 0, update_q = 0;
+		struct r5dev *dev;
+
+		set_bit(STRIPE_HANDLE, &sh->state);
+
+		BUG_ON(failed>2);
+		BUG_ON(uptodate < disks);
+		/* Want to check and possibly repair P and Q.
+		 * However there could be one 'failed' device, in which
+		 * case we can only check one of them, possibly using the
+		 * other to generate missing data
+		 */
+
+		/* If !tmp_page, we cannot do the calculations,
+		 * but as we have set STRIPE_HANDLE, we will soon be called
+		 * by stripe_handle with a tmp_page - just wait until then.
+		 */
+		if (tmp_page) {
+			if (failed == q_failed) {
+				/* The only possible failed device holds 'Q', so it makes
+				 * sense to check P (If anything else were failed, we would
+				 * have used P to recreate it).
+				 */
+				compute_block_1(sh, pd_idx, 1);
+				if (!page_is_zero(sh->dev[pd_idx].page)) {
+					compute_block_1(sh,pd_idx,0);
+					update_p = 1;
+				}
+			}
+			if (!q_failed && failed < 2) {
+				/* q is not failed, and we didn't use it to generate
+				 * anything, so it makes sense to check it
+				 */
+				memcpy(page_address(tmp_page),
+				       page_address(sh->dev[qd_idx].page),
+				       STRIPE_SIZE);
+				compute_parity6(sh, UPDATE_PARITY);
+				if (memcmp(page_address(tmp_page),
+					   page_address(sh->dev[qd_idx].page),
+					   STRIPE_SIZE)!= 0) {
+					clear_bit(STRIPE_INSYNC, &sh->state);
+					update_q = 1;
+				}
+			}
+			if (update_p || update_q) {
+				conf->mddev->resync_mismatches += STRIPE_SECTORS;
+				if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+					/* don't try to repair!! */
+					update_p = update_q = 0;
+			}
+
+			/* now write out any block on a failed drive,
+			 * or P or Q if they need it
+			 */
+
+			if (failed == 2) {
+				dev = &sh->dev[failed_num[1]];
+				locked++;
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantwrite, &dev->flags);
+			}
+			if (failed >= 1) {
+				dev = &sh->dev[failed_num[0]];
+				locked++;
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantwrite, &dev->flags);
+			}
+
+			if (update_p) {
+				dev = &sh->dev[pd_idx];
+				locked ++;
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantwrite, &dev->flags);
+			}
+			if (update_q) {
+				dev = &sh->dev[qd_idx];
+				locked++;
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantwrite, &dev->flags);
+			}
+			clear_bit(STRIPE_DEGRADED, &sh->state);
+
+			set_bit(STRIPE_INSYNC, &sh->state);
+		}
+	}
+
+	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
+		clear_bit(STRIPE_SYNCING, &sh->state);
+	}
+
+	/* If the failed drives are just a ReadError, then we might need
+	 * to progress the repair/check process
+	 */
+	if (failed <= 2 && ! conf->mddev->ro)
+		for (i=0; i<failed;i++) {
+			dev = &sh->dev[failed_num[i]];
+			if (test_bit(R5_ReadError, &dev->flags)
+			    && !test_bit(R5_LOCKED, &dev->flags)
+			    && test_bit(R5_UPTODATE, &dev->flags)
+				) {
+				if (!test_bit(R5_ReWrite, &dev->flags)) {
+					set_bit(R5_Wantwrite, &dev->flags);
+					set_bit(R5_ReWrite, &dev->flags);
+					set_bit(R5_LOCKED, &dev->flags);
+				} else {
+					/* let's read it back */
+					set_bit(R5_Wantread, &dev->flags);
+					set_bit(R5_LOCKED, &dev->flags);
+				}
+			}
+		}
+	spin_unlock(&sh->lock);
+
+	while ((bi=return_bi)) {
+		int bytes = bi->bi_size;
+
+		return_bi = bi->bi_next;
+		bi->bi_next = NULL;
+		bi->bi_size = 0;
+		bi->bi_end_io(bi, bytes, 0);
+	}
+	for (i=disks; i-- ;) {
+		int rw;
+		struct bio *bi;
+		mdk_rdev_t *rdev;
+		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
+			rw = 1;
+		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+			rw = 0;
+		else
+			continue;
+
+		bi = &sh->dev[i].req;
+
+		bi->bi_rw = rw;
+		if (rw)
+			bi->bi_end_io = raid5_end_write_request;
+		else
+			bi->bi_end_io = raid5_end_read_request;
+
+		rcu_read_lock();
+		rdev = rcu_dereference(conf->disks[i].rdev);
+		if (rdev && test_bit(Faulty, &rdev->flags))
+			rdev = NULL;
+		if (rdev)
+			atomic_inc(&rdev->nr_pending);
+		rcu_read_unlock();
+
+		if (rdev) {
+			if (syncing)
+				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+
+			bi->bi_bdev = rdev->bdev;
+			PRINTK("for %llu schedule op %ld on disc %d\n",
+				(unsigned long long)sh->sector, bi->bi_rw, i);
+			atomic_inc(&sh->count);
+			bi->bi_sector = sh->sector + rdev->data_offset;
+			bi->bi_flags = 1 << BIO_UPTODATE;
+			bi->bi_vcnt = 1;
+			bi->bi_max_vecs = 1;
+			bi->bi_idx = 0;
+			bi->bi_io_vec = &sh->dev[i].vec;
+			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+			bi->bi_io_vec[0].bv_offset = 0;
+			bi->bi_size = STRIPE_SIZE;
+			bi->bi_next = NULL;
+			if (rw == WRITE &&
+			    test_bit(R5_ReWrite, &sh->dev[i].flags))
+				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+			generic_make_request(bi);
+		} else {
+			if (rw == 1)
+				set_bit(STRIPE_DEGRADED, &sh->state);
+			PRINTK("skip op %ld on disc %d for sector %llu\n",
+				bi->bi_rw, i, (unsigned long long)sh->sector);
+			clear_bit(R5_LOCKED, &sh->dev[i].flags);
+			set_bit(STRIPE_HANDLE, &sh->state);
+		}
+	}
+}
+
+static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
+{
+	if (sh->raid_conf->level == 6)
+		handle_stripe6(sh, tmp_page);
+	else
+		handle_stripe5(sh);
+}
+
+
+
+static void raid5_activate_delayed(raid5_conf_t *conf)
+{
+	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
+		while (!list_empty(&conf->delayed_list)) {
+			struct list_head *l = conf->delayed_list.next;
+			struct stripe_head *sh;
+			sh = list_entry(l, struct stripe_head, lru);
+			list_del_init(l);
+			clear_bit(STRIPE_DELAYED, &sh->state);
+			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+				atomic_inc(&conf->preread_active_stripes);
+			list_add_tail(&sh->lru, &conf->handle_list);
+		}
+	}
+}
+
+static void activate_bit_delay(raid5_conf_t *conf)
+{
+	/* device_lock is held */
+	struct list_head head;
+	list_add(&head, &conf->bitmap_list);
+	list_del_init(&conf->bitmap_list);
+	while (!list_empty(&head)) {
+		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+		list_del_init(&sh->lru);
+		atomic_inc(&sh->count);
+		__release_stripe(conf, sh);
+	}
+}
+
+static void unplug_slaves(mddev_t *mddev)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	int i;
+
+	rcu_read_lock();
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
+			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+
+			atomic_inc(&rdev->nr_pending);
+			rcu_read_unlock();
+
+			if (r_queue->unplug_fn)
+				r_queue->unplug_fn(r_queue);
+
+			rdev_dec_pending(rdev, mddev);
+			rcu_read_lock();
+		}
+	}
+	rcu_read_unlock();
+}
+
+static void raid5_unplug_device(request_queue_t *q)
+{
+	mddev_t *mddev = q->queuedata;
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&conf->device_lock, flags);
+
+	if (blk_remove_plug(q)) {
+		conf->seq_flush++;
+		raid5_activate_delayed(conf);
 	}
 	md_wakeup_thread(mddev->thread);
 
@@ -1753,7 +2590,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
 
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
-		int disks;
+		int disks, data_disks;
 
 	retry:
 		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
@@ -1781,7 +2618,9 @@ static int make_request(request_queue_t *q, struct bio * bi)
 			}
 			spin_unlock_irq(&conf->device_lock);
 		}
- 		new_sector = raid5_compute_sector(logical_sector, disks, disks - 1,
+		data_disks = disks - conf->max_degraded;
+
+ 		new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
 						  &dd_idx, &pd_idx, conf);
 		PRINTK("raid5: make_request, sector %llu logical %llu\n",
 			(unsigned long long)new_sector, 
@@ -1833,7 +2672,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
 			}
 			finish_wait(&conf->wait_for_overlap, &w);
 			raid5_plug_device(conf);
-			handle_stripe(sh);
+			handle_stripe(sh, NULL);
 			release_stripe(sh);
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
@@ -1849,7 +2688,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
 	if (remaining == 0) {
 		int bytes = bi->bi_size;
 
-		if ( bio_data_dir(bi) == WRITE )
+		if ( rw == WRITE )
 			md_write_end(mddev);
 		bi->bi_size = 0;
 		bi->bi_end_io(bi, bytes, 0);
@@ -1865,9 +2704,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	int pd_idx;
 	sector_t first_sector, last_sector;
 	int raid_disks = conf->raid_disks;
-	int data_disks = raid_disks-1;
+	int data_disks = raid_disks - conf->max_degraded;
 	sector_t max_sector = mddev->size << 1;
 	int sync_blocks;
+	int still_degraded = 0;
+	int i;
 
 	if (sector_nr >= max_sector) {
 		/* just being told to finish up .. nothing much to do */
@@ -1880,7 +2721,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		if (mddev->curr_resync < max_sector) /* aborted */
 			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
 					&sync_blocks, 1);
-		else /* compelted sync */
+		else /* completed sync */
 			conf->fullsync = 0;
 		bitmap_close_sync(mddev->bitmap);
 
@@ -2003,11 +2844,12 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		}
 		return conf->chunk_size>>9;
 	}
-	/* if there is 1 or more failed drives and we are trying
+	/* if there is too many failed drives and we are trying
 	 * to resync, then assert that we are finished, because there is
 	 * nothing we can do.
 	 */
-	if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+	if (mddev->degraded >= (data_disks - raid_disks) &&
+	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 		sector_t rv = (mddev->size << 1) - sector_nr;
 		*skipped = 1;
 		return rv;
@@ -2026,17 +2868,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	if (sh == NULL) {
 		sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
 		/* make sure we don't swamp the stripe cache if someone else
-		 * is trying to get access 
+		 * is trying to get access
 		 */
 		schedule_timeout_uninterruptible(1);
 	}
-	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0);
-	spin_lock(&sh->lock);	
+	/* Need to check if array will still be degraded after recovery/resync
+	 * We don't need to check the 'failed' flag as when that gets set,
+	 * recovery aborts.
+	 */
+	for (i=0; i<mddev->raid_disks; i++)
+		if (conf->disks[i].rdev == NULL)
+			still_degraded = 1;
+
+	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
+
+	spin_lock(&sh->lock);
 	set_bit(STRIPE_SYNCING, &sh->state);
 	clear_bit(STRIPE_INSYNC, &sh->state);
 	spin_unlock(&sh->lock);
 
-	handle_stripe(sh);
+	handle_stripe(sh, NULL);
 	release_stripe(sh);
 
 	return STRIPE_SECTORS;
@@ -2091,7 +2942,7 @@ static void raid5d (mddev_t *mddev)
 		spin_unlock_irq(&conf->device_lock);
 		
 		handled++;
-		handle_stripe(sh);
+		handle_stripe(sh, conf->spare_page);
 		release_stripe(sh);
 
 		spin_lock_irq(&conf->device_lock);
@@ -2181,8 +3032,8 @@ static int run(mddev_t *mddev)
 	struct disk_info *disk;
 	struct list_head *tmp;
 
-	if (mddev->level != 5 && mddev->level != 4) {
-		printk(KERN_ERR "raid5: %s: raid level not set to 4/5 (%d)\n",
+	if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
+		printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
 		       mdname(mddev), mddev->level);
 		return -EIO;
 	}
@@ -2251,6 +3102,11 @@ static int run(mddev_t *mddev)
 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
 		goto abort;
 
+	if (mddev->level == 6) {
+		conf->spare_page = alloc_page(GFP_KERNEL);
+		if (!conf->spare_page)
+			goto abort;
+	}
 	spin_lock_init(&conf->device_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
@@ -2282,12 +3138,16 @@ static int run(mddev_t *mddev)
 	}
 
 	/*
-	 * 0 for a fully functional array, 1 for a degraded array.
+	 * 0 for a fully functional array, 1 or 2 for a degraded array.
 	 */
 	mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
 	conf->mddev = mddev;
 	conf->chunk_size = mddev->chunk_size;
 	conf->level = mddev->level;
+	if (conf->level == 6)
+		conf->max_degraded = 2;
+	else
+		conf->max_degraded = 1;
 	conf->algorithm = mddev->layout;
 	conf->max_nr_stripes = NR_STRIPES;
 	conf->expand_progress = mddev->reshape_position;
@@ -2296,6 +3156,11 @@ static int run(mddev_t *mddev)
 	mddev->size &= ~(mddev->chunk_size/1024 -1);
 	mddev->resync_max_sectors = mddev->size << 1;
 
+	if (conf->level == 6 && conf->raid_disks < 4) {
+		printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
+		       mdname(mddev), conf->raid_disks);
+		goto abort;
+	}
 	if (!conf->chunk_size || conf->chunk_size % 4) {
 		printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
 			conf->chunk_size, mdname(mddev));
@@ -2307,14 +3172,14 @@ static int run(mddev_t *mddev)
 			conf->algorithm, mdname(mddev));
 		goto abort;
 	}
-	if (mddev->degraded > 1) {
+	if (mddev->degraded > conf->max_degraded) {
 		printk(KERN_ERR "raid5: not enough operational devices for %s"
 			" (%d/%d failed)\n",
 			mdname(mddev), conf->failed_disks, conf->raid_disks);
 		goto abort;
 	}
 
-	if (mddev->degraded == 1 &&
+	if (mddev->degraded > 0 &&
 	    mddev->recovery_cp != MaxSector) {
 		if (mddev->ok_start_degraded)
 			printk(KERN_WARNING
@@ -2379,10 +3244,11 @@ static int run(mddev_t *mddev)
 	}
 
 	/* read-ahead size must cover two whole stripes, which is
-	 * 2 * (n-1) * chunksize where 'n' is the number of raid devices
+	 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
 	 */
 	{
-		int stripe = (mddev->raid_disks-1) *
+		int data_disks = conf->previous_raid_disks - conf->max_degraded;
+		int stripe = data_disks *
 			(mddev->chunk_size / PAGE_SIZE);
 		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
@@ -2393,12 +3259,14 @@ static int run(mddev_t *mddev)
 
 	mddev->queue->unplug_fn = raid5_unplug_device;
 	mddev->queue->issue_flush_fn = raid5_issue_flush;
-	mddev->array_size =  mddev->size * (conf->previous_raid_disks - 1);
+	mddev->array_size =  mddev->size * (conf->previous_raid_disks -
+					    conf->max_degraded);
 
 	return 0;
 abort:
 	if (conf) {
 		print_raid5_conf(conf);
+		safe_put_page(conf->spare_page);
 		kfree(conf->disks);
 		kfree(conf->stripe_hashtbl);
 		kfree(conf);
@@ -2427,23 +3295,23 @@ static int stop(mddev_t *mddev)
 }
 
 #if RAID5_DEBUG
-static void print_sh (struct stripe_head *sh)
+static void print_sh (struct seq_file *seq, struct stripe_head *sh)
 {
 	int i;
 
-	printk("sh %llu, pd_idx %d, state %ld.\n",
-		(unsigned long long)sh->sector, sh->pd_idx, sh->state);
-	printk("sh %llu,  count %d.\n",
-		(unsigned long long)sh->sector, atomic_read(&sh->count));
-	printk("sh %llu, ", (unsigned long long)sh->sector);
+	seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
+		   (unsigned long long)sh->sector, sh->pd_idx, sh->state);
+	seq_printf(seq, "sh %llu,  count %d.\n",
+		   (unsigned long long)sh->sector, atomic_read(&sh->count));
+	seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
 	for (i = 0; i < sh->disks; i++) {
-		printk("(cache%d: %p %ld) ", 
-			i, sh->dev[i].page, sh->dev[i].flags);
+		seq_printf(seq, "(cache%d: %p %ld) ",
+			   i, sh->dev[i].page, sh->dev[i].flags);
 	}
-	printk("\n");
+	seq_printf(seq, "\n");
 }
 
-static void printall (raid5_conf_t *conf)
+static void printall (struct seq_file *seq, raid5_conf_t *conf)
 {
 	struct stripe_head *sh;
 	struct hlist_node *hn;
@@ -2454,7 +3322,7 @@ static void printall (raid5_conf_t *conf)
 		hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
 			if (sh->raid_conf != conf)
 				continue;
-			print_sh(sh);
+			print_sh(seq, sh);
 		}
 	}
 	spin_unlock_irq(&conf->device_lock);
@@ -2474,9 +3342,8 @@ static void status (struct seq_file *seq, mddev_t *mddev)
 			       test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
 	seq_printf (seq, "]");
 #if RAID5_DEBUG
-#define D(x) \
-	seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x))
-	printall(conf);
+	seq_printf (seq, "\n");
+	printall(seq, conf);
 #endif
 }
 
@@ -2560,14 +3427,20 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int disk;
 	struct disk_info *p;
 
-	if (mddev->degraded > 1)
+	if (mddev->degraded > conf->max_degraded)
 		/* no point adding a device */
 		return 0;
 
 	/*
-	 * find the disk ...
+	 * find the disk ... but prefer rdev->saved_raid_disk
+	 * if possible.
 	 */
-	for (disk=0; disk < conf->raid_disks; disk++)
+	if (rdev->saved_raid_disk >= 0 &&
+	    conf->disks[rdev->saved_raid_disk].rdev == NULL)
+		disk = rdev->saved_raid_disk;
+	else
+		disk = 0;
+	for ( ; disk < conf->raid_disks; disk++)
 		if ((p=conf->disks + disk)->rdev == NULL) {
 			clear_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = disk;
@@ -2590,8 +3463,10 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	 * any io in the removed space completes, but it hardly seems
 	 * worth it.
 	 */
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+
 	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
-	mddev->array_size = (sectors * (mddev->raid_disks-1))>>1;
+	mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
 	set_capacity(mddev->gendisk, mddev->array_size << 1);
 	mddev->changed = 1;
 	if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
@@ -2731,6 +3606,17 @@ static void end_reshape(raid5_conf_t *conf)
 		conf->expand_progress = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
 		conf->mddev->reshape_position = MaxSector;
+
+		/* read-ahead size must cover two whole stripes, which is
+		 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
+		 */
+		{
+			int data_disks = conf->previous_raid_disks - conf->max_degraded;
+			int stripe = data_disks *
+				(conf->mddev->chunk_size / PAGE_SIZE);
+			if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+				conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+		}
 	}
 }
 
@@ -2762,6 +3648,23 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 	}
 }
 
+static struct mdk_personality raid6_personality =
+{
+	.name		= "raid6",
+	.level		= 6,
+	.owner		= THIS_MODULE,
+	.make_request	= make_request,
+	.run		= run,
+	.stop		= stop,
+	.status		= status,
+	.error_handler	= error,
+	.hot_add_disk	= raid5_add_disk,
+	.hot_remove_disk= raid5_remove_disk,
+	.spare_active	= raid5_spare_active,
+	.sync_request	= sync_request,
+	.resize		= raid5_resize,
+	.quiesce	= raid5_quiesce,
+};
 static struct mdk_personality raid5_personality =
 {
 	.name		= "raid5",
@@ -2804,6 +3707,12 @@ static struct mdk_personality raid4_personality =
 
 static int __init raid5_init(void)
 {
+	int e;
+
+	e = raid6_select_algo();
+	if ( e )
+		return e;
+	register_md_personality(&raid6_personality);
 	register_md_personality(&raid5_personality);
 	register_md_personality(&raid4_personality);
 	return 0;
@@ -2811,6 +3720,7 @@ static int __init raid5_init(void)
 
 static void raid5_exit(void)
 {
+	unregister_md_personality(&raid6_personality);
 	unregister_md_personality(&raid5_personality);
 	unregister_md_personality(&raid4_personality);
 }
@@ -2823,3 +3733,10 @@ MODULE_ALIAS("md-raid5");
 MODULE_ALIAS("md-raid4");
 MODULE_ALIAS("md-level-5");
 MODULE_ALIAS("md-level-4");
+MODULE_ALIAS("md-personality-8"); /* RAID6 */
+MODULE_ALIAS("md-raid6");
+MODULE_ALIAS("md-level-6");
+
+/* This used to be two separate modules, they were: */
+MODULE_ALIAS("raid5");
+MODULE_ALIAS("raid6");
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
deleted file mode 100644
index e53d2d96ea3a..000000000000
--- a/drivers/md/raid6main.c
+++ /dev/null
@@ -1,2427 +0,0 @@
-/*
- * raid6main.c : Multiple Devices driver for Linux
- *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
- *	   Copyright (C) 1999, 2000 Ingo Molnar
- *	   Copyright (C) 2002, 2003 H. Peter Anvin
- *
- * RAID-6 management functions.  This code is derived from raid5.c.
- * Last merge from raid5.c bkcvs version 1.79 (kernel 2.6.1).
- *
- * Thanks to Penguin Computing for making the RAID-6 development possible
- * by donating a test server!
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/bitops.h>
-#include <asm/atomic.h>
-#include "raid6.h"
-
-#include <linux/raid/bitmap.h>
-
-/*
- * Stripe cache
- */
-
-#define NR_STRIPES		256
-#define STRIPE_SIZE		PAGE_SIZE
-#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
-#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
-#define	IO_THRESHOLD		1
-#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
-#define HASH_MASK		(NR_HASH - 1)
-
-#define stripe_hash(conf, sect)	(&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
-
-/* bio's attached to a stripe+device for I/O are linked together in bi_sector
- * order without overlap.  There may be several bio's per stripe+device, and
- * a bio could span several devices.
- * When walking this list for a particular stripe+device, we must never proceed
- * beyond a bio that extends past this device, as the next bio might no longer
- * be valid.
- * This macro is used to determine the 'next' bio in the list, given the sector
- * of the current stripe+device
- */
-#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
-/*
- * The following can be used to debug the driver
- */
-#define RAID6_DEBUG	0	/* Extremely verbose printk */
-#define RAID6_PARANOIA	1	/* Check spinlocks */
-#define RAID6_DUMPSTATE 0	/* Include stripe cache state in /proc/mdstat */
-#if RAID6_PARANOIA && defined(CONFIG_SMP)
-# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
-#else
-# define CHECK_DEVLOCK()
-#endif
-
-#define PRINTK(x...) ((void)(RAID6_DEBUG && printk(KERN_DEBUG x)))
-#if RAID6_DEBUG
-#undef inline
-#undef __inline__
-#define inline
-#define __inline__
-#endif
-
-#if !RAID6_USE_EMPTY_ZERO_PAGE
-/* In .bss so it's zeroed */
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
-#endif
-
-static inline int raid6_next_disk(int disk, int raid_disks)
-{
-	disk++;
-	return (disk < raid_disks) ? disk : 0;
-}
-
-static void print_raid6_conf (raid6_conf_t *conf);
-
-static void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh)
-{
-	if (atomic_dec_and_test(&sh->count)) {
-		BUG_ON(!list_empty(&sh->lru));
-		BUG_ON(atomic_read(&conf->active_stripes)==0);
-		if (test_bit(STRIPE_HANDLE, &sh->state)) {
-			if (test_bit(STRIPE_DELAYED, &sh->state))
-				list_add_tail(&sh->lru, &conf->delayed_list);
-			else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
-				 conf->seq_write == sh->bm_seq)
-				list_add_tail(&sh->lru, &conf->bitmap_list);
-			else {
-				clear_bit(STRIPE_BIT_DELAY, &sh->state);
-				list_add_tail(&sh->lru, &conf->handle_list);
-			}
-			md_wakeup_thread(conf->mddev->thread);
-		} else {
-			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-				atomic_dec(&conf->preread_active_stripes);
-				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
-					md_wakeup_thread(conf->mddev->thread);
-			}
-			list_add_tail(&sh->lru, &conf->inactive_list);
-			atomic_dec(&conf->active_stripes);
-			if (!conf->inactive_blocked ||
-			    atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4))
-				wake_up(&conf->wait_for_stripe);
-		}
-	}
-}
-static void release_stripe(struct stripe_head *sh)
-{
-	raid6_conf_t *conf = sh->raid_conf;
-	unsigned long flags;
-
-	spin_lock_irqsave(&conf->device_lock, flags);
-	__release_stripe(conf, sh);
-	spin_unlock_irqrestore(&conf->device_lock, flags);
-}
-
-static inline void remove_hash(struct stripe_head *sh)
-{
-	PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
-
-	hlist_del_init(&sh->hash);
-}
-
-static inline void insert_hash(raid6_conf_t *conf, struct stripe_head *sh)
-{
-	struct hlist_head *hp = stripe_hash(conf, sh->sector);
-
-	PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
-
-	CHECK_DEVLOCK();
-	hlist_add_head(&sh->hash, hp);
-}
-
-
-/* find an idle stripe, make sure it is unhashed, and return it. */
-static struct stripe_head *get_free_stripe(raid6_conf_t *conf)
-{
-	struct stripe_head *sh = NULL;
-	struct list_head *first;
-
-	CHECK_DEVLOCK();
-	if (list_empty(&conf->inactive_list))
-		goto out;
-	first = conf->inactive_list.next;
-	sh = list_entry(first, struct stripe_head, lru);
-	list_del_init(first);
-	remove_hash(sh);
-	atomic_inc(&conf->active_stripes);
-out:
-	return sh;
-}
-
-static void shrink_buffers(struct stripe_head *sh, int num)
-{
-	struct page *p;
-	int i;
-
-	for (i=0; i<num ; i++) {
-		p = sh->dev[i].page;
-		if (!p)
-			continue;
-		sh->dev[i].page = NULL;
-		put_page(p);
-	}
-}
-
-static int grow_buffers(struct stripe_head *sh, int num)
-{
-	int i;
-
-	for (i=0; i<num; i++) {
-		struct page *page;
-
-		if (!(page = alloc_page(GFP_KERNEL))) {
-			return 1;
-		}
-		sh->dev[i].page = page;
-	}
-	return 0;
-}
-
-static void raid6_build_block (struct stripe_head *sh, int i);
-
-static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
-{
-	raid6_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
-
-	BUG_ON(atomic_read(&sh->count) != 0);
-	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
-
-	CHECK_DEVLOCK();
-	PRINTK("init_stripe called, stripe %llu\n",
-		(unsigned long long)sh->sector);
-
-	remove_hash(sh);
-
-	sh->sector = sector;
-	sh->pd_idx = pd_idx;
-	sh->state = 0;
-
-	for (i=disks; i--; ) {
-		struct r5dev *dev = &sh->dev[i];
-
-		if (dev->toread || dev->towrite || dev->written ||
-		    test_bit(R5_LOCKED, &dev->flags)) {
-			PRINTK("sector=%llx i=%d %p %p %p %d\n",
-			       (unsigned long long)sh->sector, i, dev->toread,
-			       dev->towrite, dev->written,
-			       test_bit(R5_LOCKED, &dev->flags));
-			BUG();
-		}
-		dev->flags = 0;
-		raid6_build_block(sh, i);
-	}
-	insert_hash(conf, sh);
-}
-
-static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector)
-{
-	struct stripe_head *sh;
-	struct hlist_node *hn;
-
-	CHECK_DEVLOCK();
-	PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
-	hlist_for_each_entry (sh, hn,  stripe_hash(conf, sector), hash)
-		if (sh->sector == sector)
-			return sh;
-	PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
-	return NULL;
-}
-
-static void unplug_slaves(mddev_t *mddev);
-
-static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector,
-					     int pd_idx, int noblock)
-{
-	struct stripe_head *sh;
-
-	PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector);
-
-	spin_lock_irq(&conf->device_lock);
-
-	do {
-		wait_event_lock_irq(conf->wait_for_stripe,
-				    conf->quiesce == 0,
-				    conf->device_lock, /* nothing */);
-		sh = __find_stripe(conf, sector);
-		if (!sh) {
-			if (!conf->inactive_blocked)
-				sh = get_free_stripe(conf);
-			if (noblock && sh == NULL)
-				break;
-			if (!sh) {
-				conf->inactive_blocked = 1;
-				wait_event_lock_irq(conf->wait_for_stripe,
-						    !list_empty(&conf->inactive_list) &&
-						    (atomic_read(&conf->active_stripes)
-						     < (conf->max_nr_stripes *3/4)
-						     || !conf->inactive_blocked),
-						    conf->device_lock,
-						    unplug_slaves(conf->mddev);
-					);
-				conf->inactive_blocked = 0;
-			} else
-				init_stripe(sh, sector, pd_idx);
-		} else {
-			if (atomic_read(&sh->count)) {
-				BUG_ON(!list_empty(&sh->lru));
-			} else {
-				if (!test_bit(STRIPE_HANDLE, &sh->state))
-					atomic_inc(&conf->active_stripes);
-				BUG_ON(list_empty(&sh->lru));
-				list_del_init(&sh->lru);
-			}
-		}
-	} while (sh == NULL);
-
-	if (sh)
-		atomic_inc(&sh->count);
-
-	spin_unlock_irq(&conf->device_lock);
-	return sh;
-}
-
-static int grow_one_stripe(raid6_conf_t *conf)
-{
-	struct stripe_head *sh;
-	sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
-	if (!sh)
-		return 0;
-	memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
-	sh->raid_conf = conf;
-	spin_lock_init(&sh->lock);
-
-	if (grow_buffers(sh, conf->raid_disks)) {
-		shrink_buffers(sh, conf->raid_disks);
-		kmem_cache_free(conf->slab_cache, sh);
-		return 0;
-	}
-	/* we just created an active stripe so... */
-	atomic_set(&sh->count, 1);
-	atomic_inc(&conf->active_stripes);
-	INIT_LIST_HEAD(&sh->lru);
-	release_stripe(sh);
-	return 1;
-}
-
-static int grow_stripes(raid6_conf_t *conf, int num)
-{
-	kmem_cache_t *sc;
-	int devs = conf->raid_disks;
-
-	sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev));
-
-	sc = kmem_cache_create(conf->cache_name[0],
-			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
-			       0, 0, NULL, NULL);
-	if (!sc)
-		return 1;
-	conf->slab_cache = sc;
-	while (num--)
-		if (!grow_one_stripe(conf))
-			return 1;
-	return 0;
-}
-
-static int drop_one_stripe(raid6_conf_t *conf)
-{
-	struct stripe_head *sh;
-	spin_lock_irq(&conf->device_lock);
-	sh = get_free_stripe(conf);
-	spin_unlock_irq(&conf->device_lock);
-	if (!sh)
-		return 0;
-	BUG_ON(atomic_read(&sh->count));
-	shrink_buffers(sh, conf->raid_disks);
-	kmem_cache_free(conf->slab_cache, sh);
-	atomic_dec(&conf->active_stripes);
-	return 1;
-}
-
-static void shrink_stripes(raid6_conf_t *conf)
-{
-	while (drop_one_stripe(conf))
-		;
-
-	if (conf->slab_cache)
-		kmem_cache_destroy(conf->slab_cache);
-	conf->slab_cache = NULL;
-}
-
-static int raid6_end_read_request(struct bio * bi, unsigned int bytes_done,
-				  int error)
-{
- 	struct stripe_head *sh = bi->bi_private;
-	raid6_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
-	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
-
-	if (bi->bi_size)
-		return 1;
-
-	for (i=0 ; i<disks; i++)
-		if (bi == &sh->dev[i].req)
-			break;
-
-	PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",
-		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
-		uptodate);
-	if (i == disks) {
-		BUG();
-		return 0;
-	}
-
-	if (uptodate) {
-#if 0
-		struct bio *bio;
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
-		/* we can return a buffer if we bypassed the cache or
-		 * if the top buffer is not in highmem.  If there are
-		 * multiple buffers, leave the extra work to
-		 * handle_stripe
-		 */
-		buffer = sh->bh_read[i];
-		if (buffer &&
-		    (!PageHighMem(buffer->b_page)
-		     || buffer->b_page == bh->b_page )
-			) {
-			sh->bh_read[i] = buffer->b_reqnext;
-			buffer->b_reqnext = NULL;
-		} else
-			buffer = NULL;
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		if (sh->bh_page[i]==bh->b_page)
-			set_buffer_uptodate(bh);
-		if (buffer) {
-			if (buffer->b_page != bh->b_page)
-				memcpy(buffer->b_data, bh->b_data, bh->b_size);
-			buffer->b_end_io(buffer, 1);
-		}
-#else
-		set_bit(R5_UPTODATE, &sh->dev[i].flags);
-#endif
-		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-			printk(KERN_INFO "raid6: read error corrected!!\n");
-			clear_bit(R5_ReadError, &sh->dev[i].flags);
-			clear_bit(R5_ReWrite, &sh->dev[i].flags);
-		}
-		if (atomic_read(&conf->disks[i].rdev->read_errors))
-			atomic_set(&conf->disks[i].rdev->read_errors, 0);
-	} else {
-		int retry = 0;
-		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
-		atomic_inc(&conf->disks[i].rdev->read_errors);
-		if (conf->mddev->degraded)
-			printk(KERN_WARNING "raid6: read error not correctable.\n");
-		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
-			/* Oh, no!!! */
-			printk(KERN_WARNING "raid6: read error NOT corrected!!\n");
-		else if (atomic_read(&conf->disks[i].rdev->read_errors)
-			 > conf->max_nr_stripes)
-			printk(KERN_WARNING
-			       "raid6: Too many read errors, failing device.\n");
-		else
-			retry = 1;
-		if (retry)
-			set_bit(R5_ReadError, &sh->dev[i].flags);
-		else {
-			clear_bit(R5_ReadError, &sh->dev[i].flags);
-			clear_bit(R5_ReWrite, &sh->dev[i].flags);
-			md_error(conf->mddev, conf->disks[i].rdev);
-		}
-	}
-	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-#if 0
-	/* must restore b_page before unlocking buffer... */
-	if (sh->bh_page[i] != bh->b_page) {
-		bh->b_page = sh->bh_page[i];
-		bh->b_data = page_address(bh->b_page);
-		clear_buffer_uptodate(bh);
-	}
-#endif
-	clear_bit(R5_LOCKED, &sh->dev[i].flags);
-	set_bit(STRIPE_HANDLE, &sh->state);
-	release_stripe(sh);
-	return 0;
-}
-
-static int raid6_end_write_request (struct bio *bi, unsigned int bytes_done,
-				    int error)
-{
- 	struct stripe_head *sh = bi->bi_private;
-	raid6_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
-	unsigned long flags;
-	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
-
-	if (bi->bi_size)
-		return 1;
-
-	for (i=0 ; i<disks; i++)
-		if (bi == &sh->dev[i].req)
-			break;
-
-	PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
-		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
-		uptodate);
-	if (i == disks) {
-		BUG();
-		return 0;
-	}
-
-	spin_lock_irqsave(&conf->device_lock, flags);
-	if (!uptodate)
-		md_error(conf->mddev, conf->disks[i].rdev);
-
-	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-
-	clear_bit(R5_LOCKED, &sh->dev[i].flags);
-	set_bit(STRIPE_HANDLE, &sh->state);
-	__release_stripe(conf, sh);
-	spin_unlock_irqrestore(&conf->device_lock, flags);
-	return 0;
-}
-
-
-static sector_t compute_blocknr(struct stripe_head *sh, int i);
-
-static void raid6_build_block (struct stripe_head *sh, int i)
-{
-	struct r5dev *dev = &sh->dev[i];
-	int pd_idx = sh->pd_idx;
-	int qd_idx = raid6_next_disk(pd_idx, sh->raid_conf->raid_disks);
-
-	bio_init(&dev->req);
-	dev->req.bi_io_vec = &dev->vec;
-	dev->req.bi_vcnt++;
-	dev->req.bi_max_vecs++;
-	dev->vec.bv_page = dev->page;
-	dev->vec.bv_len = STRIPE_SIZE;
-	dev->vec.bv_offset = 0;
-
-	dev->req.bi_sector = sh->sector;
-	dev->req.bi_private = sh;
-
-	dev->flags = 0;
-	if (i != pd_idx && i != qd_idx)
-		dev->sector = compute_blocknr(sh, i);
-}
-
-static void error(mddev_t *mddev, mdk_rdev_t *rdev)
-{
-	char b[BDEVNAME_SIZE];
-	raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
-	PRINTK("raid6: error called\n");
-
-	if (!test_bit(Faulty, &rdev->flags)) {
-		mddev->sb_dirty = 1;
-		if (test_bit(In_sync, &rdev->flags)) {
-			conf->working_disks--;
-			mddev->degraded++;
-			conf->failed_disks++;
-			clear_bit(In_sync, &rdev->flags);
-			/*
-			 * if recovery was running, make sure it aborts.
-			 */
-			set_bit(MD_RECOVERY_ERR, &mddev->recovery);
-		}
-		set_bit(Faulty, &rdev->flags);
-		printk (KERN_ALERT
-			"raid6: Disk failure on %s, disabling device."
-			" Operation continuing on %d devices\n",
-			bdevname(rdev->bdev,b), conf->working_disks);
-	}
-}
-
-/*
- * Input: a 'big' sector number,
- * Output: index of the data and parity disk, and the sector # in them.
- */
-static sector_t raid6_compute_sector(sector_t r_sector, unsigned int raid_disks,
-			unsigned int data_disks, unsigned int * dd_idx,
-			unsigned int * pd_idx, raid6_conf_t *conf)
-{
-	long stripe;
-	unsigned long chunk_number;
-	unsigned int chunk_offset;
-	sector_t new_sector;
-	int sectors_per_chunk = conf->chunk_size >> 9;
-
-	/* First compute the information on this sector */
-
-	/*
-	 * Compute the chunk number and the sector offset inside the chunk
-	 */
-	chunk_offset = sector_div(r_sector, sectors_per_chunk);
-	chunk_number = r_sector;
-	if ( r_sector != chunk_number ) {
-		printk(KERN_CRIT "raid6: ERROR: r_sector = %llu, chunk_number = %lu\n",
-		       (unsigned long long)r_sector, (unsigned long)chunk_number);
-		BUG();
-	}
-
-	/*
-	 * Compute the stripe number
-	 */
-	stripe = chunk_number / data_disks;
-
-	/*
-	 * Compute the data disk and parity disk indexes inside the stripe
-	 */
-	*dd_idx = chunk_number % data_disks;
-
-	/*
-	 * Select the parity disk based on the user selected algorithm.
-	 */
-
-	/**** FIX THIS ****/
-	switch (conf->algorithm) {
-	case ALGORITHM_LEFT_ASYMMETRIC:
-		*pd_idx = raid_disks - 1 - (stripe % raid_disks);
-		if (*pd_idx == raid_disks-1)
-		  	(*dd_idx)++; 	/* Q D D D P */
-		else if (*dd_idx >= *pd_idx)
-		  	(*dd_idx) += 2; /* D D P Q D */
-		break;
-	case ALGORITHM_RIGHT_ASYMMETRIC:
-		*pd_idx = stripe % raid_disks;
-		if (*pd_idx == raid_disks-1)
-		  	(*dd_idx)++; 	/* Q D D D P */
-		else if (*dd_idx >= *pd_idx)
-		  	(*dd_idx) += 2; /* D D P Q D */
-		break;
-	case ALGORITHM_LEFT_SYMMETRIC:
-		*pd_idx = raid_disks - 1 - (stripe % raid_disks);
-		*dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
-		break;
-	case ALGORITHM_RIGHT_SYMMETRIC:
-		*pd_idx = stripe % raid_disks;
-		*dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
-		break;
-	default:
-		printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
-			conf->algorithm);
-	}
-
-	PRINTK("raid6: chunk_number = %lu, pd_idx = %u, dd_idx = %u\n",
-	       chunk_number, *pd_idx, *dd_idx);
-
-	/*
-	 * Finally, compute the new sector number
-	 */
-	new_sector = (sector_t) stripe * sectors_per_chunk + chunk_offset;
-	return new_sector;
-}
-
-
-static sector_t compute_blocknr(struct stripe_head *sh, int i)
-{
-	raid6_conf_t *conf = sh->raid_conf;
-	int raid_disks = conf->raid_disks, data_disks = raid_disks - 2;
-	sector_t new_sector = sh->sector, check;
-	int sectors_per_chunk = conf->chunk_size >> 9;
-	sector_t stripe;
-	int chunk_offset;
-	int chunk_number, dummy1, dummy2, dd_idx = i;
-	sector_t r_sector;
-	int i0 = i;
-
-	chunk_offset = sector_div(new_sector, sectors_per_chunk);
-	stripe = new_sector;
-	if ( new_sector != stripe ) {
-		printk(KERN_CRIT "raid6: ERROR: new_sector = %llu, stripe = %lu\n",
-		       (unsigned long long)new_sector, (unsigned long)stripe);
-		BUG();
-	}
-
-	switch (conf->algorithm) {
-		case ALGORITHM_LEFT_ASYMMETRIC:
-		case ALGORITHM_RIGHT_ASYMMETRIC:
-		  	if (sh->pd_idx == raid_disks-1)
-				i--; 	/* Q D D D P */
-			else if (i > sh->pd_idx)
-				i -= 2; /* D D P Q D */
-			break;
-		case ALGORITHM_LEFT_SYMMETRIC:
-		case ALGORITHM_RIGHT_SYMMETRIC:
-			if (sh->pd_idx == raid_disks-1)
-				i--; /* Q D D D P */
-			else {
-				/* D D P Q D */
-				if (i < sh->pd_idx)
-					i += raid_disks;
-				i -= (sh->pd_idx + 2);
-			}
-			break;
-		default:
-			printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
-				conf->algorithm);
-	}
-
-	PRINTK("raid6: compute_blocknr: pd_idx = %u, i0 = %u, i = %u\n", sh->pd_idx, i0, i);
-
-	chunk_number = stripe * data_disks + i;
-	r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
-
-	check = raid6_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
-	if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
-		printk(KERN_CRIT "raid6: compute_blocknr: map not correct\n");
-		return 0;
-	}
-	return r_sector;
-}
-
-
-
-/*
- * Copy data between a page in the stripe cache, and one or more bion
- * The page could align with the middle of the bio, or there could be
- * several bion, each with several bio_vecs, which cover part of the page
- * Multiple bion are linked together on bi_next.  There may be extras
- * at the end of this list.  We ignore them.
- */
-static void copy_data(int frombio, struct bio *bio,
-		     struct page *page,
-		     sector_t sector)
-{
-	char *pa = page_address(page);
-	struct bio_vec *bvl;
-	int i;
-	int page_offset;
-
-	if (bio->bi_sector >= sector)
-		page_offset = (signed)(bio->bi_sector - sector) * 512;
-	else
-		page_offset = (signed)(sector - bio->bi_sector) * -512;
-	bio_for_each_segment(bvl, bio, i) {
-		int len = bio_iovec_idx(bio,i)->bv_len;
-		int clen;
-		int b_offset = 0;
-
-		if (page_offset < 0) {
-			b_offset = -page_offset;
-			page_offset += b_offset;
-			len -= b_offset;
-		}
-
-		if (len > 0 && page_offset + len > STRIPE_SIZE)
-			clen = STRIPE_SIZE - page_offset;
-		else clen = len;
-
-		if (clen > 0) {
-			char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
-			if (frombio)
-				memcpy(pa+page_offset, ba+b_offset, clen);
-			else
-				memcpy(ba+b_offset, pa+page_offset, clen);
-			__bio_kunmap_atomic(ba, KM_USER0);
-		}
-		if (clen < len) /* hit end of page */
-			break;
-		page_offset +=  len;
-	}
-}
-
-#define check_xor() 	do { 						\
-			   if (count == MAX_XOR_BLOCKS) {		\
-				xor_block(count, STRIPE_SIZE, ptr);	\
-				count = 1;				\
-			   }						\
-			} while(0)
-
-/* Compute P and Q syndromes */
-static void compute_parity(struct stripe_head *sh, int method)
-{
-	raid6_conf_t *conf = sh->raid_conf;
-	int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
-	struct bio *chosen;
-	/**** FIX THIS: This could be very bad if disks is close to 256 ****/
-	void *ptrs[disks];
-
-	qd_idx = raid6_next_disk(pd_idx, disks);
-	d0_idx = raid6_next_disk(qd_idx, disks);
-
-	PRINTK("compute_parity, stripe %llu, method %d\n",
-		(unsigned long long)sh->sector, method);
-
-	switch(method) {
-	case READ_MODIFY_WRITE:
-		BUG();		/* READ_MODIFY_WRITE N/A for RAID-6 */
-	case RECONSTRUCT_WRITE:
-		for (i= disks; i-- ;)
-			if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
-				chosen = sh->dev[i].towrite;
-				sh->dev[i].towrite = NULL;
-
-				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-					wake_up(&conf->wait_for_overlap);
-
-				BUG_ON(sh->dev[i].written);
-				sh->dev[i].written = chosen;
-			}
-		break;
-	case CHECK_PARITY:
-		BUG();		/* Not implemented yet */
-	}
-
-	for (i = disks; i--;)
-		if (sh->dev[i].written) {
-			sector_t sector = sh->dev[i].sector;
-			struct bio *wbi = sh->dev[i].written;
-			while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-				copy_data(1, wbi, sh->dev[i].page, sector);
-				wbi = r5_next_bio(wbi, sector);
-			}
-
-			set_bit(R5_LOCKED, &sh->dev[i].flags);
-			set_bit(R5_UPTODATE, &sh->dev[i].flags);
-		}
-
-//	switch(method) {
-//	case RECONSTRUCT_WRITE:
-//	case CHECK_PARITY:
-//	case UPDATE_PARITY:
-		/* Note that unlike RAID-5, the ordering of the disks matters greatly. */
-		/* FIX: Is this ordering of drives even remotely optimal? */
-		count = 0;
-		i = d0_idx;
-		do {
-			ptrs[count++] = page_address(sh->dev[i].page);
-			if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
-				printk("block %d/%d not uptodate on parity calc\n", i,count);
-			i = raid6_next_disk(i, disks);
-		} while ( i != d0_idx );
-//		break;
-//	}
-
-	raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
-
-	switch(method) {
-	case RECONSTRUCT_WRITE:
-		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-		set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
-		set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
-		set_bit(R5_LOCKED,   &sh->dev[qd_idx].flags);
-		break;
-	case UPDATE_PARITY:
-		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-		set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
-		break;
-	}
-}
-
-/* Compute one missing block */
-static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
-{
-	raid6_conf_t *conf = sh->raid_conf;
-	int i, count, disks = conf->raid_disks;
-	void *ptr[MAX_XOR_BLOCKS], *p;
-	int pd_idx = sh->pd_idx;
-	int qd_idx = raid6_next_disk(pd_idx, disks);
-
-	PRINTK("compute_block_1, stripe %llu, idx %d\n",
-		(unsigned long long)sh->sector, dd_idx);
-
-	if ( dd_idx == qd_idx ) {
-		/* We're actually computing the Q drive */
-		compute_parity(sh, UPDATE_PARITY);
-	} else {
-		ptr[0] = page_address(sh->dev[dd_idx].page);
-		if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
-		count = 1;
-		for (i = disks ; i--; ) {
-			if (i == dd_idx || i == qd_idx)
-				continue;
-			p = page_address(sh->dev[i].page);
-			if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
-				ptr[count++] = p;
-			else
-				printk("compute_block() %d, stripe %llu, %d"
-				       " not present\n", dd_idx,
-				       (unsigned long long)sh->sector, i);
-
-			check_xor();
-		}
-		if (count != 1)
-			xor_block(count, STRIPE_SIZE, ptr);
-		if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
-		else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
-	}
-}
-
-/* Compute two missing blocks */
-static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
-{
-	raid6_conf_t *conf = sh->raid_conf;
-	int i, count, disks = conf->raid_disks;
-	int pd_idx = sh->pd_idx;
-	int qd_idx = raid6_next_disk(pd_idx, disks);
-	int d0_idx = raid6_next_disk(qd_idx, disks);
-	int faila, failb;
-
-	/* faila and failb are disk numbers relative to d0_idx */
-	/* pd_idx become disks-2 and qd_idx become disks-1 */
-	faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
-	failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
-
-	BUG_ON(faila == failb);
-	if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
-
-	PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
-	       (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
-
-	if ( failb == disks-1 ) {
-		/* Q disk is one of the missing disks */
-		if ( faila == disks-2 ) {
-			/* Missing P+Q, just recompute */
-			compute_parity(sh, UPDATE_PARITY);
-			return;
-		} else {
-			/* We're missing D+Q; recompute D from P */
-			compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
-			compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */
-			return;
-		}
-	}
-
-	/* We're missing D+P or D+D; build pointer table */
-	{
-		/**** FIX THIS: This could be very bad if disks is close to 256 ****/
-		void *ptrs[disks];
-
-		count = 0;
-		i = d0_idx;
-		do {
-			ptrs[count++] = page_address(sh->dev[i].page);
-			i = raid6_next_disk(i, disks);
-			if (i != dd_idx1 && i != dd_idx2 &&
-			    !test_bit(R5_UPTODATE, &sh->dev[i].flags))
-				printk("compute_2 with missing block %d/%d\n", count, i);
-		} while ( i != d0_idx );
-
-		if ( failb == disks-2 ) {
-			/* We're missing D+P. */
-			raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
-		} else {
-			/* We're missing D+D. */
-			raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
-		}
-
-		/* Both the above update both missing blocks */
-		set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
-		set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
-	}
-}
-
-
-/*
- * Each stripe/dev can have one or more bion attached.
- * toread/towrite point to the first in a chain.
- * The bi_next chain must be in order.
- */
-static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
-{
-	struct bio **bip;
-	raid6_conf_t *conf = sh->raid_conf;
-	int firstwrite=0;
-
-	PRINTK("adding bh b#%llu to stripe s#%llu\n",
-		(unsigned long long)bi->bi_sector,
-		(unsigned long long)sh->sector);
-
-
-	spin_lock(&sh->lock);
-	spin_lock_irq(&conf->device_lock);
-	if (forwrite) {
-		bip = &sh->dev[dd_idx].towrite;
-		if (*bip == NULL && sh->dev[dd_idx].written == NULL)
-			firstwrite = 1;
-	} else
-		bip = &sh->dev[dd_idx].toread;
-	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
-		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
-			goto overlap;
-		bip = &(*bip)->bi_next;
-	}
-	if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
-		goto overlap;
-
-	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
-	if (*bip)
-		bi->bi_next = *bip;
-	*bip = bi;
-	bi->bi_phys_segments ++;
-	spin_unlock_irq(&conf->device_lock);
-	spin_unlock(&sh->lock);
-
-	PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n",
-		(unsigned long long)bi->bi_sector,
-		(unsigned long long)sh->sector, dd_idx);
-
-	if (conf->mddev->bitmap && firstwrite) {
-		sh->bm_seq = conf->seq_write;
-		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
-				  STRIPE_SECTORS, 0);
-		set_bit(STRIPE_BIT_DELAY, &sh->state);
-	}
-
-	if (forwrite) {
-		/* check if page is covered */
-		sector_t sector = sh->dev[dd_idx].sector;
-		for (bi=sh->dev[dd_idx].towrite;
-		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
-			     bi && bi->bi_sector <= sector;
-		     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
-			if (bi->bi_sector + (bi->bi_size>>9) >= sector)
-				sector = bi->bi_sector + (bi->bi_size>>9);
-		}
-		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
-			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
-	}
-	return 1;
-
- overlap:
-	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
-	spin_unlock_irq(&conf->device_lock);
-	spin_unlock(&sh->lock);
-	return 0;
-}
-
-
-static int page_is_zero(struct page *p)
-{
-	char *a = page_address(p);
-	return ((*(u32*)a) == 0 &&
-		memcmp(a, a+4, STRIPE_SIZE-4)==0);
-}
-/*
- * handle_stripe - do things to a stripe.
- *
- * We lock the stripe and then examine the state of various bits
- * to see what needs to be done.
- * Possible results:
- *    return some read request which now have data
- *    return some write requests which are safely on disc
- *    schedule a read on some buffers
- *    schedule a write of some buffers
- *    return confirmation of parity correctness
- *
- * Parity calculations are done inside the stripe lock
- * buffers are taken off read_list or write_list, and bh_cache buffers
- * get BH_Lock set before the stripe lock is released.
- *
- */
-
-static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
-{
-	raid6_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks;
-	struct bio *return_bi= NULL;
-	struct bio *bi;
-	int i;
-	int syncing;
-	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-	int non_overwrite = 0;
-	int failed_num[2] = {0, 0};
-	struct r5dev *dev, *pdev, *qdev;
-	int pd_idx = sh->pd_idx;
-	int qd_idx = raid6_next_disk(pd_idx, disks);
-	int p_failed, q_failed;
-
-	PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
-	       (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
-	       pd_idx, qd_idx);
-
-	spin_lock(&sh->lock);
-	clear_bit(STRIPE_HANDLE, &sh->state);
-	clear_bit(STRIPE_DELAYED, &sh->state);
-
-	syncing = test_bit(STRIPE_SYNCING, &sh->state);
-	/* Now to look around and see what can be done */
-
-	rcu_read_lock();
-	for (i=disks; i--; ) {
-		mdk_rdev_t *rdev;
-		dev = &sh->dev[i];
-		clear_bit(R5_Insync, &dev->flags);
-
-		PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
-			i, dev->flags, dev->toread, dev->towrite, dev->written);
-		/* maybe we can reply to a read */
-		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
-			struct bio *rbi, *rbi2;
-			PRINTK("Return read for disc %d\n", i);
-			spin_lock_irq(&conf->device_lock);
-			rbi = dev->toread;
-			dev->toread = NULL;
-			if (test_and_clear_bit(R5_Overlap, &dev->flags))
-				wake_up(&conf->wait_for_overlap);
-			spin_unlock_irq(&conf->device_lock);
-			while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-				copy_data(0, rbi, dev->page, dev->sector);
-				rbi2 = r5_next_bio(rbi, dev->sector);
-				spin_lock_irq(&conf->device_lock);
-				if (--rbi->bi_phys_segments == 0) {
-					rbi->bi_next = return_bi;
-					return_bi = rbi;
-				}
-				spin_unlock_irq(&conf->device_lock);
-				rbi = rbi2;
-			}
-		}
-
-		/* now count some things */
-		if (test_bit(R5_LOCKED, &dev->flags)) locked++;
-		if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
-
-
-		if (dev->toread) to_read++;
-		if (dev->towrite) {
-			to_write++;
-			if (!test_bit(R5_OVERWRITE, &dev->flags))
-				non_overwrite++;
-		}
-		if (dev->written) written++;
-		rdev = rcu_dereference(conf->disks[i].rdev);
-		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
-			/* The ReadError flag will just be confusing now */
-			clear_bit(R5_ReadError, &dev->flags);
-			clear_bit(R5_ReWrite, &dev->flags);
-		}
-		if (!rdev || !test_bit(In_sync, &rdev->flags)
-		    || test_bit(R5_ReadError, &dev->flags)) {
-			if ( failed < 2 )
-				failed_num[failed] = i;
-			failed++;
-		} else
-			set_bit(R5_Insync, &dev->flags);
-	}
-	rcu_read_unlock();
-	PRINTK("locked=%d uptodate=%d to_read=%d"
-	       " to_write=%d failed=%d failed_num=%d,%d\n",
-	       locked, uptodate, to_read, to_write, failed,
-	       failed_num[0], failed_num[1]);
-	/* check if the array has lost >2 devices and, if so, some requests might
-	 * need to be failed
-	 */
-	if (failed > 2 && to_read+to_write+written) {
-		for (i=disks; i--; ) {
-			int bitmap_end = 0;
-
-			if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-				mdk_rdev_t *rdev;
-				rcu_read_lock();
-				rdev = rcu_dereference(conf->disks[i].rdev);
-				if (rdev && test_bit(In_sync, &rdev->flags))
-					/* multiple read failures in one stripe */
-					md_error(conf->mddev, rdev);
-				rcu_read_unlock();
-			}
-
-			spin_lock_irq(&conf->device_lock);
-			/* fail all writes first */
-			bi = sh->dev[i].towrite;
-			sh->dev[i].towrite = NULL;
-			if (bi) { to_write--; bitmap_end = 1; }
-
-			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-				wake_up(&conf->wait_for_overlap);
-
-			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
-				struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-				clear_bit(BIO_UPTODATE, &bi->bi_flags);
-				if (--bi->bi_phys_segments == 0) {
-					md_write_end(conf->mddev);
-					bi->bi_next = return_bi;
-					return_bi = bi;
-				}
-				bi = nextbi;
-			}
-			/* and fail all 'written' */
-			bi = sh->dev[i].written;
-			sh->dev[i].written = NULL;
-			if (bi) bitmap_end = 1;
-			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
-				struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
-				clear_bit(BIO_UPTODATE, &bi->bi_flags);
-				if (--bi->bi_phys_segments == 0) {
-					md_write_end(conf->mddev);
-					bi->bi_next = return_bi;
-					return_bi = bi;
-				}
-				bi = bi2;
-			}
-
-			/* fail any reads if this device is non-operational */
-			if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
-			    test_bit(R5_ReadError, &sh->dev[i].flags)) {
-				bi = sh->dev[i].toread;
-				sh->dev[i].toread = NULL;
-				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-					wake_up(&conf->wait_for_overlap);
-				if (bi) to_read--;
-				while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
-					struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-					clear_bit(BIO_UPTODATE, &bi->bi_flags);
-					if (--bi->bi_phys_segments == 0) {
-						bi->bi_next = return_bi;
-						return_bi = bi;
-					}
-					bi = nextbi;
-				}
-			}
-			spin_unlock_irq(&conf->device_lock);
-			if (bitmap_end)
-				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-						STRIPE_SECTORS, 0, 0);
-		}
-	}
-	if (failed > 2 && syncing) {
-		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
-		clear_bit(STRIPE_SYNCING, &sh->state);
-		syncing = 0;
-	}
-
-	/*
-	 * might be able to return some write requests if the parity blocks
-	 * are safe, or on a failed drive
-	 */
-	pdev = &sh->dev[pd_idx];
-	p_failed = (failed >= 1 && failed_num[0] == pd_idx)
-		|| (failed >= 2 && failed_num[1] == pd_idx);
-	qdev = &sh->dev[qd_idx];
-	q_failed = (failed >= 1 && failed_num[0] == qd_idx)
-		|| (failed >= 2 && failed_num[1] == qd_idx);
-
-	if ( written &&
-	     ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
-			     && !test_bit(R5_LOCKED, &pdev->flags)
-			     && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
-	     ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
-			     && !test_bit(R5_LOCKED, &qdev->flags)
-			     && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
-		/* any written block on an uptodate or failed drive can be
-		 * returned.  Note that if we 'wrote' to a failed drive,
-		 * it will be UPTODATE, but never LOCKED, so we don't need
-		 * to test 'failed' directly.
-		 */
-		for (i=disks; i--; )
-			if (sh->dev[i].written) {
-				dev = &sh->dev[i];
-				if (!test_bit(R5_LOCKED, &dev->flags) &&
-				    test_bit(R5_UPTODATE, &dev->flags) ) {
-					/* We can return any write requests */
-					int bitmap_end = 0;
-					struct bio *wbi, *wbi2;
-					PRINTK("Return write for stripe %llu disc %d\n",
-					       (unsigned long long)sh->sector, i);
-					spin_lock_irq(&conf->device_lock);
-					wbi = dev->written;
-					dev->written = NULL;
-					while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-						wbi2 = r5_next_bio(wbi, dev->sector);
-						if (--wbi->bi_phys_segments == 0) {
-							md_write_end(conf->mddev);
-							wbi->bi_next = return_bi;
-							return_bi = wbi;
-						}
-						wbi = wbi2;
-					}
-					if (dev->towrite == NULL)
-						bitmap_end = 1;
-					spin_unlock_irq(&conf->device_lock);
-					if (bitmap_end)
-						bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-								STRIPE_SECTORS,
-								!test_bit(STRIPE_DEGRADED, &sh->state), 0);
-				}
-			}
-	}
-
-	/* Now we might consider reading some blocks, either to check/generate
-	 * parity, or to satisfy requests
-	 * or to load a block that is being partially written.
-	 */
-	if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
-		for (i=disks; i--;) {
-			dev = &sh->dev[i];
-			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-			    (dev->toread ||
-			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
-			     syncing ||
-			     (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
-			     (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
-				    )
-				) {
-				/* we would like to get this block, possibly
-				 * by computing it, but we might not be able to
-				 */
-				if (uptodate == disks-1) {
-					PRINTK("Computing stripe %llu block %d\n",
-					       (unsigned long long)sh->sector, i);
-					compute_block_1(sh, i, 0);
-					uptodate++;
-				} else if ( uptodate == disks-2 && failed >= 2 ) {
-					/* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
-					int other;
-					for (other=disks; other--;) {
-						if ( other == i )
-							continue;
-						if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
-							break;
-					}
-					BUG_ON(other < 0);
-					PRINTK("Computing stripe %llu blocks %d,%d\n",
-					       (unsigned long long)sh->sector, i, other);
-					compute_block_2(sh, i, other);
-					uptodate += 2;
-				} else if (test_bit(R5_Insync, &dev->flags)) {
-					set_bit(R5_LOCKED, &dev->flags);
-					set_bit(R5_Wantread, &dev->flags);
-#if 0
-					/* if I am just reading this block and we don't have
-					   a failed drive, or any pending writes then sidestep the cache */
-					if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
-					    ! syncing && !failed && !to_write) {
-						sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
-						sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
-					}
-#endif
-					locked++;
-					PRINTK("Reading block %d (sync=%d)\n",
-						i, syncing);
-				}
-			}
-		}
-		set_bit(STRIPE_HANDLE, &sh->state);
-	}
-
-	/* now to consider writing and what else, if anything should be read */
-	if (to_write) {
-		int rcw=0, must_compute=0;
-		for (i=disks ; i--;) {
-			dev = &sh->dev[i];
-			/* Would I have to read this buffer for reconstruct_write */
-			if (!test_bit(R5_OVERWRITE, &dev->flags)
-			    && i != pd_idx && i != qd_idx
-			    && (!test_bit(R5_LOCKED, &dev->flags)
-#if 0
-				|| sh->bh_page[i] != bh->b_page
-#endif
-				    ) &&
-			    !test_bit(R5_UPTODATE, &dev->flags)) {
-				if (test_bit(R5_Insync, &dev->flags)) rcw++;
-				else {
-					PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
-					must_compute++;
-				}
-			}
-		}
-		PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
-		       (unsigned long long)sh->sector, rcw, must_compute);
-		set_bit(STRIPE_HANDLE, &sh->state);
-
-		if (rcw > 0)
-			/* want reconstruct write, but need to get some data */
-			for (i=disks; i--;) {
-				dev = &sh->dev[i];
-				if (!test_bit(R5_OVERWRITE, &dev->flags)
-				    && !(failed == 0 && (i == pd_idx || i == qd_idx))
-				    && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-				    test_bit(R5_Insync, &dev->flags)) {
-					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-					{
-						PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
-						       (unsigned long long)sh->sector, i);
-						set_bit(R5_LOCKED, &dev->flags);
-						set_bit(R5_Wantread, &dev->flags);
-						locked++;
-					} else {
-						PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
-						       (unsigned long long)sh->sector, i);
-						set_bit(STRIPE_DELAYED, &sh->state);
-						set_bit(STRIPE_HANDLE, &sh->state);
-					}
-				}
-			}
-		/* now if nothing is locked, and if we have enough data, we can start a write request */
-		if (locked == 0 && rcw == 0 &&
-		    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
-			if ( must_compute > 0 ) {
-				/* We have failed blocks and need to compute them */
-				switch ( failed ) {
-				case 0:	BUG();
-				case 1: compute_block_1(sh, failed_num[0], 0); break;
-				case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
-				default: BUG();	/* This request should have been failed? */
-				}
-			}
-
-			PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
-			compute_parity(sh, RECONSTRUCT_WRITE);
-			/* now every locked buffer is ready to be written */
-			for (i=disks; i--;)
-				if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
-					PRINTK("Writing stripe %llu block %d\n",
-					       (unsigned long long)sh->sector, i);
-					locked++;
-					set_bit(R5_Wantwrite, &sh->dev[i].flags);
-				}
-			/* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
-			set_bit(STRIPE_INSYNC, &sh->state);
-
-			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-				atomic_dec(&conf->preread_active_stripes);
-				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
-					md_wakeup_thread(conf->mddev->thread);
-			}
-		}
-	}
-
-	/* maybe we need to check and possibly fix the parity for this stripe
-	 * Any reads will already have been scheduled, so we just see if enough data
-	 * is available
-	 */
-	if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
-		int update_p = 0, update_q = 0;
-		struct r5dev *dev;
-
-		set_bit(STRIPE_HANDLE, &sh->state);
-
-		BUG_ON(failed>2);
-		BUG_ON(uptodate < disks);
-		/* Want to check and possibly repair P and Q.
-		 * However there could be one 'failed' device, in which
-		 * case we can only check one of them, possibly using the
-		 * other to generate missing data
-		 */
-
-		/* If !tmp_page, we cannot do the calculations,
-		 * but as we have set STRIPE_HANDLE, we will soon be called
-		 * by stripe_handle with a tmp_page - just wait until then.
-		 */
-		if (tmp_page) {
-			if (failed == q_failed) {
-				/* The only possible failed device holds 'Q', so it makes
-				 * sense to check P (If anything else were failed, we would
-				 * have used P to recreate it).
-				 */
-				compute_block_1(sh, pd_idx, 1);
-				if (!page_is_zero(sh->dev[pd_idx].page)) {
-					compute_block_1(sh,pd_idx,0);
-					update_p = 1;
-				}
-			}
-			if (!q_failed && failed < 2) {
-				/* q is not failed, and we didn't use it to generate
-				 * anything, so it makes sense to check it
-				 */
-				memcpy(page_address(tmp_page),
-				       page_address(sh->dev[qd_idx].page),
-				       STRIPE_SIZE);
-				compute_parity(sh, UPDATE_PARITY);
-				if (memcmp(page_address(tmp_page),
-					   page_address(sh->dev[qd_idx].page),
-					   STRIPE_SIZE)!= 0) {
-					clear_bit(STRIPE_INSYNC, &sh->state);
-					update_q = 1;
-				}
-			}
-			if (update_p || update_q) {
-				conf->mddev->resync_mismatches += STRIPE_SECTORS;
-				if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
-					/* don't try to repair!! */
-					update_p = update_q = 0;
-			}
-
-			/* now write out any block on a failed drive,
-			 * or P or Q if they need it
-			 */
-
-			if (failed == 2) {
-				dev = &sh->dev[failed_num[1]];
-				locked++;
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantwrite, &dev->flags);
-			}
-			if (failed >= 1) {
-				dev = &sh->dev[failed_num[0]];
-				locked++;
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantwrite, &dev->flags);
-			}
-
-			if (update_p) {
-				dev = &sh->dev[pd_idx];
-				locked ++;
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantwrite, &dev->flags);
-			}
-			if (update_q) {
-				dev = &sh->dev[qd_idx];
-				locked++;
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantwrite, &dev->flags);
-			}
-			clear_bit(STRIPE_DEGRADED, &sh->state);
-
-			set_bit(STRIPE_INSYNC, &sh->state);
-		}
-	}
-
-	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
-		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
-		clear_bit(STRIPE_SYNCING, &sh->state);
-	}
-
-	/* If the failed drives are just a ReadError, then we might need
-	 * to progress the repair/check process
-	 */
-	if (failed <= 2 && ! conf->mddev->ro)
-		for (i=0; i<failed;i++) {
-			dev = &sh->dev[failed_num[i]];
-			if (test_bit(R5_ReadError, &dev->flags)
-			    && !test_bit(R5_LOCKED, &dev->flags)
-			    && test_bit(R5_UPTODATE, &dev->flags)
-				) {
-				if (!test_bit(R5_ReWrite, &dev->flags)) {
-					set_bit(R5_Wantwrite, &dev->flags);
-					set_bit(R5_ReWrite, &dev->flags);
-					set_bit(R5_LOCKED, &dev->flags);
-				} else {
-					/* let's read it back */
-					set_bit(R5_Wantread, &dev->flags);
-					set_bit(R5_LOCKED, &dev->flags);
-				}
-			}
-		}
-	spin_unlock(&sh->lock);
-
-	while ((bi=return_bi)) {
-		int bytes = bi->bi_size;
-
-		return_bi = bi->bi_next;
-		bi->bi_next = NULL;
-		bi->bi_size = 0;
-		bi->bi_end_io(bi, bytes, 0);
-	}
-	for (i=disks; i-- ;) {
-		int rw;
-		struct bio *bi;
-		mdk_rdev_t *rdev;
-		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
-			rw = 1;
-		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
-			rw = 0;
-		else
-			continue;
-
-		bi = &sh->dev[i].req;
-
-		bi->bi_rw = rw;
-		if (rw)
-			bi->bi_end_io = raid6_end_write_request;
-		else
-			bi->bi_end_io = raid6_end_read_request;
-
-		rcu_read_lock();
-		rdev = rcu_dereference(conf->disks[i].rdev);
-		if (rdev && test_bit(Faulty, &rdev->flags))
-			rdev = NULL;
-		if (rdev)
-			atomic_inc(&rdev->nr_pending);
-		rcu_read_unlock();
-
-		if (rdev) {
-			if (syncing)
-				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
-
-			bi->bi_bdev = rdev->bdev;
-			PRINTK("for %llu schedule op %ld on disc %d\n",
-				(unsigned long long)sh->sector, bi->bi_rw, i);
-			atomic_inc(&sh->count);
-			bi->bi_sector = sh->sector + rdev->data_offset;
-			bi->bi_flags = 1 << BIO_UPTODATE;
-			bi->bi_vcnt = 1;
-			bi->bi_max_vecs = 1;
-			bi->bi_idx = 0;
-			bi->bi_io_vec = &sh->dev[i].vec;
-			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
-			bi->bi_io_vec[0].bv_offset = 0;
-			bi->bi_size = STRIPE_SIZE;
-			bi->bi_next = NULL;
-			if (rw == WRITE &&
-			    test_bit(R5_ReWrite, &sh->dev[i].flags))
-				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
-			generic_make_request(bi);
-		} else {
-			if (rw == 1)
-				set_bit(STRIPE_DEGRADED, &sh->state);
-			PRINTK("skip op %ld on disc %d for sector %llu\n",
-				bi->bi_rw, i, (unsigned long long)sh->sector);
-			clear_bit(R5_LOCKED, &sh->dev[i].flags);
-			set_bit(STRIPE_HANDLE, &sh->state);
-		}
-	}
-}
-
-static void raid6_activate_delayed(raid6_conf_t *conf)
-{
-	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
-		while (!list_empty(&conf->delayed_list)) {
-			struct list_head *l = conf->delayed_list.next;
-			struct stripe_head *sh;
-			sh = list_entry(l, struct stripe_head, lru);
-			list_del_init(l);
-			clear_bit(STRIPE_DELAYED, &sh->state);
-			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-				atomic_inc(&conf->preread_active_stripes);
-			list_add_tail(&sh->lru, &conf->handle_list);
-		}
-	}
-}
-
-static void activate_bit_delay(raid6_conf_t *conf)
-{
-	/* device_lock is held */
-	struct list_head head;
-	list_add(&head, &conf->bitmap_list);
-	list_del_init(&conf->bitmap_list);
-	while (!list_empty(&head)) {
-		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
-		list_del_init(&sh->lru);
-		atomic_inc(&sh->count);
-		__release_stripe(conf, sh);
-	}
-}
-
-static void unplug_slaves(mddev_t *mddev)
-{
-	raid6_conf_t *conf = mddev_to_conf(mddev);
-	int i;
-
-	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
-
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-
-			if (r_queue->unplug_fn)
-				r_queue->unplug_fn(r_queue);
-
-			rdev_dec_pending(rdev, mddev);
-			rcu_read_lock();
-		}
-	}
-	rcu_read_unlock();
-}
-
-static void raid6_unplug_device(request_queue_t *q)
-{
-	mddev_t *mddev = q->queuedata;
-	raid6_conf_t *conf = mddev_to_conf(mddev);
-	unsigned long flags;
-
-	spin_lock_irqsave(&conf->device_lock, flags);
-
-	if (blk_remove_plug(q)) {
-		conf->seq_flush++;
-		raid6_activate_delayed(conf);
-	}
-	md_wakeup_thread(mddev->thread);
-
-	spin_unlock_irqrestore(&conf->device_lock, flags);
-
-	unplug_slaves(mddev);
-}
-
-static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk,
-			     sector_t *error_sector)
-{
-	mddev_t *mddev = q->queuedata;
-	raid6_conf_t *conf = mddev_to_conf(mddev);
-	int i, ret = 0;
-
-	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks && ret == 0; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags)) {
-			struct block_device *bdev = rdev->bdev;
-			request_queue_t *r_queue = bdev_get_queue(bdev);
-
-			if (!r_queue->issue_flush_fn)
-				ret = -EOPNOTSUPP;
-			else {
-				atomic_inc(&rdev->nr_pending);
-				rcu_read_unlock();
-				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
-							      error_sector);
-				rdev_dec_pending(rdev, mddev);
-				rcu_read_lock();
-			}
-		}
-	}
-	rcu_read_unlock();
-	return ret;
-}
-
-static inline void raid6_plug_device(raid6_conf_t *conf)
-{
-	spin_lock_irq(&conf->device_lock);
-	blk_plug_device(conf->mddev->queue);
-	spin_unlock_irq(&conf->device_lock);
-}
-
-static int make_request (request_queue_t *q, struct bio * bi)
-{
-	mddev_t *mddev = q->queuedata;
-	raid6_conf_t *conf = mddev_to_conf(mddev);
-	const unsigned int raid_disks = conf->raid_disks;
-	const unsigned int data_disks = raid_disks - 2;
-	unsigned int dd_idx, pd_idx;
-	sector_t new_sector;
-	sector_t logical_sector, last_sector;
-	struct stripe_head *sh;
-	const int rw = bio_data_dir(bi);
-
-	if (unlikely(bio_barrier(bi))) {
-		bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
-		return 0;
-	}
-
-	md_write_start(mddev, bi);
-
-	disk_stat_inc(mddev->gendisk, ios[rw]);
-	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
-
-	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-	last_sector = bi->bi_sector + (bi->bi_size>>9);
-
-	bi->bi_next = NULL;
-	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
-
-	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
-		DEFINE_WAIT(w);
-
-		new_sector = raid6_compute_sector(logical_sector,
-						  raid_disks, data_disks, &dd_idx, &pd_idx, conf);
-
-		PRINTK("raid6: make_request, sector %llu logical %llu\n",
-		       (unsigned long long)new_sector,
-		       (unsigned long long)logical_sector);
-
-	retry:
-		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-		sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
-		if (sh) {
-			if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
-				/* Add failed due to overlap.  Flush everything
-				 * and wait a while
-				 */
-				raid6_unplug_device(mddev->queue);
-				release_stripe(sh);
-				schedule();
-				goto retry;
-			}
-			finish_wait(&conf->wait_for_overlap, &w);
-			raid6_plug_device(conf);
-			handle_stripe(sh, NULL);
-			release_stripe(sh);
-		} else {
-			/* cannot get stripe for read-ahead, just give-up */
-			clear_bit(BIO_UPTODATE, &bi->bi_flags);
-			finish_wait(&conf->wait_for_overlap, &w);
-			break;
-		}
-
-	}
-	spin_lock_irq(&conf->device_lock);
-	if (--bi->bi_phys_segments == 0) {
-		int bytes = bi->bi_size;
-
-		if (rw == WRITE )
-			md_write_end(mddev);
-		bi->bi_size = 0;
-		bi->bi_end_io(bi, bytes, 0);
-	}
-	spin_unlock_irq(&conf->device_lock);
-	return 0;
-}
-
-/* FIXME go_faster isn't used */
-static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
-{
-	raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
-	struct stripe_head *sh;
-	int sectors_per_chunk = conf->chunk_size >> 9;
-	sector_t x;
-	unsigned long stripe;
-	int chunk_offset;
-	int dd_idx, pd_idx;
-	sector_t first_sector;
-	int raid_disks = conf->raid_disks;
-	int data_disks = raid_disks - 2;
-	sector_t max_sector = mddev->size << 1;
-	int sync_blocks;
-	int still_degraded = 0;
-	int i;
-
-	if (sector_nr >= max_sector) {
-		/* just being told to finish up .. nothing much to do */
-		unplug_slaves(mddev);
-
-		if (mddev->curr_resync < max_sector) /* aborted */
-			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
-					&sync_blocks, 1);
-		else /* completed sync */
-			conf->fullsync = 0;
-		bitmap_close_sync(mddev->bitmap);
-
-		return 0;
-	}
-	/* if there are 2 or more failed drives and we are trying
-	 * to resync, then assert that we are finished, because there is
-	 * nothing we can do.
-	 */
-	if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-		sector_t rv = (mddev->size << 1) - sector_nr;
-		*skipped = 1;
-		return rv;
-	}
-	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
-	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
-	    !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
-		/* we can skip this block, and probably more */
-		sync_blocks /= STRIPE_SECTORS;
-		*skipped = 1;
-		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
-	}
-
-	x = sector_nr;
-	chunk_offset = sector_div(x, sectors_per_chunk);
-	stripe = x;
-	BUG_ON(x != stripe);
-
-	first_sector = raid6_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
-		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
-	sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
-	if (sh == NULL) {
-		sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
-		/* make sure we don't swamp the stripe cache if someone else
-		 * is trying to get access
-		 */
-		schedule_timeout_uninterruptible(1);
-	}
-	/* Need to check if array will still be degraded after recovery/resync
-	 * We don't need to check the 'failed' flag as when that gets set,
-	 * recovery aborts.
-	 */
-	for (i=0; i<mddev->raid_disks; i++)
-		if (conf->disks[i].rdev == NULL)
-			still_degraded = 1;
-
-	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
-
-	spin_lock(&sh->lock);
-	set_bit(STRIPE_SYNCING, &sh->state);
-	clear_bit(STRIPE_INSYNC, &sh->state);
-	spin_unlock(&sh->lock);
-
-	handle_stripe(sh, NULL);
-	release_stripe(sh);
-
-	return STRIPE_SECTORS;
-}
-
-/*
- * This is our raid6 kernel thread.
- *
- * We scan the hash table for stripes which can be handled now.
- * During the scan, completed stripes are saved for us by the interrupt
- * handler, so that they will not have to wait for our next wakeup.
- */
-static void raid6d (mddev_t *mddev)
-{
-	struct stripe_head *sh;
-	raid6_conf_t *conf = mddev_to_conf(mddev);
-	int handled;
-
-	PRINTK("+++ raid6d active\n");
-
-	md_check_recovery(mddev);
-
-	handled = 0;
-	spin_lock_irq(&conf->device_lock);
-	while (1) {
-		struct list_head *first;
-
-		if (conf->seq_flush - conf->seq_write > 0) {
-			int seq = conf->seq_flush;
-			spin_unlock_irq(&conf->device_lock);
-			bitmap_unplug(mddev->bitmap);
-			spin_lock_irq(&conf->device_lock);
-			conf->seq_write = seq;
-			activate_bit_delay(conf);
-		}
-
-		if (list_empty(&conf->handle_list) &&
-		    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
-		    !blk_queue_plugged(mddev->queue) &&
-		    !list_empty(&conf->delayed_list))
-			raid6_activate_delayed(conf);
-
-		if (list_empty(&conf->handle_list))
-			break;
-
-		first = conf->handle_list.next;
-		sh = list_entry(first, struct stripe_head, lru);
-
-		list_del_init(first);
-		atomic_inc(&sh->count);
-		BUG_ON(atomic_read(&sh->count)!= 1);
-		spin_unlock_irq(&conf->device_lock);
-
-		handled++;
-		handle_stripe(sh, conf->spare_page);
-		release_stripe(sh);
-
-		spin_lock_irq(&conf->device_lock);
-	}
-	PRINTK("%d stripes handled\n", handled);
-
-	spin_unlock_irq(&conf->device_lock);
-
-	unplug_slaves(mddev);
-
-	PRINTK("--- raid6d inactive\n");
-}
-
-static ssize_t
-raid6_show_stripe_cache_size(mddev_t *mddev, char *page)
-{
-	raid6_conf_t *conf = mddev_to_conf(mddev);
-	if (conf)
-		return sprintf(page, "%d\n", conf->max_nr_stripes);
-	else
-		return 0;
-}
-
-static ssize_t
-raid6_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
-{
-	raid6_conf_t *conf = mddev_to_conf(mddev);
-	char *end;
-	int new;
-	if (len >= PAGE_SIZE)
-		return -EINVAL;
-	if (!conf)
-		return -ENODEV;
-
-	new = simple_strtoul(page, &end, 10);
-	if (!*page || (*end && *end != '\n') )
-		return -EINVAL;
-	if (new <= 16 || new > 32768)
-		return -EINVAL;
-	while (new < conf->max_nr_stripes) {
-		if (drop_one_stripe(conf))
-			conf->max_nr_stripes--;
-		else
-			break;
-	}
-	while (new > conf->max_nr_stripes) {
-		if (grow_one_stripe(conf))
-			conf->max_nr_stripes++;
-		else break;
-	}
-	return len;
-}
-
-static struct md_sysfs_entry
-raid6_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
-				raid6_show_stripe_cache_size,
-				raid6_store_stripe_cache_size);
-
-static ssize_t
-stripe_cache_active_show(mddev_t *mddev, char *page)
-{
-	raid6_conf_t *conf = mddev_to_conf(mddev);
-	if (conf)
-		return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
-	else
-		return 0;
-}
-
-static struct md_sysfs_entry
-raid6_stripecache_active = __ATTR_RO(stripe_cache_active);
-
-static struct attribute *raid6_attrs[] =  {
-	&raid6_stripecache_size.attr,
-	&raid6_stripecache_active.attr,
-	NULL,
-};
-static struct attribute_group raid6_attrs_group = {
-	.name = NULL,
-	.attrs = raid6_attrs,
-};
-
-static int run(mddev_t *mddev)
-{
-	raid6_conf_t *conf;
-	int raid_disk, memory;
-	mdk_rdev_t *rdev;
-	struct disk_info *disk;
-	struct list_head *tmp;
-
-	if (mddev->level != 6) {
-		PRINTK("raid6: %s: raid level not set to 6 (%d)\n", mdname(mddev), mddev->level);
-		return -EIO;
-	}
-
-	mddev->private = kzalloc(sizeof (raid6_conf_t), GFP_KERNEL);
-	if ((conf = mddev->private) == NULL)
-		goto abort;
-	conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
-				 GFP_KERNEL);
-	if (!conf->disks)
-		goto abort;
-
-	conf->mddev = mddev;
-
-	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
-		goto abort;
-
-	conf->spare_page = alloc_page(GFP_KERNEL);
-	if (!conf->spare_page)
-		goto abort;
-
-	spin_lock_init(&conf->device_lock);
-	init_waitqueue_head(&conf->wait_for_stripe);
-	init_waitqueue_head(&conf->wait_for_overlap);
-	INIT_LIST_HEAD(&conf->handle_list);
-	INIT_LIST_HEAD(&conf->delayed_list);
-	INIT_LIST_HEAD(&conf->bitmap_list);
-	INIT_LIST_HEAD(&conf->inactive_list);
-	atomic_set(&conf->active_stripes, 0);
-	atomic_set(&conf->preread_active_stripes, 0);
-
-	PRINTK("raid6: run(%s) called.\n", mdname(mddev));
-
-	ITERATE_RDEV(mddev,rdev,tmp) {
-		raid_disk = rdev->raid_disk;
-		if (raid_disk >= mddev->raid_disks
-		    || raid_disk < 0)
-			continue;
-		disk = conf->disks + raid_disk;
-
-		disk->rdev = rdev;
-
-		if (test_bit(In_sync, &rdev->flags)) {
-			char b[BDEVNAME_SIZE];
-			printk(KERN_INFO "raid6: device %s operational as raid"
-			       " disk %d\n", bdevname(rdev->bdev,b),
-			       raid_disk);
-			conf->working_disks++;
-		}
-	}
-
-	conf->raid_disks = mddev->raid_disks;
-
-	/*
-	 * 0 for a fully functional array, 1 or 2 for a degraded array.
-	 */
-	mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
-	conf->mddev = mddev;
-	conf->chunk_size = mddev->chunk_size;
-	conf->level = mddev->level;
-	conf->algorithm = mddev->layout;
-	conf->max_nr_stripes = NR_STRIPES;
-
-	/* device size must be a multiple of chunk size */
-	mddev->size &= ~(mddev->chunk_size/1024 -1);
-	mddev->resync_max_sectors = mddev->size << 1;
-
-	if (conf->raid_disks < 4) {
-		printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
-		       mdname(mddev), conf->raid_disks);
-		goto abort;
-	}
-	if (!conf->chunk_size || conf->chunk_size % 4) {
-		printk(KERN_ERR "raid6: invalid chunk size %d for %s\n",
-		       conf->chunk_size, mdname(mddev));
-		goto abort;
-	}
-	if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
-		printk(KERN_ERR
-		       "raid6: unsupported parity algorithm %d for %s\n",
-		       conf->algorithm, mdname(mddev));
-		goto abort;
-	}
-	if (mddev->degraded > 2) {
-		printk(KERN_ERR "raid6: not enough operational devices for %s"
-		       " (%d/%d failed)\n",
-		       mdname(mddev), conf->failed_disks, conf->raid_disks);
-		goto abort;
-	}
-
-	if (mddev->degraded > 0 &&
-	    mddev->recovery_cp != MaxSector) {
-		if (mddev->ok_start_degraded)
-			printk(KERN_WARNING "raid6: starting dirty degraded array:%s"
-			       "- data corruption possible.\n",
-			       mdname(mddev));
-		else {
-			printk(KERN_ERR "raid6: cannot start dirty degraded array"
-			       " for %s\n", mdname(mddev));
-			goto abort;
-		}
-	}
-
-	{
-		mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6");
-		if (!mddev->thread) {
-			printk(KERN_ERR
-			       "raid6: couldn't allocate thread for %s\n",
-			       mdname(mddev));
-			goto abort;
-		}
-	}
-
-	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
-		 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
-	if (grow_stripes(conf, conf->max_nr_stripes)) {
-		printk(KERN_ERR
-		       "raid6: couldn't allocate %dkB for buffers\n", memory);
-		shrink_stripes(conf);
-		md_unregister_thread(mddev->thread);
-		goto abort;
-	} else
-		printk(KERN_INFO "raid6: allocated %dkB for %s\n",
-		       memory, mdname(mddev));
-
-	if (mddev->degraded == 0)
-		printk(KERN_INFO "raid6: raid level %d set %s active with %d out of %d"
-		       " devices, algorithm %d\n", conf->level, mdname(mddev),
-		       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
-		       conf->algorithm);
-	else
-		printk(KERN_ALERT "raid6: raid level %d set %s active with %d"
-		       " out of %d devices, algorithm %d\n", conf->level,
-		       mdname(mddev), mddev->raid_disks - mddev->degraded,
-		       mddev->raid_disks, conf->algorithm);
-
-	print_raid6_conf(conf);
-
-	/* read-ahead size must cover two whole stripes, which is
-	 * 2 * (n-2) * chunksize where 'n' is the number of raid devices
-	 */
-	{
-		int stripe = (mddev->raid_disks-2) *
-			(mddev->chunk_size / PAGE_SIZE);
-		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
-			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
-	}
-
-	/* Ok, everything is just fine now */
-	sysfs_create_group(&mddev->kobj, &raid6_attrs_group);
-
-	mddev->array_size =  mddev->size * (mddev->raid_disks - 2);
-
-	mddev->queue->unplug_fn = raid6_unplug_device;
-	mddev->queue->issue_flush_fn = raid6_issue_flush;
-	return 0;
-abort:
-	if (conf) {
-		print_raid6_conf(conf);
-		safe_put_page(conf->spare_page);
-		kfree(conf->stripe_hashtbl);
-		kfree(conf->disks);
-		kfree(conf);
-	}
-	mddev->private = NULL;
-	printk(KERN_ALERT "raid6: failed to run raid set %s\n", mdname(mddev));
-	return -EIO;
-}
-
-
-
-static int stop (mddev_t *mddev)
-{
-	raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
-
-	md_unregister_thread(mddev->thread);
-	mddev->thread = NULL;
-	shrink_stripes(conf);
-	kfree(conf->stripe_hashtbl);
-	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
-	sysfs_remove_group(&mddev->kobj, &raid6_attrs_group);
-	kfree(conf);
-	mddev->private = NULL;
-	return 0;
-}
-
-#if RAID6_DUMPSTATE
-static void print_sh (struct seq_file *seq, struct stripe_head *sh)
-{
-	int i;
-
-	seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
-		   (unsigned long long)sh->sector, sh->pd_idx, sh->state);
-	seq_printf(seq, "sh %llu,  count %d.\n",
-		   (unsigned long long)sh->sector, atomic_read(&sh->count));
-	seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
-	for (i = 0; i < sh->raid_conf->raid_disks; i++) {
-		seq_printf(seq, "(cache%d: %p %ld) ",
-			   i, sh->dev[i].page, sh->dev[i].flags);
-	}
-	seq_printf(seq, "\n");
-}
-
-static void printall (struct seq_file *seq, raid6_conf_t *conf)
-{
-	struct stripe_head *sh;
-	struct hlist_node *hn;
-	int i;
-
-	spin_lock_irq(&conf->device_lock);
-	for (i = 0; i < NR_HASH; i++) {
-		sh = conf->stripe_hashtbl[i];
-		hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
-			if (sh->raid_conf != conf)
-				continue;
-			print_sh(seq, sh);
-		}
-	}
-	spin_unlock_irq(&conf->device_lock);
-}
-#endif
-
-static void status (struct seq_file *seq, mddev_t *mddev)
-{
-	raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
-	int i;
-
-	seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
-	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
-	for (i = 0; i < conf->raid_disks; i++)
- 		seq_printf (seq, "%s",
-			    conf->disks[i].rdev &&
-			    test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
-	seq_printf (seq, "]");
-#if RAID6_DUMPSTATE
-	seq_printf (seq, "\n");
-	printall(seq, conf);
-#endif
-}
-
-static void print_raid6_conf (raid6_conf_t *conf)
-{
-	int i;
-	struct disk_info *tmp;
-
-	printk("RAID6 conf printout:\n");
-	if (!conf) {
-		printk("(conf==NULL)\n");
-		return;
-	}
-	printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
-		 conf->working_disks, conf->failed_disks);
-
-	for (i = 0; i < conf->raid_disks; i++) {
-		char b[BDEVNAME_SIZE];
-		tmp = conf->disks + i;
-		if (tmp->rdev)
-		printk(" disk %d, o:%d, dev:%s\n",
-			i, !test_bit(Faulty, &tmp->rdev->flags),
-			bdevname(tmp->rdev->bdev,b));
-	}
-}
-
-static int raid6_spare_active(mddev_t *mddev)
-{
-	int i;
-	raid6_conf_t *conf = mddev->private;
-	struct disk_info *tmp;
-
-	for (i = 0; i < conf->raid_disks; i++) {
-		tmp = conf->disks + i;
-		if (tmp->rdev
-		    && !test_bit(Faulty, &tmp->rdev->flags)
-		    && !test_bit(In_sync, &tmp->rdev->flags)) {
-			mddev->degraded--;
-			conf->failed_disks--;
-			conf->working_disks++;
-			set_bit(In_sync, &tmp->rdev->flags);
-		}
-	}
-	print_raid6_conf(conf);
-	return 0;
-}
-
-static int raid6_remove_disk(mddev_t *mddev, int number)
-{
-	raid6_conf_t *conf = mddev->private;
-	int err = 0;
-	mdk_rdev_t *rdev;
-	struct disk_info *p = conf->disks + number;
-
-	print_raid6_conf(conf);
-	rdev = p->rdev;
-	if (rdev) {
-		if (test_bit(In_sync, &rdev->flags) ||
-		    atomic_read(&rdev->nr_pending)) {
-			err = -EBUSY;
-			goto abort;
-		}
-		p->rdev = NULL;
-		synchronize_rcu();
-		if (atomic_read(&rdev->nr_pending)) {
-			/* lost the race, try later */
-			err = -EBUSY;
-			p->rdev = rdev;
-		}
-	}
-
-abort:
-
-	print_raid6_conf(conf);
-	return err;
-}
-
-static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
-{
-	raid6_conf_t *conf = mddev->private;
-	int found = 0;
-	int disk;
-	struct disk_info *p;
-
-	if (mddev->degraded > 2)
-		/* no point adding a device */
-		return 0;
-	/*
-	 * find the disk ... but prefer rdev->saved_raid_disk
-	 * if possible.
-	 */
-	if (rdev->saved_raid_disk >= 0 &&
-	    conf->disks[rdev->saved_raid_disk].rdev == NULL)
-		disk = rdev->saved_raid_disk;
-	else
-		disk = 0;
-	for ( ; disk < mddev->raid_disks; disk++)
-		if ((p=conf->disks + disk)->rdev == NULL) {
-			clear_bit(In_sync, &rdev->flags);
-			rdev->raid_disk = disk;
-			found = 1;
-			if (rdev->saved_raid_disk != disk)
-				conf->fullsync = 1;
-			rcu_assign_pointer(p->rdev, rdev);
-			break;
-		}
-	print_raid6_conf(conf);
-	return found;
-}
-
-static int raid6_resize(mddev_t *mddev, sector_t sectors)
-{
-	/* no resync is happening, and there is enough space
-	 * on all devices, so we can resize.
-	 * We need to make sure resync covers any new space.
-	 * If the array is shrinking we should possibly wait until
-	 * any io in the removed space completes, but it hardly seems
-	 * worth it.
-	 */
-	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
-	mddev->array_size = (sectors * (mddev->raid_disks-2))>>1;
-	set_capacity(mddev->gendisk, mddev->array_size << 1);
-	mddev->changed = 1;
-	if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
-		mddev->recovery_cp = mddev->size << 1;
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-	}
-	mddev->size = sectors /2;
-	mddev->resync_max_sectors = sectors;
-	return 0;
-}
-
-static void raid6_quiesce(mddev_t *mddev, int state)
-{
-	raid6_conf_t *conf = mddev_to_conf(mddev);
-
-	switch(state) {
-	case 1: /* stop all writes */
-		spin_lock_irq(&conf->device_lock);
-		conf->quiesce = 1;
-		wait_event_lock_irq(conf->wait_for_stripe,
-				    atomic_read(&conf->active_stripes) == 0,
-				    conf->device_lock, /* nothing */);
-		spin_unlock_irq(&conf->device_lock);
-		break;
-
-	case 0: /* re-enable writes */
-		spin_lock_irq(&conf->device_lock);
-		conf->quiesce = 0;
-		wake_up(&conf->wait_for_stripe);
-		spin_unlock_irq(&conf->device_lock);
-		break;
-	}
-}
-
-static struct mdk_personality raid6_personality =
-{
-	.name		= "raid6",
-	.level		= 6,
-	.owner		= THIS_MODULE,
-	.make_request	= make_request,
-	.run		= run,
-	.stop		= stop,
-	.status		= status,
-	.error_handler	= error,
-	.hot_add_disk	= raid6_add_disk,
-	.hot_remove_disk= raid6_remove_disk,
-	.spare_active	= raid6_spare_active,
-	.sync_request	= sync_request,
-	.resize		= raid6_resize,
-	.quiesce	= raid6_quiesce,
-};
-
-static int __init raid6_init(void)
-{
-	int e;
-
-	e = raid6_select_algo();
-	if ( e )
-		return e;
-
-	return register_md_personality(&raid6_personality);
-}
-
-static void raid6_exit (void)
-{
-	unregister_md_personality(&raid6_personality);
-}
-
-module_init(raid6_init);
-module_exit(raid6_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("md-personality-8"); /* RAID6 */
-MODULE_ALIAS("md-raid6");
-MODULE_ALIAS("md-level-6");
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 914af667044f..20ed4c997636 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -212,6 +212,7 @@ struct raid5_private_data {
 	mddev_t			*mddev;
 	struct disk_info	*spare;
 	int			chunk_size, level, algorithm;
+	int			max_degraded;
 	int			raid_disks, working_disks, failed_disks;
 	int			max_nr_stripes;
 
-- 
cgit v1.2.3


From 5fd6c1dce06ec24ef3de20fe0c7ecf2ba9fe5ef9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:40 -0700
Subject: [PATCH] md: allow checkpoint of recovery with version-1 superblock

For a while we have had checkpointing of resync.  The version-1 superblock
allows recovery to be checkpointed as well, and this patch implements that.

Due to early carelessness we need to add a feature flag to signal that the
recovery_offset field is in use, otherwise older kernels would assume that a
partially recovered array is in fact fully recovered.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 115 +++++++++++++++++++++++++++++++++++++---------
 drivers/md/raid1.c        |   3 +-
 drivers/md/raid10.c       |   3 +-
 drivers/md/raid5.c        |   1 +
 include/linux/raid/md_k.h |   6 +++
 include/linux/raid/md_p.h |   5 +-
 6 files changed, 109 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 71d46eb2c438..a296edd7e1c3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1175,7 +1175,11 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			set_bit(Faulty, &rdev->flags);
 			break;
 		default:
-			set_bit(In_sync, &rdev->flags);
+			if ((le32_to_cpu(sb->feature_map) &
+			     MD_FEATURE_RECOVERY_OFFSET))
+				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
+			else
+				set_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = role;
 			break;
 		}
@@ -1199,6 +1203,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	sb->feature_map = 0;
 	sb->pad0 = 0;
+	sb->recovery_offset = cpu_to_le64(0);
 	memset(sb->pad1, 0, sizeof(sb->pad1));
 	memset(sb->pad2, 0, sizeof(sb->pad2));
 	memset(sb->pad3, 0, sizeof(sb->pad3));
@@ -1219,6 +1224,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
 	}
+
+	if (rdev->raid_disk >= 0 &&
+	    !test_bit(In_sync, &rdev->flags) &&
+	    rdev->recovery_offset > 0) {
+		sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
+		sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
+	}
+
 	if (mddev->reshape_position != MaxSector) {
 		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
 		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
@@ -1243,11 +1256,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 			sb->dev_roles[i] = cpu_to_le16(0xfffe);
 		else if (test_bit(In_sync, &rdev2->flags))
 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+		else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
+			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
 		else
 			sb->dev_roles[i] = cpu_to_le16(0xffff);
 	}
 
-	sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
 	sb->sb_csum = calc_sb_1_csum(sb);
 }
 
@@ -2603,8 +2617,6 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
 	return NULL;
 }
 
-void md_wakeup_thread(mdk_thread_t *thread);
-
 static void md_safemode_timeout(unsigned long data)
 {
 	mddev_t *mddev = (mddev_t *) data;
@@ -2786,6 +2798,36 @@ static int do_md_run(mddev_t * mddev)
 	mddev->queue->queuedata = mddev;
 	mddev->queue->make_request_fn = mddev->pers->make_request;
 
+	/* If there is a partially-recovered drive we need to
+	 * start recovery here.  If we leave it to md_check_recovery,
+	 * it will remove the drives and not do the right thing
+	 */
+	if (mddev->degraded) {
+		struct list_head *rtmp;
+		int spares = 0;
+		ITERATE_RDEV(mddev,rdev,rtmp)
+			if (rdev->raid_disk >= 0 &&
+			    !test_bit(In_sync, &rdev->flags) &&
+			    !test_bit(Faulty, &rdev->flags))
+				/* complete an interrupted recovery */
+				spares++;
+		if (spares && mddev->pers->sync_request) {
+			mddev->recovery = 0;
+			set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+			mddev->sync_thread = md_register_thread(md_do_sync,
+								mddev,
+								"%s_resync");
+			if (!mddev->sync_thread) {
+				printk(KERN_ERR "%s: could not start resync"
+				       " thread...\n",
+				       mdname(mddev));
+				/* leave the spares where they are, it shouldn't hurt */
+				mddev->recovery = 0;
+			} else
+				md_wakeup_thread(mddev->sync_thread);
+		}
+	}
+
 	mddev->changed = 1;
 	md_new_event(mddev);
 	return 0;
@@ -2819,6 +2861,7 @@ static int restart_array(mddev_t *mddev)
 		 */
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		md_wakeup_thread(mddev->thread);
+		md_wakeup_thread(mddev->sync_thread);
 		err = 0;
 	} else {
 		printk(KERN_ERR "md: %s has no personality assigned.\n",
@@ -2842,6 +2885,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
 		}
 
 		if (mddev->sync_thread) {
+			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 			md_unregister_thread(mddev->sync_thread);
 			mddev->sync_thread = NULL;
@@ -2871,13 +2915,14 @@ static int do_md_stop(mddev_t * mddev, int ro)
 			if (mddev->ro)
 				mddev->ro = 0;
 		}
-		if (!mddev->in_sync) {
+		if (!mddev->in_sync || mddev->sb_dirty) {
 			/* mark array as shutdown cleanly */
 			mddev->in_sync = 1;
 			md_update_sb(mddev);
 		}
 		if (ro)
 			set_disk_ro(disk, 1);
+		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	}
 
 	/*
@@ -4665,10 +4710,14 @@ void md_do_sync(mddev_t *mddev)
 	struct list_head *tmp;
 	sector_t last_check;
 	int skipped = 0;
+	struct list_head *rtmp;
+	mdk_rdev_t *rdev;
 
 	/* just incase thread restarts... */
 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
 		return;
+	if (mddev->ro) /* never try to sync a read-only array */
+		return;
 
 	/* we overload curr_resync somewhat here.
 	 * 0 == not engaged in resync at all
@@ -4727,17 +4776,30 @@ void md_do_sync(mddev_t *mddev)
 		}
 	} while (mddev->curr_resync < 2);
 
+	j = 0;
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 		/* resync follows the size requested by the personality,
 		 * which defaults to physical size, but can be virtual size
 		 */
 		max_sectors = mddev->resync_max_sectors;
 		mddev->resync_mismatches = 0;
+		/* we don't use the checkpoint if there's a bitmap */
+		if (!mddev->bitmap &&
+		    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+			j = mddev->recovery_cp;
 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		max_sectors = mddev->size << 1;
-	else
+	else {
 		/* recovery follows the physical size of devices */
 		max_sectors = mddev->size << 1;
+		j = MaxSector;
+		ITERATE_RDEV(mddev,rdev,rtmp)
+			if (rdev->raid_disk >= 0 &&
+			    !test_bit(Faulty, &rdev->flags) &&
+			    !test_bit(In_sync, &rdev->flags) &&
+			    rdev->recovery_offset < j)
+				j = rdev->recovery_offset;
+	}
 
 	printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
 	printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
@@ -4747,12 +4809,7 @@ void md_do_sync(mddev_t *mddev)
 	       speed_max(mddev));
 
 	is_mddev_idle(mddev); /* this also initializes IO event counters */
-	/* we don't use the checkpoint if there's a bitmap */
-	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap
-	    && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
-		j = mddev->recovery_cp;
-	else
-		j = 0;
+
 	io_sectors = 0;
 	for (m = 0; m < SYNC_MARKS; m++) {
 		mark[m] = jiffies;
@@ -4873,15 +4930,28 @@ void md_do_sync(mddev_t *mddev)
 	if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
 	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
-	    mddev->curr_resync > 2 &&
-	    mddev->curr_resync >= mddev->recovery_cp) {
-		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
-			printk(KERN_INFO 
-				"md: checkpointing recovery of %s.\n",
-				mdname(mddev));
-			mddev->recovery_cp = mddev->curr_resync;
-		} else
-			mddev->recovery_cp = MaxSector;
+	    mddev->curr_resync > 2) {
+		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+				if (mddev->curr_resync >= mddev->recovery_cp) {
+					printk(KERN_INFO
+					       "md: checkpointing recovery of %s.\n",
+					       mdname(mddev));
+					mddev->recovery_cp = mddev->curr_resync;
+				}
+			} else
+				mddev->recovery_cp = MaxSector;
+		} else {
+			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+				mddev->curr_resync = MaxSector;
+			ITERATE_RDEV(mddev,rdev,rtmp)
+				if (rdev->raid_disk >= 0 &&
+				    !test_bit(Faulty, &rdev->flags) &&
+				    !test_bit(In_sync, &rdev->flags) &&
+				    rdev->recovery_offset < mddev->curr_resync)
+					rdev->recovery_offset = mddev->curr_resync;
+			mddev->sb_dirty = 1;
+		}
 	}
 
  skip:
@@ -5002,6 +5072,8 @@ void md_check_recovery(mddev_t *mddev)
 		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
 		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 
+		if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+			goto unlock;
 		/* no recovery is running.
 		 * remove any failed drives, then
 		 * add spares if possible.
@@ -5024,6 +5096,7 @@ void md_check_recovery(mddev_t *mddev)
 			ITERATE_RDEV(mddev,rdev,rtmp)
 				if (rdev->raid_disk < 0
 				    && !test_bit(Faulty, &rdev->flags)) {
+					rdev->recovery_offset = 0;
 					if (mddev->pers->hot_add_disk(mddev,rdev)) {
 						char nm[20];
 						sprintf(nm, "rd%d", rdev->raid_disk);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4dd53fcb791a..b3cfae41f769 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1888,7 +1888,8 @@ static int run(mddev_t *mddev)
 
 		disk = conf->mirrors + i;
 
-		if (!disk->rdev) {
+		if (!disk->rdev ||
+		    !test_bit(In_sync, &disk->rdev->flags)) {
 			disk->head_position = 0;
 			mddev->degraded++;
 		}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index affeaefd4033..2ca18770575f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2015,7 +2015,8 @@ static int run(mddev_t *mddev)
 
 		disk = conf->mirrors + i;
 
-		if (!disk->rdev) {
+		if (!disk->rdev ||
+		    !test_bit(In_sync, &rdev->flags)) {
 			disk->head_position = 0;
 			mddev->degraded++;
 		}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9ba73074df04..970eb03ec6c0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3555,6 +3555,7 @@ static int raid5_start_reshape(mddev_t *mddev)
 				set_bit(In_sync, &rdev->flags);
 				conf->working_disks++;
 				added_devices++;
+				rdev->recovery_offset = 0;
 				sprintf(nm, "rd%d", rdev->raid_disk);
 				sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
 			} else
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index db2ca2d9066e..682574f3bd36 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -88,6 +88,10 @@ struct mdk_rdev_s
 					 * array and could again if we did a partial
 					 * resync from the bitmap
 					 */
+	sector_t	recovery_offset;/* If this device has been partially
+					 * recovered, this is where we were
+					 * up to.
+					 */
 
 	atomic_t	nr_pending;	/* number of pending requests.
 					 * only maintained for arrays that
@@ -183,6 +187,8 @@ struct mddev_s
 #define	MD_RECOVERY_REQUESTED	6
 #define	MD_RECOVERY_CHECK	7
 #define MD_RECOVERY_RESHAPE	8
+#define	MD_RECOVERY_FROZEN	9
+
 	unsigned long			recovery;
 
 	int				in_sync;	/* know to not need resync */
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index f1fbae7e390e..b6ebc69bae54 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -265,9 +265,12 @@ struct mdp_superblock_1 {
 
 /* feature_map bits */
 #define MD_FEATURE_BITMAP_OFFSET	1
+#define	MD_FEATURE_RECOVERY_OFFSET	2 /* recovery_offset is present and
+					   * must be honoured
+					   */
 #define	MD_FEATURE_RESHAPE_ACTIVE	4
 
-#define	MD_FEATURE_ALL			5
+#define	MD_FEATURE_ALL			(1|2|4)
 
 #endif 
 
-- 
cgit v1.2.3


From 7c7546ccf6463edbeee8d9aac6de7be1cd80d08a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:41 -0700
Subject: [PATCH] md: allow a linear array to have drives added while active

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/linear.c         | 74 +++++++++++++++++++++++++++++++++++++--------
 drivers/md/md.c             | 15 +++++++--
 include/linux/raid/linear.h |  2 ++
 3 files changed, 76 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 777585458c85..ff83c9b5979e 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -111,7 +111,7 @@ static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
 	return ret;
 }
 
-static int linear_run (mddev_t *mddev)
+static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 {
 	linear_conf_t *conf;
 	dev_info_t **table;
@@ -121,20 +121,21 @@ static int linear_run (mddev_t *mddev)
 	sector_t curr_offset;
 	struct list_head *tmp;
 
-	conf = kzalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t),
+	conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
 			GFP_KERNEL);
 	if (!conf)
-		goto out;
+		return NULL;
+
 	mddev->private = conf;
 
 	cnt = 0;
-	mddev->array_size = 0;
+	conf->array_size = 0;
 
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		int j = rdev->raid_disk;
 		dev_info_t *disk = conf->disks + j;
 
-		if (j < 0 || j > mddev->raid_disks || disk->rdev) {
+		if (j < 0 || j > raid_disks || disk->rdev) {
 			printk("linear: disk numbering problem. Aborting!\n");
 			goto out;
 		}
@@ -152,11 +153,11 @@ static int linear_run (mddev_t *mddev)
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->size = rdev->size;
-		mddev->array_size += rdev->size;
+		conf->array_size += rdev->size;
 
 		cnt++;
 	}
-	if (cnt != mddev->raid_disks) {
+	if (cnt != raid_disks) {
 		printk("linear: not enough drives present. Aborting!\n");
 		goto out;
 	}
@@ -200,7 +201,7 @@ static int linear_run (mddev_t *mddev)
 		unsigned round;
 		unsigned long base;
 
-		sz = mddev->array_size >> conf->preshift;
+		sz = conf->array_size >> conf->preshift;
 		sz += 1; /* force round-up */
 		base = conf->hash_spacing >> conf->preshift;
 		round = sector_div(sz, base);
@@ -247,14 +248,56 @@ static int linear_run (mddev_t *mddev)
 
 	BUG_ON(table - conf->hash_table > nb_zone);
 
+	return conf;
+
+out:
+	kfree(conf);
+	return NULL;
+}
+
+static int linear_run (mddev_t *mddev)
+{
+	linear_conf_t *conf;
+
+	conf = linear_conf(mddev, mddev->raid_disks);
+
+	if (!conf)
+		return 1;
+	mddev->private = conf;
+	mddev->array_size = conf->array_size;
+
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->unplug_fn = linear_unplug;
 	mddev->queue->issue_flush_fn = linear_issue_flush;
 	return 0;
+}
 
-out:
-	kfree(conf);
-	return 1;
+static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	/* Adding a drive to a linear array allows the array to grow.
+	 * It is permitted if the new drive has a matching superblock
+	 * already on it, with raid_disk equal to raid_disks.
+	 * It is achieved by creating a new linear_private_data structure
+	 * and swapping it in in-place of the current one.
+	 * The current one is never freed until the array is stopped.
+	 * This avoids races.
+	 */
+	linear_conf_t *newconf;
+
+	if (rdev->raid_disk != mddev->raid_disks)
+		return -EINVAL;
+
+	newconf = linear_conf(mddev,mddev->raid_disks+1);
+
+	if (!newconf)
+		return -ENOMEM;
+
+	newconf->prev = mddev_to_conf(mddev);
+	mddev->private = newconf;
+	mddev->raid_disks++;
+	mddev->array_size = newconf->array_size;
+	set_capacity(mddev->gendisk, mddev->array_size << 1);
+	return 0;
 }
 
 static int linear_stop (mddev_t *mddev)
@@ -262,8 +305,12 @@ static int linear_stop (mddev_t *mddev)
 	linear_conf_t *conf = mddev_to_conf(mddev);
   
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
-	kfree(conf->hash_table);
-	kfree(conf);
+	do {
+		linear_conf_t *t = conf->prev;
+		kfree(conf->hash_table);
+		kfree(conf);
+		conf = t;
+	} while (conf);
 
 	return 0;
 }
@@ -360,6 +407,7 @@ static struct mdk_personality linear_personality =
 	.run		= linear_run,
 	.stop		= linear_stop,
 	.status		= linear_status,
+	.hot_add_disk	= linear_add,
 };
 
 static int __init linear_init (void)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a296edd7e1c3..2ce750d4be02 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -817,8 +817,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
 		if (desc->state & (1<<MD_DISK_FAULTY))
 			set_bit(Faulty, &rdev->flags);
-		else if (desc->state & (1<<MD_DISK_SYNC) &&
-			 desc->raid_disk < mddev->raid_disks) {
+		else if (desc->state & (1<<MD_DISK_SYNC) /* &&
+			    desc->raid_disk < mddev->raid_disks */) {
 			set_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = desc->raid_disk;
 		}
@@ -3359,6 +3359,17 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 
 		rdev->raid_disk = -1;
 		err = bind_rdev_to_array(rdev, mddev);
+		if (!err && !mddev->pers->hot_remove_disk) {
+			/* If there is hot_add_disk but no hot_remove_disk
+			 * then added disks for geometry changes,
+			 * and should be added immediately.
+			 */
+			super_types[mddev->major_version].
+				validate_super(mddev, rdev);
+			err = mddev->pers->hot_add_disk(mddev, rdev);
+			if (err)
+				unbind_rdev_from_array(rdev);
+		}
 		if (err)
 			export_rdev(rdev);
 
diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h
index 7eaf290e10e7..ba15469daf11 100644
--- a/include/linux/raid/linear.h
+++ b/include/linux/raid/linear.h
@@ -13,8 +13,10 @@ typedef struct dev_info dev_info_t;
 
 struct linear_private_data
 {
+	struct linear_private_data *prev;	/* earlier version */
 	dev_info_t		**hash_table;
 	sector_t		hash_spacing;
+	sector_t		array_size;
 	int			preshift; /* shift before dividing by hash_spacing */
 	dev_info_t		disks[0];
 };
-- 
cgit v1.2.3


From c93983bf517c100a31e40ef087e19bd3d7aa2d28 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:41 -0700
Subject: [PATCH] md: support stripe/offset mode in raid10

The "industry standard" DDF format allows for a stripe/offset layout where
data is duplicated on different stripes.  e.g.

  A  B  C  D
  D  A  B  C
  E  F  G  H
  H  E  F  G

(columns are drives, rows are stripes, LETTERS are chunks of data).

This is similar to raid10's 'far' mode, but not quite the same.  So enhance
'far' mode with a 'far/offset' option which follows the layout of DDFs
stripe/offset.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid10.c         | 64 ++++++++++++++++++++++++++++++---------------
 include/linux/raid/raid10.h |  7 ++++-
 2 files changed, 49 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2ca18770575f..4db34edfd87f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -29,6 +29,7 @@
  *    raid_disks
  *    near_copies (stored in low byte of layout)
  *    far_copies (stored in second byte of layout)
+ *    far_offset (stored in bit 16 of layout )
  *
  * The data to be stored is divided into chunks using chunksize.
  * Each device is divided into far_copies sections.
@@ -36,10 +37,14 @@
  * near_copies copies of each chunk is stored (each on a different drive).
  * The starting device for each section is offset near_copies from the starting
  * device of the previous section.
- * Thus there are (near_copies*far_copies) of each chunk, and each is on a different
+ * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
  * drive.
  * near_copies and far_copies must be at least one, and their product is at most
  * raid_disks.
+ *
+ * If far_offset is true, then the far_copies are handled a bit differently.
+ * The copies are still in different stripes, but instead of be very far apart
+ * on disk, there are adjacent stripes.
  */
 
 /*
@@ -357,8 +362,7 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
  * With this layout, and block is never stored twice on the one device.
  *
  * raid10_find_phys finds the sector offset of a given virtual sector
- * on each device that it is on. If a block isn't on a device,
- * that entry in the array is set to MaxSector.
+ * on each device that it is on.
  *
  * raid10_find_virt does the reverse mapping, from a device and a
  * sector offset to a virtual address
@@ -381,6 +385,8 @@ static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
 	chunk *= conf->near_copies;
 	stripe = chunk;
 	dev = sector_div(stripe, conf->raid_disks);
+	if (conf->far_offset)
+		stripe *= conf->far_copies;
 
 	sector += stripe << conf->chunk_shift;
 
@@ -414,16 +420,24 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
 {
 	sector_t offset, chunk, vchunk;
 
-	while (sector > conf->stride) {
-		sector -= conf->stride;
-		if (dev < conf->near_copies)
-			dev += conf->raid_disks - conf->near_copies;
-		else
-			dev -= conf->near_copies;
-	}
-
 	offset = sector & conf->chunk_mask;
-	chunk = sector >> conf->chunk_shift;
+	if (conf->far_offset) {
+		int fc;
+		chunk = sector >> conf->chunk_shift;
+		fc = sector_div(chunk, conf->far_copies);
+		dev -= fc * conf->near_copies;
+		if (dev < 0)
+			dev += conf->raid_disks;
+	} else {
+		while (sector > conf->stride) {
+			sector -= conf->stride;
+			if (dev < conf->near_copies)
+				dev += conf->raid_disks - conf->near_copies;
+			else
+				dev -= conf->near_copies;
+		}
+		chunk = sector >> conf->chunk_shift;
+	}
 	vchunk = chunk * conf->raid_disks + dev;
 	sector_div(vchunk, conf->near_copies);
 	return (vchunk << conf->chunk_shift) + offset;
@@ -900,9 +914,12 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 		seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
 	if (conf->near_copies > 1)
 		seq_printf(seq, " %d near-copies", conf->near_copies);
-	if (conf->far_copies > 1)
-		seq_printf(seq, " %d far-copies", conf->far_copies);
-
+	if (conf->far_copies > 1) {
+		if (conf->far_offset)
+			seq_printf(seq, " %d offset-copies", conf->far_copies);
+		else
+			seq_printf(seq, " %d far-copies", conf->far_copies);
+	}
 	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
 						conf->working_disks);
 	for (i = 0; i < conf->raid_disks; i++)
@@ -1915,7 +1932,7 @@ static int run(mddev_t *mddev)
 	mirror_info_t *disk;
 	mdk_rdev_t *rdev;
 	struct list_head *tmp;
-	int nc, fc;
+	int nc, fc, fo;
 	sector_t stride, size;
 
 	if (mddev->chunk_size == 0) {
@@ -1925,8 +1942,9 @@ static int run(mddev_t *mddev)
 
 	nc = mddev->layout & 255;
 	fc = (mddev->layout >> 8) & 255;
+	fo = mddev->layout & (1<<16);
 	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
-	    (mddev->layout >> 16)) {
+	    (mddev->layout >> 17)) {
 		printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
 		       mdname(mddev), mddev->layout);
 		goto out;
@@ -1958,12 +1976,16 @@ static int run(mddev_t *mddev)
 	conf->near_copies = nc;
 	conf->far_copies = fc;
 	conf->copies = nc*fc;
+	conf->far_offset = fo;
 	conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
 	conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
-	stride = mddev->size >> (conf->chunk_shift-1);
-	sector_div(stride, fc);
-	conf->stride = stride << conf->chunk_shift;
-
+	if (fo)
+		conf->stride = 1 << conf->chunk_shift;
+	else {
+		stride = mddev->size >> (conf->chunk_shift-1);
+		sector_div(stride, fc);
+		conf->stride = stride << conf->chunk_shift;
+	}
 	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
 						r10bio_pool_free, conf);
 	if (!conf->r10bio_pool) {
diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
index b1103298a8c2..c41e56a7c090 100644
--- a/include/linux/raid/raid10.h
+++ b/include/linux/raid/raid10.h
@@ -24,11 +24,16 @@ struct r10_private_data_s {
 	int 			far_copies;   /* number of copies layed out
 					       * at large strides across drives
 					       */
+	int			far_offset;   /* far_copies are offset by 1 stripe
+					       * instead of many
+					       */
 	int			copies;	      /* near_copies * far_copies.
 					       * must be <= raid_disks
 					       */
 	sector_t		stride;	      /* distance between far copies.
-					       * This is size / far_copies
+					       * This is size / far_copies unless
+					       * far_offset, in which case it is
+					       * 1 stripe.
 					       */
 
 	int chunk_shift; /* shift from chunks to sectors */
-- 
cgit v1.2.3


From 5e56341d029f0c2cf31e78dc01d4c861ba4d6a5e Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 26 Jun 2006 00:27:42 -0700
Subject: [PATCH] md: make md_print_devices() static

This patch makes the needlessly global md_print_devices() static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c         | 7 +++++--
 include/linux/raid/md.h | 4 ----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2ce750d4be02..8c1e1900893d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -73,6 +73,10 @@ static void autostart_arrays (int part);
 static LIST_HEAD(pers_list);
 static DEFINE_SPINLOCK(pers_lock);
 
+static void md_print_devices(void);
+
+#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
+
 /*
  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
  * is 1000 KB/sec, so the extra system load does not show up that much.
@@ -1522,7 +1526,7 @@ static void print_rdev(mdk_rdev_t *rdev)
 		printk(KERN_INFO "md: no rdev superblock!\n");
 }
 
-void md_print_devices(void)
+static void md_print_devices(void)
 {
 	struct list_head *tmp, *tmp2;
 	mdk_rdev_t *rdev;
@@ -5345,7 +5349,6 @@ EXPORT_SYMBOL(md_write_end);
 EXPORT_SYMBOL(md_register_thread);
 EXPORT_SYMBOL(md_unregister_thread);
 EXPORT_SYMBOL(md_wakeup_thread);
-EXPORT_SYMBOL(md_print_devices);
 EXPORT_SYMBOL(md_check_recovery);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md");
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 66b44e5e0d6e..eb3e547c8fee 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -85,8 +85,6 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev);
 extern void md_unplug_mddev(mddev_t *mddev);
 
-extern void md_print_devices (void);
-
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 			   sector_t sector, int size, struct page *page);
 extern void md_super_wait(mddev_t *mddev);
@@ -97,7 +95,5 @@ extern void md_new_event(mddev_t *mddev);
 
 extern void md_update_sb(mddev_t * mddev);
 
-#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
-
 #endif 
 
-- 
cgit v1.2.3


From 0b79ccf0cdd9f59e5f99017e1a5d23da336544b2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:44 -0700
Subject: [PATCH] md/bitmap: remove bitmap writeback daemon

md/bitmap currently has a separate thread to wait for writes to the bitmap
file to complete (as we cannot get a callback on that action).

However this isn't needed as bitmap_unplug is called from process context and
waits for the writeback thread to do it's work.  The same result can be
achieved by doing the waiting directly in bitmap_unplug.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c         | 115 +++-----------------------------------------
 include/linux/raid/bitmap.h |   6 ---
 2 files changed, 8 insertions(+), 113 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index f8ffaee20ff8..814f412a410c 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -7,7 +7,6 @@
  * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.:
  * - added disk storage for bitmap
  * - changes to allow various bitmap chunk sizes
- * - added bitmap daemon (to asynchronously clear bitmap bits from disk)
  */
 
 /*
@@ -330,14 +329,13 @@ static int write_page(struct bitmap *bitmap, struct page *page, int wait)
 	set_page_dirty(page); /* force it to be written out */
 
 	if (!wait) {
-		/* add to list to be waited for by daemon */
+		/* add to list to be waited for */
 		struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO);
 		item->page = page;
 		get_page(page);
 		spin_lock(&bitmap->write_lock);
 		list_add(&item->list, &bitmap->complete_pages);
 		spin_unlock(&bitmap->write_lock);
-		md_wakeup_thread(bitmap->writeback_daemon);
 	}
 	return write_one_page(page, wait);
 }
@@ -621,8 +619,6 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
 	safe_put_page(sb_page);
 }
 
-static void bitmap_stop_daemon(struct bitmap *bitmap);
-
 /* dequeue the next item in a page list -- don't call from irq context */
 static struct page_list *dequeue_page(struct bitmap *bitmap)
 {
@@ -648,8 +644,6 @@ static void drain_write_queues(struct bitmap *bitmap)
 		put_page(item->page);
 		mempool_free(item, bitmap->write_pool);
 	}
-
-	wake_up(&bitmap->write_wait);
 }
 
 static void bitmap_file_put(struct bitmap *bitmap)
@@ -663,8 +657,6 @@ static void bitmap_file_put(struct bitmap *bitmap)
 	bitmap->file = NULL;
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 
-	bitmap_stop_daemon(bitmap);
-
 	drain_write_queues(bitmap);
 
 	bitmap_file_unmap(bitmap);
@@ -770,6 +762,8 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 
 }
 
+static void bitmap_writeback(struct bitmap *bitmap);
+
 /* this gets called when the md device is ready to unplug its underlying
  * (slave) device queues -- before we let any writes go down, we need to
  * sync the dirty pages of the bitmap file to disk */
@@ -812,13 +806,9 @@ int bitmap_unplug(struct bitmap *bitmap)
 		}
 	}
 	if (wait) { /* if any writes were performed, we need to wait on them */
-		if (bitmap->file) {
-			spin_lock_irq(&bitmap->write_lock);
-			wait_event_lock_irq(bitmap->write_wait,
-					    list_empty(&bitmap->complete_pages), bitmap->write_lock,
-					    wake_up_process(bitmap->writeback_daemon->tsk));
-			spin_unlock_irq(&bitmap->write_lock);
-		} else
+		if (bitmap->file)
+			bitmap_writeback(bitmap);
+		else
 			md_super_wait(bitmap->mddev);
 	}
 	return 0;
@@ -1126,41 +1116,12 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 	return err;
 }
 
-static void daemon_exit(struct bitmap *bitmap, mdk_thread_t **daemon)
+static void bitmap_writeback(struct bitmap *bitmap)
 {
-	mdk_thread_t *dmn;
-	unsigned long flags;
-
-	/* if no one is waiting on us, we'll free the md thread struct
-	 * and exit, otherwise we let the waiter clean things up */
-	spin_lock_irqsave(&bitmap->lock, flags);
-	if ((dmn = *daemon)) { /* no one is waiting, cleanup and exit */
-		*daemon = NULL;
-		spin_unlock_irqrestore(&bitmap->lock, flags);
-		kfree(dmn);
-		complete_and_exit(NULL, 0); /* do_exit not exported */
-	}
-	spin_unlock_irqrestore(&bitmap->lock, flags);
-}
-
-static void bitmap_writeback_daemon(mddev_t *mddev)
-{
-	struct bitmap *bitmap = mddev->bitmap;
 	struct page *page;
 	struct page_list *item;
 	int err = 0;
 
-	if (signal_pending(current)) {
-		printk(KERN_INFO
-		       "%s: bitmap writeback daemon got signal, exiting...\n",
-		       bmname(bitmap));
-		err = -EINTR;
-		goto out;
-	}
-	if (bitmap == NULL)
-		/* about to be stopped. */
-		return;
-
 	PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap));
 	/* wait on bitmap page writebacks */
 	while ((item = dequeue_page(bitmap))) {
@@ -1177,59 +1138,9 @@ static void bitmap_writeback_daemon(mddev_t *mddev)
 			       "failed (page %lu): %d\n",
 			       bmname(bitmap), page->index, err);
 			bitmap_file_kick(bitmap);
-			goto out;
+			break;
 		}
 	}
- out:
-	wake_up(&bitmap->write_wait);
-	if (err) {
-		printk(KERN_INFO "%s: bitmap writeback daemon exiting (%d)\n",
-		       bmname(bitmap), err);
-		daemon_exit(bitmap, &bitmap->writeback_daemon);
-	}
-}
-
-static mdk_thread_t *bitmap_start_daemon(struct bitmap *bitmap,
-				void (*func)(mddev_t *), char *name)
-{
-	mdk_thread_t *daemon;
-	char namebuf[32];
-
-#ifdef INJECT_FATAL_FAULT_2
-	daemon = NULL;
-#else
-	sprintf(namebuf, "%%s_%s", name);
-	daemon = md_register_thread(func, bitmap->mddev, namebuf);
-#endif
-	if (!daemon) {
-		printk(KERN_ERR "%s: failed to start bitmap daemon\n",
-			bmname(bitmap));
-		return ERR_PTR(-ECHILD);
-	}
-
-	md_wakeup_thread(daemon); /* start it running */
-
-	PRINTK("%s: %s daemon (pid %d) started...\n",
-		bmname(bitmap), name, daemon->tsk->pid);
-
-	return daemon;
-}
-
-static void bitmap_stop_daemon(struct bitmap *bitmap)
-{
-	/* the daemon can't stop itself... it'll just exit instead... */
-	if (bitmap->writeback_daemon && ! IS_ERR(bitmap->writeback_daemon) &&
-	    current->pid != bitmap->writeback_daemon->tsk->pid) {
-		mdk_thread_t *daemon;
-		unsigned long flags;
-
-		spin_lock_irqsave(&bitmap->lock, flags);
-		daemon = bitmap->writeback_daemon;
-		bitmap->writeback_daemon = NULL;
-		spin_unlock_irqrestore(&bitmap->lock, flags);
-		if (daemon && ! IS_ERR(daemon))
-			md_unregister_thread(daemon); /* destroy the thread */
-	}
 }
 
 static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
@@ -1553,7 +1464,6 @@ int bitmap_create(mddev_t *mddev)
 
 	spin_lock_init(&bitmap->write_lock);
 	INIT_LIST_HEAD(&bitmap->complete_pages);
-	init_waitqueue_head(&bitmap->write_wait);
 	bitmap->write_pool = mempool_create_kmalloc_pool(WRITE_POOL_SIZE,
 						sizeof(struct page_list));
 	err = -ENOMEM;
@@ -1613,15 +1523,6 @@ int bitmap_create(mddev_t *mddev)
 
 	mddev->bitmap = bitmap;
 
-	if (file)
-		/* kick off the bitmap writeback daemon */
-		bitmap->writeback_daemon =
-			bitmap_start_daemon(bitmap,
-					    bitmap_writeback_daemon,
-					    "bitmap_wb");
-
-	if (IS_ERR(bitmap->writeback_daemon))
-		return PTR_ERR(bitmap->writeback_daemon);
 	mddev->thread->timeout = bitmap->daemon_sleep * HZ;
 
 	return bitmap_update_sb(bitmap);
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 899437802aea..9c8907ca60a7 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -244,13 +244,7 @@ struct bitmap {
 	unsigned long daemon_lastrun; /* jiffies of last run */
 	unsigned long daemon_sleep; /* how many seconds between updates? */
 
-	/*
-	 * bitmap_writeback_daemon waits for file-pages that have been written,
-	 * as there is no way to get a call-back when a page write completes.
-	 */
-	mdk_thread_t *writeback_daemon;
 	spinlock_t write_lock;
-	wait_queue_head_t write_wait;
 	struct list_head complete_pages;
 	mempool_t *write_pool;
 };
-- 
cgit v1.2.3


From d785a06a0b9d0cd86b3cc1bf8e236e62af7b47ed Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:48 -0700
Subject: [PATCH] md/bitmap: change md/bitmap file handling to use bmap to file
 blocks

If md is asked to store a bitmap in a file, it tries to hold onto the page
cache pages for that file, manipulate them directly, and call a cocktail of
operations to write the file out.  I don't believe this is a supportable
approach.

This patch changes the approach to use the same approach as swap files.  i.e.
bmap is used to enumerate all the block address of parts of the file and we
write directly to those blocks of the device.

swapfile only uses parts of the file that provide a full pages at contiguous
addresses.  We don't have that luxury so we have to cope with pages that are
non-contiguous in storage.  To handle this we attach buffers to each page, and
store the addresses in those buffers.

With this approach the pagecache may contain data which is inconsistent with
what is on disk.  To alleviate the problems this can cause, md invalidates the
pagecache when releasing the file.  If the file is to be examined while the
array is active (a non-critical but occasionally useful function), O_DIRECT io
must be used.  And new version of mdadm will have support for this.

This approach simplifies a lot of code:
 - we no longer need to keep a list of pages which we need to wait for,
   as the b_endio function can keep track of how many outstanding
   writes there are.  This saves a mempool.
 - -EAGAIN returns from write_page are no longer possible (not sure if
    they ever were actually).

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c         | 283 +++++++++++++++++++++-----------------------
 include/linux/raid/bitmap.h |   7 +-
 2 files changed, 139 insertions(+), 151 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index e40d3e4b8d27..bedd66e9e8ac 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -67,7 +67,6 @@ static inline char * bmname(struct bitmap *bitmap)
 	return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
 }
 
-#define WRITE_POOL_SIZE 256
 
 /*
  * just a placeholder - calls kmalloc for bitmap pages
@@ -279,75 +278,137 @@ static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wai
  */
 static int write_page(struct bitmap *bitmap, struct page *page, int wait)
 {
-	int ret = -ENOMEM;
+	struct buffer_head *bh;
 
 	if (bitmap->file == NULL)
 		return write_sb_page(bitmap->mddev, bitmap->offset, page, wait);
 
-	flush_dcache_page(page); /* make sure visible to anyone reading the file */
+	bh = page_buffers(page);
 
-	if (wait)
-		lock_page(page);
-	else {
-		if (TestSetPageLocked(page))
-			return -EAGAIN; /* already locked */
-		if (PageWriteback(page)) {
-			unlock_page(page);
-			return -EAGAIN;
-		}
+	while (bh && bh->b_blocknr) {
+		atomic_inc(&bitmap->pending_writes);
+		set_buffer_locked(bh);
+		set_buffer_mapped(bh);
+		submit_bh(WRITE, bh);
+		bh = bh->b_this_page;
+	}
+
+	if (wait) {
+		wait_event(bitmap->write_wait,
+			   atomic_read(&bitmap->pending_writes)==0);
+		return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0;
 	}
+	return 0;
+}
+
+static void end_bitmap_write(struct buffer_head *bh, int uptodate)
+{
+	struct bitmap *bitmap = bh->b_private;
+	unsigned long flags;
 
-	ret = page->mapping->a_ops->prepare_write(bitmap->file, page, 0, PAGE_SIZE);
-	if (!ret)
-		ret = page->mapping->a_ops->commit_write(bitmap->file, page, 0,
-			PAGE_SIZE);
-	if (ret) {
-		unlock_page(page);
-		return ret;
+	if (!uptodate) {
+		spin_lock_irqsave(&bitmap->lock, flags);
+		bitmap->flags |= BITMAP_WRITE_ERROR;
+		spin_unlock_irqrestore(&bitmap->lock, flags);
 	}
+	if (atomic_dec_and_test(&bitmap->pending_writes))
+		wake_up(&bitmap->write_wait);
+}
 
-	set_page_dirty(page); /* force it to be written out */
+/* copied from buffer.c */
+static void
+__clear_page_buffers(struct page *page)
+{
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	page_cache_release(page);
+}
+static void free_buffers(struct page *page)
+{
+	struct buffer_head *bh = page_buffers(page);
 
-	if (!wait) {
-		/* add to list to be waited for */
-		struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO);
-		item->page = page;
-		spin_lock(&bitmap->write_lock);
-		list_add(&item->list, &bitmap->complete_pages);
-		spin_unlock(&bitmap->write_lock);
+	while (bh) {
+		struct buffer_head *next = bh->b_this_page;
+		free_buffer_head(bh);
+		bh = next;
 	}
-	return write_one_page(page, wait);
+	__clear_page_buffers(page);
+	put_page(page);
 }
 
-/* read a page from a file, pinning it into cache, and return bytes_read */
+/* read a page from a file.
+ * We both read the page, and attach buffers to the page to record the
+ * address of each block (using bmap).  These addresses will be used
+ * to write the block later, completely bypassing the filesystem.
+ * This usage is similar to how swap files are handled, and allows us
+ * to write to a file with no concerns of memory allocation failing.
+ */
 static struct page *read_page(struct file *file, unsigned long index,
-					unsigned long *bytes_read)
+			      struct bitmap *bitmap,
+			      unsigned long count)
 {
-	struct inode *inode = file->f_mapping->host;
 	struct page *page = NULL;
-	loff_t isize = i_size_read(inode);
-	unsigned long end_index = isize >> PAGE_SHIFT;
+	struct inode *inode = file->f_dentry->d_inode;
+	loff_t pos = index << PAGE_SHIFT;
+	int ret;
+	struct buffer_head *bh;
+	sector_t block;
+	mm_segment_t oldfs;
 
 	PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE,
 			(unsigned long long)index << PAGE_SHIFT);
 
-	page = read_cache_page(inode->i_mapping, index,
-			(filler_t *)inode->i_mapping->a_ops->readpage, file);
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		page = ERR_PTR(-ENOMEM);
 	if (IS_ERR(page))
 		goto out;
-	wait_on_page_locked(page);
-	if (!PageUptodate(page) || PageError(page)) {
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = vfs_read(file, (char __user*) page_address(page), count, &pos);
+	set_fs(oldfs);
+
+	if (ret >= 0 && ret != count)
+		ret = -EIO;
+	if (ret < 0) {
+		put_page(page);
+		page = ERR_PTR(ret);
+		goto out;
+	}
+	bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
+	if (!bh) {
 		put_page(page);
-		page = ERR_PTR(-EIO);
+		page = ERR_PTR(-ENOMEM);
 		goto out;
 	}
+	attach_page_buffers(page, bh);
+	block = index << (PAGE_SHIFT - inode->i_blkbits);
+	while (bh) {
+		if (count == 0)
+			bh->b_blocknr = 0;
+		else {
+			bh->b_blocknr = bmap(inode, block);
+			if (bh->b_blocknr == 0) {
+				/* Cannot use this file! */
+				free_buffers(page);
+				page = ERR_PTR(-EINVAL);
+				goto out;
+			}
+			bh->b_bdev = inode->i_sb->s_bdev;
+			if (count < (1<<inode->i_blkbits))
+				count = 0;
+			else
+				count -= (1<<inode->i_blkbits);
+
+			bh->b_end_io = end_bitmap_write;
+			bh->b_private = bitmap;
+		}
+		block++;
+		bh = bh->b_this_page;
+	}
 
-	if (index > end_index) /* we have read beyond EOF */
-		*bytes_read = 0;
-	else if (index == end_index) /* possible short read */
-		*bytes_read = isize & ~PAGE_MASK;
-	else
-		*bytes_read = PAGE_SIZE; /* got a full page */
+	page->index = index;
 out:
 	if (IS_ERR(page))
 		printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n",
@@ -418,16 +479,14 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	char *reason = NULL;
 	bitmap_super_t *sb;
 	unsigned long chunksize, daemon_sleep, write_behind;
-	unsigned long bytes_read;
 	unsigned long long events;
 	int err = -EINVAL;
 
 	/* page 0 is the superblock, read it... */
 	if (bitmap->file)
-		bitmap->sb_page = read_page(bitmap->file, 0, &bytes_read);
+		bitmap->sb_page = read_page(bitmap->file, 0, bitmap, PAGE_SIZE);
 	else {
 		bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 0);
-		bytes_read = PAGE_SIZE;
 	}
 	if (IS_ERR(bitmap->sb_page)) {
 		err = PTR_ERR(bitmap->sb_page);
@@ -437,13 +496,6 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 
 	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
 
-	if (bytes_read < sizeof(*sb)) { /* short read */
-		printk(KERN_INFO "%s: bitmap file superblock truncated\n",
-			bmname(bitmap));
-		err = -ENOSPC;
-		goto out;
-	}
-
 	chunksize = le32_to_cpu(sb->chunksize);
 	daemon_sleep = le32_to_cpu(sb->daemon_sleep);
 	write_behind = le32_to_cpu(sb->write_behind);
@@ -589,37 +641,12 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
 
 	while (pages--)
 		if (map[pages]->index != 0) /* 0 is sb_page, release it below */
-			put_page(map[pages]);
+			free_buffers(map[pages]);
 	kfree(map);
 	kfree(attr);
 
-	safe_put_page(sb_page);
-}
-
-/* dequeue the next item in a page list -- don't call from irq context */
-static struct page_list *dequeue_page(struct bitmap *bitmap)
-{
-	struct page_list *item = NULL;
-	struct list_head *head = &bitmap->complete_pages;
-
-	spin_lock(&bitmap->write_lock);
-	if (list_empty(head))
-		goto out;
-	item = list_entry(head->prev, struct page_list, list);
-	list_del(head->prev);
-out:
-	spin_unlock(&bitmap->write_lock);
-	return item;
-}
-
-static void drain_write_queues(struct bitmap *bitmap)
-{
-	struct page_list *item;
-
-	while ((item = dequeue_page(bitmap))) {
-		/* don't bother to wait */
-		mempool_free(item, bitmap->write_pool);
-	}
+	if (sb_page)
+		free_buffers(sb_page);
 }
 
 static void bitmap_file_put(struct bitmap *bitmap)
@@ -632,12 +659,16 @@ static void bitmap_file_put(struct bitmap *bitmap)
 	bitmap->file = NULL;
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 
-	drain_write_queues(bitmap);
-
+	if (file)
+		wait_event(bitmap->write_wait,
+			   atomic_read(&bitmap->pending_writes)==0);
 	bitmap_file_unmap(bitmap);
 
-	if (file)
+	if (file) {
+		struct inode *inode = file->f_dentry->d_inode;
+		invalidate_inode_pages(inode->i_mapping);
 		fput(file);
+	}
 }
 
 
@@ -728,8 +759,6 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 
 }
 
-static void bitmap_writeback(struct bitmap *bitmap);
-
 /* this gets called when the md device is ready to unplug its underlying
  * (slave) device queues -- before we let any writes go down, we need to
  * sync the dirty pages of the bitmap file to disk */
@@ -761,24 +790,18 @@ int bitmap_unplug(struct bitmap *bitmap)
 			wait = 1;
 		spin_unlock_irqrestore(&bitmap->lock, flags);
 
-		if (dirty | need_write) {
+		if (dirty | need_write)
 			err = write_page(bitmap, page, 0);
-			if (err == -EAGAIN) {
-				if (dirty)
-					err = write_page(bitmap, page, 1);
-				else
-					err = 0;
-			}
-			if (err)
-				return 1;
-		}
 	}
 	if (wait) { /* if any writes were performed, we need to wait on them */
 		if (bitmap->file)
-			bitmap_writeback(bitmap);
+			wait_event(bitmap->write_wait,
+				   atomic_read(&bitmap->pending_writes)==0);
 		else
 			md_super_wait(bitmap->mddev);
 	}
+	if (bitmap->flags & BITMAP_WRITE_ERROR)
+		bitmap_file_kick(bitmap);
 	return 0;
 }
 
@@ -800,7 +823,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	struct page *page = NULL, *oldpage = NULL;
 	unsigned long num_pages, bit_cnt = 0;
 	struct file *file;
-	unsigned long bytes, offset, dummy;
+	unsigned long bytes, offset;
 	int outofdate;
 	int ret = -ENOSPC;
 	void *paddr;
@@ -853,7 +876,12 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		index = file_page_index(i);
 		bit = file_page_offset(i);
 		if (index != oldindex) { /* this is a new page, read it in */
+			int count;
 			/* unmap the old page, we're done with it */
+			if (index == num_pages-1)
+				count = bytes - index * PAGE_SIZE;
+			else
+				count = PAGE_SIZE;
 			if (index == 0) {
 				/*
 				 * if we're here then the superblock page
@@ -863,7 +891,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 				page = bitmap->sb_page;
 				offset = sizeof(bitmap_super_t);
 			} else if (file) {
-				page = read_page(file, index, &dummy);
+				page = read_page(file, index, bitmap, count);
 				offset = 0;
 			} else {
 				page = read_sb_page(bitmap->mddev, bitmap->offset, index);
@@ -999,9 +1027,6 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 				spin_unlock_irqrestore(&bitmap->lock, flags);
 				if (need_write) {
 					switch (write_page(bitmap, page, 0)) {
-					case -EAGAIN:
-						set_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
-						break;
 					case 0:
 						break;
 					default:
@@ -1017,10 +1042,6 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 					clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
 					spin_unlock_irqrestore(&bitmap->lock, flags);
 					err = write_page(bitmap, lastpage, 0);
-					if (err == -EAGAIN) {
-						err = 0;
-						set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
-					}
 				} else {
 					set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
 					spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1070,10 +1091,6 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 			clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
 			spin_unlock_irqrestore(&bitmap->lock, flags);
 			err = write_page(bitmap, lastpage, 0);
-			if (err == -EAGAIN) {
-				set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
-				err = 0;
-			}
 		} else {
 			set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
 			spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1083,32 +1100,6 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 	return err;
 }
 
-static void bitmap_writeback(struct bitmap *bitmap)
-{
-	struct page *page;
-	struct page_list *item;
-	int err = 0;
-
-	PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap));
-	/* wait on bitmap page writebacks */
-	while ((item = dequeue_page(bitmap))) {
-		page = item->page;
-		mempool_free(item, bitmap->write_pool);
-		PRINTK("wait on page writeback: %p\n", page);
-		wait_on_page_writeback(page);
-		PRINTK("finished page writeback: %p\n", page);
-
-		err = PageError(page);
-		if (err) {
-			printk(KERN_WARNING "%s: bitmap file writeback "
-			       "failed (page %lu): %d\n",
-			       bmname(bitmap), page->index, err);
-			bitmap_file_kick(bitmap);
-			break;
-		}
-	}
-}
-
 static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
 					    sector_t offset, int *blocks,
 					    int create)
@@ -1377,8 +1368,6 @@ static void bitmap_free(struct bitmap *bitmap)
 
 	/* free all allocated memory */
 
-	mempool_destroy(bitmap->write_pool);
-
 	if (bp) /* deallocate the page memory */
 		for (k = 0; k < pages; k++)
 			if (bp[k].map && !bp[k].hijacked)
@@ -1428,17 +1417,13 @@ int bitmap_create(mddev_t *mddev)
 	spin_lock_init(&bitmap->lock);
 	bitmap->mddev = mddev;
 
-	spin_lock_init(&bitmap->write_lock);
-	INIT_LIST_HEAD(&bitmap->complete_pages);
-	bitmap->write_pool = mempool_create_kmalloc_pool(WRITE_POOL_SIZE,
-						sizeof(struct page_list));
-	err = -ENOMEM;
-	if (!bitmap->write_pool)
-		goto error;
-
 	bitmap->file = file;
 	bitmap->offset = mddev->bitmap_offset;
 	if (file) get_file(file);
+
+	/* Ensure we read fresh data */
+	invalidate_inode_pages(file->f_dentry->d_inode->i_mapping);
+
 	/* read superblock from bitmap file (this sets bitmap->chunksize) */
 	err = bitmap_read_sb(bitmap);
 	if (err)
@@ -1461,6 +1446,9 @@ int bitmap_create(mddev_t *mddev)
 
 	bitmap->syncchunk = ~0UL;
 
+	atomic_set(&bitmap->pending_writes, 0);
+	init_waitqueue_head(&bitmap->write_wait);
+
 #ifdef INJECT_FATAL_FAULT_1
 	bitmap->bp = NULL;
 #else
@@ -1503,4 +1491,3 @@ EXPORT_SYMBOL(bitmap_start_sync);
 EXPORT_SYMBOL(bitmap_end_sync);
 EXPORT_SYMBOL(bitmap_unplug);
 EXPORT_SYMBOL(bitmap_close_sync);
-EXPORT_SYMBOL(bitmap_daemon_work);
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 9c8907ca60a7..63df898fe2e9 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -140,6 +140,7 @@ typedef __u16 bitmap_counter_t;
 enum bitmap_state {
 	BITMAP_ACTIVE = 0x001, /* the bitmap is in use */
 	BITMAP_STALE  = 0x002,  /* the bitmap file is out of date or had -EIO */
+	BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
 	BITMAP_HOSTENDIAN = 0x8000,
 };
 
@@ -244,9 +245,9 @@ struct bitmap {
 	unsigned long daemon_lastrun; /* jiffies of last run */
 	unsigned long daemon_sleep; /* how many seconds between updates? */
 
-	spinlock_t write_lock;
-	struct list_head complete_pages;
-	mempool_t *write_pool;
+	atomic_t pending_writes; /* pending writes to the bitmap file */
+	wait_queue_head_t write_wait;
+
 };
 
 /* the bitmap API */
-- 
cgit v1.2.3


From 42543769142d2375f2b5f8fc9cac999f84bd4c4c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:57 -0700
Subject: [PATCH] md: Don't write dirty/clean update to spares - leave them
 alone

- record the 'event' count on each individual device (they
  might sometimes be slightly different now)
- add a new value for 'sb_dirty': '3' means that the super
  block only needs to be updated to record a clean<->dirty
  transition.
- Prefer odd event numbers for dirty states and even numbers
  for clean states
- Using all the above, don't update the superblock on
  a spare device if the update is just doing a clean-dirty
  transition.  To accomodate this, a transition from
  dirty back to clean might now decrement the events counter
  if nothing else has changed.

The net effect of this is that spare drives will not see any IO requests
during normal running of the array, so they can go to sleep if that is what
they want to do.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 65 +++++++++++++++++++++++++++++++++++++++++------
 include/linux/raid/md_k.h |  1 +
 2 files changed, 58 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6f97817d28b9..34b6902cda46 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1558,15 +1558,30 @@ static void md_print_devices(void)
 }
 
 
-static void sync_sbs(mddev_t * mddev)
+static void sync_sbs(mddev_t * mddev, int nospares)
 {
+	/* Update each superblock (in-memory image), but
+	 * if we are allowed to, skip spares which already
+	 * have the right event counter, or have one earlier
+	 * (which would mean they aren't being marked as dirty
+	 * with the rest of the array)
+	 */
 	mdk_rdev_t *rdev;
 	struct list_head *tmp;
 
 	ITERATE_RDEV(mddev,rdev,tmp) {
-		super_types[mddev->major_version].
-			sync_super(mddev, rdev);
-		rdev->sb_loaded = 1;
+		if (rdev->sb_events == mddev->events ||
+		    (nospares &&
+		     rdev->raid_disk < 0 &&
+		     (rdev->sb_events&1)==0 &&
+		     rdev->sb_events+1 == mddev->events)) {
+			/* Don't update this superblock */
+			rdev->sb_loaded = 2;
+		} else {
+			super_types[mddev->major_version].
+				sync_super(mddev, rdev);
+			rdev->sb_loaded = 1;
+		}
 	}
 }
 
@@ -1576,12 +1591,42 @@ void md_update_sb(mddev_t * mddev)
 	struct list_head *tmp;
 	mdk_rdev_t *rdev;
 	int sync_req;
+	int nospares = 0;
 
 repeat:
 	spin_lock_irq(&mddev->write_lock);
 	sync_req = mddev->in_sync;
 	mddev->utime = get_seconds();
-	mddev->events ++;
+	if (mddev->sb_dirty == 3)
+		/* just a clean<-> dirty transition, possibly leave spares alone,
+		 * though if events isn't the right even/odd, we will have to do
+		 * spares after all
+		 */
+		nospares = 1;
+
+	/* If this is just a dirty<->clean transition, and the array is clean
+	 * and 'events' is odd, we can roll back to the previous clean state */
+	if (mddev->sb_dirty == 3
+	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
+	    && (mddev->events & 1))
+		mddev->events--;
+	else {
+		/* otherwise we have to go forward and ... */
+		mddev->events ++;
+		if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
+			/* .. if the array isn't clean, insist on an odd 'events' */
+			if ((mddev->events&1)==0) {
+				mddev->events++;
+				nospares = 0;
+			}
+		} else {
+			/* otherwise insist on an even 'events' (for clean states) */
+			if ((mddev->events&1)) {
+				mddev->events++;
+				nospares = 0;
+			}
+		}
+	}
 
 	if (!mddev->events) {
 		/*
@@ -1593,7 +1638,7 @@ repeat:
 		mddev->events --;
 	}
 	mddev->sb_dirty = 2;
-	sync_sbs(mddev);
+	sync_sbs(mddev, nospares);
 
 	/*
 	 * do not write anything to disk if using
@@ -1615,6 +1660,8 @@ repeat:
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		char b[BDEVNAME_SIZE];
 		dprintk(KERN_INFO "md: ");
+		if (rdev->sb_loaded != 1)
+			continue; /* no noise on spare devices */
 		if (test_bit(Faulty, &rdev->flags))
 			dprintk("(skipping faulty ");
 
@@ -1626,6 +1673,7 @@ repeat:
 			dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
 				bdevname(rdev->bdev,b),
 				(unsigned long long)rdev->sb_offset);
+			rdev->sb_events = mddev->events;
 
 		} else
 			dprintk(")\n");
@@ -1895,6 +1943,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
 	rdev->desc_nr = -1;
 	rdev->flags = 0;
 	rdev->data_offset = 0;
+	rdev->sb_events = 0;
 	atomic_set(&rdev->nr_pending, 0);
 	atomic_set(&rdev->read_errors, 0);
 	atomic_set(&rdev->corrected_errors, 0);
@@ -4708,7 +4757,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
 		spin_lock_irq(&mddev->write_lock);
 		if (mddev->in_sync) {
 			mddev->in_sync = 0;
-			mddev->sb_dirty = 1;
+			mddev->sb_dirty = 3;
 			md_wakeup_thread(mddev->thread);
 		}
 		spin_unlock_irq(&mddev->write_lock);
@@ -5055,7 +5104,7 @@ void md_check_recovery(mddev_t *mddev)
 		if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
 		    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
 			mddev->in_sync = 1;
-			mddev->sb_dirty = 1;
+			mddev->sb_dirty = 3;
 		}
 		if (mddev->safemode == 1)
 			mddev->safemode = 0;
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 682574f3bd36..c1e0ac55bab5 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -58,6 +58,7 @@ struct mdk_rdev_s
 
 	struct page	*sb_page;
 	int		sb_loaded;
+	__u64		sb_events;
 	sector_t	data_offset;	/* start of data in array */
 	sector_t	sb_offset;
 	int		sb_size;	/* bytes in the superblock */
-- 
cgit v1.2.3


From bebfa1013eee1d91b3242e5801cc8fbdfaf148ec Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:56:52 +0200
Subject: [PATCH] x86_64: Add compat_printk and sysctl to turn off compat layer
 warnings

Sometimes e.g. with crashme the compat layer warnings can be noisy.
Add a way to turn them off by gating all output through compat_printk
that checks a global sysctl. The default is not changed.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/ia32/sys_ia32.c |  4 ++--
 fs/compat.c                 | 16 +++++++++++++++-
 include/linux/compat.h      |  2 ++
 include/linux/sysctl.h      |  2 ++
 kernel/sysctl.c             | 11 +++++++++++
 5 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index f182b20858e2..ee30557629dc 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -514,7 +514,7 @@ int sys32_ni_syscall(int call)
 	static char lastcomm[sizeof(me->comm)];
 
 	if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
-		printk(KERN_INFO "IA32 syscall %d from %s not implemented\n",
+		compat_printk(KERN_INFO "IA32 syscall %d from %s not implemented\n",
 		       call, me->comm);
 		strncpy(lastcomm, me->comm, sizeof(lastcomm));
 	} 
@@ -916,7 +916,7 @@ long sys32_vm86_warning(void)
 	struct task_struct *me = current;
 	static char lastcomm[sizeof(me->comm)];
 	if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
-		printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n",
+		compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n",
 		       me->comm);
 		strncpy(lastcomm, me->comm, sizeof(lastcomm));
 	} 
diff --git a/fs/compat.c b/fs/compat.c
index 7e7e5bc4f3cf..e31e9cf96647 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -55,6 +55,20 @@
 
 extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
 
+int compat_log = 1;
+
+int compat_printk(const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	if (!compat_log)
+		return 0;
+	va_start(ap, fmt);
+	ret = vprintk(fmt, ap);
+	va_end(ap);
+	return ret;
+}
+
 /*
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
@@ -359,7 +373,7 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
 	sprintf(buf,"'%c'", (cmd>>24) & 0x3f);
 	if (!isprint(buf[1]))
 		sprintf(buf, "%02x", buf[1]);
-	printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
+	compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
 			"cmd(%08x){%s} arg(%08x) on %s\n",
 			current->comm, current->pid,
 			(int)fd, (unsigned int)cmd, buf,
diff --git a/include/linux/compat.h b/include/linux/compat.h
index dda1697ec753..9760753e662b 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -226,5 +226,7 @@ static inline int compat_timespec_compare(struct compat_timespec *lhs,
 
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
+extern int compat_printk(const char *fmt, ...);
+
 #endif /* CONFIG_COMPAT */
 #endif /* _LINUX_COMPAT_H */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 6a60770984e9..349ef908a222 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -148,9 +148,11 @@ enum
 	KERN_SPIN_RETRY=70,	/* int: number of spinlock retries */
 	KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
 	KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
+	KERN_COMPAT_LOG=73,	/* int: print compat layer  messages */
 };
 
 
+
 /* CTL_VM names: */
 enum
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2c0e65819448..f1a4eb1a655e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -73,6 +73,7 @@ extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
+extern int compat_log;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
@@ -676,6 +677,16 @@ static ctl_table kern_table[] = {
 	 	.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#endif
+#ifdef CONFIG_COMPAT
+	{
+		.ctl_name	= KERN_COMPAT_LOG,
+		.procname	= "compat-log",
+		.data		= &compat_log,
+		.maxlen		= sizeof (int),
+	 	.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 	{ .ctl_name = 0 }
 };
-- 
cgit v1.2.3


From 08cd36570e47176c7b6bd3e80125aa46c4638097 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:57:10 +0200
Subject: [PATCH] x86_64: Optimize bitmap_weight for small bitmaps

Use inline code bitmaps <= BITS_PER_LONG in bitmap_weight. This
gives _much_ better code.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bitmap.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index d9ed27969855..dcc5de7cc487 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -24,6 +24,9 @@
  * The available bitmap operations and their rough meaning in the
  * case that the bitmap is a single unsigned long are thus:
  *
+ * Note that nbits should be always a compile time evaluable constant.
+ * Otherwise many inlines will generate horrible code.
+ *
  * bitmap_zero(dst, nbits)			*dst = 0UL
  * bitmap_fill(dst, nbits)			*dst = ~0UL
  * bitmap_copy(dst, src, nbits)			*dst = *src
@@ -244,6 +247,8 @@ static inline int bitmap_full(const unsigned long *src, int nbits)
 
 static inline int bitmap_weight(const unsigned long *src, int nbits)
 {
+	if (nbits <= BITS_PER_LONG)
+		return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
 	return __bitmap_weight(src, nbits);
 }
 
-- 
cgit v1.2.3


From 4552d5dc08b79868829b4be8951b29b07284753f Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:57:28 +0200
Subject: [PATCH] x86_64: reliable stack trace support

These are the generic bits needed to enable reliable stack traces based
on Dwarf2-like (.eh_frame) unwind information. Subsequent patches will
enable x86-64 and i386 to make use of this.

Thanks to Andi Kleen and Ingo Molnar, who pointed out several possibilities
for improvement.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kernel.h |   7 +
 include/linux/module.h |   3 +
 include/linux/unwind.h | 119 +++++++
 init/main.c            |   2 +
 kernel/Makefile        |   1 +
 kernel/module.c        |  16 +-
 kernel/unwind.c        | 915 +++++++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug      |  12 +-
 8 files changed, 1072 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/unwind.h
 create mode 100644 kernel/unwind.c

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3c5e4c2e517d..5c1ec1f84eab 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -32,6 +32,7 @@ extern const char linux_banner[];
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
+#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
 
 #define	KERN_EMERG	"<0>"	/* system is unusable			*/
 #define	KERN_ALERT	"<1>"	/* action must be taken immediately	*/
@@ -336,6 +337,12 @@ struct sysinfo {
 /* Force a compilation error if condition is true */
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
 
+/* Force a compilation error if condition is true, but also produce a
+   result (of value 0 and type size_t), so the expression can be used
+   e.g. in a structure initializer (or where-ever else comma expressions
+   aren't permitted). */
+#define BUILD_BUG_ON_ZERO(e) (sizeof(char[1 - 2 * !!(e)]) - 1)
+
 /* Trap pasters of __FUNCTION__ at compile-time */
 #define __FUNCTION__ (__func__)
 
diff --git a/include/linux/module.h b/include/linux/module.h
index 2d366098eab5..9ebbb74b7b72 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -285,6 +285,9 @@ struct module
 	/* The size of the executable code in each section.  */
 	unsigned long init_text_size, core_text_size;
 
+	/* The handle returned from unwind_add_table. */
+	void *unwind_info;
+
 	/* Arch-specific module values */
 	struct mod_arch_specific arch;
 
diff --git a/include/linux/unwind.h b/include/linux/unwind.h
new file mode 100644
index 000000000000..0295aa789ab4
--- /dev/null
+++ b/include/linux/unwind.h
@@ -0,0 +1,119 @@
+#ifndef _LINUX_UNWIND_H
+#define _LINUX_UNWIND_H
+
+/*
+ * Copyright (C) 2002-2006 Novell, Inc.
+ *	Jan Beulich <jbeulich@novell.com>
+ * This code is released under version 2 of the GNU GPL.
+ *
+ * A simple API for unwinding kernel stacks.  This is used for
+ * debugging and error reporting purposes.  The kernel doesn't need
+ * full-blown stack unwinding with all the bells and whistles, so there
+ * is not much point in implementing the full Dwarf2 unwind API.
+ */
+
+#include <linux/config.h>
+
+struct module;
+
+#ifdef CONFIG_STACK_UNWIND
+
+#include <asm/unwind.h>
+
+#ifndef ARCH_UNWIND_SECTION_NAME
+#define ARCH_UNWIND_SECTION_NAME ".eh_frame"
+#endif
+
+/*
+ * Initialize unwind support.
+ */
+extern void unwind_init(void);
+
+extern void *unwind_add_table(struct module *,
+                              const void *table_start,
+                              unsigned long table_size);
+
+extern void unwind_remove_table(void *handle, int init_only);
+
+extern int unwind_init_frame_info(struct unwind_frame_info *,
+                                  struct task_struct *,
+                                  /*const*/ struct pt_regs *);
+
+/*
+ * Prepare to unwind a blocked task.
+ */
+extern int unwind_init_blocked(struct unwind_frame_info *,
+                               struct task_struct *);
+
+/*
+ * Prepare to unwind the currently running thread.
+ */
+extern int unwind_init_running(struct unwind_frame_info *,
+                               asmlinkage void (*callback)(struct unwind_frame_info *,
+                                                           void *arg),
+                               void *arg);
+
+/*
+ * Unwind to previous to frame.  Returns 0 if successful, negative
+ * number in case of an error.
+ */
+extern int unwind(struct unwind_frame_info *);
+
+/*
+ * Unwind until the return pointer is in user-land (or until an error
+ * occurs).  Returns 0 if successful, negative number in case of
+ * error.
+ */
+extern int unwind_to_user(struct unwind_frame_info *);
+
+#else
+
+struct unwind_frame_info {};
+
+static inline void unwind_init(void) {}
+
+static inline void *unwind_add_table(struct module *mod,
+                                     const void *table_start,
+                                     unsigned long table_size)
+{
+	return NULL;
+}
+
+static inline void unwind_remove_table(void *handle, int init_only)
+{
+}
+
+static inline int unwind_init_frame_info(struct unwind_frame_info *info,
+                                         struct task_struct *tsk,
+                                         const struct pt_regs *regs)
+{
+	return -ENOSYS;
+}
+
+static inline int unwind_init_blocked(struct unwind_frame_info *info,
+                                      struct task_struct *tsk)
+{
+	return -ENOSYS;
+}
+
+static inline int unwind_init_running(struct unwind_frame_info *info,
+                                      asmlinkage void (*cb)(struct unwind_frame_info *,
+                                                            void *arg),
+                                      void *arg)
+{
+	return -ENOSYS;
+}
+
+static inline int unwind(struct unwind_frame_info *info)
+{
+	return -ENOSYS;
+}
+
+static inline int unwind_to_user(struct unwind_frame_info *info)
+{
+	return -ENOSYS;
+}
+
+#endif
+
+#endif /* _LINUX_UNWIND_H */
diff --git a/init/main.c b/init/main.c
index f715b9b89753..f556fd0a0b66 100644
--- a/init/main.c
+++ b/init/main.c
@@ -47,6 +47,7 @@
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
+#include <linux/unwind.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -482,6 +483,7 @@ asmlinkage void __init start_kernel(void)
 		   __stop___param - __start___param,
 		   &unknown_bootoption);
 	sort_main_extable();
+	unwind_init();
 	trap_init();
 	rcu_init();
 	init_IRQ();
diff --git a/kernel/Makefile b/kernel/Makefile
index f6ef00f4f90f..a31276e190f5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_STACK_UNWIND) += unwind.o
 obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
diff --git a/kernel/module.c b/kernel/module.c
index d75275de1c28..08811e26ac9d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -40,6 +40,7 @@
 #include <linux/string.h>
 #include <linux/sched.h>
 #include <linux/mutex.h>
+#include <linux/unwind.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
@@ -1051,6 +1052,8 @@ static void free_module(struct module *mod)
 	remove_sect_attrs(mod);
 	mod_kobject_remove(mod);
 
+	unwind_remove_table(mod->unwind_info, 0);
+
 	/* Arch-specific cleanup. */
 	module_arch_cleanup(mod);
 
@@ -1412,7 +1415,7 @@ static struct module *load_module(void __user *umod,
 	unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
 		exportindex, modindex, obsparmindex, infoindex, gplindex,
 		crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
-		gplfuturecrcindex;
+		gplfuturecrcindex, unwindex = 0;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1502,6 +1505,9 @@ static struct module *load_module(void __user *umod,
 	versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
 	infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
 	pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
+#ifdef ARCH_UNWIND_SECTION_NAME
+	unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
+#endif
 
 	/* Don't keep modinfo section */
 	sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1510,6 +1516,8 @@ static struct module *load_module(void __user *umod,
 	sechdrs[symindex].sh_flags |= SHF_ALLOC;
 	sechdrs[strindex].sh_flags |= SHF_ALLOC;
 #endif
+	if (unwindex)
+		sechdrs[unwindex].sh_flags |= SHF_ALLOC;
 
 	/* Check module struct version now, before we try to use module. */
 	if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -1738,6 +1746,11 @@ static struct module *load_module(void __user *umod,
 		goto arch_cleanup;
 	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
 
+	/* Size of section 0 is 0, so this works well if no unwind info. */
+	mod->unwind_info = unwind_add_table(mod,
+	                                    (void *)sechdrs[unwindex].sh_addr,
+	                                    sechdrs[unwindex].sh_size);
+
 	/* Get rid of temporary copy */
 	vfree(hdr);
 
@@ -1836,6 +1849,7 @@ sys_init_module(void __user *umod,
 	mod->state = MODULE_STATE_LIVE;
 	/* Drop initial reference. */
 	module_put(mod);
+	unwind_remove_table(mod->unwind_info, 1);
 	module_free(mod, mod->module_init);
 	mod->module_init = NULL;
 	mod->init_size = 0;
diff --git a/kernel/unwind.c b/kernel/unwind.c
new file mode 100644
index 000000000000..d36bcd3ad3b5
--- /dev/null
+++ b/kernel/unwind.c
@@ -0,0 +1,915 @@
+/*
+ * Copyright (C) 2002-2006 Novell, Inc.
+ *	Jan Beulich <jbeulich@novell.com>
+ * This code is released under version 2 of the GNU GPL.
+ *
+ * A simple API for unwinding kernel stacks.  This is used for
+ * debugging and error reporting purposes.  The kernel doesn't need
+ * full-blown stack unwinding with all the bells and whistles, so there
+ * is not much point in implementing the full Dwarf2 unwind API.
+ */
+
+#include <linux/unwind.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/stop_machine.h>
+#include <asm/sections.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+
+extern char __start_unwind[], __end_unwind[];
+
+#define MAX_STACK_DEPTH 8
+
+#define EXTRA_INFO(f) { \
+		BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \
+		                  % FIELD_SIZEOF(struct unwind_frame_info, f)) \
+		+ offsetof(struct unwind_frame_info, f) \
+		  / FIELD_SIZEOF(struct unwind_frame_info, f), \
+		FIELD_SIZEOF(struct unwind_frame_info, f) \
+	}
+#define PTREGS_INFO(f) EXTRA_INFO(regs.f)
+
+static const struct {
+	unsigned offs:BITS_PER_LONG / 2;
+	unsigned width:BITS_PER_LONG / 2;
+} reg_info[] = {
+	UNW_REGISTER_INFO
+};
+
+#undef PTREGS_INFO
+#undef EXTRA_INFO
+
+#ifndef REG_INVALID
+#define REG_INVALID(r) (reg_info[r].width == 0)
+#endif
+
+#define DW_CFA_nop                          0x00
+#define DW_CFA_set_loc                      0x01
+#define DW_CFA_advance_loc1                 0x02
+#define DW_CFA_advance_loc2                 0x03
+#define DW_CFA_advance_loc4                 0x04
+#define DW_CFA_offset_extended              0x05
+#define DW_CFA_restore_extended             0x06
+#define DW_CFA_undefined                    0x07
+#define DW_CFA_same_value                   0x08
+#define DW_CFA_register                     0x09
+#define DW_CFA_remember_state               0x0a
+#define DW_CFA_restore_state                0x0b
+#define DW_CFA_def_cfa                      0x0c
+#define DW_CFA_def_cfa_register             0x0d
+#define DW_CFA_def_cfa_offset               0x0e
+#define DW_CFA_def_cfa_expression           0x0f
+#define DW_CFA_expression                   0x10
+#define DW_CFA_offset_extended_sf           0x11
+#define DW_CFA_def_cfa_sf                   0x12
+#define DW_CFA_def_cfa_offset_sf            0x13
+#define DW_CFA_val_offset                   0x14
+#define DW_CFA_val_offset_sf                0x15
+#define DW_CFA_val_expression               0x16
+#define DW_CFA_lo_user                      0x1c
+#define DW_CFA_GNU_window_save              0x2d
+#define DW_CFA_GNU_args_size                0x2e
+#define DW_CFA_GNU_negative_offset_extended 0x2f
+#define DW_CFA_hi_user                      0x3f
+
+#define DW_EH_PE_FORM     0x07
+#define DW_EH_PE_native   0x00
+#define DW_EH_PE_leb128   0x01
+#define DW_EH_PE_data2    0x02
+#define DW_EH_PE_data4    0x03
+#define DW_EH_PE_data8    0x04
+#define DW_EH_PE_signed   0x08
+#define DW_EH_PE_ADJUST   0x70
+#define DW_EH_PE_abs      0x00
+#define DW_EH_PE_pcrel    0x10
+#define DW_EH_PE_textrel  0x20
+#define DW_EH_PE_datarel  0x30
+#define DW_EH_PE_funcrel  0x40
+#define DW_EH_PE_aligned  0x50
+#define DW_EH_PE_indirect 0x80
+#define DW_EH_PE_omit     0xff
+
+typedef unsigned long uleb128_t;
+typedef   signed long sleb128_t;
+
+static struct unwind_table {
+	struct {
+		unsigned long pc;
+		unsigned long range;
+	} core, init;
+	const void *address;
+	unsigned long size;
+	struct unwind_table *link;
+	const char *name;
+} root_table, *last_table;
+
+struct unwind_item {
+	enum item_location {
+		Nowhere,
+		Memory,
+		Register,
+		Value
+	} where;
+	uleb128_t value;
+};
+
+struct unwind_state {
+	uleb128_t loc, org;
+	const u8 *cieStart, *cieEnd;
+	uleb128_t codeAlign;
+	sleb128_t dataAlign;
+	struct cfa {
+		uleb128_t reg, offs;
+	} cfa;
+	struct unwind_item regs[ARRAY_SIZE(reg_info)];
+	unsigned stackDepth:8;
+	unsigned version:8;
+	const u8 *label;
+	const u8 *stack[MAX_STACK_DEPTH];
+};
+
+static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
+
+static struct unwind_table *find_table(unsigned long pc)
+{
+	struct unwind_table *table;
+
+	for (table = &root_table; table; table = table->link)
+		if ((pc >= table->core.pc
+		     && pc < table->core.pc + table->core.range)
+		    || (pc >= table->init.pc
+		        && pc < table->init.pc + table->init.range))
+			break;
+
+	return table;
+}
+
+static void init_unwind_table(struct unwind_table *table,
+                              const char *name,
+                              const void *core_start,
+                              unsigned long core_size,
+                              const void *init_start,
+                              unsigned long init_size,
+                              const void *table_start,
+                              unsigned long table_size)
+{
+	table->core.pc = (unsigned long)core_start;
+	table->core.range = core_size;
+	table->init.pc = (unsigned long)init_start;
+	table->init.range = init_size;
+	table->address = table_start;
+	table->size = table_size;
+	table->link = NULL;
+	table->name = name;
+}
+
+void __init unwind_init(void)
+{
+	init_unwind_table(&root_table, "kernel",
+	                  _text, _end - _text,
+	                  NULL, 0,
+	                  __start_unwind, __end_unwind - __start_unwind);
+}
+
+/* Must be called with module_mutex held. */
+void *unwind_add_table(struct module *module,
+                       const void *table_start,
+                       unsigned long table_size)
+{
+	struct unwind_table *table;
+
+	if (table_size <= 0)
+		return NULL;
+
+	table = kmalloc(sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return NULL;
+
+	init_unwind_table(table, module->name,
+	                  module->module_core, module->core_size,
+	                  module->module_init, module->init_size,
+	                  table_start, table_size);
+
+	if (last_table)
+		last_table->link = table;
+	else
+		root_table.link = table;
+	last_table = table;
+
+	return table;
+}
+
+struct unlink_table_info
+{
+	struct unwind_table *table;
+	int init_only;
+};
+
+static int unlink_table(void *arg)
+{
+	struct unlink_table_info *info = arg;
+	struct unwind_table *table = info->table, *prev;
+
+	for (prev = &root_table; prev->link && prev->link != table; prev = prev->link)
+		;
+
+	if (prev->link) {
+		if (info->init_only) {
+			table->init.pc = 0;
+			table->init.range = 0;
+			info->table = NULL;
+		} else {
+			prev->link = table->link;
+			if (!prev->link)
+				last_table = prev;
+		}
+	} else
+		info->table = NULL;
+
+	return 0;
+}
+
+/* Must be called with module_mutex held. */
+void unwind_remove_table(void *handle, int init_only)
+{
+	struct unwind_table *table = handle;
+	struct unlink_table_info info;
+
+	if (!table || table == &root_table)
+		return;
+
+	if (init_only && table == last_table) {
+		table->init.pc = 0;
+		table->init.range = 0;
+		return;
+	}
+
+	info.table = table;
+	info.init_only = init_only;
+	stop_machine_run(unlink_table, &info, NR_CPUS);
+
+	if (info.table)
+		kfree(table);
+}
+
+static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
+{
+	const u8 *cur = *pcur;
+	uleb128_t value;
+	unsigned shift;
+
+	for (shift = 0, value = 0; cur < end; shift += 7) {
+		if (shift + 7 > 8 * sizeof(value)
+		    && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
+			cur = end + 1;
+			break;
+		}
+		value |= (uleb128_t)(*cur & 0x7f) << shift;
+		if (!(*cur++ & 0x80))
+			break;
+	}
+	*pcur = cur;
+
+	return value;
+}
+
+static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
+{
+	const u8 *cur = *pcur;
+	sleb128_t value;
+	unsigned shift;
+
+	for (shift = 0, value = 0; cur < end; shift += 7) {
+		if (shift + 7 > 8 * sizeof(value)
+		    && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
+			cur = end + 1;
+			break;
+		}
+		value |= (sleb128_t)(*cur & 0x7f) << shift;
+		if (!(*cur & 0x80)) {
+			value |= -(*cur++ & 0x40) << shift;
+			break;
+		}
+	}
+	*pcur = cur;
+
+	return value;
+}
+
+static unsigned long read_pointer(const u8 **pLoc,
+                                  const void *end,
+                                  signed ptrType)
+{
+	unsigned long value = 0;
+	union {
+		const u8 *p8;
+		const u16 *p16u;
+		const s16 *p16s;
+		const u32 *p32u;
+		const s32 *p32s;
+		const unsigned long *pul;
+	} ptr;
+
+	if (ptrType < 0 || ptrType == DW_EH_PE_omit)
+		return 0;
+	ptr.p8 = *pLoc;
+	switch(ptrType & DW_EH_PE_FORM) {
+	case DW_EH_PE_data2:
+		if (end < (const void *)(ptr.p16u + 1))
+			return 0;
+		if(ptrType & DW_EH_PE_signed)
+			value = get_unaligned(ptr.p16s++);
+		else
+			value = get_unaligned(ptr.p16u++);
+		break;
+	case DW_EH_PE_data4:
+#ifdef CONFIG_64BIT
+		if (end < (const void *)(ptr.p32u + 1))
+			return 0;
+		if(ptrType & DW_EH_PE_signed)
+			value = get_unaligned(ptr.p32s++);
+		else
+			value = get_unaligned(ptr.p32u++);
+		break;
+	case DW_EH_PE_data8:
+		BUILD_BUG_ON(sizeof(u64) != sizeof(value));
+#else
+		BUILD_BUG_ON(sizeof(u32) != sizeof(value));
+#endif
+	case DW_EH_PE_native:
+		if (end < (const void *)(ptr.pul + 1))
+			return 0;
+		value = get_unaligned(ptr.pul++);
+		break;
+	case DW_EH_PE_leb128:
+		BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value));
+		value = ptrType & DW_EH_PE_signed
+		        ? get_sleb128(&ptr.p8, end)
+		        : get_uleb128(&ptr.p8, end);
+		if ((const void *)ptr.p8 > end)
+			return 0;
+		break;
+	default:
+		return 0;
+	}
+	switch(ptrType & DW_EH_PE_ADJUST) {
+	case DW_EH_PE_abs:
+		break;
+	case DW_EH_PE_pcrel:
+		value += (unsigned long)*pLoc;
+		break;
+	default:
+		return 0;
+	}
+	if ((ptrType & DW_EH_PE_indirect)
+	    && __get_user(value, (unsigned long *)value))
+		return 0;
+	*pLoc = ptr.p8;
+
+	return value;
+}
+
+static signed fde_pointer_type(const u32 *cie)
+{
+	const u8 *ptr = (const u8 *)(cie + 2);
+	unsigned version = *ptr;
+
+	if (version != 1)
+		return -1; /* unsupported */
+	if (*++ptr) {
+		const char *aug;
+		const u8 *end = (const u8 *)(cie + 1) + *cie;
+		uleb128_t len;
+
+		/* check if augmentation size is first (and thus present) */
+		if (*ptr != 'z')
+			return -1;
+		/* check if augmentation string is nul-terminated */
+		if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL)
+			return -1;
+		++ptr; /* skip terminator */
+		get_uleb128(&ptr, end); /* skip code alignment */
+		get_sleb128(&ptr, end); /* skip data alignment */
+		/* skip return address column */
+		version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end);
+		len = get_uleb128(&ptr, end); /* augmentation length */
+		if (ptr + len < ptr || ptr + len > end)
+			return -1;
+		end = ptr + len;
+		while (*++aug) {
+			if (ptr >= end)
+				return -1;
+			switch(*aug) {
+			case 'L':
+				++ptr;
+				break;
+			case 'P': {
+					signed ptrType = *ptr++;
+
+					if (!read_pointer(&ptr, end, ptrType) || ptr > end)
+						return -1;
+				}
+				break;
+			case 'R':
+				return *ptr;
+			default:
+				return -1;
+			}
+		}
+	}
+	return DW_EH_PE_native|DW_EH_PE_abs;
+}
+
+static int advance_loc(unsigned long delta, struct unwind_state *state)
+{
+	state->loc += delta * state->codeAlign;
+
+	return delta > 0;
+}
+
+static void set_rule(uleb128_t reg,
+                     enum item_location where,
+                     uleb128_t value,
+                     struct unwind_state *state)
+{
+	if (reg < ARRAY_SIZE(state->regs)) {
+		state->regs[reg].where = where;
+		state->regs[reg].value = value;
+	}
+}
+
+static int processCFI(const u8 *start,
+                      const u8 *end,
+                      unsigned long targetLoc,
+                      signed ptrType,
+                      struct unwind_state *state)
+{
+	union {
+		const u8 *p8;
+		const u16 *p16;
+		const u32 *p32;
+	} ptr;
+	int result = 1;
+
+	if (start != state->cieStart) {
+		state->loc = state->org;
+		result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state);
+		if (targetLoc == 0 && state->label == NULL)
+			return result;
+	}
+	for (ptr.p8 = start; result && ptr.p8 < end; ) {
+		switch(*ptr.p8 >> 6) {
+			uleb128_t value;
+
+		case 0:
+			switch(*ptr.p8++) {
+			case DW_CFA_nop:
+				break;
+			case DW_CFA_set_loc:
+				if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
+					result = 0;
+				break;
+			case DW_CFA_advance_loc1:
+				result = ptr.p8 < end && advance_loc(*ptr.p8++, state);
+				break;
+			case DW_CFA_advance_loc2:
+				result = ptr.p8 <= end + 2
+				         && advance_loc(*ptr.p16++, state);
+				break;
+			case DW_CFA_advance_loc4:
+				result = ptr.p8 <= end + 4
+				         && advance_loc(*ptr.p32++, state);
+				break;
+			case DW_CFA_offset_extended:
+				value = get_uleb128(&ptr.p8, end);
+				set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
+				break;
+			case DW_CFA_val_offset:
+				value = get_uleb128(&ptr.p8, end);
+				set_rule(value, Value, get_uleb128(&ptr.p8, end), state);
+				break;
+			case DW_CFA_offset_extended_sf:
+				value = get_uleb128(&ptr.p8, end);
+				set_rule(value, Memory, get_sleb128(&ptr.p8, end), state);
+				break;
+			case DW_CFA_val_offset_sf:
+				value = get_uleb128(&ptr.p8, end);
+				set_rule(value, Value, get_sleb128(&ptr.p8, end), state);
+				break;
+			case DW_CFA_restore_extended:
+			case DW_CFA_undefined:
+			case DW_CFA_same_value:
+				set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state);
+				break;
+			case DW_CFA_register:
+				value = get_uleb128(&ptr.p8, end);
+				set_rule(value,
+				         Register,
+				         get_uleb128(&ptr.p8, end), state);
+				break;
+			case DW_CFA_remember_state:
+				if (ptr.p8 == state->label) {
+					state->label = NULL;
+					return 1;
+				}
+				if (state->stackDepth >= MAX_STACK_DEPTH)
+					return 0;
+				state->stack[state->stackDepth++] = ptr.p8;
+				break;
+			case DW_CFA_restore_state:
+				if (state->stackDepth) {
+					const uleb128_t loc = state->loc;
+					const u8 *label = state->label;
+
+					state->label = state->stack[state->stackDepth - 1];
+					memcpy(&state->cfa, &badCFA, sizeof(state->cfa));
+					memset(state->regs, 0, sizeof(state->regs));
+					state->stackDepth = 0;
+					result = processCFI(start, end, 0, ptrType, state);
+					state->loc = loc;
+					state->label = label;
+				} else
+					return 0;
+				break;
+			case DW_CFA_def_cfa:
+				state->cfa.reg = get_uleb128(&ptr.p8, end);
+				/*nobreak*/
+			case DW_CFA_def_cfa_offset:
+				state->cfa.offs = get_uleb128(&ptr.p8, end);
+				break;
+			case DW_CFA_def_cfa_sf:
+				state->cfa.reg = get_uleb128(&ptr.p8, end);
+				/*nobreak*/
+			case DW_CFA_def_cfa_offset_sf:
+				state->cfa.offs = get_sleb128(&ptr.p8, end)
+				                  * state->dataAlign;
+				break;
+			case DW_CFA_def_cfa_register:
+				state->cfa.reg = get_uleb128(&ptr.p8, end);
+				break;
+			/*todo case DW_CFA_def_cfa_expression: */
+			/*todo case DW_CFA_expression: */
+			/*todo case DW_CFA_val_expression: */
+			case DW_CFA_GNU_args_size:
+				get_uleb128(&ptr.p8, end);
+				break;
+			case DW_CFA_GNU_negative_offset_extended:
+				value = get_uleb128(&ptr.p8, end);
+				set_rule(value,
+				         Memory,
+				         (uleb128_t)0 - get_uleb128(&ptr.p8, end), state);
+				break;
+			case DW_CFA_GNU_window_save:
+			default:
+				result = 0;
+				break;
+			}
+			break;
+		case 1:
+			result = advance_loc(*ptr.p8++ & 0x3f, state);
+			break;
+		case 2:
+			value = *ptr.p8++ & 0x3f;
+			set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
+			break;
+		case 3:
+			set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
+			break;
+		}
+		if (ptr.p8 > end)
+			result = 0;
+		if (result && targetLoc != 0 && targetLoc < state->loc)
+			return 1;
+	}
+
+	return result
+	   && ptr.p8 == end
+	   && (targetLoc == 0
+	    || (/*todo While in theory this should apply, gcc in practice omits
+	          everything past the function prolog, and hence the location
+	          never reaches the end of the function.
+	        targetLoc < state->loc &&*/ state->label == NULL));
+}
+
+/* Unwind to previous to frame.  Returns 0 if successful, negative
+ * number in case of an error. */
+int unwind(struct unwind_frame_info *frame)
+{
+#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
+	const u32 *fde = NULL, *cie = NULL;
+	const u8 *ptr = NULL, *end = NULL;
+	unsigned long startLoc = 0, endLoc = 0, cfa;
+	unsigned i;
+	signed ptrType = -1;
+	uleb128_t retAddrReg = 0;
+	struct unwind_table *table;
+	struct unwind_state state;
+
+	if (UNW_PC(frame) == 0)
+		return -EINVAL;
+	if ((table = find_table(UNW_PC(frame))) != NULL
+	    && !(table->size & (sizeof(*fde) - 1))) {
+		unsigned long tableSize = table->size;
+
+		for (fde = table->address;
+		     tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
+		     tableSize -= sizeof(*fde) + *fde,
+		     fde += 1 + *fde / sizeof(*fde)) {
+			if (!*fde || (*fde & (sizeof(*fde) - 1)))
+				break;
+			if (!fde[1])
+				continue; /* this is a CIE */
+			if ((fde[1] & (sizeof(*fde) - 1))
+			    || fde[1] > (unsigned long)(fde + 1)
+			                - (unsigned long)table->address)
+				continue; /* this is not a valid FDE */
+			cie = fde + 1 - fde[1] / sizeof(*fde);
+			if (*cie <= sizeof(*cie) + 4
+			    || *cie >= fde[1] - sizeof(*fde)
+			    || (*cie & (sizeof(*cie) - 1))
+			    || cie[1]
+			    || (ptrType = fde_pointer_type(cie)) < 0) {
+				cie = NULL; /* this is not a (valid) CIE */
+				continue;
+			}
+			ptr = (const u8 *)(fde + 2);
+			startLoc = read_pointer(&ptr,
+			                        (const u8 *)(fde + 1) + *fde,
+			                        ptrType);
+			endLoc = startLoc
+			         + read_pointer(&ptr,
+			                        (const u8 *)(fde + 1) + *fde,
+			                        ptrType & DW_EH_PE_indirect
+			                        ? ptrType
+			                        : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
+			if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc)
+				break;
+			cie = NULL;
+		}
+	}
+	if (cie != NULL) {
+		memset(&state, 0, sizeof(state));
+		state.cieEnd = ptr; /* keep here temporarily */
+		ptr = (const u8 *)(cie + 2);
+		end = (const u8 *)(cie + 1) + *cie;
+		if ((state.version = *ptr) != 1)
+			cie = NULL; /* unsupported version */
+		else if (*++ptr) {
+			/* check if augmentation size is first (and thus present) */
+			if (*ptr == 'z') {
+				/* check for ignorable (or already handled)
+				 * nul-terminated augmentation string */
+				while (++ptr < end && *ptr)
+					if (strchr("LPR", *ptr) == NULL)
+						break;
+			}
+			if (ptr >= end || *ptr)
+				cie = NULL;
+		}
+		++ptr;
+	}
+	if (cie != NULL) {
+		/* get code aligment factor */
+		state.codeAlign = get_uleb128(&ptr, end);
+		/* get data aligment factor */
+		state.dataAlign = get_sleb128(&ptr, end);
+		if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
+			cie = NULL;
+		else {
+			retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
+			/* skip augmentation */
+			if (((const char *)(cie + 2))[1] == 'z')
+				ptr += get_uleb128(&ptr, end);
+			if (ptr > end
+			   || retAddrReg >= ARRAY_SIZE(reg_info)
+			   || REG_INVALID(retAddrReg)
+			   || reg_info[retAddrReg].width != sizeof(unsigned long))
+				cie = NULL;
+		}
+	}
+	if (cie != NULL) {
+		state.cieStart = ptr;
+		ptr = state.cieEnd;
+		state.cieEnd = end;
+		end = (const u8 *)(fde + 1) + *fde;
+		/* skip augmentation */
+		if (((const char *)(cie + 2))[1] == 'z') {
+			uleb128_t augSize = get_uleb128(&ptr, end);
+
+			if ((ptr += augSize) > end)
+				fde = NULL;
+		}
+	}
+	if (cie == NULL || fde == NULL) {
+#ifdef CONFIG_FRAME_POINTER
+		unsigned long top, bottom;
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+		top = STACK_TOP(frame->task);
+		bottom = STACK_BOTTOM(frame->task);
+# if FRAME_RETADDR_OFFSET < 0
+		if (UNW_SP(frame) < top
+		    && UNW_FP(frame) <= UNW_SP(frame)
+		    && bottom < UNW_FP(frame)
+# else
+		if (UNW_SP(frame) > top
+		    && UNW_FP(frame) >= UNW_SP(frame)
+		    && bottom > UNW_FP(frame)
+# endif
+		   && !((UNW_SP(frame) | UNW_FP(frame))
+		        & (sizeof(unsigned long) - 1))) {
+			unsigned long link;
+
+			if (!__get_user(link,
+			                (unsigned long *)(UNW_FP(frame)
+			                                  + FRAME_LINK_OFFSET))
+# if FRAME_RETADDR_OFFSET < 0
+			   && link > bottom && link < UNW_FP(frame)
+# else
+			   && link > UNW_FP(frame) && link < bottom
+# endif
+			   && !(link & (sizeof(link) - 1))
+			   && !__get_user(UNW_PC(frame),
+			                  (unsigned long *)(UNW_FP(frame)
+			                                    + FRAME_RETADDR_OFFSET))) {
+				UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
+# if FRAME_RETADDR_OFFSET < 0
+					-
+# else
+					+
+# endif
+					  sizeof(UNW_PC(frame));
+				UNW_FP(frame) = link;
+				return 0;
+			}
+		}
+#endif
+		return -ENXIO;
+	}
+	state.org = startLoc;
+	memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
+	/* process instructions */
+	if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state)
+	   || state.loc > endLoc
+	   || state.regs[retAddrReg].where == Nowhere
+	   || state.cfa.reg >= ARRAY_SIZE(reg_info)
+	   || reg_info[state.cfa.reg].width != sizeof(unsigned long)
+	   || state.cfa.offs % sizeof(unsigned long))
+		return -EIO;
+	/* update frame */
+	cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
+	startLoc = min((unsigned long)UNW_SP(frame), cfa);
+	endLoc = max((unsigned long)UNW_SP(frame), cfa);
+	if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) {
+		startLoc = min(STACK_LIMIT(cfa), cfa);
+		endLoc = max(STACK_LIMIT(cfa), cfa);
+	}
+#ifndef CONFIG_64BIT
+# define CASES CASE(8); CASE(16); CASE(32)
+#else
+# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
+#endif
+	for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
+		if (REG_INVALID(i)) {
+			if (state.regs[i].where == Nowhere)
+				continue;
+			return -EIO;
+		}
+		switch(state.regs[i].where) {
+		default:
+			break;
+		case Register:
+			if (state.regs[i].value >= ARRAY_SIZE(reg_info)
+			   || REG_INVALID(state.regs[i].value)
+			   || reg_info[i].width > reg_info[state.regs[i].value].width)
+				return -EIO;
+			switch(reg_info[state.regs[i].value].width) {
+#define CASE(n) \
+			case sizeof(u##n): \
+				state.regs[i].value = FRAME_REG(state.regs[i].value, \
+				                                const u##n); \
+				break
+			CASES;
+#undef CASE
+			default:
+				return -EIO;
+			}
+			break;
+		}
+	}
+	for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
+		if (REG_INVALID(i))
+			continue;
+		switch(state.regs[i].where) {
+		case Nowhere:
+			if (reg_info[i].width != sizeof(UNW_SP(frame))
+			   || &FRAME_REG(i, __typeof__(UNW_SP(frame)))
+			      != &UNW_SP(frame))
+				continue;
+			UNW_SP(frame) = cfa;
+			break;
+		case Register:
+			switch(reg_info[i].width) {
+#define CASE(n) case sizeof(u##n): \
+				FRAME_REG(i, u##n) = state.regs[i].value; \
+				break
+			CASES;
+#undef CASE
+			default:
+				return -EIO;
+			}
+			break;
+		case Value:
+			if (reg_info[i].width != sizeof(unsigned long))
+				return -EIO;
+			FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
+			                                    * state.dataAlign;
+			break;
+		case Memory: {
+				unsigned long addr = cfa + state.regs[i].value
+				                           * state.dataAlign;
+
+				if ((state.regs[i].value * state.dataAlign)
+				    % sizeof(unsigned long)
+				    || addr < startLoc
+				    || addr + sizeof(unsigned long) < addr
+				    || addr + sizeof(unsigned long) > endLoc)
+					return -EIO;
+				switch(reg_info[i].width) {
+#define CASE(n)     case sizeof(u##n): \
+					__get_user(FRAME_REG(i, u##n), (u##n *)addr); \
+					break
+				CASES;
+#undef CASE
+				default:
+					return -EIO;
+				}
+			}
+			break;
+		}
+	}
+
+	return 0;
+#undef CASES
+#undef FRAME_REG
+}
+EXPORT_SYMBOL(unwind);
+
+int unwind_init_frame_info(struct unwind_frame_info *info,
+                           struct task_struct *tsk,
+                           /*const*/ struct pt_regs *regs)
+{
+	info->task = tsk;
+	arch_unw_init_frame_info(info, regs);
+
+	return 0;
+}
+EXPORT_SYMBOL(unwind_init_frame_info);
+
+/*
+ * Prepare to unwind a blocked task.
+ */
+int unwind_init_blocked(struct unwind_frame_info *info,
+                        struct task_struct *tsk)
+{
+	info->task = tsk;
+	arch_unw_init_blocked(info);
+
+	return 0;
+}
+EXPORT_SYMBOL(unwind_init_blocked);
+
+/*
+ * Prepare to unwind the currently running thread.
+ */
+int unwind_init_running(struct unwind_frame_info *info,
+                        asmlinkage void (*callback)(struct unwind_frame_info *,
+                                                    void *arg),
+                        void *arg)
+{
+	info->task = current;
+	arch_unwind_init_running(info, callback, arg);
+
+	return 0;
+}
+EXPORT_SYMBOL(unwind_init_running);
+
+/*
+ * Unwind until the return pointer is in user-land (or until an error
+ * occurs).  Returns 0 if successful, negative number in case of
+ * error.
+ */
+int unwind_to_user(struct unwind_frame_info *info)
+{
+	while (!arch_unw_user_mode(info)) {
+		int err = unwind(info);
+
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(unwind_to_user);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ccb0c1fdf1b5..256b3b805c5c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -188,14 +188,22 @@ config FRAME_POINTER
 
 config UNWIND_INFO
 	bool "Compile the kernel with frame unwind information"
-	depends on !IA64
-	depends on !MODULES || !(MIPS || PARISC || PPC || SUPERH || V850)
+	depends on !IA64 && !PARISC
+	depends on !MODULES || !(MIPS || PPC || SUPERH || V850)
 	help
 	  If you say Y here the resulting kernel image will be slightly larger
 	  but not slower, and it will give very useful debugging information.
 	  If you don't debug the kernel, you can say N, but we may not be able
 	  to solve problems without frame unwind information or frame pointers.
 
+config STACK_UNWIND
+	bool "Stack unwind support"
+	depends on UNWIND_INFO
+	depends on n
+	help
+	  This enables more precise stack traces, omitting all unrelated
+	  occurrences of pointers into kernel code from the dump.
+
 config FORCED_INLINING
 	bool "Force gcc to inline functions marked 'inline'"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3


From c33bd9aac0597eeedaaa01ea5aafe456894b2f2b Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:57:47 +0200
Subject: [PATCH] i386/x86-64: fall back to old-style call trace if no
 unwinding

If no unwinding is possible at all for a certain exception instance,
fall back to the old style call trace instead of not showing any trace
at all.

Also, allow setting the stack trace mode at the command line.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/traps.c    | 46 ++++++++++++++++++++++++++++------------
 arch/x86_64/kernel/traps.c  | 51 +++++++++++++++++++++++++++++++--------------
 include/asm-i386/unwind.h   |  8 +++----
 include/asm-x86_64/unwind.h |  8 +++----
 include/linux/unwind.h      |  8 +++----
 kernel/unwind.c             |  7 +++----
 6 files changed, 83 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 286584667865..78464097470a 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -93,6 +93,7 @@ asmlinkage void spurious_interrupt_bug(void);
 asmlinkage void machine_check(void);
 
 static int kstack_depth_to_print = 24;
+static int call_trace = 1;
 ATOMIC_NOTIFIER_HEAD(i386die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
@@ -171,40 +172,47 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 	return ebp;
 }
 
-static asmlinkage void show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
+static asmlinkage int show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
 {
+	int n = 0;
 	int printed = 0; /* nr of entries already printed on current line */
 
 	while (unwind(info) == 0 && UNW_PC(info)) {
+		++n;
 		printed = print_addr_and_symbol(UNW_PC(info), log_lvl, printed);
 		if (arch_unw_user_mode(info))
 			break;
 	}
 	if (printed)
 		printk("\n");
+	return n;
 }
 
 static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 			       unsigned long *stack, char *log_lvl)
 {
 	unsigned long ebp;
-	struct unwind_frame_info info;
 
 	if (!task)
 		task = current;
 
-	if (regs) {
-		if (unwind_init_frame_info(&info, task, regs) == 0) {
-			show_trace_unwind(&info, log_lvl);
-			return;
+	if (call_trace >= 0) {
+		int unw_ret = 0;
+		struct unwind_frame_info info;
+
+		if (regs) {
+			if (unwind_init_frame_info(&info, task, regs) == 0)
+				unw_ret = show_trace_unwind(&info, log_lvl);
+		} else if (task == current)
+			unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl);
+		else {
+			if (unwind_init_blocked(&info, task) == 0)
+				unw_ret = show_trace_unwind(&info, log_lvl);
 		}
-	} else if (task == current) {
-		if (unwind_init_running(&info, show_trace_unwind, log_lvl) == 0)
-			return;
-	} else {
-		if (unwind_init_blocked(&info, task) == 0) {
-			show_trace_unwind(&info, log_lvl);
-			return;
+		if (unw_ret > 0) {
+			if (call_trace > 0)
+				return;
+			printk("%sLegacy call trace:\n", log_lvl);
 		}
 	}
 
@@ -1245,3 +1253,15 @@ static int __init kstack_setup(char *s)
 	return 1;
 }
 __setup("kstack=", kstack_setup);
+
+static int __init call_trace_setup(char *s)
+{
+	if (strcmp(s, "old") == 0)
+		call_trace = -1;
+	else if (strcmp(s, "both") == 0)
+		call_trace = 0;
+	else if (strcmp(s, "new") == 0)
+		call_trace = 1;
+	return 1;
+}
+__setup("call_trace=", call_trace_setup);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index eb1534ff1f5f..bd0891f4c2c7 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -107,6 +107,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 }
 
 static int kstack_depth_to_print = 10;
+static int call_trace = 1;
 
 #ifdef CONFIG_KALLSYMS
 #include <linux/kallsyms.h> 
@@ -190,11 +191,12 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 	return NULL;
 }
 
-static void show_trace_unwind(struct unwind_frame_info *info, void *context)
+static int show_trace_unwind(struct unwind_frame_info *info, void *context)
 {
-	int i = 11;
+	int i = 11, n = 0;
 
 	while (unwind(info) == 0 && UNW_PC(info)) {
+		++n;
 		if (i > 50) {
 			printk("\n       ");
 			i = 7;
@@ -205,6 +207,7 @@ static void show_trace_unwind(struct unwind_frame_info *info, void *context)
 			break;
 	}
 	printk("\n");
+	return n;
 }
 
 /*
@@ -218,27 +221,32 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 {
 	const unsigned cpu = safe_smp_processor_id();
 	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
-	int i;
+	int i = 11;
 	unsigned used = 0;
-	struct unwind_frame_info info;
 
 	printk("\nCall Trace:");
 
 	if (!tsk)
 		tsk = current;
 
-	if (regs) {
-		if (unwind_init_frame_info(&info, tsk, regs) == 0) {
-			show_trace_unwind(&info, NULL);
-			return;
+	if (call_trace >= 0) {
+		int unw_ret = 0;
+		struct unwind_frame_info info;
+
+		if (regs) {
+			if (unwind_init_frame_info(&info, tsk, regs) == 0)
+				unw_ret = show_trace_unwind(&info, NULL);
+		} else if (tsk == current)
+			unw_ret = unwind_init_running(&info, show_trace_unwind, NULL);
+		else {
+			if (unwind_init_blocked(&info, tsk) == 0)
+				unw_ret = show_trace_unwind(&info, NULL);
 		}
-	} else if (tsk == current) {
-		if (unwind_init_running(&info, show_trace_unwind, NULL) == 0)
-			return;
-	} else {
-		if (unwind_init_blocked(&info, tsk) == 0) {
-			show_trace_unwind(&info, NULL);
-			return;
+		if (unw_ret > 0) {
+			if (call_trace > 0)
+				return;
+			printk("Legacy call trace:");
+			i = 18;
 		}
 	}
 
@@ -264,7 +272,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 		} \
 	} while (0)
 
-	for(i = 11; ; ) {
+	for(; ; ) {
 		const char *id;
 		unsigned long *estack_end;
 		estack_end = in_exception_stack(cpu, (unsigned long)stack,
@@ -1052,3 +1060,14 @@ static int __init kstack_setup(char *s)
 }
 __setup("kstack=", kstack_setup);
 
+static int __init call_trace_setup(char *s)
+{
+	if (strcmp(s, "old") == 0)
+		call_trace = -1;
+	else if (strcmp(s, "both") == 0)
+		call_trace = 0;
+	else if (strcmp(s, "new") == 0)
+		call_trace = 1;
+	return 1;
+}
+__setup("call_trace=", call_trace_setup);
diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h
index 1c076897ac21..d480f2e38215 100644
--- a/include/asm-i386/unwind.h
+++ b/include/asm-i386/unwind.h
@@ -66,10 +66,10 @@ static inline void arch_unw_init_blocked(struct unwind_frame_info *info)
 	info->regs.xes = __USER_DS;
 }
 
-extern asmlinkage void arch_unwind_init_running(struct unwind_frame_info *,
-                                                asmlinkage void (*callback)(struct unwind_frame_info *,
-                                                                            void *arg),
-                                                void *arg);
+extern asmlinkage int arch_unwind_init_running(struct unwind_frame_info *,
+                                               asmlinkage int (*callback)(struct unwind_frame_info *,
+                                                                          void *arg),
+                                               void *arg);
 
 static inline int arch_unw_user_mode(const struct unwind_frame_info *info)
 {
diff --git a/include/asm-x86_64/unwind.h b/include/asm-x86_64/unwind.h
index 4f61de246179..f3e7124effe3 100644
--- a/include/asm-x86_64/unwind.h
+++ b/include/asm-x86_64/unwind.h
@@ -75,10 +75,10 @@ static inline void arch_unw_init_blocked(struct unwind_frame_info *info)
 	info->regs.ss = __KERNEL_DS;
 }
 
-extern void arch_unwind_init_running(struct unwind_frame_info *,
-                                     void (*callback)(struct unwind_frame_info *,
-                                                      void *arg),
-                                     void *arg);
+extern int arch_unwind_init_running(struct unwind_frame_info *,
+                                    int (*callback)(struct unwind_frame_info *,
+                                                    void *arg),
+                                    void *arg);
 
 static inline int arch_unw_user_mode(const struct unwind_frame_info *info)
 {
diff --git a/include/linux/unwind.h b/include/linux/unwind.h
index 0295aa789ab4..13c7b2cd87ce 100644
--- a/include/linux/unwind.h
+++ b/include/linux/unwind.h
@@ -49,8 +49,8 @@ extern int unwind_init_blocked(struct unwind_frame_info *,
  * Prepare to unwind the currently running thread.
  */
 extern int unwind_init_running(struct unwind_frame_info *,
-                               asmlinkage void (*callback)(struct unwind_frame_info *,
-                                                           void *arg),
+                               asmlinkage int (*callback)(struct unwind_frame_info *,
+                                                          void *arg),
                                void *arg);
 
 /*
@@ -97,8 +97,8 @@ static inline int unwind_init_blocked(struct unwind_frame_info *info,
 }
 
 static inline int unwind_init_running(struct unwind_frame_info *info,
-                                      asmlinkage void (*cb)(struct unwind_frame_info *,
-                                                            void *arg),
+                                      asmlinkage int (*cb)(struct unwind_frame_info *,
+                                                           void *arg),
                                       void *arg)
 {
 	return -ENOSYS;
diff --git a/kernel/unwind.c b/kernel/unwind.c
index d36bcd3ad3b5..0421035272d9 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -885,14 +885,13 @@ EXPORT_SYMBOL(unwind_init_blocked);
  * Prepare to unwind the currently running thread.
  */
 int unwind_init_running(struct unwind_frame_info *info,
-                        asmlinkage void (*callback)(struct unwind_frame_info *,
-                                                    void *arg),
+                        asmlinkage int (*callback)(struct unwind_frame_info *,
+                                                   void *arg),
                         void *arg)
 {
 	info->task = current;
-	arch_unwind_init_running(info, callback, arg);
 
-	return 0;
+	return arch_unwind_init_running(info, callback, arg);
 }
 EXPORT_SYMBOL(unwind_init_running);
 
-- 
cgit v1.2.3


From 83f4fcce7fdd213bd570b899862c3838871f8cf7 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:57:50 +0200
Subject: [PATCH] x86_64: allow unwinder to build without module support

Add proper conditionals to be able to build with CONFIG_MODULES=n.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/unwind.h | 8 ++++++++
 kernel/unwind.c        | 4 ++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/unwind.h b/include/linux/unwind.h
index 13c7b2cd87ce..ce48e2cd37a2 100644
--- a/include/linux/unwind.h
+++ b/include/linux/unwind.h
@@ -29,12 +29,16 @@ struct module;
  */
 extern void unwind_init(void);
 
+#ifdef CONFIG_MODULES
+
 extern void *unwind_add_table(struct module *,
                               const void *table_start,
                               unsigned long table_size);
 
 extern void unwind_remove_table(void *handle, int init_only);
 
+#endif
+
 extern int unwind_init_frame_info(struct unwind_frame_info *,
                                   struct task_struct *,
                                   /*const*/ struct pt_regs *);
@@ -72,6 +76,8 @@ struct unwind_frame_info {};
 
 static inline void unwind_init(void) {}
 
+#ifdef CONFIG_MODULES
+
 static inline void *unwind_add_table(struct module *mod,
                                      const void *table_start,
                                      unsigned long table_size)
@@ -79,6 +85,8 @@ static inline void *unwind_add_table(struct module *mod,
 	return NULL;
 }
 
+#endif
+
 static inline void unwind_remove_table(void *handle, int init_only)
 {
 }
diff --git a/kernel/unwind.c b/kernel/unwind.c
index 0421035272d9..f69c804c8e62 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -172,6 +172,8 @@ void __init unwind_init(void)
 	                  __start_unwind, __end_unwind - __start_unwind);
 }
 
+#ifdef CONFIG_MODULES
+
 /* Must be called with module_mutex held. */
 void *unwind_add_table(struct module *module,
                        const void *table_start,
@@ -253,6 +255,8 @@ void unwind_remove_table(void *handle, int init_only)
 		kfree(table);
 }
 
+#endif /* CONFIG_MODULES */
+
 static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
 {
 	const u8 *cur = *pcur;
-- 
cgit v1.2.3


From 05ebb76109f302b949e745724bbf0f0634dba43f Mon Sep 17 00:00:00 2001
From: Vojtech Pavlik <vojtech@suse.cz>
Date: Mon, 26 Jun 2006 13:58:20 +0200
Subject: [PATCH] x86_64: Add useful constants to time.h

In timekeeping code, one often does need to use conversion constants. Naming
these leads to code that's easier to understand, showing the reader between
which units the conversion is made.

Signed-off-by: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 0cd696cee998..2fa2987c9a07 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -28,10 +28,13 @@ struct timezone {
 #ifdef __KERNEL__
 
 /* Parameters used to convert the timespec values: */
-#define MSEC_PER_SEC		1000L
-#define USEC_PER_SEC		1000000L
-#define NSEC_PER_SEC		1000000000L
-#define NSEC_PER_USEC		1000L
+#define MSEC_PER_SEC	1000L
+#define USEC_PER_MSEC	1000L
+#define NSEC_PER_USEC	1000L
+#define NSEC_PER_MSEC	1000000L
+#define USEC_PER_SEC	1000000L
+#define NSEC_PER_SEC	1000000000L
+#define FSEC_PER_SEC	1000000000000000L
 
 static inline int timespec_equal(struct timespec *a, struct timespec *b)
 {
-- 
cgit v1.2.3


From 09c0dc68625c06f5b1e786aad0d5369b592179e6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Mon, 26 Jun 2006 11:55:42 -0700
Subject: Revert "[PATCH] kthread: update loop.c to use kthread"

This reverts commit c7b2eff059fcc2d1b7085ee3d84b79fd657a537b.

Hugh Dickins explains:

 "It seems too little tested: "losetup -d /dev/loop0" fails with
  EINVAL because nothing sets lo_thread; but even when you patch
  loop_thread() to set lo->lo_thread = current, it can't survive
  more than a few dozen iterations of the loop below (with a tmpfs
  mounted on /tst):

	j=0
	cp /dev/zero /tst
	while :
	do
	    let j=j+1
	    echo "Doing pass $j"
	    losetup /dev/loop0 /tst/zero
	    mkfs -t ext2 -b 1024 /dev/loop0 >/dev/null 2>&1
	    mount -t ext2 /dev/loop0 /mnt
	    umount /mnt
	    losetup -d /dev/loop0
	done

  it collapses with failed ioctl then BUG_ON(!bio).

  I think the original lo_done completion was more subtle and safe
  than the kthread conversion has allowed for."

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/loop.c | 24 +++++++++++++-----------
 include/linux/loop.h |  2 +-
 2 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 9dc294a74953..3c74ea729fc7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -74,7 +74,6 @@
 #include <linux/completion.h>
 #include <linux/highmem.h>
 #include <linux/gfp.h>
-#include <linux/kthread.h>
 
 #include <asm/uaccess.h>
 
@@ -579,6 +578,8 @@ static int loop_thread(void *data)
 	struct loop_device *lo = data;
 	struct bio *bio;
 
+	daemonize("loop%d", lo->lo_number);
+
 	/*
 	 * loop can be used in an encrypted device,
 	 * hence, it mustn't be stopped at all
@@ -591,6 +592,11 @@ static int loop_thread(void *data)
 	lo->lo_state = Lo_bound;
 	lo->lo_pending = 1;
 
+	/*
+	 * complete it, we are running
+	 */
+	complete(&lo->lo_done);
+
 	for (;;) {
 		int pending;
 
@@ -623,6 +629,7 @@ static int loop_thread(void *data)
 			break;
 	}
 
+	complete(&lo->lo_done);
 	return 0;
 }
 
@@ -739,7 +746,6 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 	unsigned lo_blocksize;
 	int		lo_flags = 0;
 	int		error;
-	struct task_struct *tsk;
 	loff_t		size;
 
 	/* This is safe, since we have a reference from open(). */
@@ -833,11 +839,10 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 
 	set_blocksize(bdev, lo_blocksize);
 
-	tsk = kthread_run(loop_thread, lo, "loop%d", lo->lo_number);
-	if (IS_ERR(tsk)) {
-		error = PTR_ERR(tsk);
+	error = kernel_thread(loop_thread, lo, CLONE_KERNEL);
+	if (error < 0)
 		goto out_putf;
-	}
+	wait_for_completion(&lo->lo_done);
 	return 0;
 
  out_putf:
@@ -893,9 +898,6 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
 
-	if (!lo->lo_thread)
-		return -EINVAL;
-
 	if (lo->lo_refcnt > 1)	/* we needed one fd for the ioctl */
 		return -EBUSY;
 
@@ -909,7 +911,7 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 		complete(&lo->lo_bh_done);
 	spin_unlock_irq(&lo->lo_lock);
 
-	kthread_stop(lo->lo_thread);
+	wait_for_completion(&lo->lo_done);
 
 	lo->lo_backing_file = NULL;
 
@@ -922,7 +924,6 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 	lo->lo_sizelimit = 0;
 	lo->lo_encrypt_key_size = 0;
 	lo->lo_flags = 0;
-	lo->lo_thread = NULL;
 	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
 	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
 	memset(lo->lo_file_name, 0, LO_NAME_SIZE);
@@ -1287,6 +1288,7 @@ static int __init loop_init(void)
 		if (!lo->lo_queue)
 			goto out_mem4;
 		mutex_init(&lo->lo_ctl_mutex);
+		init_completion(&lo->lo_done);
 		init_completion(&lo->lo_bh_done);
 		lo->lo_number = i;
 		spin_lock_init(&lo->lo_lock);
diff --git a/include/linux/loop.h b/include/linux/loop.h
index bf3d2345ce99..e76c7611d6cc 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -59,7 +59,7 @@ struct loop_device {
 	struct bio 		*lo_bio;
 	struct bio		*lo_biotail;
 	int			lo_state;
-	struct task_struct	*lo_thread;
+	struct completion	lo_done;
 	struct completion	lo_bh_done;
 	struct mutex		lo_ctl_mutex;
 	int			lo_pending;
-- 
cgit v1.2.3


From 438bc9c3dec27ab37f0ff78471d0b8b91addd2dd Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Mon, 26 Jun 2006 20:52:17 -0400
Subject: [libata] sata_vsc: partially revert a PCI ID-related commit

Partially revert 74d0a988d3aa359b6b8a8536c8cb92cce02ca5d5:

	[PATCH] PCI: Move various PCI IDs to header file

libata policy is to avoid use of named PCI device ID constants.
These are often single-use constants, which have little value over
direct numeric constants save for constant include/linux/pci_ids.h
patching/merging headaches.

Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/sata_vsc.c | 10 +++-------
 include/linux/pci_ids.h |  1 -
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c
index 01893a0cf1a6..616fd9634b4b 100644
--- a/drivers/scsi/sata_vsc.c
+++ b/drivers/scsi/sata_vsc.c
@@ -443,16 +443,12 @@ err_out:
 }
 
 
-/*
- * Intel 31244 is supposed to be identical.
- * Compatibility is untested as of yet.
- */
 static const struct pci_device_id vsc_sata_pci_tbl[] = {
-	{ PCI_VENDOR_ID_VITESSE, PCI_DEVICE_ID_VITESSE_VSC7174,
+	{ PCI_VENDOR_ID_VITESSE, 0x7174,
 	  PCI_ANY_ID, PCI_ANY_ID, 0x10600, 0xFFFFFF, 0 },
-	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_GD31244,
+	{ PCI_VENDOR_ID_INTEL, 0x3200,
 	  PCI_ANY_ID, PCI_ANY_ID, 0x10600, 0xFFFFFF, 0 },
-	{ }
+	{ }	/* terminate list */
 };
 
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index c2fd2d19938b..6abd2761e637 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2170,7 +2170,6 @@
 #define PCI_DEVICE_ID_INTEL_ICH8_4	0x2815
 #define PCI_DEVICE_ID_INTEL_ICH8_5	0x283e
 #define PCI_DEVICE_ID_INTEL_ICH8_6	0x2850
-#define PCI_DEVICE_ID_INTEL_GD31244	0x3200
 #define PCI_DEVICE_ID_INTEL_82855PM_HB	0x3340
 #define PCI_DEVICE_ID_INTEL_82830_HB	0x3575
 #define PCI_DEVICE_ID_INTEL_82830_CGC	0x3577
-- 
cgit v1.2.3


From 41542dbe12e34165e586de1e3fe0a245707aa39e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 23 Jun 2006 04:18:31 -0700
Subject: [PATCH] libata.h needs scatterlist.h

From: Andrew Morton <akpm@osdl.org>

s390:

In file included from drivers/scsi/libata-bmdma.c:39:                           include/linux/libata.h:391: error: field 'sgent' has incomplete type
include/linux/libata.h:392: error: field 'pad_sgent' has incomplete type
include/linux/libata.h: In function 'ata_sg_is_last':                           include/linux/libata.h:849: error: arithmetic on pointer to an incomplete type
include/linux/libata.h:849: error: arithmetic on pointer to an incomplete type
include/linux/libata.h: In function 'ata_qc_next_sg':
include/linux/libata.h:869: error: increment of pointer to unknown structure
include/linux/libata.h:869: error: arithmetic on pointer to an incomplete type
include/linux/libata.h:869: error: arithmetic on pointer to an incomplete type
include/linux/libata.h:869: error: arithmetic on pointer to an incomplete type

Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/libata.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 20b1cf527c60..c227dace1d26 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -30,6 +30,7 @@
 #include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
+#include <asm/scatterlist.h>
 #include <asm/io.h>
 #include <linux/ata.h>
 #include <linux/workqueue.h>
-- 
cgit v1.2.3


From 5806db22cffc7557b675d3c9229f327980aee797 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sat, 24 Jun 2006 20:30:19 +0900
Subject: [PATCH] libata: implement ata_port_max_devices()

Implement ata_port_max_devices().  This function returns the number of
possible devices on a port.  This will be used by new PM
implementation.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/libata.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index c227dace1d26..f4284bf89758 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -888,6 +888,9 @@ static inline unsigned int ata_tag_internal(unsigned int tag)
 	return tag == ATA_MAX_QUEUE - 1;
 }
 
+/*
+ * device helpers
+ */
 static inline unsigned int ata_class_enabled(unsigned int class)
 {
 	return class == ATA_DEV_ATA || class == ATA_DEV_ATAPI;
@@ -918,6 +921,17 @@ static inline unsigned int ata_dev_absent(const struct ata_device *dev)
 	return ata_class_absent(dev->class);
 }
 
+/*
+ * port helpers
+ */
+static inline int ata_port_max_devices(const struct ata_port *ap)
+{
+	if (ap->flags & ATA_FLAG_SLAVE_POSS)
+		return 2;
+	return 1;
+}
+
+
 static inline u8 ata_chk_status(struct ata_port *ap)
 {
 	return ap->ops->check_status(ap);
-- 
cgit v1.2.3


From 353dcf7c89519a2cb9b7699f3bcf70bb685f22e1 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sun, 25 Jun 2006 01:36:55 -0700
Subject: [PATCH] ata: add some NVIDIA chipset IDs

From: Randy Dunlap <randy.dunlap@oracle.com>

Add some nVidia chipset ID's support.

http://www.kernel.org/git/?p=linux/kernel/git/bcollins/ubuntu-dapper.git;a=commitdiff;h=b407680553280f9999a20706d5ab2a3be65312c1;hp=ce4cb48010ab2cca537432b5ccb47d4b1fb489e5

Snagged from lkml.

Cc: Jeff Garzik <jeff@garzik.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Tejun Heo <htejun@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ide/pci/amd74xx.c | 7 +++++--
 include/linux/pci_ids.h   | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/pci/amd74xx.c b/drivers/ide/pci/amd74xx.c
index 6e9dbf4d8077..85007cb12c52 100644
--- a/drivers/ide/pci/amd74xx.c
+++ b/drivers/ide/pci/amd74xx.c
@@ -75,6 +75,7 @@ static struct amd_ide_chip {
 	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE,	0x50, AMD_UDMA_133 },
 	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE,	0x50, AMD_UDMA_133 },
 	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE,	0x50, AMD_UDMA_133 },
 	{ PCI_DEVICE_ID_AMD_CS5536_IDE,			0x40, AMD_UDMA_100 },
 	{ 0 }
 };
@@ -490,7 +491,8 @@ static ide_pci_device_t amd74xx_chipsets[] __devinitdata = {
 	/* 15 */ DECLARE_NV_DEV("NFORCE-MCP51"),
 	/* 16 */ DECLARE_NV_DEV("NFORCE-MCP55"),
 	/* 17 */ DECLARE_NV_DEV("NFORCE-MCP61"),
-	/* 18 */ DECLARE_AMD_DEV("AMD5536"),
+	/* 18 */ DECLARE_NV_DEV("NFORCE-MCP65"),
+	/* 19 */ DECLARE_AMD_DEV("AMD5536"),
 };
 
 static int __devinit amd74xx_probe(struct pci_dev *dev, const struct pci_device_id *id)
@@ -528,7 +530,8 @@ static struct pci_device_id amd74xx_pci_tbl[] = {
 	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 15 },
 	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 16 },
 	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 17 },
-	{ PCI_VENDOR_ID_AMD,	PCI_DEVICE_ID_AMD_CS5536_IDE,		PCI_ANY_ID, PCI_ANY_ID, 0, 0, 18 },
+	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE,  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 18 },
+	{ PCI_VENDOR_ID_AMD,	PCI_DEVICE_ID_AMD_CS5536_IDE,		PCI_ANY_ID, PCI_ANY_ID, 0, 0, 19 },
 	{ 0, },
 };
 MODULE_DEVICE_TABLE(pci, amd74xx_pci_tbl);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 6abd2761e637..9ae6b1a75366 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1202,6 +1202,7 @@
 #define PCI_DEVICE_ID_NVIDIA_NVENET_19              0x03EF
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA2     0x03F6
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA3     0x03F7
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE	    0x0448
 #define PCI_DEVICE_ID_NVIDIA_NVENET_20              0x0450
 #define PCI_DEVICE_ID_NVIDIA_NVENET_21              0x0451
 #define PCI_DEVICE_ID_NVIDIA_NVENET_22              0x0452
-- 
cgit v1.2.3


From c9f700f840bd481b3e01fcad1ba8da01794a6726 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sun, 11 Jun 2006 10:35:15 +0900
Subject: [JFFS2][XATTR] using 'delete marker' for xdatum/xref deletion

- When xdatum is removed, a new xdatum with 'delete marker' is
  written. (version==0xffffffff means 'delete marker')
- When xref is removed, a new xref with 'delete marker' is written.
  (odd-numbered xseqno means 'delete marker')

- delete_xattr_(datum/xref)_delay() are new deletion functions
  are added. We can only use them if we can detect the target
  obsolete xdatum/xref as a orphan or errir one.
  (e.g when inode deletion, or detecting crc error)

[1/3] jffs2-xattr-v6-01-delete_marker.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/erase.c       |  19 +-
 fs/jffs2/gc.c          |   5 +-
 fs/jffs2/jffs2_fs_sb.h |   3 +
 fs/jffs2/malloc.c      |   2 +
 fs/jffs2/nodemgmt.c    |  21 +-
 fs/jffs2/scan.c        |  57 ++--
 fs/jffs2/summary.c     |  39 +--
 fs/jffs2/xattr.c       | 736 +++++++++++++++++++++++++++++++------------------
 fs/jffs2/xattr.h       |  40 ++-
 include/linux/jffs2.h  |   1 +
 10 files changed, 584 insertions(+), 339 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 1862e8bc101d..1644e3408852 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -230,7 +230,6 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
 			   at the end of the linked list. Stash it and continue
 			   from the beginning of the list */
 			ic = (struct jffs2_inode_cache *)(*prev);
-			BUG_ON(ic->class != RAWNODE_CLASS_INODE_CACHE);
 			prev = &ic->nodes;
 			continue;
 		}
@@ -254,7 +253,8 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
 
 	/* PARANOIA */
 	if (!ic) {
-		printk(KERN_WARNING "inode_cache not found in remove_node_refs()!!\n");
+		JFFS2_WARNING("inode_cache/xattr_datum/xattr_ref"
+			      " not found in remove_node_refs()!!\n");
 		return;
 	}
 
@@ -279,8 +279,19 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
 		printk("\n");
 	});
 
-	if (ic->nodes == (void *)ic && ic->nlink == 0)
-		jffs2_del_ino_cache(c, ic);
+	switch (ic->class) {
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case RAWNODE_CLASS_XATTR_DATUM:
+			jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic);
+			break;
+		case RAWNODE_CLASS_XATTR_REF:
+			jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic);
+			break;
+#endif
+		default:
+			if (ic->nodes == (void *)ic && ic->nlink == 0)
+				jffs2_del_ino_cache(c, ic);
+	}
 }
 
 void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 477c526d638b..f59b147661c9 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -275,13 +275,12 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 	 * We can decide whether this node is inode or xattr by ic->class.     */
 	if (ic->class == RAWNODE_CLASS_XATTR_DATUM
 	    || ic->class == RAWNODE_CLASS_XATTR_REF) {
-		BUG_ON(raw->next_in_ino != (void *)ic);
 		spin_unlock(&c->erase_completion_lock);
 
 		if (ic->class == RAWNODE_CLASS_XATTR_DATUM) {
-			ret = jffs2_garbage_collect_xattr_datum(c, (struct jffs2_xattr_datum *)ic);
+			ret = jffs2_garbage_collect_xattr_datum(c, (struct jffs2_xattr_datum *)ic, raw);
 		} else {
-			ret = jffs2_garbage_collect_xattr_ref(c, (struct jffs2_xattr_ref *)ic);
+			ret = jffs2_garbage_collect_xattr_ref(c, (struct jffs2_xattr_ref *)ic, raw);
 		}
 		goto release_sem;
 	}
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 935fec1b1201..b98594992eed 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -119,8 +119,11 @@ struct jffs2_sb_info {
 #ifdef CONFIG_JFFS2_FS_XATTR
 #define XATTRINDEX_HASHSIZE	(57)
 	uint32_t highest_xid;
+	uint32_t highest_xseqno;
 	struct list_head xattrindex[XATTRINDEX_HASHSIZE];
 	struct list_head xattr_unchecked;
+	struct list_head xattr_dead_list;
+	struct jffs2_xattr_ref *xref_dead_list;
 	struct jffs2_xattr_ref *xref_temp;
 	struct rw_semaphore xattr_sem;
 	uint32_t xdatum_mem_usage;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 4889d0700c0e..8310c95478e9 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -291,6 +291,7 @@ struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
 
 	memset(xd, 0, sizeof(struct jffs2_xattr_datum));
 	xd->class = RAWNODE_CLASS_XATTR_DATUM;
+	xd->node = (void *)xd;
 	INIT_LIST_HEAD(&xd->xindex);
 	return xd;
 }
@@ -309,6 +310,7 @@ struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
 
 	memset(ref, 0, sizeof(struct jffs2_xattr_ref));
 	ref->class = RAWNODE_CLASS_XATTR_REF;
+	ref->node = (void *)ref;
 	return ref;
 }
 
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 8bedfd2ff689..01594a2256eb 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -684,19 +684,26 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 		spin_lock(&c->erase_completion_lock);
 
 		ic = jffs2_raw_ref_to_ic(ref);
-		/* It seems we should never call jffs2_mark_node_obsolete() for
-		   XATTR nodes.... yet. Make sure we notice if/when we change
-		   that :) */
-		BUG_ON(ic->class != RAWNODE_CLASS_INODE_CACHE);
 		for (p = &ic->nodes; (*p) != ref; p = &((*p)->next_in_ino))
 			;
 
 		*p = ref->next_in_ino;
 		ref->next_in_ino = NULL;
 
-		if (ic->nodes == (void *)ic && ic->nlink == 0)
-			jffs2_del_ino_cache(c, ic);
-
+		switch (ic->class) {
+#ifdef CONFIG_JFFS2_FS_XATTR
+			case RAWNODE_CLASS_XATTR_DATUM:
+				jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic);
+				break;
+			case RAWNODE_CLASS_XATTR_REF:
+				jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic);
+				break;
+#endif
+			default:
+				if (ic->nodes == (void *)ic && ic->nlink == 0)
+					jffs2_del_ino_cache(c, ic);
+				break;
+		}
 		spin_unlock(&c->erase_completion_lock);
 	}
 
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 61618080b86f..79638f56c5ea 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -317,20 +317,25 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 				 struct jffs2_summary *s)
 {
 	struct jffs2_xattr_datum *xd;
-	uint32_t totlen, crc;
+	uint32_t xid, version, totlen, crc;
 	int err;
 
 	crc = crc32(0, rx, sizeof(struct jffs2_raw_xattr) - 4);
 	if (crc != je32_to_cpu(rx->node_crc)) {
-		if (je32_to_cpu(rx->node_crc) != 0xffffffff)
-			JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
-				      ofs, je32_to_cpu(rx->node_crc), crc);
+		JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			      ofs, je32_to_cpu(rx->node_crc), crc);
 		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen))))
 			return err;
 		return 0;
 	}
 
-	totlen = PAD(sizeof(*rx) + rx->name_len + 1 + je16_to_cpu(rx->value_len));
+	xid = je32_to_cpu(rx->xid);
+	version = je32_to_cpu(rx->version);
+
+	totlen = sizeof(struct jffs2_raw_xattr);
+	if (version != XDATUM_DELETE_MARKER)
+		totlen += rx->name_len + 1 + je16_to_cpu(rx->value_len);
+	totlen = PAD(totlen);
 	if (totlen != je32_to_cpu(rx->totlen)) {
 		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n",
 			      ofs, je32_to_cpu(rx->totlen), totlen);
@@ -339,22 +344,24 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 		return 0;
 	}
 
-	xd = jffs2_setup_xattr_datum(c, je32_to_cpu(rx->xid), je32_to_cpu(rx->version));
-	if (IS_ERR(xd)) {
-		if (PTR_ERR(xd) == -EEXIST) {
-			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rx->totlen)))))
-				return err;
-			return 0;
-		}
+	xd = jffs2_setup_xattr_datum(c, xid, version);
+	if (IS_ERR(xd))
 		return PTR_ERR(xd);
-	}
-	xd->xprefix = rx->xprefix;
-	xd->name_len = rx->name_len;
-	xd->value_len = je16_to_cpu(rx->value_len);
-	xd->data_crc = je32_to_cpu(rx->data_crc);
 
-	xd->node = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, NULL);
-	/* FIXME */ xd->node->next_in_ino = (void *)xd;
+	if (xd->version > version) {
+		struct jffs2_raw_node_ref *raw
+			= jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, NULL);
+		raw->next_in_ino = xd->node->next_in_ino;
+		xd->node->next_in_ino = raw;
+	} else {
+		xd->version = version;
+		xd->xprefix = rx->xprefix;
+		xd->name_len = rx->name_len;
+		xd->value_len = je16_to_cpu(rx->value_len);
+		xd->data_crc = je32_to_cpu(rx->data_crc);
+
+		jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, (void *)xd);
+	}
 
 	if (jffs2_sum_active())
 		jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset);
@@ -373,9 +380,8 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 
 	crc = crc32(0, rr, sizeof(*rr) - 4);
 	if (crc != je32_to_cpu(rr->node_crc)) {
-		if (je32_to_cpu(rr->node_crc) != 0xffffffff)
-			JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
-				      ofs, je32_to_cpu(rr->node_crc), crc);
+		JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			      ofs, je32_to_cpu(rr->node_crc), crc);
 		if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rr->totlen)))))
 			return err;
 		return 0;
@@ -395,6 +401,7 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 		return -ENOMEM;
 
 	/* BEFORE jffs2_build_xattr_subsystem() called, 
+	 * and AFTER xattr_ref is marked as a dead xref,
 	 * ref->xid is used to store 32bit xid, xd is not used
 	 * ref->ino is used to store 32bit inode-number, ic is not used
 	 * Thoes variables are declared as union, thus using those
@@ -404,11 +411,13 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	 */
 	ref->ino = je32_to_cpu(rr->ino);
 	ref->xid = je32_to_cpu(rr->xid);
+	ref->xseqno = je32_to_cpu(rr->xseqno);
+	if (ref->xseqno > c->highest_xseqno)
+		c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER);
 	ref->next = c->xref_temp;
 	c->xref_temp = ref;
 
-	ref->node = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rr->totlen)), NULL);
-	/* FIXME */ ref->node->next_in_ino = (void *)ref;
+	jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rr->totlen)), (void *)ref);
 
 	if (jffs2_sum_active())
 		jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset);
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 0b02fc79e4d1..c430f1d217e2 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -310,8 +310,6 @@ int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs,
 #ifdef CONFIG_JFFS2_FS_XATTR
 		case JFFS2_NODETYPE_XATTR: {
 			struct jffs2_sum_xattr_mem *temp;
-			if (je32_to_cpu(node->x.version) == 0xffffffff)
-				return 0;
 			temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL);
 			if (!temp)
 				goto no_mem;
@@ -327,10 +325,6 @@ int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs,
 		}
 		case JFFS2_NODETYPE_XREF: {
 			struct jffs2_sum_xref_mem *temp;
-
-			if (je32_to_cpu(node->r.ino) == 0xffffffff
-			    && je32_to_cpu(node->r.xid) == 0xffffffff)
-				return 0;
 			temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL);
 			if (!temp)
 				goto no_mem;
@@ -483,22 +477,20 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 
 				xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid),
 								je32_to_cpu(spx->version));
-				if (IS_ERR(xd)) {
-					if (PTR_ERR(xd) == -EEXIST) {
-						/* a newer version of xd exists */
-						if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(spx->totlen))))
-							return err;
-						sp += JFFS2_SUMMARY_XATTR_SIZE;
-						break;
-					}
-					JFFS2_NOTICE("allocation of xattr_datum failed\n");
+				if (IS_ERR(xd))
 					return PTR_ERR(xd);
+				if (xd->version > je32_to_cpu(spx->version)) {
+					/* node is not the newest one */
+					struct jffs2_raw_node_ref *raw
+						= sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
+								    PAD(je32_to_cpu(spx->totlen)), NULL);
+					raw->next_in_ino = xd->node->next_in_ino;
+					xd->node->next_in_ino = raw;
+				} else {
+					xd->version = je32_to_cpu(spx->version);
+					sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
+							  PAD(je32_to_cpu(spx->totlen)), (void *)xd);
 				}
-
-				xd->node = sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
-							     PAD(je32_to_cpu(spx->totlen)), NULL);
-				/* FIXME */ xd->node->next_in_ino = (void *)xd;
-
 				*pseudo_random += je32_to_cpu(spx->xid);
 				sp += JFFS2_SUMMARY_XATTR_SIZE;
 
@@ -519,14 +511,11 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 					JFFS2_NOTICE("allocation of xattr_datum failed\n");
 					return -ENOMEM;
 				}
-				ref->ino = 0xfffffffe;
-				ref->xid = 0xfffffffd;
 				ref->next = c->xref_temp;
 				c->xref_temp = ref;
 
-				ref->node = sum_link_node_ref(c, jeb, je32_to_cpu(spr->offset) | REF_UNCHECKED,
-							      PAD(sizeof(struct jffs2_raw_xref)), NULL);
-				/* FIXME */ ref->node->next_in_ino = (void *)ref;
+				sum_link_node_ref(c, jeb, je32_to_cpu(spr->offset) | REF_UNCHECKED,
+						  PAD(sizeof(struct jffs2_raw_xref)), (void *)ref);
 
 				*pseudo_random += ref->node->flash_offset;
 				sp += JFFS2_SUMMARY_XREF_SIZE;
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 2d82e250be34..03871ab7c26d 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -23,18 +23,15 @@
  * xattr_datum_hashkey(xprefix, xname, xvalue, xsize)
  *   is used to calcurate xdatum hashkey. The reminder of hashkey into XATTRINDEX_HASHSIZE is
  *   the index of the xattr name/value pair cache (c->xattrindex).
+ * is_xattr_datum_unchecked(c, xd)
+ *   returns 1, if xdatum contains any unchecked raw nodes. if all raw nodes are not
+ *   unchecked, it returns 0.
  * unload_xattr_datum(c, xd)
  *   is used to release xattr name/value pair and detach from c->xattrindex.
  * reclaim_xattr_datum(c)
  *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
  *   memory usage by cache is over c->xdatum_mem_threshold. Currentry, this threshold 
  *   is hard coded as 32KiB.
- * delete_xattr_datum_node(c, xd)
- *   is used to delete a jffs2 node is dominated by xdatum. When EBS(Erase Block Summary) is
- *   enabled, it overwrites the obsolete node by myself.
- * delete_xattr_datum(c, xd)
- *   is used to delete jffs2_xattr_datum object. It must be called with 0-value of reference
- *   counter. (It means how many jffs2_xattr_ref object refers this xdatum.)
  * do_verify_xattr_datum(c, xd)
  *   is used to load the xdatum informations without name/value pair from the medium.
  *   It's necessary once, because those informations are not collected during mounting
@@ -53,8 +50,13 @@
  *   is used to write xdatum to medium. xd->version will be incremented.
  * create_xattr_datum(c, xprefix, xname, xvalue, xsize)
  *   is used to create new xdatum and write to medium.
+ * delete_xattr_datum_delay(c, xd)
+ *   is used to delete a xdatum without 'delete marker'. It has a possibility to detect
+ *   orphan xdatum on next mounting.
+ * delete_xattr_datum(c, xd)
+ *   is used to delete a xdatum with 'delete marker'. Calling jffs2_reserve_space() is
+ *   necessary before this function.
  * -------------------------------------------------- */
-
 static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *xvalue, int xsize)
 {
 	int name_len = strlen(xname);
@@ -62,6 +64,22 @@ static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *
 	return crc32(xprefix, xname, name_len) ^ crc32(xprefix, xvalue, xsize);
 }
 
+static int is_xattr_datum_unchecked(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	struct jffs2_raw_node_ref *raw;
+	int rc = 0;
+
+	spin_lock(&c->erase_completion_lock);
+	for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+		if (ref_flags(raw) == REF_UNCHECKED) {
+			rc = 1;
+			break;
+		}
+	}
+	spin_unlock(&c->erase_completion_lock);
+	return rc;
+}
+
 static void unload_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
 	/* must be called under down_write(xattr_sem) */
@@ -107,80 +125,39 @@ static void reclaim_xattr_datum(struct jffs2_sb_info *c)
 		     before, c->xdatum_mem_usage, before - c->xdatum_mem_usage);
 }
 
-static void delete_xattr_datum_node(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
-{
-	/* must be called under down_write(xattr_sem) */
-	struct jffs2_raw_xattr rx;
-	size_t length;
-	int rc;
-
-	if (!xd->node) {
-		JFFS2_WARNING("xdatum (xid=%u) is removed twice.\n", xd->xid);
-		return;
-	}
-	if (jffs2_sum_active()) {
-		memset(&rx, 0xff, sizeof(struct jffs2_raw_xattr));
-		rc = jffs2_flash_read(c, ref_offset(xd->node),
-				      sizeof(struct jffs2_unknown_node),
-				      &length, (char *)&rx);
-		if (rc || length != sizeof(struct jffs2_unknown_node)) {
-			JFFS2_ERROR("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
-				    rc, sizeof(struct jffs2_unknown_node),
-				    length, ref_offset(xd->node));
-		}
-		rc = jffs2_flash_write(c, ref_offset(xd->node), sizeof(rx),
-				       &length, (char *)&rx);
-		if (rc || length != sizeof(struct jffs2_raw_xattr)) {
-			JFFS2_ERROR("jffs2_flash_write()=%d, req=%zu, wrote=%zu ar %#08x\n",
-				    rc, sizeof(rx), length, ref_offset(xd->node));
-		}
-	}
-	spin_lock(&c->erase_completion_lock);
-	xd->node->next_in_ino = NULL;
-	spin_unlock(&c->erase_completion_lock);
-	jffs2_mark_node_obsolete(c, xd->node);
-	xd->node = NULL;
-}
-
-static void delete_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
-{
-	/* must be called under down_write(xattr_sem) */
-	BUG_ON(xd->refcnt);
-
-	unload_xattr_datum(c, xd);
-	if (xd->node) {
-		delete_xattr_datum_node(c, xd);
-		xd->node = NULL;
-	}
-	jffs2_free_xattr_datum(xd);
-}
-
 static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
 	/* must be called under down_write(xattr_sem) */
 	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *raw;
 	struct jffs2_raw_xattr rx;
 	size_t readlen;
-	uint32_t crc, totlen;
+	uint32_t crc, offset, totlen;
 	int rc;
 
-	BUG_ON(!xd->node);
-	BUG_ON(ref_flags(xd->node) != REF_UNCHECKED);
+	spin_lock(&c->erase_completion_lock);
+	offset = ref_offset(xd->node);
+	if (ref_flags(xd->node) == REF_PRISTINE)
+		goto complete;
+	spin_unlock(&c->erase_completion_lock);
 
-	rc = jffs2_flash_read(c, ref_offset(xd->node), sizeof(rx), &readlen, (char *)&rx);
+	rc = jffs2_flash_read(c, offset, sizeof(rx), &readlen, (char *)&rx);
 	if (rc || readlen != sizeof(rx)) {
 		JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
-			      rc, sizeof(rx), readlen, ref_offset(xd->node));
+			      rc, sizeof(rx), readlen, offset);
 		return rc ? rc : -EIO;
 	}
 	crc = crc32(0, &rx, sizeof(rx) - 4);
 	if (crc != je32_to_cpu(rx.node_crc)) {
-		if (je32_to_cpu(rx.node_crc) != 0xffffffff)
-			JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
-				    ref_offset(xd->node), je32_to_cpu(rx.hdr_crc), crc);
+		JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			    offset, je32_to_cpu(rx.hdr_crc), crc);
+		xd->flags |= JFFS2_XFLAGS_INVALID;
 		return EIO;
 	}
-	totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
+	totlen = sizeof(rx);
+	if (xd->version != XDATUM_DELETE_MARKER)
+		totlen += rx.name_len + 1 + je16_to_cpu(rx.value_len);
+	totlen = PAD(totlen);
 	if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
 	    || je16_to_cpu(rx.nodetype) != JFFS2_NODETYPE_XATTR
 	    || je32_to_cpu(rx.totlen) != totlen
@@ -188,11 +165,12 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
 	    || je32_to_cpu(rx.version) != xd->version) {
 		JFFS2_ERROR("inconsistent xdatum at %#08x, magic=%#04x/%#04x, "
 			    "nodetype=%#04x/%#04x, totlen=%u/%u, xid=%u/%u, version=%u/%u\n",
-			    ref_offset(xd->node), je16_to_cpu(rx.magic), JFFS2_MAGIC_BITMASK,
+			    offset, je16_to_cpu(rx.magic), JFFS2_MAGIC_BITMASK,
 			    je16_to_cpu(rx.nodetype), JFFS2_NODETYPE_XATTR,
 			    je32_to_cpu(rx.totlen), totlen,
 			    je32_to_cpu(rx.xid), xd->xid,
 			    je32_to_cpu(rx.version), xd->version);
+		xd->flags |= JFFS2_XFLAGS_INVALID;
 		return EIO;
 	}
 	xd->xprefix = rx.xprefix;
@@ -200,14 +178,17 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
 	xd->value_len = je16_to_cpu(rx.value_len);
 	xd->data_crc = je32_to_cpu(rx.data_crc);
 
-	/* This JFFS2_NODETYPE_XATTR node is checked */
-	jeb = &c->blocks[ref_offset(xd->node) / c->sector_size];
-	totlen = PAD(je32_to_cpu(rx.totlen));
-
 	spin_lock(&c->erase_completion_lock);
-	c->unchecked_size -= totlen; c->used_size += totlen;
-	jeb->unchecked_size -= totlen; jeb->used_size += totlen;
-	xd->node->flash_offset = ref_offset(xd->node) | REF_PRISTINE;
+ complete:
+	for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+		jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+		totlen = PAD(ref_totlen(c, jeb, raw));
+		if (ref_flags(raw) == REF_UNCHECKED) {
+			c->unchecked_size -= totlen; c->used_size += totlen;
+			jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+		}
+		raw->flash_offset = ref_offset(raw) | ((xd->node==raw) ? REF_PRISTINE : REF_NORMAL);
+	}
 	spin_unlock(&c->erase_completion_lock);
 
 	/* unchecked xdatum is chained with c->xattr_unchecked */
@@ -227,7 +208,6 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
 	uint32_t crc, length;
 	int i, ret, retry = 0;
 
-	BUG_ON(!xd->node);
 	BUG_ON(ref_flags(xd->node) != REF_PRISTINE);
 	BUG_ON(!list_empty(&xd->xindex));
  retry:
@@ -253,6 +233,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
 			      " at %#08x, read: 0x%08x calculated: 0x%08x\n",
 			      ref_offset(xd->node), xd->data_crc, crc);
 		kfree(data);
+		xd->flags |= JFFS2_XFLAGS_INVALID;
 		return EIO;
 	}
 
@@ -286,16 +267,13 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 	 * rc > 0 : Unrecoverable error, this node should be deleted.
 	 */
 	int rc = 0;
-	BUG_ON(xd->xname);
-	if (!xd->node)
+
+	if (xd->xname)
+		return 0;
+	if (xd->flags & JFFS2_XFLAGS_INVALID)
 		return EIO;
-	if (unlikely(ref_flags(xd->node) != REF_PRISTINE)) {
+	if (unlikely(is_xattr_datum_unchecked(c, xd)))
 		rc = do_verify_xattr_datum(c, xd);
-		if (rc > 0) {
-			list_del_init(&xd->xindex);
-			delete_xattr_datum_node(c, xd);
-		}
-	}
 	if (!rc)
 		rc = do_load_xattr_datum(c, xd);
 	return rc;
@@ -304,36 +282,43 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
 	/* must be called under down_write(xattr_sem) */
-	struct jffs2_raw_node_ref *raw;
 	struct jffs2_raw_xattr rx;
 	struct kvec vecs[2];
 	size_t length;
-	int rc, totlen;
+	int rc, totlen, nvecs = 1;
 	uint32_t phys_ofs = write_ofs(c);
 
-	BUG_ON(!xd->xname);
+	BUG_ON(is_xattr_datum_dead(xd) || (xd->flags & JFFS2_XFLAGS_INVALID)
+	       ? !!xd->xname : !xd->xname);
 
 	vecs[0].iov_base = &rx;
-	vecs[0].iov_len = PAD(sizeof(rx));
-	vecs[1].iov_base = xd->xname;
-	vecs[1].iov_len = xd->name_len + 1 + xd->value_len;
-	totlen = vecs[0].iov_len + vecs[1].iov_len;
-
+	vecs[0].iov_len = totlen = sizeof(rx);
+	if (!is_xattr_datum_dead(xd) && !(xd->flags & JFFS2_XFLAGS_INVALID)) {
+		nvecs++;
+		vecs[1].iov_base = xd->xname;
+		vecs[1].iov_len = xd->name_len + 1 + xd->value_len;
+		totlen += vecs[1].iov_len;
+	}
 	/* Setup raw-xattr */
+	memset(&rx, 0, sizeof(rx));
 	rx.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
 	rx.nodetype = cpu_to_je16(JFFS2_NODETYPE_XATTR);
 	rx.totlen = cpu_to_je32(PAD(totlen));
 	rx.hdr_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_unknown_node) - 4));
 
 	rx.xid = cpu_to_je32(xd->xid);
-	rx.version = cpu_to_je32(++xd->version);
-	rx.xprefix = xd->xprefix;
-	rx.name_len = xd->name_len;
-	rx.value_len = cpu_to_je16(xd->value_len);
-	rx.data_crc = cpu_to_je32(crc32(0, vecs[1].iov_base, vecs[1].iov_len));
+	if (!is_xattr_datum_dead(xd) && !(xd->flags & JFFS2_XFLAGS_INVALID)) {
+		rx.version = cpu_to_je32(++xd->version);
+		rx.xprefix = xd->xprefix;
+		rx.name_len = xd->name_len;
+		rx.value_len = cpu_to_je16(xd->value_len);
+		rx.data_crc = cpu_to_je32(crc32(0, vecs[1].iov_base, vecs[1].iov_len));
+	} else {
+		rx.version = cpu_to_je32(XDATUM_DELETE_MARKER);
+	}
 	rx.node_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_raw_xattr) - 4));
 
-	rc = jffs2_flash_writev(c, vecs, 2, phys_ofs, &length, 0);
+	rc = jffs2_flash_writev(c, vecs, nvecs, phys_ofs, &length, 0);
 	if (rc || totlen != length) {
 		JFFS2_WARNING("jffs2_flash_writev()=%d, req=%u, wrote=%zu, at %#08x\n",
 			      rc, totlen, length, phys_ofs);
@@ -343,14 +328,8 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 
 		return rc;
 	}
-
 	/* success */
-	raw = jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(totlen), NULL);
-	/* FIXME */ raw->next_in_ino = (void *)xd;
-
-	if (xd->node)
-		delete_xattr_datum_node(c, xd);
-	xd->node = raw;
+	jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(totlen), (void *)xd);
 
 	dbg_xattr("success on saving xdatum (xid=%u, version=%u, xprefix=%u, xname='%s')\n",
 		  xd->xid, xd->version, xd->xprefix, xd->xname);
@@ -426,6 +405,39 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
 	return xd;
 }
 
+static void delete_xattr_datum_delay(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	BUG_ON(xd->refcnt);
+
+	unload_xattr_datum(c, xd);
+	set_xattr_datum_dead(xd);
+	spin_lock(&c->erase_completion_lock);
+	list_add(&xd->xindex, &c->xattr_dead_list);
+	spin_unlock(&c->erase_completion_lock);
+	JFFS2_NOTICE("xdatum(xid=%u) was removed without delete marker. "
+		     "An orphan xdatum may be detected on next mounting.\n", xd->xid);
+}
+
+static void delete_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under jffs2_reserve_space() and down_write(xattr_sem) */
+	int rc;
+	BUG_ON(xd->refcnt);
+
+	unload_xattr_datum(c, xd);
+	set_xattr_datum_dead(xd);	
+	rc = save_xattr_datum(c, xd);
+	if (rc) {
+		JFFS2_NOTICE("xdatum(xid=%u) was removed without delete marker. "
+			     "An orphan xdatum may be detected on next mounting.\n",
+			     xd->xid);
+	}
+	spin_lock(&c->erase_completion_lock);
+	list_add(&xd->xindex, &c->xattr_dead_list);
+	spin_unlock(&c->erase_completion_lock);
+}
+
 /* -------- xref related functions ------------------
  * verify_xattr_ref(c, ref)
  *   is used to load xref information from medium. Because summary data does not
@@ -450,25 +462,29 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
 static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 {
 	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *raw;
 	struct jffs2_raw_xref rr;
 	size_t readlen;
-	uint32_t crc, totlen;
+	uint32_t crc, offset, totlen;
 	int rc;
 
-	BUG_ON(ref_flags(ref->node) != REF_UNCHECKED);
+	spin_lock(&c->erase_completion_lock);
+	if (ref_flags(ref->node) != REF_UNCHECKED)
+		goto complete;
+	offset = ref_offset(ref->node);
+	spin_unlock(&c->erase_completion_lock);
 
-	rc = jffs2_flash_read(c, ref_offset(ref->node), sizeof(rr), &readlen, (char *)&rr);
+	rc = jffs2_flash_read(c, offset, sizeof(rr), &readlen, (char *)&rr);
 	if (rc || sizeof(rr) != readlen) {
 		JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu, at %#08x\n",
-			      rc, sizeof(rr), readlen, ref_offset(ref->node));
+			      rc, sizeof(rr), readlen, offset);
 		return rc ? rc : -EIO;
 	}
 	/* obsolete node */
 	crc = crc32(0, &rr, sizeof(rr) - 4);
 	if (crc != je32_to_cpu(rr.node_crc)) {
-		if (je32_to_cpu(rr.node_crc) != 0xffffffff)
-			JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
-				    ref_offset(ref->node), je32_to_cpu(rr.node_crc), crc);
+		JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			    offset, je32_to_cpu(rr.node_crc), crc);
 		return EIO;
 	}
 	if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
@@ -476,22 +492,28 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
 	    || je32_to_cpu(rr.totlen) != PAD(sizeof(rr))) {
 		JFFS2_ERROR("inconsistent xref at %#08x, magic=%#04x/%#04x, "
 			    "nodetype=%#04x/%#04x, totlen=%u/%zu\n",
-			    ref_offset(ref->node), je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
+			    offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
 			    je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
 			    je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
 		return EIO;
 	}
 	ref->ino = je32_to_cpu(rr.ino);
 	ref->xid = je32_to_cpu(rr.xid);
-
-	/* fixup superblock/eraseblock info */
-	jeb = &c->blocks[ref_offset(ref->node) / c->sector_size];
-	totlen = PAD(sizeof(rr));
+	ref->xseqno = je32_to_cpu(rr.xseqno);
+	if (ref->xseqno > c->highest_xseqno)
+		c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER);
 
 	spin_lock(&c->erase_completion_lock);
-	c->unchecked_size -= totlen; c->used_size += totlen;
-	jeb->unchecked_size -= totlen; jeb->used_size += totlen;
-	ref->node->flash_offset = ref_offset(ref->node) | REF_PRISTINE;
+ complete:
+	for (raw=ref->node; raw != (void *)ref; raw=raw->next_in_ino) {
+		jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+		totlen = PAD(ref_totlen(c, jeb, raw));
+		if (ref_flags(raw) == REF_UNCHECKED) {
+			c->unchecked_size -= totlen; c->used_size += totlen;
+			jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+		}
+		raw->flash_offset = ref_offset(raw) | ((ref->node==raw) ? REF_PRISTINE : REF_NORMAL);
+	}
 	spin_unlock(&c->erase_completion_lock);
 
 	dbg_xattr("success on verifying xref (ino=%u, xid=%u) at %#08x\n",
@@ -499,58 +521,12 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
 	return 0;
 }
 
-static void delete_xattr_ref_node(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
-{
-	struct jffs2_raw_xref rr;
-	size_t length;
-	int rc;
-
-	if (jffs2_sum_active()) {
-		memset(&rr, 0xff, sizeof(rr));
-		rc = jffs2_flash_read(c, ref_offset(ref->node),
-				      sizeof(struct jffs2_unknown_node),
-				      &length, (char *)&rr);
-		if (rc || length != sizeof(struct jffs2_unknown_node)) {
-			JFFS2_ERROR("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
-				    rc, sizeof(struct jffs2_unknown_node),
-				    length, ref_offset(ref->node));
-		}
-		rc = jffs2_flash_write(c, ref_offset(ref->node), sizeof(rr),
-				       &length, (char *)&rr);
-		if (rc || length != sizeof(struct jffs2_raw_xref)) {
-			JFFS2_ERROR("jffs2_flash_write()=%d, req=%zu, wrote=%zu at %#08x\n",
-				    rc, sizeof(rr), length, ref_offset(ref->node));
-		}
-	}
-	spin_lock(&c->erase_completion_lock);
-	ref->node->next_in_ino = NULL;
-	spin_unlock(&c->erase_completion_lock);
-	jffs2_mark_node_obsolete(c, ref->node);
-	ref->node = NULL;
-}
-
-static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
-{
-	/* must be called under down_write(xattr_sem) */
-	struct jffs2_xattr_datum *xd;
-
-	BUG_ON(!ref->node);
-	delete_xattr_ref_node(c, ref);
-
-	xd = ref->xd;
-	xd->refcnt--;
-	if (!xd->refcnt)
-		delete_xattr_datum(c, xd);
-	jffs2_free_xattr_ref(ref);
-}
-
 static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 {
 	/* must be called under down_write(xattr_sem) */
-	struct jffs2_raw_node_ref *raw;
 	struct jffs2_raw_xref rr;
 	size_t length;
-	uint32_t phys_ofs = write_ofs(c);
+	uint32_t xseqno, phys_ofs = write_ofs(c);
 	int ret;
 
 	rr.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -558,8 +534,16 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 	rr.totlen = cpu_to_je32(PAD(sizeof(rr)));
 	rr.hdr_crc = cpu_to_je32(crc32(0, &rr, sizeof(struct jffs2_unknown_node) - 4));
 
-	rr.ino = cpu_to_je32(ref->ic->ino);
-	rr.xid = cpu_to_je32(ref->xd->xid);
+	xseqno = (c->highest_xseqno += 2);
+	if (is_xattr_ref_dead(ref)) {
+		xseqno |= XREF_DELETE_MARKER;
+		rr.ino = cpu_to_je32(ref->ino);
+		rr.xid = cpu_to_je32(ref->xid);
+	} else {
+		rr.ino = cpu_to_je32(ref->ic->ino);
+		rr.xid = cpu_to_je32(ref->xd->xid);
+	}
+	rr.xseqno = cpu_to_je32(xseqno);
 	rr.node_crc = cpu_to_je32(crc32(0, &rr, sizeof(rr) - 4));
 
 	ret = jffs2_flash_write(c, phys_ofs, sizeof(rr), &length, (char *)&rr);
@@ -572,12 +556,9 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 
 		return ret;
 	}
-
-	raw = jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(sizeof(rr)), NULL);
-	/* FIXME */ raw->next_in_ino = (void *)ref;
-	if (ref->node)
-		delete_xattr_ref_node(c, ref);
-	ref->node = raw;
+	/* success */
+	ref->xseqno = xseqno;
+	jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(sizeof(rr)), (void *)ref);
 
 	dbg_xattr("success on saving xref (ino=%u, xid=%u)\n", ref->ic->ino, ref->xd->xid);
 
@@ -610,22 +591,116 @@ static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct
 	return ref; /* success */
 }
 
+static void delete_xattr_ref_delay(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd;
+
+	set_xattr_ref_dead(ref);
+	xd = ref->xd;
+	ref->ino = ref->ic->ino;
+	ref->xid = ref->xd->xid;
+	spin_lock(&c->erase_completion_lock);
+	ref->next = c->xref_dead_list;
+	c->xref_dead_list = ref;
+	spin_unlock(&c->erase_completion_lock);
+
+	JFFS2_NOTICE("xref(ino=%u, xid=%u) was removed without delete marker. "
+		     "An orphan xref may be detected on next mounting.\n",
+		     ref->ino, ref->xid);
+
+	if (!--xd->refcnt)
+		delete_xattr_datum_delay(c, xd);
+}
+
+static int delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref, int enforce)
+{
+	/* must be called under jffs2_reserve_space() and down_write(xattr_sem) */
+	struct jffs2_inode_cache *ic;
+	struct jffs2_xattr_datum *xd;
+	uint32_t length;
+	int rc;
+
+	set_xattr_ref_dead(ref);
+	ic = ref->ic;
+	xd = ref->xd;
+	ref->ino = ic->ino;
+	ref->xid = xd->xid;
+	rc = save_xattr_ref(c, ref);
+	if (rc) {
+		if (!enforce) {
+			clr_xattr_ref_dead(ref);
+			ref->ic = ic;
+			ref->xd = xd;
+			return rc;
+		}
+		JFFS2_WARNING("could not write delete marker of xref(ino=%u, xid=%u). "
+			      "An orphan xref may be detected on next mounting.\n",
+			      ref->ic->ino, ref->xd->xid);
+	}
+	spin_lock(&c->erase_completion_lock);
+	ref->next = c->xref_dead_list;
+	c->xref_dead_list = ref;
+	spin_unlock(&c->erase_completion_lock);
+
+	xd->refcnt--;
+	if (xd->refcnt)
+		return 0;
+
+	/* delete xdatum */
+	unload_xattr_datum(c, xd);
+	up_write(&c->xattr_sem);
+	jffs2_complete_reservation(c);
+
+	rc = jffs2_reserve_space(c, PAD(sizeof(struct jffs2_raw_xattr)), &length,
+				 ALLOC_DELETION, JFFS2_SUMMARY_XATTR_SIZE);
+	if (rc) {
+		down(&c->alloc_sem);
+		down_write(&c->xattr_sem);
+		delete_xattr_datum_delay(c, xd);
+	} else {
+		down_write(&c->xattr_sem);
+		delete_xattr_datum(c, xd);
+	}
+	return 0;
+}
+
 void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
 {
 	/* It's called from jffs2_clear_inode() on inode removing.
 	   When an inode with XATTR is removed, those XATTRs must be removed. */
-	struct jffs2_xattr_ref *ref, *_ref;
+	struct jffs2_xattr_ref *ref;
+	uint32_t length;
+	int rc, retry;
 
 	if (!ic || ic->nlink > 0)
 		return;
 
+	down_read(&c->xattr_sem);
+	if (!ic->xref) {
+		up_read(&c->xattr_sem);
+		return;
+	}
+	up_read(&c->xattr_sem);
+ retry:
+	rc = jffs2_reserve_space(c, PAD(sizeof(struct jffs2_raw_xref)), &length,
+				 ALLOC_DELETION, JFFS2_SUMMARY_XREF_SIZE);
 	down_write(&c->xattr_sem);
-	for (ref = ic->xref; ref; ref = _ref) {
-		_ref = ref->next;
-		delete_xattr_ref(c, ref);
+	if (ic->xref) {
+		ref = ic->xref;
+		ic->xref = ref->next;
+		if (rc) {
+			delete_xattr_ref_delay(c, ref);
+		} else {
+			delete_xattr_ref(c, ref, 1);
+		}
 	}
-	ic->xref = NULL;
+	retry = ic->xref ? 1 : 0;
 	up_write(&c->xattr_sem);
+	if (!rc)
+		jffs2_complete_reservation(c);
+	if (retry)
+		goto retry;
 }
 
 void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
@@ -655,7 +730,7 @@ static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cac
 	 * duplicate name/value pairs. If duplicate name/value pair would be found,
 	 * one will be removed.
 	 */
-	struct jffs2_xattr_ref *ref, *cmp, **pref;
+	struct jffs2_xattr_ref *ref, *cmp, **pref, **pcmp;
 	int rc = 0;
 
 	if (likely(ic->flags & INO_FLAGS_XATTR_CHECKED))
@@ -668,27 +743,32 @@ static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cac
 			rc = load_xattr_datum(c, ref->xd);
 			if (unlikely(rc > 0)) {
 				*pref = ref->next;
-				delete_xattr_ref(c, ref);
+				delete_xattr_ref_delay(c, ref);
 				goto retry;
 			} else if (unlikely(rc < 0))
 				goto out;
 		}
-		for (cmp=ref->next, pref=&ref->next; cmp; pref=&cmp->next, cmp=cmp->next) {
+		for (cmp=ref->next, pcmp=&ref->next; cmp; pcmp=&cmp->next, cmp=cmp->next) {
 			if (!cmp->xd->xname) {
 				ref->xd->flags |= JFFS2_XFLAGS_BIND;
 				rc = load_xattr_datum(c, cmp->xd);
 				ref->xd->flags &= ~JFFS2_XFLAGS_BIND;
 				if (unlikely(rc > 0)) {
-					*pref = cmp->next;
-					delete_xattr_ref(c, cmp);
+					*pcmp = cmp->next;
+					delete_xattr_ref_delay(c, cmp);
 					goto retry;
 				} else if (unlikely(rc < 0))
 					goto out;
 			}
 			if (ref->xd->xprefix == cmp->xd->xprefix
 			    && !strcmp(ref->xd->xname, cmp->xd->xname)) {
-				*pref = cmp->next;
-				delete_xattr_ref(c, cmp);
+				if (ref->xseqno > cmp->xseqno) {
+					*pcmp = cmp->next;
+					delete_xattr_ref_delay(c, cmp);
+				} else {
+					*pref = ref->next;
+					delete_xattr_ref_delay(c, ref);
+				}
 				goto retry;
 			}
 		}
@@ -719,9 +799,13 @@ void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c)
 	for (i=0; i < XATTRINDEX_HASHSIZE; i++)
 		INIT_LIST_HEAD(&c->xattrindex[i]);
 	INIT_LIST_HEAD(&c->xattr_unchecked);
+	INIT_LIST_HEAD(&c->xattr_dead_list);
+	c->xref_dead_list = NULL;
 	c->xref_temp = NULL;
 
 	init_rwsem(&c->xattr_sem);
+	c->highest_xid = 0;
+	c->highest_xseqno = 0;
 	c->xdatum_mem_usage = 0;
 	c->xdatum_mem_threshold = 32 * 1024;	/* Default 32KB */
 }
@@ -751,7 +835,11 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
 		_ref = ref->next;
 		jffs2_free_xattr_ref(ref);
 	}
-	c->xref_temp = NULL;
+
+	for (ref=c->xref_dead_list; ref; ref = _ref) {
+		_ref = ref->next;
+		jffs2_free_xattr_ref(ref);
+	}
 
 	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
 		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
@@ -761,64 +849,120 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
 			jffs2_free_xattr_datum(xd);
 		}
 	}
+
+	list_for_each_entry_safe(xd, _xd, &c->xattr_dead_list, xindex) {
+		list_del(&xd->xindex);
+		jffs2_free_xattr_datum(xd);
+	}
 }
 
+#define XREF_TMPHASH_SIZE	(128)
 void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
 {
 	struct jffs2_xattr_ref *ref, *_ref;
+	struct jffs2_xattr_ref *xref_tmphash[XREF_TMPHASH_SIZE];
 	struct jffs2_xattr_datum *xd, *_xd;
 	struct jffs2_inode_cache *ic;
-	int i, xdatum_count =0, xdatum_unchecked_count = 0, xref_count = 0;
+	struct jffs2_raw_node_ref *raw;
+	int i, xdatum_count = 0, xdatum_unchecked_count = 0, xref_count = 0;
 
 	BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING));
+	/* Phase.1 : Drop dead xdatum */
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+			BUG_ON(xd->node == (void *)xd);
+			if (is_xattr_datum_dead(xd)) {
+				list_del_init(&xd->xindex);
+				list_add(&xd->xindex, &c->xattr_unchecked);
+			}
+		}
+	}
 
-	/* Phase.1 */
+	/* Phase.2 : Merge same xref */
+	for (i=0; i < XREF_TMPHASH_SIZE; i++)
+		xref_tmphash[i] = NULL;
 	for (ref=c->xref_temp; ref; ref=_ref) {
+		struct jffs2_xattr_ref *tmp;
+
 		_ref = ref->next;
-		/* checking REF_UNCHECKED nodes */
 		if (ref_flags(ref->node) != REF_PRISTINE) {
 			if (verify_xattr_ref(c, ref)) {
-				delete_xattr_ref_node(c, ref);
+				BUG_ON(ref->node->next_in_ino != (void *)ref);
+				ref->node->next_in_ino = NULL;
+				jffs2_mark_node_obsolete(c, ref->node);
 				jffs2_free_xattr_ref(ref);
 				continue;
 			}
 		}
-		/* At this point, ref->xid and ref->ino contain XID and inode number.
-		   ref->xd and ref->ic are not valid yet. */
-		xd = jffs2_find_xattr_datum(c, ref->xid);
-		ic = jffs2_get_ino_cache(c, ref->ino);
-		if (!xd || !ic) {
-			if (ref_flags(ref->node) != REF_UNCHECKED)
-				JFFS2_WARNING("xref(ino=%u, xid=%u) is orphan. \n",
-					      ref->ino, ref->xid);
-			delete_xattr_ref_node(c, ref);
+
+		i = (ref->ino ^ ref->xid) % XREF_TMPHASH_SIZE;
+		for (tmp=xref_tmphash[i]; tmp; tmp=tmp->next) {
+			if (tmp->ino == ref->ino && tmp->xid == ref->xid)
+				break;
+		}
+		if (tmp) {
+			raw = ref->node;
+			if (ref->xseqno > tmp->xseqno) {
+				tmp->xseqno = ref->xseqno;
+				raw->next_in_ino = tmp->node;
+				tmp->node = raw;
+			} else {
+				raw->next_in_ino = tmp->node->next_in_ino;
+				tmp->node->next_in_ino = raw;
+			}
 			jffs2_free_xattr_ref(ref);
 			continue;
+		} else {
+			ref->next = xref_tmphash[i];
+			xref_tmphash[i] = ref;
 		}
-		ref->xd = xd;
-		ref->ic = ic;
-		xd->refcnt++;
-		ref->next = ic->xref;
-		ic->xref = ref;
-		xref_count++;
 	}
 	c->xref_temp = NULL;
-	/* After this, ref->xid/ino are NEVER used. */
 
-	/* Phase.2 */
+	/* Phase.3 : Bind xref with inode_cache and xattr_datum */
+	for (i=0; i < XREF_TMPHASH_SIZE; i++) {
+		for (ref=xref_tmphash[i]; ref; ref=_ref) {
+			_ref = ref->next;
+			if (is_xattr_ref_dead(ref)) {
+				ref->next = c->xref_dead_list;
+				c->xref_dead_list = ref;
+				continue;
+			}
+			/* At this point, ref->xid and ref->ino contain XID and inode number.
+			   ref->xd and ref->ic are not valid yet. */
+			xd = jffs2_find_xattr_datum(c, ref->xid);
+			ic = jffs2_get_ino_cache(c, ref->ino);
+			if (!xd || !ic) {
+				JFFS2_WARNING("xref(ino=%u, xid=%u, xseqno=%u) is orphan. \n",
+					      ref->ino, ref->xid, ref->xseqno);
+				set_xattr_ref_dead(ref);
+				ref->next = c->xref_dead_list;
+				c->xref_dead_list = ref;
+				continue;
+			}
+			ref->xd = xd;
+			ref->ic = ic;
+			xd->refcnt++;
+			ref->next = ic->xref;
+			ic->xref = ref;
+			xref_count++;
+		}
+	}
+
+	/* Phase.4 : Link unchecked xdatum to xattr_unchecked list */
 	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
 		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
 			list_del_init(&xd->xindex);
 			if (!xd->refcnt) {
-				if (ref_flags(xd->node) != REF_UNCHECKED)
-					JFFS2_WARNING("orphan xdatum(xid=%u, version=%u) at %#08x\n",
-						      xd->xid, xd->version, ref_offset(xd->node));
-				delete_xattr_datum(c, xd);
+				JFFS2_WARNING("orphan xdatum(xid=%u, version=%u)\n",
+					      xd->xid, xd->version);
+				set_xattr_datum_dead(xd);
+				list_add(&xd->xindex, &c->xattr_unchecked);
 				continue;
 			}
-			if (ref_flags(xd->node) != REF_PRISTINE) {
-				dbg_xattr("unchecked xdatum(xid=%u) at %#08x\n",
-					  xd->xid, ref_offset(xd->node));
+			if (is_xattr_datum_unchecked(c, xd)) {
+				dbg_xattr("unchecked xdatum(xid=%u, version=%u)\n",
+					  xd->xid, xd->version);
 				list_add(&xd->xindex, &c->xattr_unchecked);
 				xdatum_unchecked_count++;
 			}
@@ -833,28 +977,18 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
 struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
 						  uint32_t xid, uint32_t version)
 {
-	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_xattr_datum *xd;
 
-	_xd = jffs2_find_xattr_datum(c, xid);
-	if (_xd) {
-		dbg_xattr("duplicate xdatum (xid=%u, version=%u/%u) at %#08x\n",
-			  xid, version, _xd->version, ref_offset(_xd->node));
-		if (version < _xd->version)
-			return ERR_PTR(-EEXIST);
-	}
-	xd = jffs2_alloc_xattr_datum();
-	if (!xd)
-		return ERR_PTR(-ENOMEM);
-	xd->xid = xid;
-	xd->version = version;
-	if (xd->xid > c->highest_xid)
-		c->highest_xid = xd->xid;
-	list_add_tail(&xd->xindex, &c->xattrindex[xid % XATTRINDEX_HASHSIZE]);
-
-	if (_xd) {
-		list_del_init(&_xd->xindex);
-		delete_xattr_datum_node(c, _xd);
-		jffs2_free_xattr_datum(_xd);
+	xd = jffs2_find_xattr_datum(c, xid);
+	if (!xd) {
+		xd = jffs2_alloc_xattr_datum();
+		if (!xd)
+			return ERR_PTR(-ENOMEM);
+		xd->xid = xid;
+		xd->version = version;
+		if (xd->xid > c->highest_xid)
+			c->highest_xid = xd->xid;
+		list_add_tail(&xd->xindex, &c->xattrindex[xid % XATTRINDEX_HASHSIZE]);
 	}
 	return xd;
 }
@@ -945,7 +1079,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 				rc = load_xattr_datum(c, xd);
 				if (unlikely(rc > 0)) {
 					*pref = ref->next;
-					delete_xattr_ref(c, ref);
+					delete_xattr_ref_delay(c, ref);
 					goto retry;
 				} else if (unlikely(rc < 0))
 					goto out;
@@ -1006,7 +1140,7 @@ int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
 				rc = load_xattr_datum(c, xd);
 				if (unlikely(rc > 0)) {
 					*pref = ref->next;
-					delete_xattr_ref(c, ref);
+					delete_xattr_ref_delay(c, ref);
 					goto retry;
 				} else if (unlikely(rc < 0)) {
 					goto out;
@@ -1069,7 +1203,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 			rc = load_xattr_datum(c, xd);
 			if (unlikely(rc > 0)) {
 				*pref = ref->next;
-				delete_xattr_ref(c, ref);
+				delete_xattr_ref_delay(c, ref);
 				goto retry;
 			} else if (unlikely(rc < 0))
 				goto out;
@@ -1081,8 +1215,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 			}
 			if (!buffer) {
 				*pref = ref->next;
-				delete_xattr_ref(c, ref);
-				rc = 0;
+				rc = delete_xattr_ref(c, ref, 0);
 				goto out;
 			}
 			goto found;
@@ -1094,7 +1227,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 		goto out;
 	}
 	if (!buffer) {
-		rc = -EINVAL;
+		rc = -ENODATA;
 		goto out;
 	}
  found:
@@ -1110,16 +1243,15 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 	request = PAD(sizeof(struct jffs2_raw_xref));
 	rc = jffs2_reserve_space(c, request, &length,
 				 ALLOC_NORMAL, JFFS2_SUMMARY_XREF_SIZE);
+	down_write(&c->xattr_sem);
 	if (rc) {
 		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
-		down_write(&c->xattr_sem);
 		xd->refcnt--;
 		if (!xd->refcnt)
-			delete_xattr_datum(c, xd);
+			delete_xattr_datum_delay(c, xd);
 		up_write(&c->xattr_sem);
 		return rc;
 	}
-	down_write(&c->xattr_sem);
 	if (ref)
 		*pref = ref->next;
 	newref = create_xattr_ref(c, ic, xd);
@@ -1131,9 +1263,21 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 		rc = PTR_ERR(newref);
 		xd->refcnt--;
 		if (!xd->refcnt)
-			delete_xattr_datum(c, xd);
+			delete_xattr_datum_delay(c, xd);
 	} else if (ref) {
-		delete_xattr_ref(c, ref);
+		up_write(&c->xattr_sem);
+		jffs2_complete_reservation(c);
+
+		rc = jffs2_reserve_space(c, request, &length,
+					 ALLOC_DELETION, JFFS2_SUMMARY_XREF_SIZE);
+		down_write(&c->xattr_sem);
+		if (rc) {
+			JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
+			delete_xattr_ref_delay(c, ref);
+			up_write(&c->xattr_sem);
+			return 0;
+		}
+		delete_xattr_ref(c, ref, 1);
 	}
  out:
 	up_write(&c->xattr_sem);
@@ -1142,38 +1286,37 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 }
 
 /* -------- garbage collector functions -------------
- * jffs2_garbage_collect_xattr_datum(c, xd)
+ * jffs2_garbage_collect_xattr_datum(c, xd, raw)
  *   is used to move xdatum into new node.
- * jffs2_garbage_collect_xattr_ref(c, ref)
+ * jffs2_garbage_collect_xattr_ref(c, ref, raw)
  *   is used to move xref into new node.
  * jffs2_verify_xattr(c)
  *   is used to call do_verify_xattr_datum() before garbage collecting.
  * -------------------------------------------------- */
-int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd,
+				      struct jffs2_raw_node_ref *raw)
 {
 	uint32_t totlen, length, old_ofs;
-	int rc = -EINVAL;
+	int rc = 0;
 
 	down_write(&c->xattr_sem);
-	BUG_ON(!xd->node);
+	if (xd->node != raw)
+		goto out;
+	if (is_xattr_datum_dead(xd) && (raw->next_in_ino == (void *)xd))
+		goto out;
 
 	old_ofs = ref_offset(xd->node);
 	totlen = ref_totlen(c, c->gcblock, xd->node);
-	if (totlen < sizeof(struct jffs2_raw_xattr))
-		goto out;
 
-	if (!xd->xname) {
+	if (!is_xattr_datum_dead(xd)) {
 		rc = load_xattr_datum(c, xd);
-		if (unlikely(rc > 0)) {
-			delete_xattr_datum_node(c, xd);
-			rc = 0;
-			goto out;
-		} else if (unlikely(rc < 0))
+		if (unlikely(rc < 0))
 			goto out;
 	}
+
 	rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XATTR_SIZE);
-	if (rc || length < totlen) {
-		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, totlen);
+	if (rc) {
+		JFFS2_WARNING("jffs2_reserve_space_gc()=%d, request=%u\n", rc, totlen);
 		rc = rc ? rc : -EBADFD;
 		goto out;
 	}
@@ -1182,27 +1325,33 @@ int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xatt
 		dbg_xattr("xdatum (xid=%u, version=%u) GC'ed from %#08x to %08x\n",
 			  xd->xid, xd->version, old_ofs, ref_offset(xd->node));
  out:
+	if (!rc)
+		jffs2_mark_node_obsolete(c, raw);
 	up_write(&c->xattr_sem);
 	return rc;
 }
 
 
-int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
+				    struct jffs2_raw_node_ref *raw)
 {
 	uint32_t totlen, length, old_ofs;
-	int rc = -EINVAL;
+	int rc = 0;
 
 	down_write(&c->xattr_sem);
 	BUG_ON(!ref->node);
 
+	if (ref->node != raw)
+		goto out;
+	if (is_xattr_ref_dead(ref) && (raw->next_in_ino == (void *)ref))
+		goto out;
+
 	old_ofs = ref_offset(ref->node);
 	totlen = ref_totlen(c, c->gcblock, ref->node);
-	if (totlen != sizeof(struct jffs2_raw_xref))
-		goto out;
 
 	rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XREF_SIZE);
-	if (rc || length < totlen) {
-		JFFS2_WARNING("%s: jffs2_reserve_space() = %d, request = %u\n",
+	if (rc) {
+		JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n",
 			      __FUNCTION__, rc, totlen);
 		rc = rc ? rc : -EBADFD;
 		goto out;
@@ -1212,6 +1361,8 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_
 		dbg_xattr("xref (ino=%u, xid=%u) GC'ed from %#08x to %08x\n",
 			  ref->ic->ino, ref->xd->xid, old_ofs, ref_offset(ref->node));
  out:
+	if (!rc)
+		jffs2_mark_node_obsolete(c, raw);
 	up_write(&c->xattr_sem);
 	return rc;
 }
@@ -1219,20 +1370,59 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_
 int jffs2_verify_xattr(struct jffs2_sb_info *c)
 {
 	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *raw;
+	uint32_t totlen;
 	int rc;
 
 	down_write(&c->xattr_sem);
 	list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) {
 		rc = do_verify_xattr_datum(c, xd);
-		if (rc == 0) {
-			list_del_init(&xd->xindex);
-			break;
-		} else if (rc > 0) {
-			list_del_init(&xd->xindex);
-			delete_xattr_datum_node(c, xd);
+		if (rc < 0)
+			continue;
+		list_del_init(&xd->xindex);
+		spin_lock(&c->erase_completion_lock);
+		for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+			if (ref_flags(raw) != REF_UNCHECKED)
+				continue;
+			jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+			totlen = PAD(ref_totlen(c, jeb, raw));
+			c->unchecked_size -= totlen; c->used_size += totlen;
+			jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+			raw->flash_offset = ref_offset(raw)
+				| ((xd->node == (void *)raw) ? REF_PRISTINE : REF_NORMAL);
 		}
+		if (is_xattr_datum_dead(xd))
+			list_add(&xd->xindex, &c->xattr_dead_list);
+		spin_unlock(&c->erase_completion_lock);
 	}
 	up_write(&c->xattr_sem);
-
 	return list_empty(&c->xattr_unchecked) ? 1 : 0;
 }
+
+void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under spin_lock(&c->erase_completion_lock) */
+	if (xd->node != (void *)xd)
+		return;
+
+	list_del(&xd->xindex);
+	jffs2_free_xattr_datum(xd);
+}
+
+void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	/* must be called under spin_lock(&c->erase_completion_lock) */
+	struct jffs2_xattr_ref *tmp, **ptmp;
+
+	if (ref->node != (void *)ref)
+		return;
+
+	for (tmp=c->xref_dead_list, ptmp=&c->xref_dead_list; tmp; ptmp=&tmp->next, tmp=tmp->next) {
+		if (ref == tmp) {
+			*ptmp = tmp->next;
+			jffs2_free_xattr_ref(ref);
+			break;
+		}
+	}
+}
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 2c199856c582..06ab7b880212 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -16,6 +16,7 @@
 
 #define JFFS2_XFLAGS_HOT	(0x01)	/* This datum is HOT */
 #define JFFS2_XFLAGS_BIND	(0x02)	/* This datum is not reclaimed */
+#define JFFS2_XFLAGS_INVALID	(0x80)	/* This datum contains crc error */
 
 struct jffs2_xattr_datum
 {
@@ -23,7 +24,7 @@ struct jffs2_xattr_datum
 	struct jffs2_raw_node_ref *node;
 	uint8_t class;
 	uint8_t flags;
-	uint16_t xprefix;			/* see JFFS2_XATTR_PREFIX_* */
+	uint16_t xprefix;		/* see JFFS2_XATTR_PREFIX_* */
 
 	struct list_head xindex;	/* chained from c->xattrindex[n] */
 	uint32_t refcnt;		/* # of xattr_ref refers this */
@@ -47,6 +48,7 @@ struct jffs2_xattr_ref
 	uint8_t flags;		/* Currently unused */
 	u16 unused;
 
+	uint32_t xseqno;
 	union {
 		struct jffs2_inode_cache *ic;	/* reference to jffs2_inode_cache */
 		uint32_t ino;			/* only used in scanning/building  */
@@ -58,6 +60,34 @@ struct jffs2_xattr_ref
 	struct jffs2_xattr_ref *next;		/* chained from ic->xref_list */
 };
 
+#define XDATUM_DELETE_MARKER	(0xffffffff)
+#define XREF_DELETE_MARKER	(0x00000001)
+static inline int is_xattr_datum_dead(struct jffs2_xattr_datum *xd)
+{
+	return (xd->version == XDATUM_DELETE_MARKER);
+}
+
+static inline void set_xattr_datum_dead(struct jffs2_xattr_datum *xd)
+{
+	xd->version = XDATUM_DELETE_MARKER;
+}
+
+static inline int is_xattr_ref_dead(struct jffs2_xattr_ref *ref)
+{
+	return ((ref->xseqno & XREF_DELETE_MARKER) != 0);
+}
+
+static inline void set_xattr_ref_dead(struct jffs2_xattr_ref *ref)
+{
+	ref->xseqno |= XREF_DELETE_MARKER;
+}
+
+static inline void clr_xattr_ref_dead(struct jffs2_xattr_ref *ref)
+{
+	ref->xseqno &= ~XREF_DELETE_MARKER;
+}
+
+
 #ifdef CONFIG_JFFS2_FS_XATTR
 
 extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c);
@@ -70,9 +100,13 @@ extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c
 extern void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
 extern void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
 
-extern int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd);
-extern int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref);
+extern int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd,
+					     struct jffs2_raw_node_ref *raw);
+extern int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
+					   struct jffs2_raw_node_ref *raw);
 extern int jffs2_verify_xattr(struct jffs2_sb_info *c);
+extern void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd);
+extern void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref);
 
 extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
 			     char *buffer, size_t size);
diff --git a/include/linux/jffs2.h b/include/linux/jffs2.h
index c6f70660b371..c9c760700bc3 100644
--- a/include/linux/jffs2.h
+++ b/include/linux/jffs2.h
@@ -186,6 +186,7 @@ struct jffs2_raw_xref
 	jint32_t hdr_crc;
 	jint32_t ino;		/* inode number */
 	jint32_t xid;		/* XATTR identifier number */
+	jint32_t xseqno;	/* xref sequencial number */
 	jint32_t node_crc;
 } __attribute__((packed));
 
-- 
cgit v1.2.3


From cf7c712c11fb881842534efe98a07f36f1c86c65 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 12 Jun 2006 15:49:31 -0700
Subject: [PATCH] 64bit resource: introduce resource_size_t for the start and
 end of struct resource

But do not change it from what it currently is (unsigned long)

Based on a patch series originally from Vivek Goyal <vgoyal@in.ibm.com>

Cc: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/ioport.h | 4 +++-
 include/linux/types.h  | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index cd6bd001ba4e..535bd9585897 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -9,13 +9,15 @@
 #define _LINUX_IOPORT_H
 
 #include <linux/compiler.h>
+#include <linux/types.h>
 /*
  * Resources are tree-like, allowing
  * nesting etc..
  */
 struct resource {
+	resource_size_t start;
+	resource_size_t end;
 	const char *name;
-	unsigned long start, end;
 	unsigned long flags;
 	struct resource *parent, *sibling, *child;
 };
diff --git a/include/linux/types.h b/include/linux/types.h
index a5e46e783ffa..a021e1577336 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -177,6 +177,8 @@ typedef __u64 __bitwise __be64;
 
 #ifdef __KERNEL__
 typedef unsigned __bitwise__ gfp_t;
+
+typedef unsigned long resource_size_t;
 #endif
 
 struct ustat {
-- 
cgit v1.2.3


From d75fc8bbccf7c019994bcfd6255d5b56335ed21d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 12 Jun 2006 16:09:23 -0700
Subject: [PATCH] 64bit resource: change resource core to use resource_size_t

Based on a patch series originally from Vivek Goyal <vgoyal@in.ibm.com>

Cc: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/sparc/kernel/ioport.c |  4 ++--
 include/linux/ioport.h     | 23 +++++++++++++----------
 kernel/resource.c          | 34 ++++++++++++++++++----------------
 3 files changed, 33 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 99d716b9507a..79d177149fdb 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -208,7 +208,7 @@ _sparc_ioremap(struct resource *res, u32 bus, u32 pa, int sz)
 	pa &= PAGE_MASK;
 	sparc_mapiorange(bus, pa, res->start, res->end - res->start + 1);
 
-	return (void __iomem *) (res->start + offset);
+	return (void __iomem *)(unsigned long)(res->start + offset);
 }
 
 /*
@@ -325,7 +325,7 @@ void *sbus_alloc_consistent(struct sbus_dev *sdev, long len, u32 *dma_addrp)
 		res->name = sdev->prom_name;
 	}
 
-	return (void *)res->start;
+	return (void *)(unsigned long)res->start;
 
 err_noiommu:
 	release_resource(res);
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 535bd9585897..d4895236b7e9 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -98,31 +98,34 @@ extern struct resource * ____request_resource(struct resource *root, struct reso
 extern int release_resource(struct resource *new);
 extern __deprecated_for_modules int insert_resource(struct resource *parent, struct resource *new);
 extern int allocate_resource(struct resource *root, struct resource *new,
-			     unsigned long size,
-			     unsigned long min, unsigned long max,
-			     unsigned long align,
+			     resource_size_t size, resource_size_t min,
+			     resource_size_t max, resource_size_t align,
 			     void (*alignf)(void *, struct resource *,
-					    unsigned long, unsigned long),
+					    resource_size_t, resource_size_t),
 			     void *alignf_data);
-int adjust_resource(struct resource *res, unsigned long start,
-		    unsigned long size);
+int adjust_resource(struct resource *res, resource_size_t start,
+		    resource_size_t size);
 
 /* Convenience shorthand with allocation */
 #define request_region(start,n,name)	__request_region(&ioport_resource, (start), (n), (name))
 #define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name))
 #define rename_region(region, newname) do { (region)->name = (newname); } while (0)
 
-extern struct resource * __request_region(struct resource *, unsigned long start, unsigned long n, const char *name);
+extern struct resource * __request_region(struct resource *,
+					resource_size_t start,
+					resource_size_t n, const char *name);
 
 /* Compatibility cruft */
 #define release_region(start,n)	__release_region(&ioport_resource, (start), (n))
 #define check_mem_region(start,n)	__check_region(&iomem_resource, (start), (n))
 #define release_mem_region(start,n)	__release_region(&iomem_resource, (start), (n))
 
-extern int __check_region(struct resource *, unsigned long, unsigned long);
-extern void __release_region(struct resource *, unsigned long, unsigned long);
+extern int __check_region(struct resource *, resource_size_t, resource_size_t);
+extern void __release_region(struct resource *, resource_size_t,
+				resource_size_t);
 
-static inline int __deprecated check_region(unsigned long s, unsigned long n)
+static inline int __deprecated check_region(resource_size_t s,
+						resource_size_t n)
 {
 	return __check_region(&ioport_resource, s, n);
 }
diff --git a/kernel/resource.c b/kernel/resource.c
index ea5f7811a408..54835c02ab37 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -151,8 +151,8 @@ __initcall(ioresources_init);
 /* Return the conflict entry if you can't request it */
 static struct resource * __request_resource(struct resource *root, struct resource *new)
 {
-	unsigned long start = new->start;
-	unsigned long end = new->end;
+	resource_size_t start = new->start;
+	resource_size_t end = new->end;
 	struct resource *tmp, **p;
 
 	if (end < start)
@@ -236,11 +236,10 @@ EXPORT_SYMBOL(release_resource);
  * Find empty slot in the resource tree given range and alignment.
  */
 static int find_resource(struct resource *root, struct resource *new,
-			 unsigned long size,
-			 unsigned long min, unsigned long max,
-			 unsigned long align,
+			 resource_size_t size, resource_size_t min,
+			 resource_size_t max, resource_size_t align,
 			 void (*alignf)(void *, struct resource *,
-					unsigned long, unsigned long),
+					resource_size_t, resource_size_t),
 			 void *alignf_data)
 {
 	struct resource *this = root->child;
@@ -282,11 +281,10 @@ static int find_resource(struct resource *root, struct resource *new,
  * Allocate empty slot in the resource tree given range and alignment.
  */
 int allocate_resource(struct resource *root, struct resource *new,
-		      unsigned long size,
-		      unsigned long min, unsigned long max,
-		      unsigned long align,
+		      resource_size_t size, resource_size_t min,
+		      resource_size_t max, resource_size_t align,
 		      void (*alignf)(void *, struct resource *,
-				     unsigned long, unsigned long),
+				     resource_size_t, resource_size_t),
 		      void *alignf_data)
 {
 	int err;
@@ -378,10 +376,10 @@ EXPORT_SYMBOL(insert_resource);
  * arguments.  Returns -EBUSY if it can't fit.  Existing children of
  * the resource are assumed to be immutable.
  */
-int adjust_resource(struct resource *res, unsigned long start, unsigned long size)
+int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
 {
 	struct resource *tmp, *parent = res->parent;
-	unsigned long end = start + size - 1;
+	resource_size_t end = start + size - 1;
 	int result = -EBUSY;
 
 	write_lock(&resource_lock);
@@ -428,7 +426,9 @@ EXPORT_SYMBOL(adjust_resource);
  *
  * Release-region releases a matching busy region.
  */
-struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
+struct resource * __request_region(struct resource *parent,
+				   resource_size_t start, resource_size_t n,
+				   const char *name)
 {
 	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
 
@@ -464,7 +464,8 @@ struct resource * __request_region(struct resource *parent, unsigned long start,
 
 EXPORT_SYMBOL(__request_region);
 
-int __check_region(struct resource *parent, unsigned long start, unsigned long n)
+int __check_region(struct resource *parent, resource_size_t start,
+			resource_size_t n)
 {
 	struct resource * res;
 
@@ -479,10 +480,11 @@ int __check_region(struct resource *parent, unsigned long start, unsigned long n
 
 EXPORT_SYMBOL(__check_region);
 
-void __release_region(struct resource *parent, unsigned long start, unsigned long n)
+void __release_region(struct resource *parent, resource_size_t start,
+			resource_size_t n)
 {
 	struct resource **p;
-	unsigned long end;
+	resource_size_t end;
 
 	p = &parent->child;
 	end = start + n - 1;
-- 
cgit v1.2.3


From e31dd6e4520439ceae4753f32dd2da2c345e929a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 12 Jun 2006 17:06:02 -0700
Subject: [PATCH] 64bit resource: change pci core and arch code to use
 resource_size_t

Based on a patch series originally from Vivek Goyal <vgoyal@in.ibm.com>

Cc: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/alpha/kernel/pci.c               |  4 ++--
 arch/arm/kernel/bios32.c              |  6 +++---
 arch/cris/arch-v32/drivers/pci/bios.c |  4 ++--
 arch/frv/mb93090-mb00/pci-frv.c       |  4 ++--
 arch/i386/pci/i386.c                  |  4 ++--
 arch/ia64/pci/pci.c                   |  2 +-
 arch/m68knommu/kernel/comempci.c      |  3 ++-
 arch/mips/pci/pci.c                   |  4 ++--
 arch/mips/pmc-sierra/yosemite/ht.c    |  4 ++--
 arch/parisc/kernel/pci.c              |  2 +-
 arch/powerpc/kernel/pci_32.c          | 10 +++++-----
 arch/powerpc/kernel/pci_64.c          |  4 ++--
 arch/ppc/kernel/pci.c                 | 12 ++++++------
 arch/sh/boards/mpc1211/pci.c          |  4 ++--
 arch/sh/boards/overdrive/galileo.c    |  2 +-
 arch/sh/drivers/pci/pci.c             |  6 +++---
 arch/sh64/kernel/pcibios.c            |  4 ++--
 arch/sparc/kernel/pcic.c              |  2 +-
 arch/sparc64/kernel/pci.c             |  2 +-
 arch/v850/kernel/rte_mb_a_pci.c       |  2 +-
 arch/xtensa/kernel/pci.c              |  6 +++---
 drivers/pci/bus.c                     | 10 +++++-----
 drivers/pci/pci-sysfs.c               |  4 ++--
 drivers/pci/pci.h                     |  6 +++---
 drivers/pci/proc.c                    |  4 ++--
 drivers/pci/rom.c                     | 10 +++++-----
 drivers/pci/setup-res.c               |  6 +++---
 include/asm-arm/mach/pci.h            |  2 +-
 include/asm-powerpc/pci.h             |  2 +-
 include/asm-ppc/pci.h                 |  2 +-
 include/linux/pci.h                   | 13 +++++++------
 31 files changed, 76 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c
index 2a8b364c822e..4ea6711e55aa 100644
--- a/arch/alpha/kernel/pci.c
+++ b/arch/alpha/kernel/pci.c
@@ -124,12 +124,12 @@ DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pcibios_fixup_final);
 
 void
 pcibios_align_resource(void *data, struct resource *res,
-		       unsigned long size, unsigned long align)
+		       resource_size_t size, resource_size_t align)
 {
 	struct pci_dev *dev = data;
 	struct pci_controller *hose = dev->sysdata;
 	unsigned long alignto;
-	unsigned long start = res->start;
+	resource_size_t start = res->start;
 
 	if (res->flags & IORESOURCE_IO) {
 		/* Make sure we start at our min on all hoses */
diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c
index 302fc1401547..45da06fc1ba1 100644
--- a/arch/arm/kernel/bios32.c
+++ b/arch/arm/kernel/bios32.c
@@ -304,7 +304,7 @@ static inline int pdev_bad_for_parity(struct pci_dev *dev)
 static void __devinit
 pdev_fixup_device_resources(struct pci_sys_data *root, struct pci_dev *dev)
 {
-	unsigned long offset;
+	resource_size_t offset;
 	int i;
 
 	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
@@ -634,9 +634,9 @@ char * __init pcibios_setup(char *str)
  * which might be mirrored at 0x0100-0x03ff..
  */
 void pcibios_align_resource(void *data, struct resource *res,
-			    unsigned long size, unsigned long align)
+			    resource_size_t size, resource_size_t align)
 {
-	unsigned long start = res->start;
+	resource_size_t start = res->start;
 
 	if (res->flags & IORESOURCE_IO && start & 0x300)
 		start = (start + 0x3ff) & ~0x3ff;
diff --git a/arch/cris/arch-v32/drivers/pci/bios.c b/arch/cris/arch-v32/drivers/pci/bios.c
index 1e9d062103ae..a2b9c60c2777 100644
--- a/arch/cris/arch-v32/drivers/pci/bios.c
+++ b/arch/cris/arch-v32/drivers/pci/bios.c
@@ -43,10 +43,10 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 
 void
 pcibios_align_resource(void *data, struct resource *res,
-		       unsigned long size, unsigned long align)
+		       resource_size_t size, resource_size_t align)
 {
 	if (res->flags & IORESOURCE_IO) {
-		unsigned long start = res->start;
+		resource_size_t start = res->start;
 
 		if (start & 0x300) {
 			start = (start + 0x3ff) & ~0x3ff;
diff --git a/arch/frv/mb93090-mb00/pci-frv.c b/arch/frv/mb93090-mb00/pci-frv.c
index 0a26bf6f1cd4..4f165c93be42 100644
--- a/arch/frv/mb93090-mb00/pci-frv.c
+++ b/arch/frv/mb93090-mb00/pci-frv.c
@@ -64,10 +64,10 @@ pcibios_update_resource(struct pci_dev *dev, struct resource *root,
  */
 void
 pcibios_align_resource(void *data, struct resource *res,
-		       unsigned long size, unsigned long align)
+		       resource_size_t size, resource_size_t align)
 {
 	if (res->flags & IORESOURCE_IO) {
-		unsigned long start = res->start;
+		resource_size_t start = res->start;
 
 		if (start & 0x300) {
 			start = (start + 0x3ff) & ~0x3ff;
diff --git a/arch/i386/pci/i386.c b/arch/i386/pci/i386.c
index a151f7a99f5e..10154a2cac68 100644
--- a/arch/i386/pci/i386.c
+++ b/arch/i386/pci/i386.c
@@ -48,10 +48,10 @@
  */
 void
 pcibios_align_resource(void *data, struct resource *res,
-		       unsigned long size, unsigned long align)
+			resource_size_t size, resource_size_t align)
 {
 	if (res->flags & IORESOURCE_IO) {
-		unsigned long start = res->start;
+		resource_size_t start = res->start;
 
 		if (start & 0x300) {
 			start = (start + 0x3ff) & ~0x3ff;
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 77375a55da31..5bef0e3603f2 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -568,7 +568,7 @@ pcibios_disable_device (struct pci_dev *dev)
 
 void
 pcibios_align_resource (void *data, struct resource *res,
-		        unsigned long size, unsigned long align)
+		        resource_size_t size, resource_size_t align)
 {
 }
 
diff --git a/arch/m68knommu/kernel/comempci.c b/arch/m68knommu/kernel/comempci.c
index 8670938f1107..db7a0c1cebae 100644
--- a/arch/m68knommu/kernel/comempci.c
+++ b/arch/m68knommu/kernel/comempci.c
@@ -357,7 +357,8 @@ void pcibios_fixup_bus(struct pci_bus *b)
 
 /*****************************************************************************/
 
-void pcibios_align_resource(void *data, struct resource *res, unsigned long size, unsigned long align)
+void pcibios_align_resource(void *data, struct resource *res,
+				resource_size_t size, resource_size_t align)
 {
 }
 
diff --git a/arch/mips/pci/pci.c b/arch/mips/pci/pci.c
index 4dfce154d4af..ba66f8c9bd4e 100644
--- a/arch/mips/pci/pci.c
+++ b/arch/mips/pci/pci.c
@@ -51,11 +51,11 @@ unsigned long PCIBIOS_MIN_MEM	= 0;
  */
 void
 pcibios_align_resource(void *data, struct resource *res,
-		       unsigned long size, unsigned long align)
+		       resource_size_t size, resource_size_t align)
 {
 	struct pci_dev *dev = data;
 	struct pci_controller *hose = dev->sysdata;
-	unsigned long start = res->start;
+	resource_size_t start = res->start;
 
 	if (res->flags & IORESOURCE_IO) {
 		/* Make sure we start at our min on all hoses */
diff --git a/arch/mips/pmc-sierra/yosemite/ht.c b/arch/mips/pmc-sierra/yosemite/ht.c
index 54b65a80abf5..fb523ebcafa8 100644
--- a/arch/mips/pmc-sierra/yosemite/ht.c
+++ b/arch/mips/pmc-sierra/yosemite/ht.c
@@ -383,12 +383,12 @@ void pcibios_update_resource(struct pci_dev *dev, struct resource *root,
 
 
 void pcibios_align_resource(void *data, struct resource *res,
-                            unsigned long size, unsigned long align)
+                            resource_size_t size, resource_size_t align)
 {
         struct pci_dev *dev = data;
 
         if (res->flags & IORESOURCE_IO) {
-                unsigned long start = res->start;
+                resource_size_t start = res->start;
 
                 /* We need to avoid collisions with `mirrored' VGA ports
                    and other strange ISA hardware, so we always want the
diff --git a/arch/parisc/kernel/pci.c b/arch/parisc/kernel/pci.c
index 79c7db2705fd..7d6967ee367c 100644
--- a/arch/parisc/kernel/pci.c
+++ b/arch/parisc/kernel/pci.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL(pcibios_bus_to_resource);
  * than res->start.
  */
 void pcibios_align_resource(void *data, struct resource *res,
-				unsigned long size, unsigned long alignment)
+				resource_size_t size, resource_size_t alignment)
 {
 	unsigned long mask, align;
 
diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c
index d9e2506db5f7..8474355a1a4f 100644
--- a/arch/powerpc/kernel/pci_32.c
+++ b/arch/powerpc/kernel/pci_32.c
@@ -173,18 +173,18 @@ EXPORT_SYMBOL(pcibios_bus_to_resource);
  * but we want to try to avoid allocating at 0x2900-0x2bff
  * which might have be mirrored at 0x0100-0x03ff..
  */
-void pcibios_align_resource(void *data, struct resource *res, unsigned long size,
-		       unsigned long align)
+void pcibios_align_resource(void *data, struct resource *res,
+				resource_size_t size, resource_size_t align)
 {
 	struct pci_dev *dev = data;
 
 	if (res->flags & IORESOURCE_IO) {
-		unsigned long start = res->start;
+		resource_size_t start = res->start;
 
 		if (size > 0x100) {
 			printk(KERN_ERR "PCI: I/O Region %s/%d too large"
 			       " (%lld bytes)\n", pci_name(dev),
-			       dev->resource - res, size);
+			       dev->resource - res, (unsigned long long)size);
 		}
 
 		if (start & 0x300) {
@@ -1756,7 +1756,7 @@ long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn)
 
 void pci_resource_to_user(const struct pci_dev *dev, int bar,
 			  const struct resource *rsrc,
-			  u64 *start, u64 *end)
+			  resource_size_t *start, resource_size_t *end)
 {
 	struct pci_controller *hose = pci_bus_to_hose(dev->bus->number);
 	unsigned long offset = 0;
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index 247937dd8b73..286aa52aae33 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -138,11 +138,11 @@ EXPORT_SYMBOL(pcibios_bus_to_resource);
  * which might have be mirrored at 0x0100-0x03ff..
  */
 void pcibios_align_resource(void *data, struct resource *res,
-			    unsigned long size, unsigned long align)
+			    resource_size_t size, resource_size_t align)
 {
 	struct pci_dev *dev = data;
 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
-	unsigned long start = res->start;
+	resource_size_t start = res->start;
 	unsigned long alignto;
 
 	if (res->flags & IORESOURCE_IO) {
diff --git a/arch/ppc/kernel/pci.c b/arch/ppc/kernel/pci.c
index 8544e100d713..242bb052be67 100644
--- a/arch/ppc/kernel/pci.c
+++ b/arch/ppc/kernel/pci.c
@@ -171,13 +171,13 @@ EXPORT_SYMBOL(pcibios_bus_to_resource);
  * but we want to try to avoid allocating at 0x2900-0x2bff
  * which might have be mirrored at 0x0100-0x03ff..
  */
-void pcibios_align_resource(void *data, struct resource *res, unsigned long size,
-		       unsigned long align)
+void pcibios_align_resource(void *data, struct resource *res,
+				resource_size_t size, resource_size_t align)
 {
 	struct pci_dev *dev = data;
 
 	if (res->flags & IORESOURCE_IO) {
-		unsigned long start = res->start;
+		resource_size_t start = res->start;
 
 		if (size > 0x100) {
 			printk(KERN_ERR "PCI: I/O Region %s/%d too large"
@@ -960,8 +960,8 @@ static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp,
 	else
 		prot |= _PAGE_GUARDED;
 
-	printk("PCI map for %s:%llx, prot: %llx\n", pci_name(dev), rp->start,
-	       prot);
+	printk("PCI map for %s:%llx, prot: %lx\n", pci_name(dev),
+		(unsigned long long)rp->start, prot);
 
 	return __pgprot(prot);
 }
@@ -1130,7 +1130,7 @@ long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn)
 
 void pci_resource_to_user(const struct pci_dev *dev, int bar,
 			  const struct resource *rsrc,
-			  u64 *start, u64 *end)
+			  resource_size_t *start, resource_size_t *end)
 {
 	struct pci_controller *hose = pci_bus_to_hose(dev->bus->number);
 	unsigned long offset = 0;
diff --git a/arch/sh/boards/mpc1211/pci.c b/arch/sh/boards/mpc1211/pci.c
index ba3a65439752..9f7ccd33ffb6 100644
--- a/arch/sh/boards/mpc1211/pci.c
+++ b/arch/sh/boards/mpc1211/pci.c
@@ -273,9 +273,9 @@ void __init pcibios_fixup_irqs(void)
 }
 
 void pcibios_align_resource(void *data, struct resource *res,
-			    unsigned long size, unsigned long align)
+			    resource_size_t size, resource_size_t align)
 {
-	unsigned long start = res->start;
+	resource_size_t start = res->start;
 
 	if (res->flags & IORESOURCE_IO) {
 		if (start >= 0x10000UL) {
diff --git a/arch/sh/boards/overdrive/galileo.c b/arch/sh/boards/overdrive/galileo.c
index 276fa11ee4ce..b055809d2ac1 100644
--- a/arch/sh/boards/overdrive/galileo.c
+++ b/arch/sh/boards/overdrive/galileo.c
@@ -536,7 +536,7 @@ void __init pcibios_fixup_bus(struct pci_bus *bus)
 }
 
 void pcibios_align_resource(void *data, struct resource *res,
-			    unsigned long size)
+			    resource_size_t size)
 {
 }
 
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index c1669905abe4..3d546ba329cf 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -75,7 +75,7 @@ pcibios_update_resource(struct pci_dev *dev, struct resource *root,
 }
 
 void pcibios_align_resource(void *data, struct resource *res,
-			    unsigned long size, unsigned long align)
+			    resource_size_t size, resource_size_t align)
 			    __attribute__ ((weak));
 
 /*
@@ -85,10 +85,10 @@ void pcibios_align_resource(void *data, struct resource *res,
  * modulo 0x400.
  */
 void pcibios_align_resource(void *data, struct resource *res,
-			    unsigned long size, unsigned long align)
+			    resource_size_t size, resource_size_t align)
 {
 	if (res->flags & IORESOURCE_IO) {
-		unsigned long start = res->start;
+		resource_size_t start = res->start;
 
 		if (start & 0x300) {
 			start = (start + 0x3ff) & ~0x3ff;
diff --git a/arch/sh64/kernel/pcibios.c b/arch/sh64/kernel/pcibios.c
index 50c61dcb9fae..945920bc24db 100644
--- a/arch/sh64/kernel/pcibios.c
+++ b/arch/sh64/kernel/pcibios.c
@@ -69,10 +69,10 @@ pcibios_update_resource(struct pci_dev *dev, struct resource *root,
  * modulo 0x400.
  */
 void pcibios_align_resource(void *data, struct resource *res,
-			    unsigned long size, unsigned long align)
+			    resource_size_t size, resource_size_t align)
 {
 	if (res->flags & IORESOURCE_IO) {
-		unsigned long start = res->start;
+		resource_size_t start = res->start;
 
 		if (start & 0x300) {
 			start = (start + 0x3ff) & ~0x3ff;
diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index bcfdddd0418a..5df3ebdc0ab1 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -860,7 +860,7 @@ char * __init pcibios_setup(char *str)
 }
 
 void pcibios_align_resource(void *data, struct resource *res,
-			    unsigned long size, unsigned long align)
+			    resource_size_t size, resource_size_t align)
 {
 }
 
diff --git a/arch/sparc64/kernel/pci.c b/arch/sparc64/kernel/pci.c
index 6c9e3e94abaa..20ca9ec8fd3b 100644
--- a/arch/sparc64/kernel/pci.c
+++ b/arch/sparc64/kernel/pci.c
@@ -357,7 +357,7 @@ void pcibios_update_irq(struct pci_dev *pdev, int irq)
 }
 
 void pcibios_align_resource(void *data, struct resource *res,
-			    unsigned long size, unsigned long align)
+			    resource_size_t size, resource_size_t align)
 {
 }
 
diff --git a/arch/v850/kernel/rte_mb_a_pci.c b/arch/v850/kernel/rte_mb_a_pci.c
index ffbb6d073bf2..3a7c5c9c3ac6 100644
--- a/arch/v850/kernel/rte_mb_a_pci.c
+++ b/arch/v850/kernel/rte_mb_a_pci.c
@@ -329,7 +329,7 @@ void pcibios_fixup_bus(struct pci_bus *b)
 
 void
 pcibios_align_resource (void *data, struct resource *res,
-			unsigned long size, unsigned long align)
+			resource_size_t size, resource_size_t align)
 {
 }
 
diff --git a/arch/xtensa/kernel/pci.c b/arch/xtensa/kernel/pci.c
index c6f471b9eaa0..eda029fc8972 100644
--- a/arch/xtensa/kernel/pci.c
+++ b/arch/xtensa/kernel/pci.c
@@ -71,13 +71,13 @@ static int pci_bus_count;
  * which might have be mirrored at 0x0100-0x03ff..
  */
 void
-pcibios_align_resource(void *data, struct resource *res, unsigned long size,
-    		       unsigned long align)
+pcibios_align_resource(void *data, struct resource *res, resource_size_t size,
+    		       resource_size_t align)
 {
 	struct pci_dev *dev = data;
 
 	if (res->flags & IORESOURCE_IO) {
-		unsigned long start = res->start;
+		resource_size_t start = res->start;
 
 		if (size > 0x100) {
 			printk(KERN_ERR "PCI: I/O Region %s/%d too large"
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 723092682023..5f7db9d2436e 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -34,11 +34,11 @@
  */
 int
 pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res,
-	unsigned long size, unsigned long align, unsigned long min,
-	unsigned int type_mask,
-	void (*alignf)(void *, struct resource *,
-			unsigned long, unsigned long),
-	void *alignf_data)
+		resource_size_t size, resource_size_t align,
+		resource_size_t min, unsigned int type_mask,
+		void (*alignf)(void *, struct resource *, resource_size_t,
+				resource_size_t),
+		void *alignf_data)
 {
 	int i, ret = -ENOMEM;
 
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index bc405c035ce3..606f9b6f70eb 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -87,7 +87,7 @@ resource_show(struct device * dev, struct device_attribute *attr, char * buf)
 	char * str = buf;
 	int i;
 	int max = 7;
-	u64 start, end;
+	resource_size_t start, end;
 
 	if (pci_dev->subordinate)
 		max = DEVICE_COUNT_RESOURCE;
@@ -365,7 +365,7 @@ pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr,
 						       struct device, kobj));
 	struct resource *res = (struct resource *)attr->private;
 	enum pci_mmap_state mmap_type;
-	u64 start, end;
+	resource_size_t start, end;
 	int i;
 
 	for (i = 0; i < PCI_ROM_RESOURCE; i++)
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 29bdeca031a8..9cc842b666eb 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -6,10 +6,10 @@ extern int pci_create_sysfs_dev_files(struct pci_dev *pdev);
 extern void pci_remove_sysfs_dev_files(struct pci_dev *pdev);
 extern void pci_cleanup_rom(struct pci_dev *dev);
 extern int pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res,
-				  unsigned long size, unsigned long align,
-				  unsigned long min, unsigned int type_mask,
+				  resource_size_t size, resource_size_t align,
+				  resource_size_t min, unsigned int type_mask,
 				  void (*alignf)(void *, struct resource *,
-					  	 unsigned long, unsigned long),
+					      resource_size_t, resource_size_t),
 				  void *alignf_data);
 /* Firmware callbacks */
 extern int (*platform_pci_choose_state)(struct pci_dev *dev, pm_message_t state);
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 20dfd77bd8c2..99cf33379769 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -350,14 +350,14 @@ static int show_device(struct seq_file *m, void *v)
 			dev->irq);
 	/* Here should be 7 and not PCI_NUM_RESOURCES as we need to preserve compatibility */
 	for (i=0; i<7; i++) {
-		u64 start, end;
+		resource_size_t start, end;
 		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
 		seq_printf(m, "\t%16llx",
 			(unsigned long long)(start |
 			(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
 	}
 	for (i=0; i<7; i++) {
-		u64 start, end;
+		resource_size_t start, end;
 		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
 		seq_printf(m, "\t%16llx",
 			dev->resource[i].start < dev->resource[i].end ?
diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c
index 598a115cd00e..cbb69cf41311 100644
--- a/drivers/pci/rom.c
+++ b/drivers/pci/rom.c
@@ -80,8 +80,8 @@ void __iomem *pci_map_rom(struct pci_dev *pdev, size_t *size)
 	} else {
 		if (res->flags & IORESOURCE_ROM_COPY) {
 			*size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
-			return (void __iomem *)pci_resource_start(pdev,
-							     PCI_ROM_RESOURCE);
+			return (void __iomem *)(unsigned long)
+				pci_resource_start(pdev, PCI_ROM_RESOURCE);
 		} else {
 			/* assign the ROM an address if it doesn't have one */
 			if (res->parent == NULL &&
@@ -170,11 +170,11 @@ void __iomem *pci_map_rom_copy(struct pci_dev *pdev, size_t *size)
 		return rom;
 
 	res->end = res->start + *size;
-	memcpy_fromio((void*)res->start, rom, *size);
+	memcpy_fromio((void*)(unsigned long)res->start, rom, *size);
 	pci_unmap_rom(pdev, rom);
 	res->flags |= IORESOURCE_ROM_COPY;
 
-	return (void __iomem *)res->start;
+	return (void __iomem *)(unsigned long)res->start;
 }
 
 /**
@@ -227,7 +227,7 @@ void pci_cleanup_rom(struct pci_dev *pdev)
 {
 	struct resource *res = &pdev->resource[PCI_ROM_RESOURCE];
 	if (res->flags & IORESOURCE_ROM_COPY) {
-		kfree((void*)res->start);
+		kfree((void*)(unsigned long)res->start);
 		res->flags &= ~IORESOURCE_ROM_COPY;
 		res->start = 0;
 		res->end = 0;
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index f5ff0d3ba620..ab78e4bbdd83 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -121,7 +121,7 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
 {
 	struct pci_bus *bus = dev->bus;
 	struct resource *res = dev->resource + resno;
-	unsigned long size, min, align;
+	resource_size_t size, min, align;
 	int ret;
 
 	size = res->end - res->start + 1;
@@ -209,7 +209,7 @@ pdev_sort_resources(struct pci_dev *dev, struct resource_list *head)
 	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
 		struct resource *r;
 		struct resource_list *list, *tmp;
-		unsigned long r_align;
+		resource_size_t r_align;
 
 		r = &dev->resource[i];
 		r_align = r->end - r->start;
@@ -225,7 +225,7 @@ pdev_sort_resources(struct pci_dev *dev, struct resource_list *head)
 		}
 		r_align = (i < PCI_BRIDGE_RESOURCES) ? r_align + 1 : r->start;
 		for (list = head; ; list = list->next) {
-			unsigned long align = 0;
+			resource_size_t align = 0;
 			struct resource_list *ln = list->next;
 			int idx;
 
diff --git a/include/asm-arm/mach/pci.h b/include/asm-arm/mach/pci.h
index 25d540ed0079..923e0ca66200 100644
--- a/include/asm-arm/mach/pci.h
+++ b/include/asm-arm/mach/pci.h
@@ -28,7 +28,7 @@ struct hw_pci {
 struct pci_sys_data {
 	struct list_head node;
 	int		busnr;		/* primary bus number			*/
-	unsigned long	mem_offset;	/* bus->cpu memory mapping offset	*/
+	u64		mem_offset;	/* bus->cpu memory mapping offset	*/
 	unsigned long	io_offset;	/* bus->cpu IO mapping offset		*/
 	struct pci_bus	*bus;		/* PCI bus				*/
 	struct resource *resource[3];	/* Primary PCI bus resources		*/
diff --git a/include/asm-powerpc/pci.h b/include/asm-powerpc/pci.h
index 5d2c9e6c4be2..46afd29b904e 100644
--- a/include/asm-powerpc/pci.h
+++ b/include/asm-powerpc/pci.h
@@ -242,7 +242,7 @@ extern pgprot_t	pci_phys_mem_access_prot(struct file *file,
 #define HAVE_ARCH_PCI_RESOURCE_TO_USER
 extern void pci_resource_to_user(const struct pci_dev *dev, int bar,
 				 const struct resource *rsrc,
-				 u64 *start, u64 *end);
+				 resource_size_t *start, resource_size_t *end);
 #endif /* CONFIG_PPC_MULTIPLATFORM || CONFIG_PPC32 */
 
 #endif	/* __KERNEL__ */
diff --git a/include/asm-ppc/pci.h b/include/asm-ppc/pci.h
index 61434edbad7b..11ffaaa5da16 100644
--- a/include/asm-ppc/pci.h
+++ b/include/asm-ppc/pci.h
@@ -133,7 +133,7 @@ extern pgprot_t	pci_phys_mem_access_prot(struct file *file,
 #define HAVE_ARCH_PCI_RESOURCE_TO_USER
 extern void pci_resource_to_user(const struct pci_dev *dev, int bar,
 				 const struct resource *rsrc,
-				 u64 *start, u64 *end);
+				 resource_size_t *start, resource_size_t *end);
 
 
 #endif	/* __KERNEL__ */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 62a8c22f5f60..983fca251b25 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -404,8 +404,8 @@ int pcibios_enable_device(struct pci_dev *, int mask);
 char *pcibios_setup (char *str);
 
 /* Used only when drivers/pci/setup.c is used */
-void pcibios_align_resource(void *, struct resource *,
-			    unsigned long, unsigned long);
+void pcibios_align_resource(void *, struct resource *, resource_size_t,
+				resource_size_t);
 void pcibios_update_irq(struct pci_dev *, int irq);
 
 /* Generic PCI functions used internally */
@@ -532,10 +532,10 @@ void pci_release_region(struct pci_dev *, int);
 
 /* drivers/pci/bus.c */
 int pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res,
-			   unsigned long size, unsigned long align,
-			   unsigned long min, unsigned int type_mask,
+			   resource_size_t size, resource_size_t align,
+			   resource_size_t min, unsigned int type_mask,
 			   void (*alignf)(void *, struct resource *,
-					  unsigned long, unsigned long),
+					  resource_size_t, resource_size_t),
 			   void *alignf_data);
 void pci_enable_bridges(struct pci_bus *bus);
 
@@ -730,7 +730,8 @@ static inline char *pci_name(struct pci_dev *pdev)
  */
 #ifndef HAVE_ARCH_PCI_RESOURCE_TO_USER
 static inline void pci_resource_to_user(const struct pci_dev *dev, int bar,
-                const struct resource *rsrc, u64 *start, u64 *end)
+                const struct resource *rsrc, resource_size_t *start,
+		resource_size_t *end)
 {
 	*start = rsrc->start;
 	*end = rsrc->end;
-- 
cgit v1.2.3


From b60ba8343b78b182c03cf239d4342785376c1ad1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 12 Jun 2006 17:07:07 -0700
Subject: [PATCH] 64bit resource: change pnp core to use resource_size_t

Based on a patch series originally from Vivek Goyal <vgoyal@in.ibm.com>

Cc: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pnp/interface.c |  8 ++++----
 drivers/pnp/manager.c   | 15 ++++++++++-----
 drivers/pnp/resource.c  |  8 ++++----
 include/linux/pnp.h     |  7 +++++--
 4 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pnp/interface.c b/drivers/pnp/interface.c
index a2d8ce7fef9c..3163e3d73da1 100644
--- a/drivers/pnp/interface.c
+++ b/drivers/pnp/interface.c
@@ -264,7 +264,7 @@ static ssize_t pnp_show_current_resources(struct device *dmdev, struct device_at
 			if (pnp_port_flags(dev, i) & IORESOURCE_DISABLED)
 				pnp_printf(buffer," disabled\n");
 			else
-				pnp_printf(buffer," 0x%lx-0x%lx\n",
+				pnp_printf(buffer," 0x%llx-0x%llx\n",
 						pnp_port_start(dev, i),
 						pnp_port_end(dev, i));
 		}
@@ -275,7 +275,7 @@ static ssize_t pnp_show_current_resources(struct device *dmdev, struct device_at
 			if (pnp_mem_flags(dev, i) & IORESOURCE_DISABLED)
 				pnp_printf(buffer," disabled\n");
 			else
-				pnp_printf(buffer," 0x%lx-0x%lx\n",
+				pnp_printf(buffer," 0x%llx-0x%llx\n",
 						pnp_mem_start(dev, i),
 						pnp_mem_end(dev, i));
 		}
@@ -286,7 +286,7 @@ static ssize_t pnp_show_current_resources(struct device *dmdev, struct device_at
 			if (pnp_irq_flags(dev, i) & IORESOURCE_DISABLED)
 				pnp_printf(buffer," disabled\n");
 			else
-				pnp_printf(buffer," %ld\n",
+				pnp_printf(buffer," %lld\n",
 						pnp_irq(dev, i));
 		}
 	}
@@ -296,7 +296,7 @@ static ssize_t pnp_show_current_resources(struct device *dmdev, struct device_at
 			if (pnp_dma_flags(dev, i) & IORESOURCE_DISABLED)
 				pnp_printf(buffer," disabled\n");
 			else
-				pnp_printf(buffer," %ld\n",
+				pnp_printf(buffer," %lld\n",
 						pnp_dma(dev, i));
 		}
 	}
diff --git a/drivers/pnp/manager.c b/drivers/pnp/manager.c
index 6fff109bdab6..1d7a5b87f4cb 100644
--- a/drivers/pnp/manager.c
+++ b/drivers/pnp/manager.c
@@ -20,7 +20,8 @@ DECLARE_MUTEX(pnp_res_mutex);
 
 static int pnp_assign_port(struct pnp_dev *dev, struct pnp_port *rule, int idx)
 {
-	unsigned long *start, *end, *flags;
+	resource_size_t *start, *end;
+	unsigned long *flags;
 
 	if (!dev || !rule)
 		return -EINVAL;
@@ -63,7 +64,8 @@ static int pnp_assign_port(struct pnp_dev *dev, struct pnp_port *rule, int idx)
 
 static int pnp_assign_mem(struct pnp_dev *dev, struct pnp_mem *rule, int idx)
 {
-	unsigned long *start, *end, *flags;
+	resource_size_t *start, *end;
+	unsigned long *flags;
 
 	if (!dev || !rule)
 		return -EINVAL;
@@ -116,7 +118,8 @@ static int pnp_assign_mem(struct pnp_dev *dev, struct pnp_mem *rule, int idx)
 
 static int pnp_assign_irq(struct pnp_dev * dev, struct pnp_irq *rule, int idx)
 {
-	unsigned long *start, *end, *flags;
+	resource_size_t *start, *end;
+	unsigned long *flags;
 	int i;
 
 	/* IRQ priority: this table is good for i386 */
@@ -168,7 +171,8 @@ static int pnp_assign_irq(struct pnp_dev * dev, struct pnp_irq *rule, int idx)
 
 static int pnp_assign_dma(struct pnp_dev *dev, struct pnp_dma *rule, int idx)
 {
-	unsigned long *start, *end, *flags;
+	resource_size_t *start, *end;
+	unsigned long *flags;
 	int i;
 
 	/* DMA priority: this table is good for i386 */
@@ -582,7 +586,8 @@ int pnp_disable_dev(struct pnp_dev *dev)
  * @size: size of region
  *
  */
-void pnp_resource_change(struct resource *resource, unsigned long start, unsigned long size)
+void pnp_resource_change(struct resource *resource, resource_size_t start,
+				resource_size_t size)
 {
 	if (resource == NULL)
 		return;
diff --git a/drivers/pnp/resource.c b/drivers/pnp/resource.c
index 6ded527169f4..7bb892f58cc0 100644
--- a/drivers/pnp/resource.c
+++ b/drivers/pnp/resource.c
@@ -241,7 +241,7 @@ int pnp_check_port(struct pnp_dev * dev, int idx)
 {
 	int tmp;
 	struct pnp_dev *tdev;
-	unsigned long *port, *end, *tport, *tend;
+	resource_size_t *port, *end, *tport, *tend;
 	port = &dev->res.port_resource[idx].start;
 	end = &dev->res.port_resource[idx].end;
 
@@ -297,7 +297,7 @@ int pnp_check_mem(struct pnp_dev * dev, int idx)
 {
 	int tmp;
 	struct pnp_dev *tdev;
-	unsigned long *addr, *end, *taddr, *tend;
+	resource_size_t *addr, *end, *taddr, *tend;
 	addr = &dev->res.mem_resource[idx].start;
 	end = &dev->res.mem_resource[idx].end;
 
@@ -358,7 +358,7 @@ int pnp_check_irq(struct pnp_dev * dev, int idx)
 {
 	int tmp;
 	struct pnp_dev *tdev;
-	unsigned long * irq = &dev->res.irq_resource[idx].start;
+	resource_size_t * irq = &dev->res.irq_resource[idx].start;
 
 	/* if the resource doesn't exist, don't complain about it */
 	if (cannot_compare(dev->res.irq_resource[idx].flags))
@@ -423,7 +423,7 @@ int pnp_check_dma(struct pnp_dev * dev, int idx)
 #ifndef CONFIG_IA64
 	int tmp;
 	struct pnp_dev *tdev;
-	unsigned long * dma = &dev->res.dma_resource[idx].start;
+	resource_size_t * dma = &dev->res.dma_resource[idx].start;
 
 	/* if the resource doesn't exist, don't complain about it */
 	if (cannot_compare(dev->res.dma_resource[idx].flags))
diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index 93b0959eb40f..ab8a8dd8d64c 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -389,7 +389,8 @@ int pnp_start_dev(struct pnp_dev *dev);
 int pnp_stop_dev(struct pnp_dev *dev);
 int pnp_activate_dev(struct pnp_dev *dev);
 int pnp_disable_dev(struct pnp_dev *dev);
-void pnp_resource_change(struct resource *resource, unsigned long start, unsigned long size);
+void pnp_resource_change(struct resource *resource, resource_size_t start,
+				resource_size_t size);
 
 /* protocol helpers */
 int pnp_is_active(struct pnp_dev * dev);
@@ -434,7 +435,9 @@ static inline int pnp_start_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_stop_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_activate_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_disable_dev(struct pnp_dev *dev) { return -ENODEV; }
-static inline void pnp_resource_change(struct resource *resource, unsigned long start, unsigned long size) { }
+static inline void pnp_resource_change(struct resource *resource,
+					resource_size_t start,
+					resource_size_t size) { }
 
 /* protocol helpers */
 static inline int pnp_is_active(struct pnp_dev * dev) { return 0; }
-- 
cgit v1.2.3


From 6550e07f41ce8473ed684dac54fbfbd42183ffda Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 12 Jun 2006 17:11:31 -0700
Subject: [PATCH] 64bit Resource: finally enable 64bit resource sizes

Introduce the Kconfig entry and actually switch to a 64bit value, if
wanted, for resource_size_t.

Based on a patch series originally from Vivek Goyal <vgoyal@in.ibm.com>

Cc: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/i386/Kconfig     | 1 +
 include/linux/types.h | 7 ++++++-
 kernel/resource.c     | 8 +++-----
 mm/Kconfig            | 6 ++++++
 4 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 47c08bcd9b24..7e46ad7a7b69 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -529,6 +529,7 @@ config X86_PAE
 	bool
 	depends on HIGHMEM64G
 	default y
+	select RESOURCES_64BIT
 
 # Common NUMA Features
 config NUMA
diff --git a/include/linux/types.h b/include/linux/types.h
index a021e1577336..3f235660a3cd 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -178,9 +178,14 @@ typedef __u64 __bitwise __be64;
 #ifdef __KERNEL__
 typedef unsigned __bitwise__ gfp_t;
 
-typedef unsigned long resource_size_t;
+#ifdef CONFIG_RESOURCES_64BIT
+typedef u64 resource_size_t;
+#else
+typedef u32 resource_size_t;
 #endif
 
+#endif	/* __KERNEL__ */
+
 struct ustat {
 	__kernel_daddr_t	f_tfree;
 	__kernel_ino_t		f_tinode;
diff --git a/kernel/resource.c b/kernel/resource.c
index 54835c02ab37..cc73029088a7 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -23,20 +23,18 @@
 
 struct resource ioport_resource = {
 	.name	= "PCI IO",
-	.start	= 0x0000,
+	.start	= 0,
 	.end	= IO_SPACE_LIMIT,
 	.flags	= IORESOURCE_IO,
 };
-
 EXPORT_SYMBOL(ioport_resource);
 
 struct resource iomem_resource = {
 	.name	= "PCI mem",
-	.start	= 0UL,
-	.end	= ~0UL,
+	.start	= 0,
+	.end	= -1,
 	.flags	= IORESOURCE_MEM,
 };
-
 EXPORT_SYMBOL(iomem_resource);
 
 static DEFINE_RWLOCK(resource_lock);
diff --git a/mm/Kconfig b/mm/Kconfig
index 66e65ab39426..e3644b0062b1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -145,3 +145,9 @@ config MIGRATION
 	  while the virtual addresses are not changed. This is useful for
 	  example on NUMA systems to put pages nearer to the processors accessing
 	  the page.
+
+config RESOURCES_64BIT
+	bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
+	default 64BIT
+	help
+	  This option allows memory and IO resources to be 64 bit.
-- 
cgit v1.2.3


From bc02af93dd2bbddce1b55e0a493f833a1b7cf140 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:30 -0700
Subject: [PATCH] pgdat allocation for new node add (specify node id)

Change the name of old add_memory() to arch_add_memory.  And use node id to
get pgdat for the node at NODE_DATA().

Note: Powerpc's old add_memory() is defined as __devinit. However,
      add_memory() is usually called only after bootup.
      I suppose it may be redundant. But, I'm not well known about powerpc.
      So, I keep it. (But, __meminit is better at least.)

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/init.c            |  2 +-
 arch/ia64/mm/init.c            |  5 ++--
 arch/powerpc/mm/mem.c          | 11 +++++--
 arch/x86_64/mm/init.c          | 66 ++++++++++++++++++++++++------------------
 drivers/acpi/Kconfig           |  2 +-
 drivers/acpi/acpi_memhotplug.c |  3 +-
 drivers/base/memory.c          |  4 ++-
 include/linux/memory_hotplug.h | 13 ++++++++-
 mm/memory_hotplug.c            | 11 +++++++
 9 files changed, 78 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index bf19513f0cea..468592531793 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -654,7 +654,7 @@ void __init mem_init(void)
  */
 #ifdef CONFIG_MEMORY_HOTPLUG
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-int add_memory(u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size)
 {
 	struct pglist_data *pgdata = &contig_page_data;
 	struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 11f08001f8c2..38306e98f04b 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -652,7 +652,7 @@ void online_page(struct page *page)
 	num_physpages++;
 }
 
-int add_memory(u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size)
 {
 	pg_data_t *pgdat;
 	struct zone *zone;
@@ -660,7 +660,7 @@ int add_memory(u64 start, u64 size)
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	pgdat = NODE_DATA(0);
+	pgdat = NODE_DATA(nid);
 
 	zone = pgdat->node_zones + ZONE_NORMAL;
 	ret = __add_pages(zone, start_pfn, nr_pages);
@@ -671,7 +671,6 @@ int add_memory(u64 start, u64 size)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(add_memory);
 
 int remove_memory(u64 start, u64 size)
 {
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 69f3b9a20beb..089d939a0b3e 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -114,15 +114,20 @@ void online_page(struct page *page)
 	num_physpages++;
 }
 
-int __devinit add_memory(u64 start, u64 size)
+#ifdef CONFIG_NUMA
+int memory_add_physaddr_to_nid(u64 start)
+{
+	return hot_add_scn_to_nid(start);
+}
+#endif
+
+int __devinit arch_add_memory(int nid, u64 start, u64 size)
 {
 	struct pglist_data *pgdata;
 	struct zone *zone;
-	int nid;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	nid = hot_add_scn_to_nid(start);
 	pgdata = NODE_DATA(nid);
 
 	start = (unsigned long)__va(start);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 02add1d1dfa8..51fbf3eddf13 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -506,8 +506,6 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
 /*
  * Memory hotplug specific functions
  */
-#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
-
 void online_page(struct page *page)
 {
 	ClearPageReserved(page);
@@ -517,31 +515,17 @@ void online_page(struct page *page)
 	num_physpages++;
 }
 
-#ifndef CONFIG_MEMORY_HOTPLUG
+#ifdef CONFIG_MEMORY_HOTPLUG
 /*
- * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
- * just online the pages.
+ * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
+ *	via probe interface of sysfs. If acpi notifies hot-add event, then it
+ *	can tell node id by searching dsdt. But, probe interface doesn't have
+ *	node id. So, return 0 as node id at this time.
  */
-int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
+#ifdef CONFIG_NUMA
+int memory_add_physaddr_to_nid(u64 start)
 {
-	int err = -EIO;
-	unsigned long pfn;
-	unsigned long total = 0, mem = 0;
-	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
-		if (pfn_valid(pfn)) {
-			online_page(pfn_to_page(pfn));
-			err = 0;
-			mem++;
-		}
-		total++;
-	}
-	if (!err) {
-		z->spanned_pages += total;
-		z->present_pages += mem;
-		z->zone_pgdat->node_spanned_pages += total;
-		z->zone_pgdat->node_present_pages += mem;
-	}
-	return err;
+	return 0;
 }
 #endif
 
@@ -549,9 +533,9 @@ int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
  * Memory is added always to NORMAL zone. This means you will never get
  * additional DMA/DMA32 memory.
  */
-int add_memory(u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size)
 {
-	struct pglist_data *pgdat = NODE_DATA(0);
+	struct pglist_data *pgdat = NODE_DATA(nid);
 	struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -568,7 +552,7 @@ error:
 	printk("%s: Problem encountered in __add_pages!\n", __func__);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(add_memory);
+EXPORT_SYMBOL_GPL(arch_add_memory);
 
 int remove_memory(u64 start, u64 size)
 {
@@ -576,7 +560,33 @@ int remove_memory(u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(remove_memory);
 
-#endif
+#else /* CONFIG_MEMORY_HOTPLUG */
+/*
+ * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
+ * just online the pages.
+ */
+int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
+{
+	int err = -EIO;
+	unsigned long pfn;
+	unsigned long total = 0, mem = 0;
+	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
+		if (pfn_valid(pfn)) {
+			online_page(pfn_to_page(pfn));
+			err = 0;
+			mem++;
+		}
+		total++;
+	}
+	if (!err) {
+		z->spanned_pages += total;
+		z->present_pages += mem;
+		z->zone_pgdat->node_spanned_pages += total;
+		z->zone_pgdat->node_present_pages += mem;
+	}
+	return err;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
 
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
 			 kcore_vsyscall;
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 94b8d820c512..610d2cc02cf8 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -328,7 +328,7 @@ config ACPI_CONTAINER
 config ACPI_HOTPLUG_MEMORY
 	tristate "Memory Hotplug"
 	depends on ACPI
-	depends on MEMORY_HOTPLUG || X86_64
+	depends on MEMORY_HOTPLUG
 	default n
 	help
 	  This driver adds supports for ACPI Memory Hotplug.  This driver
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 5652569b3762..0424326eae15 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -215,6 +215,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 {
 	int result, num_enabled = 0;
 	struct acpi_memory_info *info;
+	int node = 0;
 
 	ACPI_FUNCTION_TRACE("acpi_memory_enable_device");
 
@@ -245,7 +246,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 			continue;
 		}
 
-		result = add_memory(info->start_addr, info->length);
+		result = add_memory(node, info->start_addr, info->length);
 		if (result)
 			continue;
 		info->enabled = 1;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index dd547af4681a..c6b7d9c4b651 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -306,11 +306,13 @@ static ssize_t
 memory_probe_store(struct class *class, const char *buf, size_t count)
 {
 	u64 phys_addr;
+	int nid;
 	int ret;
 
 	phys_addr = simple_strtoull(buf, NULL, 0);
 
-	ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
+	nid = memory_add_physaddr_to_nid(phys_addr);
+	ret = add_memory(nid, phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
 
 	if (ret)
 		count = ret;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 911206386171..29c1472efad0 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -63,6 +63,16 @@ extern int online_pages(unsigned long, unsigned long);
 /* reasonably generic interface to expand the physical pages in a zone  */
 extern int __add_pages(struct zone *zone, unsigned long start_pfn,
 	unsigned long nr_pages);
+
+#ifdef CONFIG_NUMA
+extern int memory_add_physaddr_to_nid(u64 start);
+#else
+static inline int memory_add_physaddr_to_nid(u64 start)
+{
+	return 0;
+}
+#endif
+
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 /*
  * Stub functions for when hotplug is off
@@ -99,7 +109,8 @@ static inline int __remove_pages(struct zone *zone, unsigned long start_pfn,
 	return -ENOSYS;
 }
 
-extern int add_memory(u64 start, u64 size);
+extern int add_memory(int nid, u64 start, u64 size);
+extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int remove_memory(u64 start, u64 size);
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 841a077d5aeb..6cdeabe9f6d4 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -163,3 +163,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	vm_total_pages = nr_free_pagecache_pages();
 	return 0;
 }
+
+int add_memory(int nid, u64 start, u64 size)
+{
+	int ret;
+
+	/* call arch's memory hotadd */
+	ret = arch_add_memory(nid, start, size);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(add_memory);
-- 
cgit v1.2.3


From 1e3590e2e4a38e8390fdac5bda23330bf2801838 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:31 -0700
Subject: [PATCH] pgdat allocation for new node add (get node id by acpi)

This is to find node id from acpi's handle of memory_device in DSDT.  _PXM for
the new node can be found by acpi_get_pxm() by using new memory's handle.  So,
node id can be found by pxm_to_nid_map[].

  This patch becomes simpler than v2 of node hot-add patch.
  Because old add_memory() function doesn't have node id parameter.
  So, kernel must find its handle by physical address via DSDT again.
  But, v3 just give node id to add_memory() now.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/acpi/acpi_memhotplug.c |  3 ++-
 drivers/acpi/numa.c            | 15 ++++++++++++++-
 include/linux/acpi.h           |  6 ++++++
 3 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 0424326eae15..1012284ff4f7 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -215,7 +215,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 {
 	int result, num_enabled = 0;
 	struct acpi_memory_info *info;
-	int node = 0;
+	int node;
 
 	ACPI_FUNCTION_TRACE("acpi_memory_enable_device");
 
@@ -228,6 +228,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 		return result;
 	}
 
+	node = acpi_get_node(mem_device->handle);
 	/*
 	 * Tell the VM there is more memory here...
 	 * Note: Assume that this function returns zero on success
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index e2c1a16078c9..13d6d5bdea26 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -254,5 +254,18 @@ int acpi_get_pxm(acpi_handle h)
 	} while (ACPI_SUCCESS(status));
 	return -1;
 }
-
 EXPORT_SYMBOL(acpi_get_pxm);
+
+int acpi_get_node(acpi_handle *handle)
+{
+	int pxm, node = -1;
+
+	ACPI_FUNCTION_TRACE("acpi_get_node");
+
+	pxm = acpi_get_pxm(handle);
+	if (pxm >= 0)
+		node = acpi_map_pxm_to_node(pxm);
+
+	return_VALUE(node);
+}
+EXPORT_SYMBOL(acpi_get_node);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 90d6df1551ed..88b5dfd8ee12 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -528,12 +528,18 @@ static inline void acpi_set_cstate_limit(unsigned int new_limit) { return; }
 
 #ifdef CONFIG_ACPI_NUMA
 int acpi_get_pxm(acpi_handle handle);
+int acpi_get_node(acpi_handle *handle);
 #else
 static inline int acpi_get_pxm(acpi_handle handle)
 {
 	return 0;
 }
+static inline int acpi_get_node(acpi_handle *handle)
+{
+	return 0;
+}
 #endif
+extern int acpi_paddr_to_node(u64 start_addr, u64 size);
 
 extern int pnpacpi_disabled;
 
-- 
cgit v1.2.3


From 306d6cbe86e2e6603ac3162e1294d5c75cfdeca6 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:32 -0700
Subject: [PATCH] pgdat allocation for new node add (generic alloc node_data)

For node hotplug, basically we have to allocate new pgdat.  But, there are
several types of implementations of pgdat.

1. Allocate only pgdat.
   This style allocate only pgdat area.
   And its address is recorded in node_data[].
   It is most popular style.

2. Static array of pgdat
   In this case, all of pgdats are static array.
   Some archs use this style.

3. Allocate not only pgdat, but also per node data.
   To increase performance, each node has copy of some data as
   a per node data. So, this area must be allocated too.

   Ia64 is this style. Ia64 has the copies of node_data[] array
   on each per node data to increase performance.

In this series of patches, treat (1) as generic arch.

generic archs can use generic function. (2) and (3) should have
its own if necessary.

This patch defines pgdat allocator.
Updating NODE_DATA() macro function is in other patch.

Signed-off-by: Yasonori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/memory_hotplug.h | 55 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 29c1472efad0..c6fd2c0323fc 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -73,6 +73,61 @@ static inline int memory_add_physaddr_to_nid(u64 start)
 }
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION
+/*
+ * For supporting node-hotadd, we have to allocate a new pgdat.
+ *
+ * If an arch has generic style NODE_DATA(),
+ * node_data[nid] = kzalloc() works well. But it depends on the architecture.
+ *
+ * In general, generic_alloc_nodedata() is used.
+ * Now, arch_free_nodedata() is just defined for error path of node_hot_add.
+ *
+ */
+static inline pg_data_t *arch_alloc_nodedata(int nid)
+{
+	return NULL;
+}
+static inline void arch_free_nodedata(pg_data_t *pgdat)
+{
+}
+
+#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
+
+#define arch_alloc_nodedata(nid)	generic_alloc_nodedata(nid)
+#define arch_free_nodedata(pgdat)	generic_free_nodedata(pgdat)
+
+#ifdef CONFIG_NUMA
+/*
+ * If ARCH_HAS_NODEDATA_EXTENSION=n, this func is used to allocate pgdat.
+ * XXX: kmalloc_node() can't work well to get new node's memory at this time.
+ *	Because, pgdat for the new node is not allocated/initialized yet itself.
+ *	To use new node's memory, more consideration will be necessary.
+ */
+#define generic_alloc_nodedata(nid)				\
+({								\
+	kzalloc(sizeof(pg_data_t), GFP_KERNEL);			\
+})
+/*
+ * This definition is just for error path in node hotadd.
+ * For node hotremove, we have to replace this.
+ */
+#define generic_free_nodedata(pgdat)	kfree(pgdat)
+
+#else /* !CONFIG_NUMA */
+
+/* never called */
+static inline pg_data_t *generic_alloc_nodedata(int nid)
+{
+	BUG();
+	return NULL;
+}
+static inline void generic_free_nodedata(pg_data_t *pgdat)
+{
+}
+#endif /* CONFIG_NUMA */
+#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
+
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 /*
  * Stub functions for when hotplug is off
-- 
cgit v1.2.3


From 10ad400b49aca15ecf83b0fde7e35e4064b15c85 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:33 -0700
Subject: [PATCH] pgdat allocation for new node add (refresh node_data[])

Refresh NODE_DATA() for generic archs.  In this case, NODE_DATA(nid) ==
node_data[nid].  node_data[] is array of address of pgdat.  So, refresh is
quite simple.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/Kconfig              |  4 ++++
 include/linux/memory_hotplug.h | 12 ++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 18318749884b..a56df7bf022d 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -374,6 +374,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
 	def_bool y
 	depends on NEED_MULTIPLE_NODES
 
+config HAVE_ARCH_NODEDATA_EXTENSION
+	def_bool y
+	depends on NUMA
+
 config IA32_SUPPORT
 	bool "Support for Linux/x86 binaries"
 	help
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index c6fd2c0323fc..569b1f6c27d1 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -91,6 +91,9 @@ static inline pg_data_t *arch_alloc_nodedata(int nid)
 static inline void arch_free_nodedata(pg_data_t *pgdat)
 {
 }
+static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
+{
+}
 
 #else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
@@ -114,6 +117,12 @@ static inline void arch_free_nodedata(pg_data_t *pgdat)
  */
 #define generic_free_nodedata(pgdat)	kfree(pgdat)
 
+extern pg_data_t *node_data[];
+static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
+{
+	node_data[nid] = pgdat;
+}
+
 #else /* !CONFIG_NUMA */
 
 /* never called */
@@ -125,6 +134,9 @@ static inline pg_data_t *generic_alloc_nodedata(int nid)
 static inline void generic_free_nodedata(pg_data_t *pgdat)
 {
 }
+static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
+{
+}
 #endif /* CONFIG_NUMA */
 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
-- 
cgit v1.2.3


From 3218ae14b1e3ee2ab81df30ed690c8e864d23316 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:33 -0700
Subject: [PATCH] pgdat allocation for new node add (export kswapd start func)

When node is hot-added, kswapd for the node should start.  This export kswapd
start function as kswapd_run() to use at add_memory().

[akpm@osdl.org: daemonize() isn't needed when using the kthread API]
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  2 ++
 mm/vmscan.c          | 37 ++++++++++++++++++++++++++-----------
 2 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index dc3f3aa0c83e..c41e2d6d1acc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -199,6 +199,8 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
+extern int kswapd_run(int nid);
+
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 72babac71dea..f03da33d9147 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -34,6 +34,7 @@
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
+#include <linux/kthread.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1223,7 +1224,6 @@ static int kswapd(void *p)
 	};
 	cpumask_t cpumask;
 
-	daemonize("kswapd%d", pgdat->node_id);
 	cpumask = node_to_cpumask(pgdat->node_id);
 	if (!cpus_empty(cpumask))
 		set_cpus_allowed(tsk, cpumask);
@@ -1468,20 +1468,35 @@ static int cpu_callback(struct notifier_block *nfb,
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+/*
+ * This kswapd start function will be called by init and node-hot-add.
+ * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
+ */
+int kswapd_run(int nid)
+{
+	pg_data_t *pgdat = NODE_DATA(nid);
+	int ret = 0;
+
+	if (pgdat->kswapd)
+		return 0;
+
+	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+	if (IS_ERR(pgdat->kswapd)) {
+		/* failure at boot is fatal */
+		BUG_ON(system_state == SYSTEM_BOOTING);
+		printk("Failed to start kswapd on node %d\n",nid);
+		ret = -1;
+	}
+	return ret;
+}
+
 static int __init kswapd_init(void)
 {
-	pg_data_t *pgdat;
+	int nid;
 
 	swap_setup();
-	for_each_online_pgdat(pgdat) {
-		pid_t pid;
-
-		pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
-		BUG_ON(pid < 0);
-		read_lock(&tasklist_lock);
-		pgdat->kswapd = find_task_by_pid(pid);
-		read_unlock(&tasklist_lock);
-	}
+	for_each_online_node(nid)
+ 		kswapd_run(nid);
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
 }
-- 
cgit v1.2.3


From 2842f11419704f8707fffc82e10d2263427fc130 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:36 -0700
Subject: [PATCH] catch valid mem range at onlining memory

This patch allows hot-add memory which is not aligned to section.

Now, hot-added memory has to be aligned to section size.  Considering big
section sized archs, this is not useful.

When hot-added memory is registerd as iomem resoruce by iomem resource
patch, we can make use of that information to detect valid memory range.

Note: With this, not-aligned memory can be registerd. To allow hot-add
      memory with holes, we have to do more work around add_memory().
      (It doesn't allows add memory to already existing mem section.)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/ioport.h |  3 +++
 kernel/resource.c      | 38 ++++++++++++++++++++++++++++++++++++++
 mm/memory_hotplug.c    | 28 ++++++++++++++++++++++++----
 3 files changed, 65 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index cd6bd001ba4e..edfc733b1575 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -105,6 +105,9 @@ extern int allocate_resource(struct resource *root, struct resource *new,
 int adjust_resource(struct resource *res, unsigned long start,
 		    unsigned long size);
 
+/* get registered SYSTEM_RAM resources in specified area */
+extern int find_next_system_ram(struct resource *res);
+
 /* Convenience shorthand with allocation */
 #define request_region(start,n,name)	__request_region(&ioport_resource, (start), (n), (name))
 #define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name))
diff --git a/kernel/resource.c b/kernel/resource.c
index e3080fcc66a3..2404f9b0bc47 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -232,6 +232,44 @@ int release_resource(struct resource *old)
 
 EXPORT_SYMBOL(release_resource);
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Finds the lowest memory reosurce exists within [res->start.res->end)
+ * the caller must specify res->start, res->end, res->flags.
+ * If found, returns 0, res is overwritten, if not found, returns -1.
+ */
+int find_next_system_ram(struct resource *res)
+{
+	resource_size_t start, end;
+	struct resource *p;
+
+	BUG_ON(!res);
+
+	start = res->start;
+	end = res->end;
+
+	read_lock(&resource_lock);
+	for (p = iomem_resource.child; p ; p = p->sibling) {
+		/* system ram is just marked as IORESOURCE_MEM */
+		if (p->flags != res->flags)
+			continue;
+		if (p->start > end) {
+			p = NULL;
+			break;
+		}
+		if (p->start >= start)
+			break;
+	}
+	read_unlock(&resource_lock);
+	if (!p)
+		return -1;
+	/* copy data */
+	res->start = p->start;
+	res->end = p->end;
+	return 0;
+}
+#endif
+
 /*
  * Find empty slot in the resource tree given range and alignment.
  */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0b11a8543441..f13783e81eb6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -127,6 +127,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	unsigned long i;
 	unsigned long flags;
 	unsigned long onlined_pages = 0;
+	struct resource res;
+	u64 section_end;
+	unsigned long start_pfn;
 	struct zone *zone;
 	int need_zonelists_rebuild = 0;
 
@@ -149,10 +152,27 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	if (!populated_zone(zone))
 		need_zonelists_rebuild = 1;
 
-	for (i = 0; i < nr_pages; i++) {
-		struct page *page = pfn_to_page(pfn + i);
-		online_page(page);
-		onlined_pages++;
+	res.start = (u64)pfn << PAGE_SHIFT;
+	res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1;
+	res.flags = IORESOURCE_MEM; /* we just need system ram */
+	section_end = res.end;
+
+	while (find_next_system_ram(&res) >= 0) {
+		start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
+		nr_pages = (unsigned long)
+                           ((res.end + 1 - res.start) >> PAGE_SHIFT);
+
+		if (PageReserved(pfn_to_page(start_pfn))) {
+			/* this region's page is not onlined now */
+			for (i = 0; i < nr_pages; i++) {
+				struct page *page = pfn_to_page(start_pfn + i);
+				online_page(page);
+				onlined_pages++;
+			}
+		}
+
+		res.start = res.end + 1;
+		res.end = section_end;
 	}
 	zone->present_pages += onlined_pages;
 	zone->zone_pgdat->node_present_pages += onlined_pages;
-- 
cgit v1.2.3


From 0fc44159bfcb5b0afa178f9c3f50db23aebc76ff Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:38 -0700
Subject: [PATCH] Register sysfs file for hotplugged new node

When new node becomes enable by hot-add, new sysfs file must be created for
new node.  So, if new node is enabled by add_memory(), register_one_node() is
called to create it.  In addition, I386's arch_register_node() and a part of
register_nodes() of powerpc are consolidated to register_one_node() as a
generic_code().

This is tested by Tiger4(IPF) with node hot-plug emulation.

Signed-off-by: Keiichiro Tokunaga <tokuanga.keiich@jp.fujitsu.com>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/topology.c |  9 +++------
 arch/ia64/kernel/topology.c | 15 +++------------
 arch/powerpc/kernel/sysfs.c | 15 ++-------------
 drivers/base/node.c         | 25 +++++++++++++++++++++++++
 include/asm-i386/cpu.h      |  2 --
 include/asm-i386/node.h     | 29 -----------------------------
 include/linux/node.h        |  4 ++++
 mm/memory_hotplug.c         | 12 +++++++++++-
 8 files changed, 48 insertions(+), 63 deletions(-)
 delete mode 100644 include/asm-i386/node.h

(limited to 'include/linux')

diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 296355292c7c..1eecc2e1bd4b 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -38,7 +38,7 @@ int arch_register_cpu(int num){
 #ifdef CONFIG_NUMA
 	int node = cpu_to_node(num);
 	if (node_online(node))
-		parent = &node_devices[node].node;
+		parent = &node_devices[parent_node(node)];
 #endif /* CONFIG_NUMA */
 
 	/*
@@ -61,7 +61,7 @@ void arch_unregister_cpu(int num) {
 #ifdef CONFIG_NUMA
 	int node = cpu_to_node(num);
 	if (node_online(node))
-		parent = &node_devices[node].node;
+		parent = &node_devices[parent_node(node)];
 #endif /* CONFIG_NUMA */
 
 	return unregister_cpu(&cpu_devices[num].cpu, parent);
@@ -74,16 +74,13 @@ EXPORT_SYMBOL(arch_unregister_cpu);
 
 #ifdef CONFIG_NUMA
 #include <linux/mmzone.h>
-#include <asm/node.h>
-
-struct i386_node node_devices[MAX_NUMNODES];
 
 static int __init topology_init(void)
 {
 	int i;
 
 	for_each_online_node(i)
-		arch_register_node(i);
+		register_one_node(i);
 
 	for_each_present_cpu(i)
 		arch_register_cpu(i);
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 879edb51d1e0..42cb05bdc680 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -26,9 +26,6 @@
 #include <asm/numa.h>
 #include <asm/cpu.h>
 
-#ifdef CONFIG_NUMA
-static struct node *sysfs_nodes;
-#endif
 static struct ia64_cpu *sysfs_cpus;
 
 int arch_register_cpu(int num)
@@ -36,7 +33,7 @@ int arch_register_cpu(int num)
 	struct node *parent = NULL;
 	
 #ifdef CONFIG_NUMA
-	parent = &sysfs_nodes[cpu_to_node(num)];
+	parent = &node_devices[cpu_to_node(num)];
 #endif /* CONFIG_NUMA */
 
 #if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU)
@@ -59,7 +56,7 @@ void arch_unregister_cpu(int num)
 
 #ifdef CONFIG_NUMA
 	int node = cpu_to_node(num);
-	parent = &sysfs_nodes[node];
+	parent = &node_devices[node];
 #endif /* CONFIG_NUMA */
 
 	return unregister_cpu(&sysfs_cpus[num].cpu, parent);
@@ -74,17 +71,11 @@ static int __init topology_init(void)
 	int i, err = 0;
 
 #ifdef CONFIG_NUMA
-	sysfs_nodes = kzalloc(sizeof(struct node) * MAX_NUMNODES, GFP_KERNEL);
-	if (!sysfs_nodes) {
-		err = -ENOMEM;
-		goto out;
-	}
-
 	/*
 	 * MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes?
 	 */
 	for_each_online_node(i) {
-		if ((err = register_node(&sysfs_nodes[i], i, 0)))
+		if ((err = register_one_node(i)))
 			goto out;
 	}
 #endif
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 5bc2585c8036..338491d1604a 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -304,23 +304,12 @@ static struct notifier_block sysfs_cpu_nb = {
 /* NUMA stuff */
 
 #ifdef CONFIG_NUMA
-static struct node node_devices[MAX_NUMNODES];
-
 static void register_nodes(void)
 {
 	int i;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (node_online(i)) {
-			int p_node = parent_node(i);
-			struct node *parent = NULL;
-
-			if (p_node != i)
-				parent = &node_devices[p_node];
-
-			register_node(&node_devices[i], i, parent);
-		}
-	}
+	for (i = 0; i < MAX_NUMNODES; i++)
+		register_one_node(i);
 }
 
 int sysfs_add_device_to_node(struct sys_device *dev, int nid)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index c80c3aeed004..cbd0f62b4870 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -190,6 +190,31 @@ void unregister_node(struct node *node)
 	sysdev_unregister(&node->sysdev);
 }
 
+struct node node_devices[MAX_NUMNODES];
+
+int register_one_node(int nid)
+{
+	int error = 0;
+
+	if (node_online(nid)) {
+		int p_node = parent_node(nid);
+		struct node *parent = NULL;
+
+		if (p_node != nid)
+			parent = &node_devices[p_node];
+
+		error = register_node(&node_devices[nid], nid, parent);
+	}
+
+	return error;
+
+}
+
+void unregister_one_node(int nid)
+{
+	unregister_node(&node_devices[nid]);
+}
+
 static int __init register_node_type(void)
 {
 	return sysdev_class_register(&node_class);
diff --git a/include/asm-i386/cpu.h b/include/asm-i386/cpu.h
index e7252c216ca8..b1bc7b1b64b0 100644
--- a/include/asm-i386/cpu.h
+++ b/include/asm-i386/cpu.h
@@ -7,8 +7,6 @@
 #include <linux/nodemask.h>
 #include <linux/percpu.h>
 
-#include <asm/node.h>
-
 struct i386_cpu {
 	struct cpu cpu;
 };
diff --git a/include/asm-i386/node.h b/include/asm-i386/node.h
deleted file mode 100644
index e13c6ffa72ae..000000000000
--- a/include/asm-i386/node.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _ASM_I386_NODE_H_
-#define _ASM_I386_NODE_H_
-
-#include <linux/device.h>
-#include <linux/mmzone.h>
-#include <linux/node.h>
-#include <linux/topology.h>
-#include <linux/nodemask.h>
-
-struct i386_node {
-	struct node node;
-};
-extern struct i386_node node_devices[MAX_NUMNODES];
-
-static inline int arch_register_node(int num){
-	int p_node;
-	struct node *parent = NULL;
-
-	if (!node_online(num))
-		return 0;
-	p_node = parent_node(num);
-
-	if (p_node != num)
-		parent = &node_devices[p_node].node;
-
-	return register_node(&node_devices[num].node, num, parent);
-}
-
-#endif /* _ASM_I386_NODE_H_ */
diff --git a/include/linux/node.h b/include/linux/node.h
index 254dc3de650b..1e5347527fa8 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -26,8 +26,12 @@ struct node {
 	struct sys_device	sysdev;
 };
 
+extern struct node node_devices[];
+
 extern int register_node(struct node *, int, struct node *);
 extern void unregister_node(struct node *node);
+extern int register_one_node(int nid);
+extern void unregister_one_node(int nid);
 
 #define to_node(sys_device) container_of(sys_device, struct node, sysdev)
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f13783e81eb6..ea4038838b0a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -256,9 +256,19 @@ int add_memory(int nid, u64 start, u64 size)
 	if (ret < 0)
 		goto error;
 
-	/* we online node here. we have no error path from here. */
+	/* we online node here. we can't roll back from here. */
 	node_set_online(nid);
 
+	if (new_pgdat) {
+		ret = register_one_node(nid);
+		/*
+		 * If sysfs file of new node can't create, cpu on the node
+		 * can't be hot-added. There is no rollback way now.
+		 * So, check by BUG_ON() to catch it reluctantly..
+		 */
+		BUG_ON(ret);
+	}
+
 	/* register this memory as resource */
 	register_memory_resource(start, size);
 
-- 
cgit v1.2.3


From 7049027c6f0098eb6b23b8f6ca65a905541faf81 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:39 -0700
Subject: [PATCH] pgdat allocation and update for ia64 of memory hotplug:
 update pgdat address array

This is to refresh node_data[] array for ia64.  As I mentioned previous
patches, ia64 has copies of information of pgdat address array on each node as
per node data.

At v2 of node_add, this function used stop_machine_run() to update them.  (I
wished that they were copied safety as much as possible.) But, in this patch,
this arrays are just copied simply, and set node_online_map bit after
completion of pgdat initialization.

So, kernel must touch NODE_DATA() macro after checking node_online_map().
(Current code has already done it.) This is more simple way for just
hot-add.....

Note : It will be problem when hot-remove will occur,
       because, even if online_map bit is set, kernel may
       touch NODE_DATA() due to race condition. :-(

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/mm/discontig.c       | 24 +++++++++++++++++++-----
 include/asm-ia64/nodedata.h    | 12 ++++++++++++
 include/linux/memory_hotplug.h |  4 +---
 3 files changed, 32 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 83153ac18795..9153465d7fcc 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -308,6 +308,17 @@ static void __init reserve_pernode_space(void)
 	}
 }
 
+static void __meminit scatter_node_data(void)
+{
+	pg_data_t **dst;
+	int node;
+
+	for_each_online_node(node) {
+		dst = LOCAL_DATA_ADDR(pgdat_list[node])->pg_data_ptrs;
+		memcpy(dst, pgdat_list, sizeof(pgdat_list));
+	}
+}
+
 /**
  * initialize_pernode_data - fixup per-cpu & per-node pointers
  *
@@ -320,11 +331,8 @@ static void __init initialize_pernode_data(void)
 {
 	int cpu, node;
 
-	/* Copy the pg_data_t list to each node and init the node field */
-	for_each_online_node(node) {
-		memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list,
-		       sizeof(pgdat_list));
-	}
+	scatter_node_data();
+
 #ifdef CONFIG_SMP
 	/* Set the node_data pointer for each per-cpu struct */
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
@@ -783,3 +791,9 @@ void __init paging_init(void)
 
 	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
 }
+
+void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
+{
+	pgdat_list[update_node] = update_pgdat;
+	scatter_node_data();
+}
diff --git a/include/asm-ia64/nodedata.h b/include/asm-ia64/nodedata.h
index a140310bf84d..2fb337b0e9b7 100644
--- a/include/asm-ia64/nodedata.h
+++ b/include/asm-ia64/nodedata.h
@@ -46,6 +46,18 @@ struct ia64_node_data {
  */
 #define NODE_DATA(nid)		(local_node_data->pg_data_ptrs[nid])
 
+/*
+ * LOCAL_DATA_ADDR - This is to calculate the address of other node's
+ *		     "local_node_data" at hot-plug phase. The local_node_data
+ *		     is pointed by per_cpu_page. Kernel usually use it for
+ *		     just executing cpu. However, when new node is hot-added,
+ *		     the addresses of local data for other nodes are necessary
+ *		     to update all of them.
+ */
+#define LOCAL_DATA_ADDR(pgdat)  			\
+	((struct ia64_node_data *)((u64)(pgdat) + 	\
+				   L1_CACHE_ALIGN(sizeof(struct pglist_data))))
+
 #endif /* CONFIG_NUMA */
 
 #endif /* _ASM_IA64_NODEDATA_H */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 569b1f6c27d1..9b6260007e5e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -91,9 +91,7 @@ static inline pg_data_t *arch_alloc_nodedata(int nid)
 static inline void arch_free_nodedata(pg_data_t *pgdat)
 {
 }
-static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
-{
-}
+extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
 
 #else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
-- 
cgit v1.2.3


From dd0932d9d4301bd58a4d5a634a3a8298c4fc5e24 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:40 -0700
Subject: [PATCH] pgdat allocation and update for ia64 of memory hotplug:
 allocate pgdat and per node data

This is a patch to allocate pgdat and per node data area for ia64.  The size
for them can be calculated by compute_pernodesize().

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/mm/discontig.c       | 16 ++++++++++++++--
 include/linux/memory_hotplug.h |  9 ++-------
 2 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 9153465d7fcc..525b082eb661 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -100,7 +100,7 @@ static int __init build_node_maps(unsigned long start, unsigned long len,
  * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
  * called yet.  Note that node 0 will also count all non-existent cpus.
  */
-static int __init early_nr_cpus_node(int node)
+static int __meminit early_nr_cpus_node(int node)
 {
 	int cpu, n = 0;
 
@@ -115,7 +115,7 @@ static int __init early_nr_cpus_node(int node)
  * compute_pernodesize - compute size of pernode data
  * @node: the node id.
  */
-static unsigned long __init compute_pernodesize(int node)
+static unsigned long __meminit compute_pernodesize(int node)
 {
 	unsigned long pernodesize = 0, cpus;
 
@@ -792,6 +792,18 @@ void __init paging_init(void)
 	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
 }
 
+pg_data_t *arch_alloc_nodedata(int nid)
+{
+	unsigned long size = compute_pernodesize(nid);
+
+	return kzalloc(size, GFP_KERNEL);
+}
+
+void arch_free_nodedata(pg_data_t *pgdat)
+{
+	kfree(pgdat);
+}
+
 void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
 {
 	pgdat_list[update_node] = update_pgdat;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 9b6260007e5e..218501cfaeb9 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -84,13 +84,8 @@ static inline int memory_add_physaddr_to_nid(u64 start)
  * Now, arch_free_nodedata() is just defined for error path of node_hot_add.
  *
  */
-static inline pg_data_t *arch_alloc_nodedata(int nid)
-{
-	return NULL;
-}
-static inline void arch_free_nodedata(pg_data_t *pgdat)
-{
-}
+extern pg_data_t *arch_alloc_nodedata(int nid);
+extern void arch_free_nodedata(pg_data_t *pgdat);
 extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
 
 #else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
-- 
cgit v1.2.3


From 76b67ed9dce69a6a329cdd66f94af1787f417b62 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:41 -0700
Subject: [PATCH] node hotplug: register cpu: remove node struct

With Goto-san's patch, we can add new pgdat/node at runtime.  I'm now
considering node-hot-add with cpu + memory on ACPI.

I found acpi container, which describes node, could evaluate cpu before
memory. This means cpu-hot-add occurs before memory hot add.

In most part, cpu-hot-add doesn't depend on node hot add.  But register_cpu(),
which creates symbolic link from node to cpu, requires that node should be
onlined before register_cpu().  When a node is onlined, its pgdat should be
there.

This patch-set holds off creating symbolic link from node to cpu
until node is onlined.

This removes node arguments from register_cpu().

Now, register_cpu() requires 'struct node' as its argument.  But the array of
struct node is now unified in driver/base/node.c now (By Goto's node hotplug
patch).  We can get struct node in generic way.  So, this argument is not
necessary now.

This patch also guarantees add cpu under node only when node is onlined.  It
is necessary for node-hot-add vs.  cpu-hot-add patch following this.

Moreover, register_cpu calculates cpu->node_id by cpu_to_node() without regard
to its 'struct node *root' argument.  This patch removes it.

Also modify callers of register_cpu()/unregister_cpu, whose args are changed
by register-cpu-remove-node-struct patch.

[Brice.Goglin@ens-lyon.org: fix it]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Brice Goglin <Brice.Goglin@ens-lyon.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/setup.c      |  2 +-
 arch/arm/kernel/setup.c        |  2 +-
 arch/i386/kernel/topology.c    | 23 ++++-------------------
 arch/ia64/kernel/topology.c    | 17 ++---------------
 arch/m32r/kernel/setup.c       |  2 +-
 arch/mips/kernel/smp.c         |  2 +-
 arch/parisc/kernel/topology.c  |  3 +--
 arch/powerpc/kernel/setup_32.c |  2 +-
 arch/powerpc/kernel/sysfs.c    | 12 +-----------
 arch/ppc/kernel/setup.c        |  2 +-
 arch/s390/kernel/smp.c         |  2 +-
 arch/sh/kernel/setup.c         |  2 +-
 arch/sh64/kernel/setup.c       |  2 +-
 arch/sparc64/kernel/setup.c    |  2 +-
 drivers/base/cpu.c             | 18 ++++++++----------
 drivers/base/node.c            | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/cpu.h            |  4 ++--
 include/linux/node.h           | 13 +++++++++++++
 18 files changed, 77 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 558b83368559..254c507a608c 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -481,7 +481,7 @@ register_cpus(void)
 		struct cpu *p = kzalloc(sizeof(*p), GFP_KERNEL);
 		if (!p)
 			return -ENOMEM;
-		register_cpu(p, i, NULL);
+		register_cpu(p, i);
 	}
 	return 0;
 }
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 9fc9af88c60c..093ccba0503c 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -808,7 +808,7 @@ static int __init topology_init(void)
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		register_cpu(&per_cpu(cpu_data, cpu).cpu, cpu, NULL);
+		register_cpu(&per_cpu(cpu_data, cpu).cpu, cpu);
 
 	return 0;
 }
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 1eecc2e1bd4b..e2e281d4bcc8 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -32,15 +32,8 @@
 
 static struct i386_cpu cpu_devices[NR_CPUS];
 
-int arch_register_cpu(int num){
-	struct node *parent = NULL;
-
-#ifdef CONFIG_NUMA
-	int node = cpu_to_node(num);
-	if (node_online(node))
-		parent = &node_devices[parent_node(node)];
-#endif /* CONFIG_NUMA */
-
+int arch_register_cpu(int num)
+{
 	/*
 	 * CPU0 cannot be offlined due to several
 	 * restrictions and assumptions in kernel. This basically
@@ -50,21 +43,13 @@ int arch_register_cpu(int num){
 	if (!num)
 		cpu_devices[num].cpu.no_control = 1;
 
-	return register_cpu(&cpu_devices[num].cpu, num, parent);
+	return register_cpu(&cpu_devices[num].cpu, num);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 
 void arch_unregister_cpu(int num) {
-	struct node *parent = NULL;
-
-#ifdef CONFIG_NUMA
-	int node = cpu_to_node(num);
-	if (node_online(node))
-		parent = &node_devices[parent_node(node)];
-#endif /* CONFIG_NUMA */
-
-	return unregister_cpu(&cpu_devices[num].cpu, parent);
+	return unregister_cpu(&cpu_devices[num].cpu);
 }
 EXPORT_SYMBOL(arch_register_cpu);
 EXPORT_SYMBOL(arch_unregister_cpu);
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 42cb05bdc680..5737c9a061ef 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -30,12 +30,6 @@ static struct ia64_cpu *sysfs_cpus;
 
 int arch_register_cpu(int num)
 {
-	struct node *parent = NULL;
-	
-#ifdef CONFIG_NUMA
-	parent = &node_devices[cpu_to_node(num)];
-#endif /* CONFIG_NUMA */
-
 #if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU)
 	/*
 	 * If CPEI cannot be re-targetted, and this is
@@ -45,21 +39,14 @@ int arch_register_cpu(int num)
 		sysfs_cpus[num].cpu.no_control = 1;
 #endif
 
-	return register_cpu(&sysfs_cpus[num].cpu, num, parent);
+	return register_cpu(&sysfs_cpus[num].cpu, num);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 
 void arch_unregister_cpu(int num)
 {
-	struct node *parent = NULL;
-
-#ifdef CONFIG_NUMA
-	int node = cpu_to_node(num);
-	parent = &node_devices[node];
-#endif /* CONFIG_NUMA */
-
-	return unregister_cpu(&sysfs_cpus[num].cpu, parent);
+	return unregister_cpu(&sysfs_cpus[num].cpu);
 }
 EXPORT_SYMBOL(arch_register_cpu);
 EXPORT_SYMBOL(arch_unregister_cpu);
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
index 3cd3c2988a48..1ff483c8a4c9 100644
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -275,7 +275,7 @@ static int __init topology_init(void)
 	int i;
 
 	for_each_present_cpu(i)
-		register_cpu(&cpu_devices[i], i, NULL);
+		register_cpu(&cpu_devices[i], i);
 
 	return 0;
 }
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 298f82fe8440..9096a5ea4229 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -446,7 +446,7 @@ static int __init topology_init(void)
 	int ret;
 
 	for_each_present_cpu(cpu) {
-		ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu, NULL);
+		ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu);
 		if (ret)
 			printk(KERN_WARNING "topology_init: register_cpu %d "
 			       "failed (%d)\n", cpu, ret);
diff --git a/arch/parisc/kernel/topology.c b/arch/parisc/kernel/topology.c
index 3ba040050e4c..068b20d822e7 100644
--- a/arch/parisc/kernel/topology.c
+++ b/arch/parisc/kernel/topology.c
@@ -26,11 +26,10 @@ static struct cpu cpu_devices[NR_CPUS] __read_mostly;
 
 static int __init topology_init(void)
 {
-	struct node *parent = NULL;
 	int num;
 
 	for_each_present_cpu(num) {
-		register_cpu(&cpu_devices[num], num, parent);
+		register_cpu(&cpu_devices[num], num);
 	}
 	return 0;
 }
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index e5a44812441a..0932a62a1c96 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -215,7 +215,7 @@ int __init ppc_init(void)
 
 	/* register CPU devices */
 	for_each_possible_cpu(i)
-		register_cpu(&cpu_devices[i], i, NULL);
+		register_cpu(&cpu_devices[i], i);
 
 	/* call platform init */
 	if (ppc_md.init != NULL) {
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 338491d1604a..412ad00e222d 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -348,23 +348,13 @@ static SYSDEV_ATTR(physical_id, 0444, show_physical_id, NULL);
 static int __init topology_init(void)
 {
 	int cpu;
-	struct node *parent = NULL;
 
 	register_nodes();
-
 	register_cpu_notifier(&sysfs_cpu_nb);
 
 	for_each_possible_cpu(cpu) {
 		struct cpu *c = &per_cpu(cpu_devices, cpu);
 
-#ifdef CONFIG_NUMA
-		/* The node to which a cpu belongs can't be known
-		 * until the cpu is made present.
-		 */
-		parent = NULL;
-		if (cpu_present(cpu))
-			parent = &node_devices[cpu_to_node(cpu)];
-#endif
 		/*
 		 * For now, we just see if the system supports making
 		 * the RTAS calls for CPU hotplug.  But, there may be a
@@ -376,7 +366,7 @@ static int __init topology_init(void)
 			c->no_control = 1;
 
 		if (cpu_online(cpu) || (c->no_control == 0)) {
-			register_cpu(c, cpu, parent);
+			register_cpu(c, cpu);
 
 			sysdev_create_file(&c->sysdev, &attr_physical_id);
 		}
diff --git a/arch/ppc/kernel/setup.c b/arch/ppc/kernel/setup.c
index 1f79e84ab464..4b4607d89bfa 100644
--- a/arch/ppc/kernel/setup.c
+++ b/arch/ppc/kernel/setup.c
@@ -475,7 +475,7 @@ int __init ppc_init(void)
 
 	/* register CPU devices */
 	for_each_possible_cpu(i)
-		register_cpu(&cpu_devices[i], i, NULL);
+		register_cpu(&cpu_devices[i], i);
 
 	/* call platform init */
 	if (ppc_md.init != NULL) {
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 343120c9223d..8e03219eea76 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -869,7 +869,7 @@ static int __init topology_init(void)
 	int ret;
 
 	for_each_possible_cpu(cpu) {
-		ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu, NULL);
+		ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu);
 		if (ret)
 			printk(KERN_WARNING "topology_init: register_cpu %d "
 			       "failed (%d)\n", cpu, ret);
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index bb229ef030f3..9af22116c9a2 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -402,7 +402,7 @@ static int __init topology_init(void)
 	int cpu_id;
 
 	for_each_possible_cpu(cpu_id)
-		register_cpu(&cpu[cpu_id], cpu_id, NULL);
+		register_cpu(&cpu[cpu_id], cpu_id);
 
 	return 0;
 }
diff --git a/arch/sh64/kernel/setup.c b/arch/sh64/kernel/setup.c
index d2711c9c9d13..da98d8dbcf95 100644
--- a/arch/sh64/kernel/setup.c
+++ b/arch/sh64/kernel/setup.c
@@ -309,7 +309,7 @@ static struct cpu cpu[1];
 
 static int __init topology_init(void)
 {
-	return register_cpu(cpu, 0, NULL);
+	return register_cpu(cpu, 0);
 }
 
 subsys_initcall(topology_init);
diff --git a/arch/sparc64/kernel/setup.c b/arch/sparc64/kernel/setup.c
index a6a7d8168346..116d9632002d 100644
--- a/arch/sparc64/kernel/setup.c
+++ b/arch/sparc64/kernel/setup.c
@@ -537,7 +537,7 @@ static int __init topology_init(void)
 	for_each_possible_cpu(i) {
 		struct cpu *p = kzalloc(sizeof(*p), GFP_KERNEL);
 		if (p) {
-			register_cpu(p, i, NULL);
+			register_cpu(p, i);
 			err = 0;
 		}
 	}
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index dd712b24ec91..3972d8ac9786 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -8,6 +8,7 @@
 #include <linux/cpu.h>
 #include <linux/topology.h>
 #include <linux/device.h>
+#include <linux/node.h>
 
 #include "base.h"
 
@@ -57,13 +58,12 @@ static void __devinit register_cpu_control(struct cpu *cpu)
 {
 	sysdev_create_file(&cpu->sysdev, &attr_online);
 }
-void unregister_cpu(struct cpu *cpu, struct node *root)
+void unregister_cpu(struct cpu *cpu)
 {
 	int logical_cpu = cpu->sysdev.id;
 
-	if (root)
-		sysfs_remove_link(&root->sysdev.kobj,
-				  kobject_name(&cpu->sysdev.kobj));
+	unregister_cpu_under_node(logical_cpu, cpu_to_node(logical_cpu));
+
 	sysdev_remove_file(&cpu->sysdev, &attr_online);
 
 	sysdev_unregister(&cpu->sysdev);
@@ -109,23 +109,21 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
  *
  * Initialize and register the CPU device.
  */
-int __devinit register_cpu(struct cpu *cpu, int num, struct node *root)
+int __devinit register_cpu(struct cpu *cpu, int num)
 {
 	int error;
-
 	cpu->node_id = cpu_to_node(num);
 	cpu->sysdev.id = num;
 	cpu->sysdev.cls = &cpu_sysdev_class;
 
 	error = sysdev_register(&cpu->sysdev);
-	if (!error && root)
-		error = sysfs_create_link(&root->sysdev.kobj,
-					  &cpu->sysdev.kobj,
-					  kobject_name(&cpu->sysdev.kobj));
+
 	if (!error && !cpu->no_control)
 		register_cpu_control(cpu);
 	if (!error)
 		cpu_sys_devices[num] = &cpu->sysdev;
+	if (!error)
+		register_cpu_under_node(num, cpu_to_node(num));
 
 #ifdef CONFIG_KEXEC
 	if (!error)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index cbd0f62b4870..eae2bdc183bb 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -11,6 +11,7 @@
 #include <linux/cpumask.h>
 #include <linux/topology.h>
 #include <linux/nodemask.h>
+#include <linux/cpu.h>
 
 static struct sysdev_class node_class = {
 	set_kset_name("node"),
@@ -192,9 +193,38 @@ void unregister_node(struct node *node)
 
 struct node node_devices[MAX_NUMNODES];
 
+/*
+ * register cpu under node
+ */
+int register_cpu_under_node(unsigned int cpu, unsigned int nid)
+{
+	if (node_online(nid)) {
+		struct sys_device *obj = get_cpu_sysdev(cpu);
+		if (!obj)
+			return 0;
+		return sysfs_create_link(&node_devices[nid].sysdev.kobj,
+					 &obj->kobj,
+					 kobject_name(&obj->kobj));
+	 }
+
+	return 0;
+}
+
+int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
+{
+	if (node_online(nid)) {
+		struct sys_device *obj = get_cpu_sysdev(cpu);
+		if (obj)
+			sysfs_remove_link(&node_devices[nid].sysdev.kobj,
+					 kobject_name(&obj->kobj));
+	}
+	return 0;
+}
+
 int register_one_node(int nid)
 {
 	int error = 0;
+	int cpu;
 
 	if (node_online(nid)) {
 		int p_node = parent_node(nid);
@@ -204,6 +234,12 @@ int register_one_node(int nid)
 			parent = &node_devices[p_node];
 
 		error = register_node(&node_devices[nid], nid, parent);
+
+		/* link cpu under this node */
+		for_each_present_cpu(cpu) {
+			if (cpu_to_node(cpu) == nid)
+				register_cpu_under_node(cpu, nid);
+		}
 	}
 
 	return error;
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 08d50c53aab4..b23bf1c8addc 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -31,10 +31,10 @@ struct cpu {
 	struct sys_device sysdev;
 };
 
-extern int register_cpu(struct cpu *, int, struct node *);
+extern int register_cpu(struct cpu *cpu, int num);
 extern struct sys_device *get_cpu_sysdev(unsigned cpu);
 #ifdef CONFIG_HOTPLUG_CPU
-extern void unregister_cpu(struct cpu *, struct node *);
+extern void unregister_cpu(struct cpu *cpu);
 #endif
 struct notifier_block;
 
diff --git a/include/linux/node.h b/include/linux/node.h
index 1e5347527fa8..81dcec84cd8f 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -32,6 +32,19 @@ extern int register_node(struct node *, int, struct node *);
 extern void unregister_node(struct node *node);
 extern int register_one_node(int nid);
 extern void unregister_one_node(int nid);
+#ifdef CONFIG_NUMA
+extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
+extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
+#else
+static inline int register_cpu_under_node(unsigned int cpu, unsigned int nid)
+{
+	return 0;
+}
+static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
+{
+	return 0;
+}
+#endif
 
 #define to_node(sys_device) container_of(sys_device, struct node, sysdev)
 
-- 
cgit v1.2.3


From e6e5494cb23d1933735ee47cc674ffe1c4afed6f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 27 Jun 2006 02:53:50 -0700
Subject: [PATCH] vdso: randomize the i386 vDSO by moving it into a vma

Move the i386 VDSO down into a vma and thus randomize it.

Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.

It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).

There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO.  Newer
distributions (using glibc 2.3.3 or later) can turn this option off.  Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.

There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.

(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)

This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.

[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt  |   4 ++
 arch/i386/Kconfig                    |  11 +++
 arch/i386/kernel/asm-offsets.c       |   4 +-
 arch/i386/kernel/entry.S             |   7 +-
 arch/i386/kernel/signal.c            |   4 +-
 arch/i386/kernel/sysenter.c          | 128 +++++++++++++++++++++++++++++++++--
 arch/i386/kernel/vsyscall-sysenter.S |   4 +-
 arch/i386/kernel/vsyscall.lds.S      |   4 +-
 fs/proc/task_mmu.c                   |  30 ++++----
 include/asm-i386/elf.h               |  53 +++++++++++----
 include/asm-i386/fixmap.h            |  10 +--
 include/asm-i386/mmu.h               |   1 +
 include/asm-i386/page.h              |   3 +
 include/asm-i386/thread_info.h       |   1 +
 include/asm-i386/unwind.h            |   4 +-
 include/linux/mm.h                   |   2 +
 include/linux/sysctl.h               |   1 +
 kernel/sysctl.c                      |  12 ++++
 18 files changed, 235 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 2e352a605fcf..0d189c93eeaf 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1669,6 +1669,10 @@ running once the system is up.
 	usbhid.mousepoll=
 			[USBHID] The interval which mice are to be polled at.
 
+	vdso=		[IA-32]
+			vdso=1: enable VDSO (default)
+			vdso=0: disable VDSO mapping
+
 	video=		[FB] Frame buffer configuration
 			See Documentation/fb/modedb.txt.
 
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 6662f8c44798..3bb221db164a 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -780,6 +780,17 @@ config HOTPLUG_CPU
 	  enable suspend on SMP systems. CPUs can be controlled through
 	  /sys/devices/system/cpu.
 
+config COMPAT_VDSO
+	bool "Compat VDSO support"
+	default y
+	help
+	  Map the VDSO to the predictable old-style address too.
+	---help---
+	  Say N here if you are running a sufficiently recent glibc
+	  version (2.3.3 or later), to remove the high-mapped
+	  VDSO mapping and to exclusively use the randomized VDSO.
+
+	  If unsure, say Y.
 
 endmenu
 
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 1c3a809e6421..c80271f8f084 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -14,6 +14,7 @@
 #include <asm/fixmap.h>
 #include <asm/processor.h>
 #include <asm/thread_info.h>
+#include <asm/elf.h>
 
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -54,6 +55,7 @@ void foo(void)
 	OFFSET(TI_preempt_count, thread_info, preempt_count);
 	OFFSET(TI_addr_limit, thread_info, addr_limit);
 	OFFSET(TI_restart_block, thread_info, restart_block);
+	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
 	BLANK();
 
 	OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
@@ -69,7 +71,7 @@ void foo(void)
 		 sizeof(struct tss_struct));
 
 	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-	DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL));
+	DEFINE(VDSO_PRELINK, VDSO_PRELINK);
 
 	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
 }
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index e8d2630fd19a..fbdb933251b6 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -270,7 +270,12 @@ sysenter_past_esp:
 	pushl $(__USER_CS)
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET cs, 0*/
-	pushl $SYSENTER_RETURN
+	/*
+	 * Push current_thread_info()->sysenter_return to the stack.
+	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
+	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
+	 */
+	pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET eip, 0
 
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 5c352c3a9e7f..43002cfb40c4 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -351,7 +351,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 			goto give_sigsegv;
 	}
 
-	restorer = &__kernel_sigreturn;
+	restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
 	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = ka->sa.sa_restorer;
 
@@ -447,7 +447,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 		goto give_sigsegv;
 
 	/* Set up to return from userspace.  */
-	restorer = &__kernel_rt_sigreturn;
+	restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn);
 	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = ka->sa.sa_restorer;
 	err |= __put_user(restorer, &frame->pretcode);
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 0bada1870bdf..c60419dee018 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -2,6 +2,8 @@
  * linux/arch/i386/kernel/sysenter.c
  *
  * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
  *
  * This file contains the needed initializations to support sysenter.
  */
@@ -13,12 +15,31 @@
 #include <linux/gfp.h>
 #include <linux/string.h>
 #include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/module.h>
 
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
 #include <asm/pgtable.h>
 #include <asm/unistd.h>
 
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso_enabled = 1;
+
+EXPORT_SYMBOL_GPL(vdso_enabled);
+
+static int __init vdso_setup(char *s)
+{
+	vdso_enabled = simple_strtoul(s, NULL, 0);
+
+	return 1;
+}
+
+__setup("vdso=", vdso_setup);
+
 extern asmlinkage void sysenter_entry(void);
 
 void enable_sep_cpu(void)
@@ -45,23 +66,122 @@ void enable_sep_cpu(void)
  */
 extern const char vsyscall_int80_start, vsyscall_int80_end;
 extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
+static void *syscall_page;
 
 int __init sysenter_setup(void)
 {
-	void *page = (void *)get_zeroed_page(GFP_ATOMIC);
+	syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
 
-	__set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
+#ifdef CONFIG_COMPAT_VDSO
+	__set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY);
+	printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
+#else
+	/*
+	 * In the non-compat case the ELF coredumping code needs the fixmap:
+	 */
+	__set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_KERNEL_RO);
+#endif
 
 	if (!boot_cpu_has(X86_FEATURE_SEP)) {
-		memcpy(page,
+		memcpy(syscall_page,
 		       &vsyscall_int80_start,
 		       &vsyscall_int80_end - &vsyscall_int80_start);
 		return 0;
 	}
 
-	memcpy(page,
+	memcpy(syscall_page,
 	       &vsyscall_sysenter_start,
 	       &vsyscall_sysenter_end - &vsyscall_sysenter_start);
 
 	return 0;
 }
+
+static struct page *syscall_nopage(struct vm_area_struct *vma,
+				unsigned long adr, int *type)
+{
+	struct page *p = virt_to_page(adr - vma->vm_start + syscall_page);
+	get_page(p);
+	return p;
+}
+
+/* Prevent VMA merging */
+static void syscall_vma_close(struct vm_area_struct *vma)
+{
+}
+
+static struct vm_operations_struct syscall_vm_ops = {
+	.close = syscall_vma_close,
+	.nopage = syscall_nopage,
+};
+
+/* Defined in vsyscall-sysenter.S */
+extern void SYSENTER_RETURN;
+
+/* Setup a VMA at program startup for the vsyscall page */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret;
+
+	down_write(&mm->mmap_sem);
+	addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+	if (IS_ERR_VALUE(addr)) {
+		ret = addr;
+		goto up_fail;
+	}
+
+	vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL);
+	if (!vma) {
+		ret = -ENOMEM;
+		goto up_fail;
+	}
+
+	vma->vm_start = addr;
+	vma->vm_end = addr + PAGE_SIZE;
+	/* MAYWRITE to allow gdb to COW and set breakpoints */
+	vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
+	vma->vm_flags |= mm->def_flags;
+	vma->vm_page_prot = protection_map[vma->vm_flags & 7];
+	vma->vm_ops = &syscall_vm_ops;
+	vma->vm_mm = mm;
+
+	ret = insert_vm_struct(mm, vma);
+	if (ret)
+		goto free_vma;
+
+	current->mm->context.vdso = (void *)addr;
+	current_thread_info()->sysenter_return =
+				    (void *)VDSO_SYM(&SYSENTER_RETURN);
+	mm->total_vm++;
+up_fail:
+	up_write(&mm->mmap_sem);
+	return ret;
+
+free_vma:
+	kmem_cache_free(vm_area_cachep, vma);
+	return ret;
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+		return "[vdso]";
+	return NULL;
+}
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+	return NULL;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+	return 0;
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+	return 0;
+}
diff --git a/arch/i386/kernel/vsyscall-sysenter.S b/arch/i386/kernel/vsyscall-sysenter.S
index 3b62baa6a371..1a36d26e15eb 100644
--- a/arch/i386/kernel/vsyscall-sysenter.S
+++ b/arch/i386/kernel/vsyscall-sysenter.S
@@ -42,10 +42,10 @@ __kernel_vsyscall:
 	/* 7: align return point with nop's to make disassembly easier */
 	.space 7,0x90
 
-	/* 14: System call restart point is here! (SYSENTER_RETURN - 2) */
+	/* 14: System call restart point is here! (SYSENTER_RETURN-2) */
 	jmp .Lenter_kernel
 	/* 16: System call normal return point is here! */
-	.globl SYSENTER_RETURN	/* Symbol used by entry.S.  */
+	.globl SYSENTER_RETURN	/* Symbol used by sysenter.c  */
 SYSENTER_RETURN:
 	pop %ebp
 .Lpop_ebp:
diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S
index 98699ca6e52d..e26975fc68b6 100644
--- a/arch/i386/kernel/vsyscall.lds.S
+++ b/arch/i386/kernel/vsyscall.lds.S
@@ -7,7 +7,7 @@
 
 SECTIONS
 {
-  . = VSYSCALL_BASE + SIZEOF_HEADERS;
+  . = VDSO_PRELINK + SIZEOF_HEADERS;
 
   .hash           : { *(.hash) }		:text
   .dynsym         : { *(.dynsym) }
@@ -20,7 +20,7 @@ SECTIONS
      For the layouts to match, we need to skip more than enough
      space for the dynamic symbol table et al.  If this amount
      is insufficient, ld -shared will barf.  Just increase it here.  */
-  . = VSYSCALL_BASE + 0x400;
+  . = VDSO_PRELINK + 0x400;
 
   .text           : { *(.text) }		:text =0x90909090
   .note		  : { *(.note.*) }		:text :note
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 0137ec4c1368..0a163a4f7764 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -122,6 +122,11 @@ struct mem_size_stats
 	unsigned long private_dirty;
 };
 
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
 	struct proc_maps_private *priv = m->private;
@@ -158,22 +163,23 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
 		pad_len_spaces(m, len);
 		seq_path(m, file->f_vfsmnt, file->f_dentry, "\n");
 	} else {
-		if (mm) {
-			if (vma->vm_start <= mm->start_brk &&
+		const char *name = arch_vma_name(vma);
+		if (!name) {
+			if (mm) {
+				if (vma->vm_start <= mm->start_brk &&
 						vma->vm_end >= mm->brk) {
-				pad_len_spaces(m, len);
-				seq_puts(m, "[heap]");
-			} else {
-				if (vma->vm_start <= mm->start_stack &&
-					vma->vm_end >= mm->start_stack) {
-
-					pad_len_spaces(m, len);
-					seq_puts(m, "[stack]");
+					name = "[heap]";
+				} else if (vma->vm_start <= mm->start_stack &&
+					   vma->vm_end >= mm->start_stack) {
+					name = "[stack]";
 				}
+			} else {
+				name = "[vdso]";
 			}
-		} else {
+		}
+		if (name) {
 			pad_len_spaces(m, len);
-			seq_puts(m, "[vdso]");
+			seq_puts(m, name);
 		}
 	}
 	seq_putc(m, '\n');
diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h
index 4153d80e4d2b..1eac92cb5b16 100644
--- a/include/asm-i386/elf.h
+++ b/include/asm-i386/elf.h
@@ -10,6 +10,7 @@
 #include <asm/processor.h>
 #include <asm/system.h>		/* for savesegment */
 #include <asm/auxvec.h>
+#include <asm/desc.h>
 
 #include <linux/utsname.h>
 
@@ -129,15 +130,41 @@ extern int dump_task_extended_fpu (struct task_struct *, struct user_fxsr_struct
 #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
 #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs)
 
-#define VSYSCALL_BASE	(__fix_to_virt(FIX_VSYSCALL))
-#define VSYSCALL_EHDR	((const struct elfhdr *) VSYSCALL_BASE)
-#define VSYSCALL_ENTRY	((unsigned long) &__kernel_vsyscall)
+#define VDSO_HIGH_BASE		(__fix_to_virt(FIX_VDSO))
+#define VDSO_BASE		((unsigned long)current->mm->context.vdso)
+
+#ifdef CONFIG_COMPAT_VDSO
+# define VDSO_COMPAT_BASE	VDSO_HIGH_BASE
+# define VDSO_PRELINK		VDSO_HIGH_BASE
+#else
+# define VDSO_COMPAT_BASE	VDSO_BASE
+# define VDSO_PRELINK		0
+#endif
+
+#define VDSO_COMPAT_SYM(x) \
+		(VDSO_COMPAT_BASE + (unsigned long)(x) - VDSO_PRELINK)
+
+#define VDSO_SYM(x) \
+		(VDSO_BASE + (unsigned long)(x) - VDSO_PRELINK)
+
+#define VDSO_HIGH_EHDR		((const struct elfhdr *) VDSO_HIGH_BASE)
+#define VDSO_EHDR		((const struct elfhdr *) VDSO_COMPAT_BASE)
+
 extern void __kernel_vsyscall;
 
+#define VDSO_ENTRY		VDSO_SYM(&__kernel_vsyscall)
+
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
+struct linux_binprm;
+extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+                                       int executable_stack);
+
+extern unsigned int vdso_enabled;
+
 #define ARCH_DLINFO						\
-do {								\
-		NEW_AUX_ENT(AT_SYSINFO,	VSYSCALL_ENTRY);	\
-		NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE);	\
+do if (vdso_enabled) {						\
+		NEW_AUX_ENT(AT_SYSINFO,	VDSO_ENTRY);		\
+		NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_COMPAT_BASE);	\
 } while (0)
 
 /*
@@ -148,15 +175,15 @@ do {								\
  * Dumping its extra ELF program headers includes all the other information
  * a debugger needs to easily find how the vsyscall DSO was being used.
  */
-#define ELF_CORE_EXTRA_PHDRS		(VSYSCALL_EHDR->e_phnum)
+#define ELF_CORE_EXTRA_PHDRS		(VDSO_HIGH_EHDR->e_phnum)
 #define ELF_CORE_WRITE_EXTRA_PHDRS					      \
 do {									      \
 	const struct elf_phdr *const vsyscall_phdrs =			      \
-		(const struct elf_phdr *) (VSYSCALL_BASE		      \
-					   + VSYSCALL_EHDR->e_phoff);	      \
+		(const struct elf_phdr *) (VDSO_HIGH_BASE		      \
+					   + VDSO_HIGH_EHDR->e_phoff);    \
 	int i;								      \
 	Elf32_Off ofs = 0;						      \
-	for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) {			      \
+	for (i = 0; i < VDSO_HIGH_EHDR->e_phnum; ++i) {		      \
 		struct elf_phdr phdr = vsyscall_phdrs[i];		      \
 		if (phdr.p_type == PT_LOAD) {				      \
 			BUG_ON(ofs != 0);				      \
@@ -174,10 +201,10 @@ do {									      \
 #define ELF_CORE_WRITE_EXTRA_DATA					      \
 do {									      \
 	const struct elf_phdr *const vsyscall_phdrs =			      \
-		(const struct elf_phdr *) (VSYSCALL_BASE		      \
-					   + VSYSCALL_EHDR->e_phoff);	      \
+		(const struct elf_phdr *) (VDSO_HIGH_BASE		      \
+					   + VDSO_HIGH_EHDR->e_phoff);    \
 	int i;								      \
-	for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) {			      \
+	for (i = 0; i < VDSO_HIGH_EHDR->e_phnum; ++i) {		      \
 		if (vsyscall_phdrs[i].p_type == PT_LOAD)		      \
 			DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr,	      \
 				   PAGE_ALIGN(vsyscall_phdrs[i].p_memsz));    \
diff --git a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h
index f7e068f4d2f9..a48cc3f7ccc6 100644
--- a/include/asm-i386/fixmap.h
+++ b/include/asm-i386/fixmap.h
@@ -51,7 +51,7 @@
  */
 enum fixed_addresses {
 	FIX_HOLE,
-	FIX_VSYSCALL,
+	FIX_VDSO,
 #ifdef CONFIG_X86_LOCAL_APIC
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 #endif
@@ -115,14 +115,6 @@ extern void __set_fixmap (enum fixed_addresses idx,
 #define __fix_to_virt(x)	(FIXADDR_TOP - ((x) << PAGE_SHIFT))
 #define __virt_to_fix(x)	((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
 
-/*
- * This is the range that is readable by user mode, and things
- * acting like user mode such as get_user_pages.
- */
-#define FIXADDR_USER_START	(__fix_to_virt(FIX_VSYSCALL))
-#define FIXADDR_USER_END	(FIXADDR_USER_START + PAGE_SIZE)
-
-
 extern void __this_fixmap_does_not_exist(void);
 
 /*
diff --git a/include/asm-i386/mmu.h b/include/asm-i386/mmu.h
index f431a0b86d4c..8358dd3df7aa 100644
--- a/include/asm-i386/mmu.h
+++ b/include/asm-i386/mmu.h
@@ -12,6 +12,7 @@ typedef struct {
 	int size;
 	struct semaphore sem;
 	void *ldt;
+	void *vdso;
 } mm_context_t;
 
 #endif
diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index e3a552fa5538..f5bf544c729a 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -96,6 +96,8 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 
 #ifndef __ASSEMBLY__
 
+struct vm_area_struct;
+
 /*
  * This much address space is reserved for vmalloc() and iomap()
  * as well as fixmap mappings.
@@ -139,6 +141,7 @@ extern int page_is_ram(unsigned long pagenr);
 #include <asm-generic/memory_model.h>
 #include <asm-generic/page.h>
 
+#define __HAVE_ARCH_GATE_AREA 1
 #endif /* __KERNEL__ */
 
 #endif /* _I386_PAGE_H */
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index ff1e2b1a7c84..2833fa2c0dd0 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -37,6 +37,7 @@ struct thread_info {
 					 	   0-0xBFFFFFFF for user-thead
 						   0-0xFFFFFFFF for kernel-thread
 						*/
+	void			*sysenter_return;
 	struct restart_block    restart_block;
 
 	unsigned long           previous_esp;   /* ESP of the previous stack in case
diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h
index d480f2e38215..69f0f1df6722 100644
--- a/include/asm-i386/unwind.h
+++ b/include/asm-i386/unwind.h
@@ -78,8 +78,8 @@ static inline int arch_unw_user_mode(const struct unwind_frame_info *info)
 	return user_mode_vm(&info->regs);
 #else
 	return info->regs.eip < PAGE_OFFSET
-	       || (info->regs.eip >= __fix_to_virt(FIX_VSYSCALL)
-	            && info->regs.eip < __fix_to_virt(FIX_VSYSCALL) + PAGE_SIZE)
+	       || (info->regs.eip >= __fix_to_virt(FIX_VDSO)
+	            && info->regs.eip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE)
 	       || info->regs.esp < PAGE_OFFSET;
 #endif
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a929ea197e48..ff1fa87df8d0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1065,5 +1065,7 @@ void drop_slab(void);
 extern int randomize_va_space;
 #endif
 
+const char *arch_vma_name(struct vm_area_struct *vma);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 349ef908a222..bee12a7a0576 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -189,6 +189,7 @@ enum
 	VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
 	VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
+	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f1a4eb1a655e..f54afed8426f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -927,6 +927,18 @@ static ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec_jiffies,
 		.strategy	= &sysctl_jiffies,
 	},
+#endif
+#ifdef CONFIG_X86_32
+	{
+		.ctl_name	= VM_VDSO_ENABLED,
+		.procname	= "vdso_enabled",
+		.data		= &vdso_enabled,
+		.maxlen		= sizeof(vdso_enabled),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 #endif
 	{ .ctl_name = 0 }
 };
-- 
cgit v1.2.3


From c9cf55285e87ac423c45d9efca750d3f50234d10 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Tue, 27 Jun 2006 02:53:52 -0700
Subject: [PATCH] add poison.h and patch primary users

Localize poison values into one header file for better documentation and
easier/quicker debugging and so that the same values won't be used for
multiple purposes.

Use these constants in core arch., mm, driver, and fs code.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: Matt Mackall <mpm@selenic.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/init.c       |  3 ++-
 arch/powerpc/mm/init_64.c |  3 ++-
 arch/sparc64/mm/init.c    |  3 ++-
 arch/x86_64/mm/init.c     |  7 +++++--
 drivers/base/dmapool.c    |  3 +--
 fs/jbd/journal.c          |  3 ++-
 include/linux/list.h      |  9 +--------
 include/linux/poison.h    | 45 +++++++++++++++++++++++++++++++++++++++++++++
 mm/slab.c                 | 12 +-----------
 9 files changed, 61 insertions(+), 27 deletions(-)
 create mode 100644 include/linux/poison.h

(limited to 'include/linux')

diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 468592531793..f84b16e007ff 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/poison.h>
 #include <linux/bootmem.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
@@ -753,7 +754,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
-		memset((void *)addr, 0xcc, PAGE_SIZE);
+		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 9e30f968c184..d454caada265 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -41,6 +41,7 @@
 #include <linux/idr.h>
 #include <linux/nodemask.h>
 #include <linux/module.h>
+#include <linux/poison.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -90,7 +91,7 @@ void free_initmem(void)
 
 	addr = (unsigned long)__init_begin;
 	for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
-		memset((void *)addr, 0xcc, PAGE_SIZE);
+		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
 		free_page(addr);
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 5c2bcf354ce6..cb75a27adb51 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -18,6 +18,7 @@
 #include <linux/initrd.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
+#include <linux/poison.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/kprobes.h>
@@ -1520,7 +1521,7 @@ void free_initmem(void)
 		page = (addr +
 			((unsigned long) __va(kern_base)) -
 			((unsigned long) KERNBASE));
-		memset((void *)addr, 0xcc, PAGE_SIZE);
+		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
 		p = virt_to_page(page);
 
 		ClearPageReserved(p);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 51fbf3eddf13..95bd232ff0cf 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -23,6 +23,7 @@
 #include <linux/bootmem.h>
 #include <linux/proc_fs.h>
 #include <linux/pci.h>
+#include <linux/poison.h>
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
 #include <linux/memory_hotplug.h>
@@ -660,7 +661,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
-		memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
+		memset((void *)(addr & ~(PAGE_SIZE-1)),
+			POISON_FREE_INITMEM, PAGE_SIZE);
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -668,7 +670,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 
 void free_initmem(void)
 {
-	memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
+	memset(__initdata_begin, POISON_FREE_INITDATA,
+		__initdata_end - __initdata_begin);
 	free_init_pages("unused kernel memory",
 			(unsigned long)(&__init_begin),
 			(unsigned long)(&__init_end));
diff --git a/drivers/base/dmapool.c b/drivers/base/dmapool.c
index e2f64f91ed05..33c5cce1560b 100644
--- a/drivers/base/dmapool.c
+++ b/drivers/base/dmapool.c
@@ -7,6 +7,7 @@
 #include <linux/dmapool.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/poison.h>
 
 /*
  * Pool allocator ... wraps the dma_alloc_coherent page allocator, so
@@ -35,8 +36,6 @@ struct dma_page {	/* cacheable header for 'allocation' bytes */
 };
 
 #define	POOL_TIMEOUT_JIFFIES	((100 /* msec */ * HZ) / 1000)
-#define	POOL_POISON_FREED	0xa7	/* !inuse */
-#define	POOL_POISON_ALLOCATED	0xa9	/* !initted */
 
 static DECLARE_MUTEX (pools_lock);
 
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 7f96b5cb6781..8c9b28dff119 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -34,6 +34,7 @@
 #include <linux/suspend.h>
 #include <linux/pagemap.h>
 #include <linux/kthread.h>
+#include <linux/poison.h>
 #include <linux/proc_fs.h>
 
 #include <asm/uaccess.h>
@@ -1675,7 +1676,7 @@ static void journal_free_journal_head(struct journal_head *jh)
 {
 #ifdef CONFIG_JBD_DEBUG
 	atomic_dec(&nr_journal_heads);
-	memset(jh, 0x5b, sizeof(*jh));
+	memset(jh, JBD_POISON_FREE, sizeof(*jh));
 #endif
 	kmem_cache_free(journal_head_cache, jh);
 }
diff --git a/include/linux/list.h b/include/linux/list.h
index 37ca31b21bb7..6b74adf5297f 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -4,17 +4,10 @@
 #ifdef __KERNEL__
 
 #include <linux/stddef.h>
+#include <linux/poison.h>
 #include <linux/prefetch.h>
 #include <asm/system.h>
 
-/*
- * These are non-NULL pointers that will result in page faults
- * under normal circumstances, used to verify that nobody uses
- * non-initialized list entries.
- */
-#define LIST_POISON1  ((void *) 0x00100100)
-#define LIST_POISON2  ((void *) 0x00200200)
-
 /*
  * Simple doubly linked list implementation.
  *
diff --git a/include/linux/poison.h b/include/linux/poison.h
new file mode 100644
index 000000000000..d536de7dd825
--- /dev/null
+++ b/include/linux/poison.h
@@ -0,0 +1,45 @@
+#ifndef _LINUX_POISON_H
+#define _LINUX_POISON_H
+
+/********** include/linux/list.h **********/
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+/********** mm/slab.c **********/
+/*
+ * Magic nums for obj red zoning.
+ * Placed in the first word before and the first word after an obj.
+ */
+#define	RED_INACTIVE	0x5A2CF071UL	/* when obj is inactive */
+#define	RED_ACTIVE	0x170FC2A5UL	/* when obj is active */
+
+/* ...and for poisoning */
+#define	POISON_INUSE	0x5a	/* for use-uninitialised poisoning */
+#define POISON_FREE	0x6b	/* for use-after-free poisoning */
+#define	POISON_END	0xa5	/* end-byte of poisoning */
+
+/********** arch/$ARCH/mm/init.c **********/
+#define POISON_FREE_INITMEM	0xcc
+
+/********** arch/x86_64/mm/init.c **********/
+#define	POISON_FREE_INITDATA	0xba
+
+/********** arch/ia64/hp/common/sba_iommu.c **********/
+/*
+ * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a
+ * value of "SBAIOMMU POISON\0" for spill-over poisoning.
+ */
+
+/********** fs/jbd/journal.c **********/
+#define JBD_POISON_FREE	0x5b
+
+/********** drivers/base/dmapool.c **********/
+#define	POOL_POISON_FREED	0xa7	/* !inuse */
+#define	POOL_POISON_ALLOCATED	0xa9	/* !initted */
+
+#endif
diff --git a/mm/slab.c b/mm/slab.c
index 98ac20bc0de9..47982c2d9f39 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -89,6 +89,7 @@
 #include	<linux/config.h>
 #include	<linux/slab.h>
 #include	<linux/mm.h>
+#include	<linux/poison.h>
 #include	<linux/swap.h>
 #include	<linux/cache.h>
 #include	<linux/interrupt.h>
@@ -492,17 +493,6 @@ struct kmem_cache {
 #endif
 
 #if DEBUG
-/*
- * Magic nums for obj red zoning.
- * Placed in the first word before and the first word after an obj.
- */
-#define	RED_INACTIVE	0x5A2CF071UL	/* when obj is inactive */
-#define	RED_ACTIVE	0x170FC2A5UL	/* when obj is active */
-
-/* ...and for poisoning */
-#define	POISON_INUSE	0x5a	/* for use-uninitialised poisoning */
-#define POISON_FREE	0x6b	/* for use-after-free poisoning */
-#define	POISON_END	0xa5	/* end-byte of poisoning */
 
 /*
  * memory layout of objects:
-- 
cgit v1.2.3


From b3c681e09193559ba15f6c9562bd37045f120a96 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Tue, 27 Jun 2006 02:53:53 -0700
Subject: [PATCH] update two drivers for poison.h

Update two drivers to use poison.h.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/atm/firestream.c    | 3 ++-
 include/linux/poison.h      | 6 ++++++
 sound/oss/via82cxxx_audio.c | 5 +++--
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c
index f2eeaf9dc56a..1bca86edf570 100644
--- a/drivers/atm/firestream.c
+++ b/drivers/atm/firestream.c
@@ -33,6 +33,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/pci.h>
+#include <linux/poison.h>
 #include <linux/errno.h>
 #include <linux/atm.h>
 #include <linux/atmdev.h>
@@ -754,7 +755,7 @@ static void process_txdone_queue (struct fs_dev *dev, struct queue *q)
 			fs_kfree_skb (skb);
 
 			fs_dprintk (FS_DEBUG_ALLOC, "Free trans-d: %p\n", td); 
-			memset (td, 0x12, sizeof (struct FS_BPENTRY));
+			memset (td, ATM_POISON_FREE, sizeof(struct FS_BPENTRY));
 			kfree (td);
 			break;
 		default:
diff --git a/include/linux/poison.h b/include/linux/poison.h
index d536de7dd825..4109f37b7b66 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -42,4 +42,10 @@
 #define	POOL_POISON_FREED	0xa7	/* !inuse */
 #define	POOL_POISON_ALLOCATED	0xa9	/* !initted */
 
+/********** drivers/atm/ **********/
+#define ATM_POISON_FREE		0x12
+
+/********** sound/oss/ **********/
+#define OSS_POISON_FREE		0xAB
+
 #endif
diff --git a/sound/oss/via82cxxx_audio.c b/sound/oss/via82cxxx_audio.c
index 1a921ee71aba..2343dedd44ae 100644
--- a/sound/oss/via82cxxx_audio.c
+++ b/sound/oss/via82cxxx_audio.c
@@ -24,6 +24,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/pci.h>
+#include <linux/poison.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/proc_fs.h>
@@ -3522,7 +3523,7 @@ err_out_have_mixer:
 
 err_out_kfree:
 #ifndef VIA_NDEBUG
-	memset (card, 0xAB, sizeof (*card)); /* poison memory */
+	memset (card, OSS_POISON_FREE, sizeof (*card)); /* poison memory */
 #endif
 	kfree (card);
 
@@ -3559,7 +3560,7 @@ static void __devexit via_remove_one (struct pci_dev *pdev)
 	via_ac97_cleanup (card);
 
 #ifndef VIA_NDEBUG
-	memset (card, 0xAB, sizeof (*card)); /* poison memory */
+	memset (card, OSS_POISON_FREE, sizeof (*card)); /* poison memory */
 #endif
 	kfree (card);
 
-- 
cgit v1.2.3


From a7807a32bbb027ab9955b96734fdc7f1e6497a9f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Tue, 27 Jun 2006 02:53:54 -0700
Subject: [PATCH] poison: add & use more constants

Add more poison values to include/linux/poison.h.  It's not clear to me
whether some others should be added or not, so I haven't added any of
these:

./include/linux/libata.h:#define ATA_TAG_POISON		0xfafbfcfdU
./arch/ppc/8260_io/fcc_enet.c:1918:	memset((char *)(&(immap->im_dprambase[(mem_addr+64)])), 0x88, 32);
./drivers/usb/mon/mon_text.c:429:	memset(mem, 0xe5, sizeof(struct mon_event_text));
./drivers/char/ftape/lowlevel/ftape-ctl.c:738:		memset(ft_buffer[i]->address, 0xAA, FT_BUFF_SIZE);
./drivers/block/sx8.c:/* 0xf is just arbitrary, non-zero noise; this is sorta like poisoning */

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/poison.h | 7 +++++++
 kernel/mutex-debug.c   | 5 +++--
 security/keys/key.c    | 3 ++-
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/poison.h b/include/linux/poison.h
index 4109f37b7b66..a5347c02432e 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -45,6 +45,13 @@
 /********** drivers/atm/ **********/
 #define ATM_POISON_FREE		0x12
 
+/********** kernel/mutexes **********/
+#define MUTEX_DEBUG_INIT	0x11
+#define MUTEX_DEBUG_FREE	0x22
+
+/********** security/ **********/
+#define KEY_DESTROY		0xbd
+
 /********** sound/oss/ **********/
 #define OSS_POISON_FREE		0xAB
 
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 036b6285b15c..e38e4bac97ca 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/poison.h>
 #include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
@@ -381,7 +382,7 @@ void debug_mutex_set_owner(struct mutex *lock,
 
 void debug_mutex_init_waiter(struct mutex_waiter *waiter)
 {
-	memset(waiter, 0x11, sizeof(*waiter));
+	memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
 	waiter->magic = waiter;
 	INIT_LIST_HEAD(&waiter->list);
 }
@@ -397,7 +398,7 @@ void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
 void debug_mutex_free_waiter(struct mutex_waiter *waiter)
 {
 	DEBUG_WARN_ON(!list_empty(&waiter->list));
-	memset(waiter, 0x22, sizeof(*waiter));
+	memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
 }
 
 void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
diff --git a/security/keys/key.c b/security/keys/key.c
index 43295ca37b5d..80de8c3e9cc3 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -11,6 +11,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/poison.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/security.h>
@@ -988,7 +989,7 @@ void unregister_key_type(struct key_type *ktype)
 		if (key->type == ktype) {
 			if (ktype->destroy)
 				ktype->destroy(key);
-			memset(&key->payload, 0xbd, sizeof(key->payload));
+			memset(&key->payload, KEY_DESTROY, sizeof(key->payload));
 		}
 	}
 
-- 
cgit v1.2.3


From b6cd0b772dcc5dc9b4c03d53946474dee399fa72 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 27 Jun 2006 02:53:54 -0700
Subject: [PATCH] fs/buffer.c: cleanups

- add a proper prototype for the following global function:
  - buffer_init()

- make the following needlessly global function static:
  - end_buffer_async_write()

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c                 | 3 +--
 include/linux/buffer_head.h | 2 +-
 init/main.c                 | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 373bb6292bdc..f23bb647db47 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -564,7 +564,7 @@ still_busy:
  * Completion handler for block_write_full_page() - pages which are unlocked
  * during I/O, and which have PageWriteback cleared upon I/O completion.
  */
-void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
 	char b[BDEVNAME_SIZE];
 	unsigned long flags;
@@ -3166,7 +3166,6 @@ EXPORT_SYMBOL(block_sync_page);
 EXPORT_SYMBOL(block_truncate_page);
 EXPORT_SYMBOL(block_write_full_page);
 EXPORT_SYMBOL(cont_prepare_write);
-EXPORT_SYMBOL(end_buffer_async_write);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(file_fsync);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index fb7e9b7ccbe3..737e407d0cd1 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -149,7 +149,6 @@ void create_empty_buffers(struct page *, unsigned long,
 			unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
-void end_buffer_async_write(struct buffer_head *bh, int uptodate);
 
 /* Things to do with buffers at mapping->private_list */
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
@@ -214,6 +213,7 @@ int nobh_truncate_page(struct address_space *, loff_t);
 int nobh_writepage(struct page *page, get_block_t *get_block,
                         struct writeback_control *wbc);
 
+void buffer_init(void);
 
 /*
  * inline definitions
diff --git a/init/main.c b/init/main.c
index 80af1a52485f..0d57f6ccb63a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -48,6 +48,7 @@
 #include <linux/mempolicy.h>
 #include <linux/key.h>
 #include <linux/unwind.h>
+#include <linux/buffer_head.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -80,7 +81,6 @@ extern void mca_init(void);
 extern void sbus_init(void);
 extern void sysctl_init(void);
 extern void signals_init(void);
-extern void buffer_init(void);
 extern void pidhash_init(void);
 extern void pidmap_init(void);
 extern void prio_tree_init(void);
-- 
cgit v1.2.3


From 1c0f16e5cdff59f3b132a1b0c0d44a941f8813d2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 27 Jun 2006 02:53:56 -0700
Subject: [PATCH] Remove gratuitous inclusion of <linux/config.h> from
 <linux/dmaengine.h>

We include config.h on the compiler command line. There's no need for it
to be included again.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/dmaengine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 78b236ca04f8..272010a6078a 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -20,7 +20,7 @@
  */
 #ifndef DMAENGINE_H
 #define DMAENGINE_H
-#include <linux/config.h>
+
 #ifdef CONFIG_DMA_ENGINE
 
 #include <linux/device.h>
-- 
cgit v1.2.3


From c32e066057fe0914da262c94e52cefb142f965b4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Tue, 27 Jun 2006 02:54:04 -0700
Subject: [PATCH] rcutorture: add call_rcu_bh() operations

Add operations for the call_rcu_bh() variant of RCU.  Also add an
rcu_batches_completed_bh() function, which is needed by rcutorture.

Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h |  1 +
 kernel/rcupdate.c        | 10 ++++++++++
 kernel/rcutorture.c      | 40 ++++++++++++++++++++++++++++++++++++++--
 3 files changed, 49 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 6312758393b6..48dfe00070c7 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -258,6 +258,7 @@ extern void rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
 extern void rcu_restart_cpu(int cpu);
 extern long rcu_batches_completed(void);
+extern long rcu_batches_completed_bh(void);
 
 /* Exported interfaces */
 extern void FASTCALL(call_rcu(struct rcu_head *head, 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 20e9710fc21c..c0e1cb95dd4f 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -182,6 +182,15 @@ long rcu_batches_completed(void)
 	return rcu_ctrlblk.completed;
 }
 
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_ctrlblk.completed;
+}
+
 static void rcu_barrier_callback(struct rcu_head *notused)
 {
 	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -619,6 +628,7 @@ module_param(qlowmark, int, 0);
 module_param(rsinterval, int, 0);
 #endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 EXPORT_SYMBOL_GPL(call_rcu);
 EXPORT_SYMBOL_GPL(call_rcu_bh);
 EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c96b5edd6ed1..4d1c3d247127 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -66,7 +66,7 @@ MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
 module_param(shuffle_interval, int, 0);
 MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
 module_param(torture_type, charp, 0);
-MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu)");
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)");
 
 #define TORTURE_FLAG "-torture:"
 #define PRINTK_STRING(s) \
@@ -246,8 +246,44 @@ static struct rcu_torture_ops rcu_ops = {
 	.name = "rcu"
 };
 
+/*
+ * Definitions for rcu_bh torture testing.
+ */
+
+static int rcu_bh_torture_read_lock(void)
+{
+	rcu_read_lock_bh();
+	return 0;
+}
+
+static void rcu_bh_torture_read_unlock(int idx)
+{
+	rcu_read_unlock_bh();
+}
+
+static int rcu_bh_torture_completed(void)
+{
+	return rcu_batches_completed_bh();
+}
+
+static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
+{
+	call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops rcu_bh_ops = {
+	.init = NULL,
+	.cleanup = NULL,
+	.readlock = rcu_bh_torture_read_lock,
+	.readunlock = rcu_bh_torture_read_unlock,
+	.completed = rcu_bh_torture_completed,
+	.deferredfree = rcu_bh_torture_deferred_free,
+	.stats = NULL,
+	.name = "rcu_bh"
+};
+
 static struct rcu_torture_ops *torture_ops[] =
-	{ &rcu_ops, NULL };
+	{ &rcu_ops, &rcu_bh_ops, NULL };
 
 /*
  * RCU torture writer kthread.  Repeatedly substitutes a new structure
-- 
cgit v1.2.3


From 65edc68c345cbe21d0b0375c3452a3ed5e322868 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Tue, 27 Jun 2006 02:54:08 -0700
Subject: [PATCH] cpu hotplug: make [un]register_cpu_notifier init time only

CPUs come online only at init time (unless CONFIG_HOTPLUG_CPU is defined).
So, cpu_notifier functionality need to be available only at init time.

This patch makes register_cpu_notifier() available only at init time, unless
CONFIG_HOTPLUG_CPU is defined.

This patch exports register_cpu_notifier() and unregister_cpu_notifier() only
if CONFIG_HOTPLUG_CPU is defined.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/cpufreq/cpufreq.c       | 8 +++++---
 drivers/cpufreq/cpufreq_stats.c | 4 ++--
 include/linux/cpu.h             | 6 ++++++
 kernel/cpu.c                    | 8 +++++---
 4 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 486ef6664708..3533e26f837d 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1497,7 +1497,8 @@ int cpufreq_update_policy(unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_update_policy);
 
-static int __cpuinit cpufreq_cpu_callback(struct notifier_block *nfb,
+#ifdef CONFIG_HOTPLUG_CPU
+static int cpufreq_cpu_callback(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
@@ -1536,6 +1537,7 @@ static struct notifier_block cpufreq_cpu_notifier =
 {
     .notifier_call = cpufreq_cpu_callback,
 };
+#endif /* CONFIG_HOTPLUG_CPU */
 
 /*********************************************************************
  *               REGISTER / UNREGISTER CPUFREQ DRIVER                *
@@ -1596,7 +1598,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	}
 
 	if (!ret) {
-		register_cpu_notifier(&cpufreq_cpu_notifier);
+		register_hotcpu_notifier(&cpufreq_cpu_notifier);
 		dprintk("driver %s up and running\n", driver_data->name);
 		cpufreq_debug_enable_ratelimit();
 	}
@@ -1628,7 +1630,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
 	dprintk("unregistering driver %s\n", driver->name);
 
 	sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver);
-	unregister_cpu_notifier(&cpufreq_cpu_notifier);
+	unregister_hotcpu_notifier(&cpufreq_cpu_notifier);
 
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
 	cpufreq_driver = NULL;
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index c576c0b3f452..145061b8472a 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -350,7 +350,7 @@ __init cpufreq_stats_init(void)
 		return ret;
 	}
 
-	register_cpu_notifier(&cpufreq_stat_cpu_notifier);
+	register_hotcpu_notifier(&cpufreq_stat_cpu_notifier);
 	lock_cpu_hotplug();
 	for_each_online_cpu(cpu) {
 		cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, CPU_ONLINE,
@@ -368,7 +368,7 @@ __exit cpufreq_stats_exit(void)
 			CPUFREQ_POLICY_NOTIFIER);
 	cpufreq_unregister_notifier(&notifier_trans_block,
 			CPUFREQ_TRANSITION_NOTIFIER);
-	unregister_cpu_notifier(&cpufreq_stat_cpu_notifier);
+	unregister_hotcpu_notifier(&cpufreq_stat_cpu_notifier);
 	lock_cpu_hotplug();
 	for_each_online_cpu(cpu) {
 		cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, CPU_DEAD,
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b23bf1c8addc..cdfe471a70a1 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -41,7 +41,13 @@ struct notifier_block;
 #ifdef CONFIG_SMP
 /* Need to know about CPUs going up/down? */
 extern int register_cpu_notifier(struct notifier_block *nb);
+#ifdef CONFIG_HOTPLUG_CPU
 extern void unregister_cpu_notifier(struct notifier_block *nb);
+#else
+static inline void unregister_cpu_notifier(struct notifier_block *nb)
+{
+}
+#endif
 extern int current_in_cpu_hotplug(void);
 
 int cpu_up(unsigned int cpu);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 03dcd981846a..70fbf2e83766 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -18,7 +18,7 @@
 /* This protects CPUs going up and down... */
 static DEFINE_MUTEX(cpucontrol);
 
-static BLOCKING_NOTIFIER_HEAD(cpu_chain);
+static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
 
 #ifdef CONFIG_HOTPLUG_CPU
 static struct task_struct *lock_cpu_hotplug_owner;
@@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
 #endif	/* CONFIG_HOTPLUG_CPU */
 
 /* Need to know about CPUs going up/down? */
-int register_cpu_notifier(struct notifier_block *nb)
+int __cpuinit register_cpu_notifier(struct notifier_block *nb)
 {
 	return blocking_notifier_chain_register(&cpu_chain, nb);
 }
+
+#ifdef CONFIG_HOTPLUG_CPU
+
 EXPORT_SYMBOL(register_cpu_notifier);
 
 void unregister_cpu_notifier(struct notifier_block *nb)
@@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
-#ifdef CONFIG_HOTPLUG_CPU
 static inline void check_for_tasks(int cpu)
 {
 	struct task_struct *p;
-- 
cgit v1.2.3


From 39f4885c56073ecafd482d7e10dd7b17900fa312 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Tue, 27 Jun 2006 02:54:10 -0700
Subject: [PATCH] cpu hotplug: add hotplug versions of cpu_notifier

Define new macros register_hotcpu_notifier() and unregister_hotcpu_notifier()
that redefines register_cpu_notifier() and unregister_cpu_notifier() for use
only when HOTPLUG_CPU is defined.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpu.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index cdfe471a70a1..a3caf6866bae 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -79,6 +79,8 @@ extern int lock_cpu_hotplug_interruptible(void);
 		{ .notifier_call = fn, .priority = pri };	\
 	register_cpu_notifier(&fn##_nb);			\
 }
+#define register_hotcpu_notifier(nb)	register_cpu_notifier(nb)
+#define unregister_hotcpu_notifier(nb)	unregister_cpu_notifier(nb)
 int cpu_down(unsigned int cpu);
 #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
 #else
@@ -86,6 +88,8 @@ int cpu_down(unsigned int cpu);
 #define unlock_cpu_hotplug()	do { } while (0)
 #define lock_cpu_hotplug_interruptible() 0
 #define hotcpu_notifier(fn, pri)
+#define register_hotcpu_notifier(nb)
+#define unregister_hotcpu_notifier(nb)
 
 /* CPUs don't go offline once they're online w/o CONFIG_HOTPLUG_CPU */
 static inline int cpu_is_offline(int cpu) { return 0; }
-- 
cgit v1.2.3


From 62c83cde9282a9580994a12b3063e677181b5ebe Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Tue, 27 Jun 2006 02:54:13 -0700
Subject: [PATCH] chardev: GPIO for SCx200 & PC-8736x: whitespace pre-clean

GPIO SUPPORT FOR SCx200 & PC8736x

The patch-set reworks the 2.4 vintage scx200_gpio driver for modern 2.6, and
refactors GPIO support to reuse it in a new driver for the GPIO on PC-8736x
chips.  Its handy for the Soekris.com net-4801, which has both chips.

These patches have been seen recently on Kernel-Mentors, and then
Kernel-Newbies ML, where Jesper Juhl kindly reviewed it.  His feedback has
been incorporated.  Thanks Jesper !

Its also gone to soekris-tech@soekris.com for possible testing by linux folks,
I've gotten 1 promise so far.  Theyre mostly BSD folk over there, but we'll
see..

Device-file & Sysfs

The driver preserves the existing device-file interface, including the
write/cmd set, but adds v to 'view' the pin-settings & configs by inducing,
via gpio_dump(), a dev_info() call.  Its a fairly crappy way to get status,
but it sticks to the syslog approach, conservatively.

Allowing users to voluntarily trigger logging is good, it gives them a
familiar way to confirm their app's control & use of the pins, and I've thus
reduced the pin-mode-updates from dev_info to dev_dbg.

I've recently bolted on a proto sysfs interface for both new drivers.  Im not
including those patches here; they (the patch + doc-pre-patch) are still quite
raw (and unreviewed on KNML), and since they 'invent' a convention for GPIO, a
proper vetting is needed.  Since this patchset is much bigger than my previous
ones, Id like to keep things simpler, and address it 1st, before bolting on
more stuff.

The driver-split

The Geode CPU and the PC-87366 Super-IO chip have GPIO units which share a
common pin-architecture (same pin features, with same bits controlling), but
with different addressing mechanics and port organizations.

The vintage driver expresses the pin capabilities with pin-mode commands
[OoPpTt],etc that change the pin configurations, and since the 2 chips share
pin-arch, we can reuse the read(), write() commands, once the implementation
is suitably adjusted.

The patchset adds a vtable: struct nsc_gpio_ops, to abstract the existing gpio
operations, then adjusts fileops.write() code to invoke operations via that
vtable.  Driver specific open()s set private_data to the vtable so its
available for use by write().

The vtable gets the gpio_dump() too, since its user-friendly, and (could be
construed as) part of the current device-file interface.  To support use of
dev_dbg() in write() & _dump(), the vtable gets a dev ptr too, set by both
scx200 & pc8736x _gpio drivers.

heres how the pins are presented in syslog:

[ 1890.176223]  scx200_gpio.0: io00: 0x0044 TS OD PUE  EDGE LO DEBOUNCE
[ 1890.287223]  scx200_gpio.0: io01: 0x0003 OE PP PUD  EDGE LO

nsc_gpio.c: new file is new home of several file-ops methods, which are
modified to get their vtable from filp->private_data, and use it where needed.

scx200_gpio.c: keeps some of its existing gpio routines, but now wires them up
via the vtable (they're invoked by nsc_gpio.c:nsc_gpio_write() thru this
vtable).  A driver-spcific open() initializes filp->private_data with the
vtable.

Once the split is clean, and the scx200_gpio driver is working, we copy and
modify the function and variable names, and rework the access-method bodies
for the different addressing scheme.

Heres a working overview of the patchset:

# series file for GPIO

# Spring Cleaning
gpio-scx/patch.preclean        # scripts/Lindent fixes, editor-ctrl comments

# API Modernization

gpio-scx/patch.api26        # what I learned from LDD3
gpio-scx/patch.platform-dev-2    # get pdev, support for dev_dbg()
gpio-scx/patch.unsigned-minor    # fix to match std practice

# Debuggability

gpio-scx/patch.dump-diet    # shrink gpio_dump()
gpio-scx/patch.viewpins        # add new 'command' to call dump()
gpio-scx/patch.init-refactor    # pull shadow-register init to sub

# Access-Abstraction (add vtable)

gpio-scx/patch.access-vtable    # introduce nsg_gpio_ops vtable, w dump
gpio-scx/patch.vtable-calls    # add & use the vtable in scx200_gpio
gpio-scx/patch.nscgpio-shell    # add empty driver for common-fops

# move code under abstraction
gpio-scx/patch.migrate-fops    # move file-ops methods from scx200_gpio
gpio-scx/patch.common-dump    # mv scx200.c:scx200_gpio_dump() to nsc_gpio.c
gpio-scx/patch.add-pc8736x-gpio    # add new driver, like old, w chip adapt
# gpio-scx/patch.add-DEBUG    # enable all dev_dbg()s

# Cleanups

# finish printk -> dev_dbg() etc
gpio-scx/patch.pdev-pc8736x    # new drvr needs pdev too,
gpio-scx/patch.devdbg-nscgpio    # add device to 'vtable', use in dev_dbg()

# gpio-scx/patch.pin-config-view    # another 'c' 'command'
# gpio-scx/quiet-getset        # take out excess dbg stuff (pretty quiet
now)
gpio-scx/patch.shadow-current    # imitate scx200_gpio's shadow regs in
pc87*

# post KMentors-post patches ..

gpio-scx/patch.mutexes        # use mutexes for config-locks
gpio-scx/patch.viewpins-values    # extend dump to obsolete separate 'c' cmd

gpio-scx/patch.kconfig        # add stuff for kbuild

# TBC
# combine api26 with pdev, which is just one step.
# merge c&v commands to single do-all-fn
# delay viewpins, dump-diet should also un-ifdef it too.

diff.sys-gpio-rollup-1

This patch:

Removed editor format-control comments, and used scripts/Lindent to clean up
whitespace, then deleted the bogus chunks :-(

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/scx200.c   |  7 -------
 drivers/char/scx200_gpio.c  | 26 +++++++++-----------------
 include/linux/scx200.h      |  7 -------
 include/linux/scx200_gpio.h |  7 -------
 4 files changed, 9 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/scx200.c b/arch/i386/kernel/scx200.c
index 321f5fd26e75..4d3d8e811fe1 100644
--- a/arch/i386/kernel/scx200.c
+++ b/arch/i386/kernel/scx200.c
@@ -159,10 +159,3 @@ EXPORT_SYMBOL(scx200_gpio_base);
 EXPORT_SYMBOL(scx200_gpio_shadow);
 EXPORT_SYMBOL(scx200_gpio_configure);
 EXPORT_SYMBOL(scx200_cb_base);
-
-/*
-    Local variables:
-        compile-command: "make -k -C ../../.. SUBDIRS=arch/i386/kernel modules"
-        c-basic-offset: 8
-    End:
-*/
diff --git a/drivers/char/scx200_gpio.c b/drivers/char/scx200_gpio.c
index 664a6e97eb1a..dd051ea6755e 100644
--- a/drivers/char/scx200_gpio.c
+++ b/drivers/char/scx200_gpio.c
@@ -1,4 +1,4 @@
-/* linux/drivers/char/scx200_gpio.c 
+/* linux/drivers/char/scx200_gpio.c
 
    National Semiconductor SCx200 GPIO driver.  Allows a user space
    process to play with the GPIO pins.
@@ -26,7 +26,7 @@ static int major = 0;		/* default to dynamic major */
 module_param(major, int, 0);
 MODULE_PARM_DESC(major, "Major device number");
 
-static ssize_t scx200_gpio_write(struct file *file, const char __user *data, 
+static ssize_t scx200_gpio_write(struct file *file, const char __user *data,
 				 size_t len, loff_t *ppos)
 {
 	unsigned m = iminor(file->f_dentry->d_inode);
@@ -34,15 +34,14 @@ static ssize_t scx200_gpio_write(struct file *file, const char __user *data,
 
 	for (i = 0; i < len; ++i) {
 		char c;
-		if (get_user(c, data+i))
+		if (get_user(c, data + i))
 			return -EFAULT;
-		switch (c)
-		{
-		case '0': 
-			scx200_gpio_set(m, 0); 
+		switch (c) {
+		case '0':
+			scx200_gpio_set(m, 0);
 			break;
-		case '1': 
-			scx200_gpio_set(m, 1); 
+		case '1':
+			scx200_gpio_set(m, 1);
 			break;
 		case 'O':
 			printk(KERN_INFO NAME ": GPIO%d output enabled\n", m);
@@ -83,7 +82,7 @@ static ssize_t scx200_gpio_read(struct file *file, char __user *buf,
 	value = scx200_gpio_get(m);
 	if (put_user(value ? '1' : '0', buf))
 		return -EFAULT;
-	
+
 	return 1;
 }
 
@@ -140,10 +139,3 @@ static void __exit scx200_gpio_cleanup(void)
 
 module_init(scx200_gpio_init);
 module_exit(scx200_gpio_cleanup);
-
-/*
-    Local variables:
-        compile-command: "make -k -C ../.. SUBDIRS=drivers/char modules"
-        c-basic-offset: 8
-    End:
-*/
diff --git a/include/linux/scx200.h b/include/linux/scx200.h
index a22f9e173ad2..693c0557e70b 100644
--- a/include/linux/scx200.h
+++ b/include/linux/scx200.h
@@ -49,10 +49,3 @@ extern unsigned scx200_cb_base;
 #define SCx200_REV 0x3d		/* Revision Register */
 #define SCx200_CBA 0x3e		/* Configuration Base Address Register */
 #define SCx200_CBA_SCRATCH 0x64	/* Configuration Base Address Scratchpad */
-
-/*
-    Local variables:
-        compile-command: "make -C ../.. bzImage modules"
-        c-basic-offset: 8
-    End:
-*/
diff --git a/include/linux/scx200_gpio.h b/include/linux/scx200_gpio.h
index 30cdd648ba79..83ab8a1521f5 100644
--- a/include/linux/scx200_gpio.h
+++ b/include/linux/scx200_gpio.h
@@ -87,10 +87,3 @@ static inline void scx200_gpio_change(int index) {
 #undef __SCx200_GPIO_SHADOW
 #undef __SCx200_GPIO_INDEX
 #undef __SCx200_GPIO_OUT
-
-/*
-    Local variables:
-        compile-command: "make -C ../.. bzImage modules"
-        c-basic-offset: 8
-    End:
-*/
-- 
cgit v1.2.3


From 55b8c0455b8aeb80f94183fa3aa42e3fa62b1705 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Tue, 27 Jun 2006 02:54:15 -0700
Subject: [PATCH] chardev: GPIO for SCx200 & PC-8736x: device minor numbers are
 unsigned ints

Per kernel headers, device minor numbers are unsigned ints.  Do the same in
this driver.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/scx200.c   |  2 +-
 include/linux/scx200_gpio.h | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/scx200.c b/arch/i386/kernel/scx200.c
index 4d3d8e811fe1..18f895ce2488 100644
--- a/arch/i386/kernel/scx200.c
+++ b/arch/i386/kernel/scx200.c
@@ -87,7 +87,7 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_
 	return 0;
 }
 
-u32 scx200_gpio_configure(int index, u32 mask, u32 bits)
+u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
 {
 	u32 config, new_config;
 	unsigned long flags;
diff --git a/include/linux/scx200_gpio.h b/include/linux/scx200_gpio.h
index 83ab8a1521f5..90dd069cc145 100644
--- a/include/linux/scx200_gpio.h
+++ b/include/linux/scx200_gpio.h
@@ -1,6 +1,6 @@
 #include <linux/spinlock.h>
 
-u32 scx200_gpio_configure(int index, u32 set, u32 clear);
+u32 scx200_gpio_configure(unsigned index, u32 set, u32 clear);
 
 extern unsigned scx200_gpio_base;
 extern long scx200_gpio_shadow[2];
@@ -17,7 +17,7 @@ extern long scx200_gpio_shadow[2];
 
 /* returns the value of the GPIO pin */
 
-static inline int scx200_gpio_get(int index) {
+static inline int scx200_gpio_get(unsigned index) {
 	__SCx200_GPIO_BANK;
 	__SCx200_GPIO_IOADDR + 0x04;
 	__SCx200_GPIO_INDEX;
@@ -29,7 +29,7 @@ static inline int scx200_gpio_get(int index) {
    driven if the GPIO is configured as an output, it might not be the
    state of the GPIO right now if the GPIO is configured as an input) */
 
-static inline int scx200_gpio_current(int index) {
+static inline int scx200_gpio_current(unsigned index) {
         __SCx200_GPIO_BANK;
 	__SCx200_GPIO_INDEX;
 		
@@ -38,7 +38,7 @@ static inline int scx200_gpio_current(int index) {
 
 /* drive the GPIO signal high */
 
-static inline void scx200_gpio_set_high(int index) {
+static inline void scx200_gpio_set_high(unsigned index) {
 	__SCx200_GPIO_BANK;
 	__SCx200_GPIO_IOADDR;
 	__SCx200_GPIO_SHADOW;
@@ -49,7 +49,7 @@ static inline void scx200_gpio_set_high(int index) {
 
 /* drive the GPIO signal low */
 
-static inline void scx200_gpio_set_low(int index) {
+static inline void scx200_gpio_set_low(unsigned index) {
 	__SCx200_GPIO_BANK;
 	__SCx200_GPIO_IOADDR;
 	__SCx200_GPIO_SHADOW;
@@ -60,7 +60,7 @@ static inline void scx200_gpio_set_low(int index) {
 
 /* drive the GPIO signal to state */
 
-static inline void scx200_gpio_set(int index, int state) {
+static inline void scx200_gpio_set(unsigned index, int state) {
 	__SCx200_GPIO_BANK;
 	__SCx200_GPIO_IOADDR;
 	__SCx200_GPIO_SHADOW;
@@ -73,7 +73,7 @@ static inline void scx200_gpio_set(int index, int state) {
 }
 
 /* toggle the GPIO signal */
-static inline void scx200_gpio_change(int index) {
+static inline void scx200_gpio_change(unsigned index) {
 	__SCx200_GPIO_BANK;
 	__SCx200_GPIO_IOADDR;
 	__SCx200_GPIO_SHADOW;
-- 
cgit v1.2.3


From fe3a168a2ce1c93837cdf8fe27a3e67795155f90 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Tue, 27 Jun 2006 02:54:18 -0700
Subject: [PATCH] chardev: GPIO for SCx200 & PC-8736x: add gpio-ops vtable

Abstract the gpio operations into a new nsc_gpio_ops vtable.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/scx200_gpio.c | 13 +++++++++++++
 include/linux/nsc_gpio.h   | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 include/linux/nsc_gpio.h

(limited to 'include/linux')

diff --git a/drivers/char/scx200_gpio.c b/drivers/char/scx200_gpio.c
index a1a56c5c8a84..f9994ed01816 100644
--- a/drivers/char/scx200_gpio.c
+++ b/drivers/char/scx200_gpio.c
@@ -20,6 +20,7 @@
 #include <linux/cdev.h>
 
 #include <linux/scx200_gpio.h>
+#include <linux/nsc_gpio.h>
 
 #define NAME "scx200_gpio"
 #define DEVNAME NAME
@@ -36,6 +37,18 @@ MODULE_PARM_DESC(major, "Major device number");
 
 extern void scx200_gpio_dump(unsigned index);
 
+struct nsc_gpio_ops scx200_access = {
+	.owner		= THIS_MODULE,
+	.gpio_config	= scx200_gpio_configure,
+	.gpio_dump	= scx200_gpio_dump,
+	.gpio_get	= scx200_gpio_get,
+	.gpio_set	= scx200_gpio_set,
+	.gpio_set_high	= scx200_gpio_set_high,
+	.gpio_set_low	= scx200_gpio_set_low,
+	.gpio_change	= scx200_gpio_change,
+	.gpio_current	= scx200_gpio_current
+};
+
 static ssize_t scx200_gpio_write(struct file *file, const char __user *data,
 				 size_t len, loff_t *ppos)
 {
diff --git a/include/linux/nsc_gpio.h b/include/linux/nsc_gpio.h
new file mode 100644
index 000000000000..3ad8ae9dcb5a
--- /dev/null
+++ b/include/linux/nsc_gpio.h
@@ -0,0 +1,33 @@
+/**
+   nsc_gpio.c
+
+   National Semiconductor GPIO common access methods.
+
+   struct nsc_gpio_ops abstracts the low-level access
+   operations for the GPIO units on 2 NSC chip families; the GEODE
+   integrated CPU, and the PC-8736[03456] integrated PC-peripheral
+   chips.
+
+   The GPIO units on these chips have the same pin architecture, but
+   the access methods differ.  Thus, scx200_gpio and pc8736x_gpio
+   implement their own versions of these routines; and use the common
+   file-operations routines implemented in nsc_gpio module.
+
+   Copyright (c) 2005 Jim Cromie <jim.cromie@gmail.com>
+
+   NB: this work was tested on the Geode SC-1100 and PC-87366 chips.
+   NSC sold the GEODE line to AMD, and the PC-8736x line to Winbond.
+*/
+
+struct nsc_gpio_ops {
+	struct module*	owner;
+	u32	(*gpio_config)	(unsigned iminor, u32 mask, u32 bits);
+	void	(*gpio_dump)	(unsigned iminor);
+	int	(*gpio_get)	(unsigned iminor);
+	void	(*gpio_set)	(unsigned iminor, int state);
+	void	(*gpio_set_high)(unsigned iminor);
+	void	(*gpio_set_low)	(unsigned iminor);
+	void	(*gpio_change)	(unsigned iminor);
+	int	(*gpio_current)	(unsigned iminor);
+};
+
-- 
cgit v1.2.3


From 1a66fdf083bf2b60c4d12feb970bc7373b59e33a Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Tue, 27 Jun 2006 02:54:20 -0700
Subject: [PATCH] chardev: GPIO for SCx200 & PC-8736x: migrate file-ops to
 common module

Now that the read(), write() file-ops are dispatching gpio-ops via the vtable,
they are generic, and can be moved 'verbatim' to the nsc_gpio common-support
module.  After the move, various symbols are renamed to update 'scx200_' to
'nsc_', and headers are adjusted accordingly.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/nsc_gpio.c    | 97 +++++++++++++++++++++++++++++++++++++++++-----
 drivers/char/scx200_gpio.c | 88 ++++-------------------------------------
 include/linux/nsc_gpio.h   |  5 +++
 3 files changed, 100 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/nsc_gpio.c b/drivers/char/nsc_gpio.c
index 42d95904967c..3842c2727118 100644
--- a/drivers/char/nsc_gpio.c
+++ b/drivers/char/nsc_gpio.c
@@ -19,9 +19,89 @@
 
 #define NAME "nsc_gpio"
 
-MODULE_AUTHOR("Jim Cromie <jim.cromie@gmail.com>");
-MODULE_DESCRIPTION("NatSemi GPIO Common Methods");
-MODULE_LICENSE("GPL");
+ssize_t nsc_gpio_write(struct file *file, const char __user *data,
+		       size_t len, loff_t *ppos)
+{
+	unsigned m = iminor(file->f_dentry->d_inode);
+	struct nsc_gpio_ops *amp = file->private_data;
+	size_t i;
+	int err = 0;
+
+	for (i = 0; i < len; ++i) {
+		char c;
+		if (get_user(c, data + i))
+			return -EFAULT;
+		switch (c) {
+		case '0':
+			amp->gpio_set(m, 0);
+			break;
+		case '1':
+			amp->gpio_set(m, 1);
+			break;
+		case 'O':
+			printk(KERN_INFO NAME ": GPIO%d output enabled\n", m);
+			amp->gpio_config(m, ~1, 1);
+			break;
+		case 'o':
+			printk(KERN_INFO NAME ": GPIO%d output disabled\n", m);
+			amp->gpio_config(m, ~1, 0);
+			break;
+		case 'T':
+			printk(KERN_INFO NAME ": GPIO%d output is push pull\n",
+			       m);
+			amp->gpio_config(m, ~2, 2);
+			break;
+		case 't':
+			printk(KERN_INFO NAME ": GPIO%d output is open drain\n",
+			       m);
+			amp->gpio_config(m, ~2, 0);
+			break;
+		case 'P':
+			printk(KERN_INFO NAME ": GPIO%d pull up enabled\n", m);
+			amp->gpio_config(m, ~4, 4);
+			break;
+		case 'p':
+			printk(KERN_INFO NAME ": GPIO%d pull up disabled\n", m);
+			amp->gpio_config(m, ~4, 0);
+			break;
+
+		case 'v':
+			/* View Current pin settings */
+			amp->gpio_dump(m);
+			break;
+		case '\n':
+			/* end of settings string, do nothing */
+			break;
+		default:
+			printk(KERN_ERR NAME
+			       ": GPIO-%2d bad setting: chr<0x%2x>\n", m,
+			       (int)c);
+			err++;
+		}
+	}
+	if (err)
+		return -EINVAL;	/* full string handled, report error */
+
+	return len;
+}
+
+ssize_t nsc_gpio_read(struct file *file, char __user * buf,
+		      size_t len, loff_t * ppos)
+{
+	unsigned m = iminor(file->f_dentry->d_inode);
+	int value;
+	struct nsc_gpio_ops *amp = file->private_data;
+
+	value = amp->gpio_get(m);
+	if (put_user(value ? '1' : '0', buf))
+		return -EFAULT;
+
+	return 1;
+}
+
+/* common routines for both scx200_gpio and pc87360_gpio */
+EXPORT_SYMBOL(nsc_gpio_write);
+EXPORT_SYMBOL(nsc_gpio_read);
 
 static int __init nsc_gpio_init(void)
 {
@@ -34,12 +114,9 @@ static void __exit nsc_gpio_cleanup(void)
 	printk(KERN_DEBUG NAME " cleanup\n");
 }
 
-/* prepare for
-   common routines for both scx200_gpio and pc87360_gpio
-EXPORT_SYMBOL(scx200_gpio_write);
-EXPORT_SYMBOL(scx200_gpio_read);
-EXPORT_SYMBOL(scx200_gpio_release);
-*/
-
 module_init(nsc_gpio_init);
 module_exit(nsc_gpio_cleanup);
+
+MODULE_AUTHOR("Jim Cromie <jim.cromie@gmail.com>");
+MODULE_DESCRIPTION("NatSemi GPIO Common Methods");
+MODULE_LICENSE("GPL");
diff --git a/drivers/char/scx200_gpio.c b/drivers/char/scx200_gpio.c
index 15dfb95ebc7e..eb9a84777598 100644
--- a/drivers/char/scx200_gpio.c
+++ b/drivers/char/scx200_gpio.c
@@ -37,6 +37,12 @@ MODULE_PARM_DESC(major, "Major device number");
 
 extern void scx200_gpio_dump(unsigned index);
 
+extern ssize_t nsc_gpio_write(struct file *file, const char __user *data,
+			      size_t len, loff_t *ppos);
+
+extern ssize_t nsc_gpio_read(struct file *file, char __user *buf,
+			     size_t len, loff_t *ppos);
+
 struct nsc_gpio_ops scx200_access = {
 	.owner		= THIS_MODULE,
 	.gpio_config	= scx200_gpio_configure,
@@ -49,84 +55,6 @@ struct nsc_gpio_ops scx200_access = {
 	.gpio_current	= scx200_gpio_current
 };
 
-static ssize_t scx200_gpio_write(struct file *file, const char __user *data,
-				 size_t len, loff_t *ppos)
-{
-	unsigned m = iminor(file->f_dentry->d_inode);
-	struct nsc_gpio_ops *amp = file->private_data;
-	size_t i;
-	int err = 0;
-
-	for (i = 0; i < len; ++i) {
-		char c;
-		if (get_user(c, data + i))
-			return -EFAULT;
-		switch (c) {
-		case '0':
-			amp->gpio_set(m, 0);
-			break;
-		case '1':
-			amp->gpio_set(m, 1);
-			break;
-		case 'O':
-			printk(KERN_INFO NAME ": GPIO%d output enabled\n", m);
-			amp->gpio_config(m, ~1, 1);
-			break;
-		case 'o':
-			printk(KERN_INFO NAME ": GPIO%d output disabled\n", m);
-			amp->gpio_config(m, ~1, 0);
-			break;
-		case 'T':
-			printk(KERN_INFO NAME ": GPIO%d output is push pull\n", m);
-			amp->gpio_config(m, ~2, 2);
-			break;
-		case 't':
-			printk(KERN_INFO NAME ": GPIO%d output is open drain\n", m);
-			amp->gpio_config(m, ~2, 0);
-			break;
-		case 'P':
-			printk(KERN_INFO NAME ": GPIO%d pull up enabled\n", m);
-			amp->gpio_config(m, ~4, 4);
-			break;
-		case 'p':
-			printk(KERN_INFO NAME ": GPIO%d pull up disabled\n", m);
-			amp->gpio_config(m, ~4, 0);
-			break;
-
-		case 'v':
-			/* View Current pin settings */
-			amp->gpio_dump(m);
-			break;
-		case '\n':
-			/* end of settings string, do nothing */
-			break;
-		default:
-			printk(KERN_ERR NAME
-			       ": GPIO-%2d bad setting: chr<0x%2x>\n", m,
-			       (int)c);
-			err++;
-		}
-	}
-	if (err)
-		return -EINVAL;	/* full string handled, report error */
-
-	return len;
-}
-
-static ssize_t scx200_gpio_read(struct file *file, char __user *buf,
-				size_t len, loff_t *ppos)
-{
-	unsigned m = iminor(file->f_dentry->d_inode);
-	int value;
-	struct nsc_gpio_ops *amp = file->private_data;
-
-	value = amp->gpio_get(m);
-	if (put_user(value ? '1' : '0', buf))
-		return -EFAULT;
-
-	return 1;
-}
-
 static int scx200_gpio_open(struct inode *inode, struct file *file)
 {
 	unsigned m = iminor(inode);
@@ -145,8 +73,8 @@ static int scx200_gpio_release(struct inode *inode, struct file *file)
 
 static struct file_operations scx200_gpio_fops = {
 	.owner   = THIS_MODULE,
-	.write   = scx200_gpio_write,
-	.read    = scx200_gpio_read,
+	.write   = nsc_gpio_write,
+	.read    = nsc_gpio_read,
 	.open    = scx200_gpio_open,
 	.release = scx200_gpio_release,
 };
diff --git a/include/linux/nsc_gpio.h b/include/linux/nsc_gpio.h
index 3ad8ae9dcb5a..27bf66f73868 100644
--- a/include/linux/nsc_gpio.h
+++ b/include/linux/nsc_gpio.h
@@ -31,3 +31,8 @@ struct nsc_gpio_ops {
 	int	(*gpio_current)	(unsigned iminor);
 };
 
+extern ssize_t nsc_gpio_write(struct file *file, const char __user *data,
+			      size_t len, loff_t *ppos);
+
+extern ssize_t nsc_gpio_read(struct file *file, char __user *buf,
+			     size_t len, loff_t *ppos);
-- 
cgit v1.2.3


From f31000e573da052b6b8bcc21faff520b4e2eda7a Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Tue, 27 Jun 2006 02:54:23 -0700
Subject: [PATCH] chardev: GPIO for SCx200 & PC-8736x: use dev_dbg in common
 module

Use of dev_dbg() and friends is considered good practice.  dev_dbg() needs a
struct device *devp, but nsc_gpio is only a helper module, so it doesnt
have/need its own.  To provide devp to the user-modules (scx200 & pc8736x
_gpio), we add it to the vtable, and set it during init.

Also squeeze nsc_gpio_dump()'s format a little.

[  199.259879]  pc8736x_gpio.0: io09: 0x0044 TS OD PUE  EDGE LO DEBOUNCE

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/nsc_gpio.c     | 45 +++++++++++++++++++++++++--------------------
 drivers/char/pc8736x_gpio.c |  3 +--
 drivers/char/scx200_gpio.c  | 11 +++--------
 include/linux/nsc_gpio.h    |  6 +++++-
 4 files changed, 34 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/nsc_gpio.c b/drivers/char/nsc_gpio.c
index 929d68486c1a..d0b5d65a73fb 100644
--- a/drivers/char/nsc_gpio.c
+++ b/drivers/char/nsc_gpio.c
@@ -14,22 +14,27 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/nsc_gpio.h>
+#include <linux/platform_device.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
 #define NAME "nsc_gpio"
 
-void nsc_gpio_dump(unsigned index, u32 config)
+void nsc_gpio_dump(struct nsc_gpio_ops *amp, unsigned index)
 {
-	printk(KERN_INFO NAME ": GPIO-%02u: 0x%08lx %s %s %s %s %s %s %s\n",
-	       index, (unsigned long)config,
-	       (config & 1) ? "OE" : "TS",      /* output-enabled/tristate */
-	       (config & 2) ? "PP" : "OD",      /* push pull / open drain */
-	       (config & 4) ? "PUE" : "PUD",    /* pull up enabled/disabled */
-	       (config & 8) ? "LOCKED" : "",    /* locked / unlocked */
-	       (config & 16) ? "LEVEL" : "EDGE",/* level/edge input */
-	       (config & 32) ? "HI" : "LO",     /* trigger on rise/fall edge */
-	       (config & 64) ? "DEBOUNCE" : "");        /* debounce */
+	/* retrieve current config w/o changing it */
+	u32 config = amp->gpio_config(index, ~0, 0);
+
+	/* user requested via 'v' command, so its INFO */
+	dev_info(amp->dev, "io%02u: 0x%04x %s %s %s %s %s %s %s\n",
+		 index, config,
+		 (config & 1) ? "OE" : "TS",      /* output-enabled/tristate */
+		 (config & 2) ? "PP" : "OD",      /* push pull / open drain */
+		 (config & 4) ? "PUE" : "PUD",    /* pull up enabled/disabled */
+		 (config & 8) ? "LOCKED" : "",    /* locked / unlocked */
+		 (config & 16) ? "LEVEL" : "EDGE",/* level/edge input */
+		 (config & 32) ? "HI" : "LO",     /* trigger on rise/fall edge */
+		 (config & 64) ? "DEBOUNCE" : "");        /* debounce */
 }
 
 ssize_t nsc_gpio_write(struct file *file, const char __user *data,
@@ -37,6 +42,7 @@ ssize_t nsc_gpio_write(struct file *file, const char __user *data,
 {
 	unsigned m = iminor(file->f_dentry->d_inode);
 	struct nsc_gpio_ops *amp = file->private_data;
+	struct device *dev = amp->dev;
 	size_t i;
 	int err = 0;
 
@@ -52,42 +58,41 @@ ssize_t nsc_gpio_write(struct file *file, const char __user *data,
 			amp->gpio_set(m, 1);
 			break;
 		case 'O':
-			printk(KERN_INFO NAME ": GPIO%d output enabled\n", m);
+			dev_dbg(dev, "GPIO%d output enabled\n", m);
 			amp->gpio_config(m, ~1, 1);
 			break;
 		case 'o':
-			printk(KERN_INFO NAME ": GPIO%d output disabled\n", m);
+			dev_dbg(dev, "GPIO%d output disabled\n", m);
 			amp->gpio_config(m, ~1, 0);
 			break;
 		case 'T':
-			printk(KERN_INFO NAME ": GPIO%d output is push pull\n",
+			dev_dbg(dev, "GPIO%d output is push pull\n",
 			       m);
 			amp->gpio_config(m, ~2, 2);
 			break;
 		case 't':
-			printk(KERN_INFO NAME ": GPIO%d output is open drain\n",
+			dev_dbg(dev, "GPIO%d output is open drain\n",
 			       m);
 			amp->gpio_config(m, ~2, 0);
 			break;
 		case 'P':
-			printk(KERN_INFO NAME ": GPIO%d pull up enabled\n", m);
+			dev_dbg(dev, "GPIO%d pull up enabled\n", m);
 			amp->gpio_config(m, ~4, 4);
 			break;
 		case 'p':
-			printk(KERN_INFO NAME ": GPIO%d pull up disabled\n", m);
+			dev_dbg(dev, "GPIO%d pull up disabled\n", m);
 			amp->gpio_config(m, ~4, 0);
 			break;
 		case 'v':
 			/* View Current pin settings */
-			amp->gpio_dump(m);
+			amp->gpio_dump(amp, m);
 			break;
 		case '\n':
 			/* end of settings string, do nothing */
 			break;
 		default:
-			printk(KERN_ERR NAME
-			       ": GPIO-%2d bad setting: chr<0x%2x>\n", m,
-			       (int)c);
+			dev_err(dev, "io%2d bad setting: chr<0x%2x>\n",
+				m, (int)c);
 			err++;
 		}
 	}
diff --git a/drivers/char/pc8736x_gpio.c b/drivers/char/pc8736x_gpio.c
index b16fbef816c2..48ff1fc8b06d 100644
--- a/drivers/char/pc8736x_gpio.c
+++ b/drivers/char/pc8736x_gpio.c
@@ -207,8 +207,6 @@ static void pc8736x_gpio_change(unsigned index)
 	pc8736x_gpio_set(index, !pc8736x_gpio_get(index));
 }
 
-extern void nsc_gpio_dump(unsigned iminor);
-
 static struct nsc_gpio_ops pc8736x_access = {
 	.owner		= THIS_MODULE,
 	.gpio_config	= pc8736x_gpio_configure,
@@ -260,6 +258,7 @@ static int __init pc8736x_gpio_init(void)
 		dev_err(&pdev->dev, "no device found\n");
 		goto undo_platform_dev_add;
 	}
+	pc8736x_access.dev = &pdev->dev;
 
 	/* Verify that chip and it's GPIO unit are both enabled.
 	   My BIOS does this, so I take minimum action here
diff --git a/drivers/char/scx200_gpio.c b/drivers/char/scx200_gpio.c
index 442367b3f5dc..5a280a330401 100644
--- a/drivers/char/scx200_gpio.c
+++ b/drivers/char/scx200_gpio.c
@@ -35,14 +35,6 @@ static int major = 0;		/* default to dynamic major */
 module_param(major, int, 0);
 MODULE_PARM_DESC(major, "Major device number");
 
-extern void nsc_gpio_dump(unsigned index);
-
-extern ssize_t nsc_gpio_write(struct file *file, const char __user *data,
-			      size_t len, loff_t *ppos);
-
-extern ssize_t nsc_gpio_read(struct file *file, char __user *buf,
-			     size_t len, loff_t *ppos);
-
 struct nsc_gpio_ops scx200_access = {
 	.owner		= THIS_MODULE,
 	.gpio_config	= scx200_gpio_configure,
@@ -101,6 +93,9 @@ static int __init scx200_gpio_init(void)
 	if (rc)
 		goto undo_malloc;
 
+	/* nsc_gpio uses dev_dbg(), so needs this */
+	scx200_access.dev = &pdev->dev;
+
 	if (major)
 		rc = register_chrdev_region(dev, num_pins, "scx200_gpio");
 	else {
diff --git a/include/linux/nsc_gpio.h b/include/linux/nsc_gpio.h
index 27bf66f73868..135742cfada5 100644
--- a/include/linux/nsc_gpio.h
+++ b/include/linux/nsc_gpio.h
@@ -22,13 +22,14 @@
 struct nsc_gpio_ops {
 	struct module*	owner;
 	u32	(*gpio_config)	(unsigned iminor, u32 mask, u32 bits);
-	void	(*gpio_dump)	(unsigned iminor);
+	void	(*gpio_dump)	(struct nsc_gpio_ops *amp, unsigned iminor);
 	int	(*gpio_get)	(unsigned iminor);
 	void	(*gpio_set)	(unsigned iminor, int state);
 	void	(*gpio_set_high)(unsigned iminor);
 	void	(*gpio_set_low)	(unsigned iminor);
 	void	(*gpio_change)	(unsigned iminor);
 	int	(*gpio_current)	(unsigned iminor);
+	struct device*	dev;	/* for dev_dbg() support, set in init  */
 };
 
 extern ssize_t nsc_gpio_write(struct file *file, const char __user *data,
@@ -36,3 +37,6 @@ extern ssize_t nsc_gpio_write(struct file *file, const char __user *data,
 
 extern ssize_t nsc_gpio_read(struct file *file, char __user *buf,
 			     size_t len, loff_t *ppos);
+
+extern void nsc_gpio_dump(struct nsc_gpio_ops *amp, unsigned index);
+
-- 
cgit v1.2.3


From 2dd73a4f09beacadde827a032cf15fd8b1fa3d48 Mon Sep 17 00:00:00 2001
From: Peter Williams <pwil3058@bigpond.net.au>
Date: Tue, 27 Jun 2006 02:54:34 -0700
Subject: [PATCH] sched: implement smpnice

Problem:

The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.

For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU.  Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running.  The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each.  However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU.  Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met.  The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.

Solution:

The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance.  Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example.  Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU.  The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2.  If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop.  If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.

One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.

Notes:

struct task_struct:

has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.

struct runqueue:

has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue.  This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens.  This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.

int try_to_wake_up():

in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on.  While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction.  To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).

int move_tasks():

The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists.  This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function.  The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load.  This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.

struct sched_group *find_busiest_group():

Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created.  A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required.  As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.

A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.

void set_user_nice():

has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.

From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>

With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain.  This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.

a) on a simple 4-way MP system, if we have one high priority and 4 normal
   priority tasks, with smpnice we would like to see the high priority task
   scheduled on one cpu, two other cpus getting one normal task each and the
   fourth cpu getting the remaining two normal tasks.  but with current
   smpnice extra normal priority task keeps jumping from one cpu to another
   cpu having the normal priority task.  This is because of the
   busiest_has_loaded_cpus, nr_loaded_cpus logic..  We are not including the
   cpu with high priority task in max_load calculations but including that in
   total and avg_load calcuations..  leading to max_load < avg_load and load
   balance between cpus running normal priority tasks(2 Vs 1) will always show
   imbalanace as one normal priority and the extra normal priority task will
   keep moving from one cpu to another cpu having normal priority task..

b) 4-way system with HT (8 logical processors).  Package-P0 T0 has a
   highest priority task, T1 is idle.  Package-P1 Both T0 and T1 have 1 normal
   priority task each..  P2 and P3 are idle.  With this patch, one of the
   normal priority tasks on P1 will be moved to P2 or P3..

c) With the current weighted smp nice calculations, it doesn't always make
   sense to look at the highest weighted runqueue in the busy group..
   Consider a load balance scenario on a DP with HT system, with Package-0
   containing one high priority and one low priority, Package-1 containing one
   low priority(with other thread being idle)..  Package-1 thinks that it need
   to take the low priority thread from Package-0.  And find_busiest_queue()
   returns the cpu thread with highest priority task..  And ultimately(with
   help of active load balance) we move high priority task to Package-1.  And
   same continues with Package-0 now, moving high priority task from package-1
   to package-0..  Even without the presence of active load balance, load
   balance will fail to balance the above scenario..  Fix find_busiest_queue
   to use "imbalance" when it is lightly loaded.

[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |   8 +-
 kernel/sched.c        | 284 ++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 225 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 122a25c1b997..74a1e39e0d3d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -123,6 +123,7 @@ extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
+extern unsigned long weighted_cpuload(const int cpu);
 
 
 /*
@@ -558,9 +559,9 @@ enum idle_type
 /*
  * sched-domains (multiprocessor balancing) declarations:
  */
-#ifdef CONFIG_SMP
 #define SCHED_LOAD_SCALE	128UL	/* increase resolution of load */
 
+#ifdef CONFIG_SMP
 #define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
 #define SD_BALANCE_NEWIDLE	2	/* Balance when about to become idle */
 #define SD_BALANCE_EXEC		4	/* Balance on exec */
@@ -713,9 +714,12 @@ struct task_struct {
 
 	int lock_depth;		/* BKL lock depth */
 
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#ifdef CONFIG_SMP
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	int oncpu;
 #endif
+#endif
+	int load_weight;	/* for niceness load balancing purposes */
 	int prio, static_prio;
 	struct list_head run_list;
 	prio_array_t *array;
diff --git a/kernel/sched.c b/kernel/sched.c
index 678335a8b390..1847a4456a2d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,15 +168,21 @@
  */
 
 #define SCALE_PRIO(x, prio) \
-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
+	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 
-static unsigned int task_timeslice(task_t *p)
+static unsigned int static_prio_timeslice(int static_prio)
 {
-	if (p->static_prio < NICE_TO_PRIO(0))
-		return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
+	if (static_prio < NICE_TO_PRIO(0))
+		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
 	else
-		return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
+		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 }
+
+static inline unsigned int task_timeslice(task_t *p)
+{
+	return static_prio_timeslice(p->static_prio);
+}
+
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 
@@ -207,6 +213,7 @@ struct runqueue {
 	 * remote CPUs use both these fields when doing load calculation.
 	 */
 	unsigned long nr_running;
+	unsigned long raw_weighted_load;
 #ifdef CONFIG_SMP
 	unsigned long cpu_load[3];
 #endif
@@ -661,6 +668,68 @@ static int effective_prio(task_t *p)
 	return prio;
 }
 
+/*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
+ */
+
+/*
+ * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
+ * If static_prio_timeslice() is ever changed to break this assumption then
+ * this code will need modification
+ */
+#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
+#define LOAD_WEIGHT(lp) \
+	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
+#define PRIO_TO_LOAD_WEIGHT(prio) \
+	LOAD_WEIGHT(static_prio_timeslice(prio))
+#define RTPRIO_TO_LOAD_WEIGHT(rp) \
+	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
+
+static void set_load_weight(task_t *p)
+{
+	if (rt_task(p)) {
+#ifdef CONFIG_SMP
+		if (p == task_rq(p)->migration_thread)
+			/*
+			 * The migration thread does the actual balancing.
+			 * Giving its load any weight will skew balancing
+			 * adversely.
+			 */
+			p->load_weight = 0;
+		else
+#endif
+			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
+	} else
+		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+}
+
+static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
+{
+	rq->raw_weighted_load += p->load_weight;
+}
+
+static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
+{
+	rq->raw_weighted_load -= p->load_weight;
+}
+
+static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running++;
+	inc_raw_weighted_load(rq, p);
+}
+
+static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running--;
+	dec_raw_weighted_load(rq, p);
+}
+
 /*
  * __activate_task - move a task to the runqueue.
  */
@@ -671,7 +740,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
 	if (batch_task(p))
 		target = rq->expired;
 	enqueue_task(p, target);
-	rq->nr_running++;
+	inc_nr_running(p, rq);
 }
 
 /*
@@ -680,7 +749,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task_head(p, rq->active);
-	rq->nr_running++;
+	inc_nr_running(p, rq);
 }
 
 static int recalc_task_prio(task_t *p, unsigned long long now)
@@ -804,7 +873,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
  */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	rq->nr_running--;
+	dec_nr_running(p, rq);
 	dequeue_task(p, p->array);
 	p->array = NULL;
 }
@@ -859,6 +928,12 @@ inline int task_curr(const task_t *p)
 	return cpu_curr(task_cpu(p)) == p;
 }
 
+/* Used instead of source_load when we know the type == 0 */
+unsigned long weighted_cpuload(const int cpu)
+{
+	return cpu_rq(cpu)->raw_weighted_load;
+}
+
 #ifdef CONFIG_SMP
 typedef struct {
 	struct list_head list;
@@ -948,7 +1023,8 @@ void kick_process(task_t *p)
 }
 
 /*
- * Return a low guess at the load of a migration-source cpu.
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
  *
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
@@ -956,24 +1032,36 @@ void kick_process(task_t *p)
 static inline unsigned long source_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+
 	if (type == 0)
-		return load_now;
+		return rq->raw_weighted_load;
 
-	return min(rq->cpu_load[type-1], load_now);
+	return min(rq->cpu_load[type-1], rq->raw_weighted_load);
 }
 
 /*
- * Return a high guess at the load of a migration-target cpu
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
  */
 static inline unsigned long target_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+
 	if (type == 0)
-		return load_now;
+		return rq->raw_weighted_load;
 
-	return max(rq->cpu_load[type-1], load_now);
+	return max(rq->cpu_load[type-1], rq->raw_weighted_load);
+}
+
+/*
+ * Return the average load per task on the cpu's run queue
+ */
+static inline unsigned long cpu_avg_load_per_task(int cpu)
+{
+	runqueue_t *rq = cpu_rq(cpu);
+	unsigned long n = rq->nr_running;
+
+	return n ?  rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
 }
 
 /*
@@ -1046,7 +1134,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 	cpus_and(tmp, group->cpumask, p->cpus_allowed);
 
 	for_each_cpu_mask(i, tmp) {
-		load = source_load(i, 0);
+		load = weighted_cpuload(i);
 
 		if (load < min_load || (load == min_load && i == this_cpu)) {
 			min_load = load;
@@ -1226,17 +1314,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
 
 		if (this_sd->flags & SD_WAKE_AFFINE) {
 			unsigned long tl = this_load;
+			unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
+
 			/*
 			 * If sync wakeup then subtract the (maximum possible)
 			 * effect of the currently running task from the load
 			 * of the current CPU:
 			 */
 			if (sync)
-				tl -= SCHED_LOAD_SCALE;
+				tl -= current->load_weight;
 
 			if ((tl <= load &&
-				tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
-				100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+				tl + target_load(cpu, idx) <= tl_per_task) ||
+				100*(tl + p->load_weight) <= imbalance*load) {
 				/*
 				 * This domain has SD_WAKE_AFFINE and
 				 * p is cache cold in this domain, and
@@ -1435,7 +1525,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
-				rq->nr_running++;
+				inc_nr_running(p, rq);
 			}
 			set_need_resched();
 		} else
@@ -1802,9 +1892,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
 	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
-	src_rq->nr_running--;
+	dec_nr_running(p, src_rq);
 	set_task_cpu(p, this_cpu);
-	this_rq->nr_running++;
+	inc_nr_running(p, this_rq);
 	enqueue_task(p, this_array);
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
@@ -1852,24 +1942,27 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 }
 
 /*
- * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
- * as part of a balancing operation within "domain". Returns the number of
- * tasks moved.
+ * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * load from busiest to this_rq, as part of a balancing operation within
+ * "domain". Returns the number of tasks moved.
  *
  * Called with both runqueues locked.
  */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-		      unsigned long max_nr_move, struct sched_domain *sd,
-		      enum idle_type idle, int *all_pinned)
+		      unsigned long max_nr_move, unsigned long max_load_move,
+		      struct sched_domain *sd, enum idle_type idle,
+		      int *all_pinned)
 {
 	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
 	int idx, pulled = 0, pinned = 0;
+	long rem_load_move;
 	task_t *tmp;
 
-	if (max_nr_move == 0)
+	if (max_nr_move == 0 || max_load_move == 0)
 		goto out;
 
+	rem_load_move = max_load_move;
 	pinned = 1;
 
 	/*
@@ -1910,7 +2003,8 @@ skip_queue:
 
 	curr = curr->prev;
 
-	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+	if (tmp->load_weight > rem_load_move ||
+	    !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -1924,9 +2018,13 @@ skip_queue:
 
 	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
 	pulled++;
+	rem_load_move -= tmp->load_weight;
 
-	/* We only want to steal up to the prescribed number of tasks. */
-	if (pulled < max_nr_move) {
+	/*
+	 * We only want to steal up to the prescribed number of tasks
+	 * and the prescribed amount of weighted load.
+	 */
+	if (pulled < max_nr_move && rem_load_move > 0) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -1947,7 +2045,7 @@ out:
 
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the number of tasks which should be
+ * domain. It calculates and returns the amount of weighted load which should be
  * moved to restore balance via the imbalance parameter.
  */
 static struct sched_group *
@@ -1957,9 +2055,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 	unsigned long max_pull;
+	unsigned long busiest_load_per_task, busiest_nr_running;
+	unsigned long this_load_per_task, this_nr_running;
 	int load_idx;
 
 	max_load = this_load = total_load = total_pwr = 0;
+	busiest_load_per_task = busiest_nr_running = 0;
+	this_load_per_task = this_nr_running = 0;
 	if (idle == NOT_IDLE)
 		load_idx = sd->busy_idx;
 	else if (idle == NEWLY_IDLE)
@@ -1971,13 +2073,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		unsigned long load;
 		int local_group;
 		int i;
+		unsigned long sum_nr_running, sum_weighted_load;
 
 		local_group = cpu_isset(this_cpu, group->cpumask);
 
 		/* Tally up the load of all CPUs in the group */
-		avg_load = 0;
+		sum_weighted_load = sum_nr_running = avg_load = 0;
 
 		for_each_cpu_mask(i, group->cpumask) {
+			runqueue_t *rq = cpu_rq(i);
+
 			if (*sd_idle && !idle_cpu(i))
 				*sd_idle = 0;
 
@@ -1988,6 +2093,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 				load = source_load(i, load_idx);
 
 			avg_load += load;
+			sum_nr_running += rq->nr_running;
+			sum_weighted_load += rq->raw_weighted_load;
 		}
 
 		total_load += avg_load;
@@ -1999,14 +2106,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
-		} else if (avg_load > max_load) {
+			this_nr_running = sum_nr_running;
+			this_load_per_task = sum_weighted_load;
+		} else if (avg_load > max_load &&
+			   sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) {
 			max_load = avg_load;
 			busiest = group;
+			busiest_nr_running = sum_nr_running;
+			busiest_load_per_task = sum_weighted_load;
 		}
 		group = group->next;
 	} while (group != sd->groups);
 
-	if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
+	if (!busiest || this_load >= max_load || busiest_nr_running == 0)
 		goto out_balanced;
 
 	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2015,6 +2127,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 			100*max_load <= sd->imbalance_pct*this_load)
 		goto out_balanced;
 
+	busiest_load_per_task /= busiest_nr_running;
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
 	 * want to push ourselves above the average load, nor do we wish to
@@ -2026,21 +2139,50 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 * by pulling tasks to us.  Be careful of negative numbers as they'll
 	 * appear as very large values with unsigned longs.
 	 */
+	if (max_load <= busiest_load_per_task)
+		goto out_balanced;
+
+	/*
+	 * In the presence of smp nice balancing, certain scenarios can have
+	 * max load less than avg load(as we skip the groups at or below
+	 * its cpu_power, while calculating max_load..)
+	 */
+	if (max_load < avg_load) {
+		*imbalance = 0;
+		goto small_imbalance;
+	}
 
 	/* Don't want to pull so many tasks that a group would go idle */
-	max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+	max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
 
 	/* How much load to actually move to equalise the imbalance */
 	*imbalance = min(max_pull * busiest->cpu_power,
 				(avg_load - this_load) * this->cpu_power)
 			/ SCHED_LOAD_SCALE;
 
-	if (*imbalance < SCHED_LOAD_SCALE) {
-		unsigned long pwr_now = 0, pwr_move = 0;
+	/*
+	 * if *imbalance is less than the average load per runnable task
+	 * there is no gaurantee that any tasks will be moved so we'll have
+	 * a think about bumping its value to force at least one task to be
+	 * moved
+	 */
+	if (*imbalance < busiest_load_per_task) {
+		unsigned long pwr_now, pwr_move;
 		unsigned long tmp;
+		unsigned int imbn;
+
+small_imbalance:
+		pwr_move = pwr_now = 0;
+		imbn = 2;
+		if (this_nr_running) {
+			this_load_per_task /= this_nr_running;
+			if (busiest_load_per_task > this_load_per_task)
+				imbn = 1;
+		} else
+			this_load_per_task = SCHED_LOAD_SCALE;
 
-		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-			*imbalance = 1;
+		if (max_load - this_load >= busiest_load_per_task * imbn) {
+			*imbalance = busiest_load_per_task;
 			return busiest;
 		}
 
@@ -2050,35 +2192,34 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * moving them.
 		 */
 
-		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
-		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+		pwr_now += busiest->cpu_power *
+			min(busiest_load_per_task, max_load);
+		pwr_now += this->cpu_power *
+			min(this_load_per_task, this_load);
 		pwr_now /= SCHED_LOAD_SCALE;
 
 		/* Amount of load we'd subtract */
-		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+		tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
 		if (max_load > tmp)
-			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
-							max_load - tmp);
+			pwr_move += busiest->cpu_power *
+				min(busiest_load_per_task, max_load - tmp);
 
 		/* Amount of load we'd add */
 		if (max_load*busiest->cpu_power <
-				SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
+				busiest_load_per_task*SCHED_LOAD_SCALE)
 			tmp = max_load*busiest->cpu_power/this->cpu_power;
 		else
-			tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
-		pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
+			tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
+		pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
 		pwr_move /= SCHED_LOAD_SCALE;
 
 		/* Move if we gain throughput */
 		if (pwr_move <= pwr_now)
 			goto out_balanced;
 
-		*imbalance = 1;
-		return busiest;
+		*imbalance = busiest_load_per_task;
 	}
 
-	/* Get rid of the scaling factor, rounding down as we divide */
-	*imbalance = *imbalance / SCHED_LOAD_SCALE;
 	return busiest;
 
 out_balanced:
@@ -2091,18 +2232,21 @@ out_balanced:
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static runqueue_t *find_busiest_queue(struct sched_group *group,
-	enum idle_type idle)
+	enum idle_type idle, unsigned long imbalance)
 {
-	unsigned long load, max_load = 0;
-	runqueue_t *busiest = NULL;
+	unsigned long max_load = 0;
+	runqueue_t *busiest = NULL, *rqi;
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = source_load(i, 0);
+		rqi = cpu_rq(i);
+
+		if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance)
+			continue;
 
-		if (load > max_load) {
-			max_load = load;
-			busiest = cpu_rq(i);
+		if (rqi->raw_weighted_load > max_load) {
+			max_load = rqi->raw_weighted_load;
+			busiest = rqi;
 		}
 	}
 
@@ -2115,6 +2259,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
  */
 #define MAX_PINNED_INTERVAL	512
 
+#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
@@ -2142,7 +2287,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, idle);
+	busiest = find_busiest_queue(group, idle, imbalance);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
@@ -2162,6 +2307,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 		 */
 		double_rq_lock(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
+					minus_1_or_zero(busiest->nr_running),
 					imbalance, sd, idle, &all_pinned);
 		double_rq_unlock(this_rq, busiest);
 
@@ -2265,7 +2411,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, NEWLY_IDLE);
+	busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out_balanced;
@@ -2280,6 +2426,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 		/* Attempt to move tasks */
 		double_lock_balance(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
+					minus_1_or_zero(busiest->nr_running),
 					imbalance, sd, NEWLY_IDLE, NULL);
 		spin_unlock(&busiest->lock);
 	}
@@ -2361,7 +2508,8 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
 
 	schedstat_inc(sd, alb_cnt);
 
-	if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+	if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
+			RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL))
 		schedstat_inc(sd, alb_pushed);
 	else
 		schedstat_inc(sd, alb_failed);
@@ -2389,7 +2537,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 	struct sched_domain *sd;
 	int i;
 
-	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
+	this_load = this_rq->raw_weighted_load;
 	/* Update our load */
 	for (i = 0; i < 3; i++) {
 		unsigned long new_load = this_load;
@@ -3441,17 +3589,21 @@ void set_user_nice(task_t *p, long nice)
 		goto out_unlock;
 	}
 	array = p->array;
-	if (array)
+	if (array) {
 		dequeue_task(p, array);
+		dec_raw_weighted_load(rq, p);
+	}
 
 	old_prio = p->prio;
 	new_prio = NICE_TO_PRIO(nice);
 	delta = new_prio - old_prio;
 	p->static_prio = NICE_TO_PRIO(nice);
+	set_load_weight(p);
 	p->prio += delta;
 
 	if (array) {
 		enqueue_task(p, array);
+		inc_raw_weighted_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -3587,6 +3739,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
 		if (policy == SCHED_BATCH)
 			p->sleep_avg = 0;
 	}
+	set_load_weight(p);
 }
 
 /**
@@ -6106,6 +6259,7 @@ void __init sched_init(void)
 		}
 	}
 
+	set_load_weight(&init_task);
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
-- 
cgit v1.2.3


From 51888ca25a03125e742ef84d4ddfd74e139707a0 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Date: Tue, 27 Jun 2006 02:54:38 -0700
Subject: [PATCH] sched_domain: handle kmalloc failure

Try to handle mem allocation failures in build_sched_domains by bailing out
and cleaning up thus-far allocated memory.  The patch has a direct consequence
that we disable load balancing completely (even at sibling level) upon *any*
memory allocation failure.

[Lee.Schermerhorn@hp.com: bugfix]
Signed-off-by: Srivatsa Vaddagir <vatsa@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |   2 +-
 kernel/sched.c        | 139 ++++++++++++++++++++++++++++----------------------
 2 files changed, 79 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 74a1e39e0d3d..ab8ffc54423a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -639,7 +639,7 @@ struct sched_domain {
 #endif
 };
 
-extern void partition_sched_domains(cpumask_t *partition1,
+extern int partition_sched_domains(cpumask_t *partition1,
 				    cpumask_t *partition2);
 
 /*
diff --git a/kernel/sched.c b/kernel/sched.c
index 0ec84f57695d..77a2ec55ef7d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5820,11 +5820,56 @@ next_sg:
 }
 #endif
 
+/* Free memory allocated for various sched_group structures */
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+#ifdef CONFIG_NUMA
+	int i;
+	int cpu;
+
+	for_each_cpu_mask(cpu, *cpu_map) {
+		struct sched_group *sched_group_allnodes
+			= sched_group_allnodes_bycpu[cpu];
+		struct sched_group **sched_group_nodes
+			= sched_group_nodes_bycpu[cpu];
+
+		if (sched_group_allnodes) {
+			kfree(sched_group_allnodes);
+			sched_group_allnodes_bycpu[cpu] = NULL;
+		}
+
+		if (!sched_group_nodes)
+			continue;
+
+		for (i = 0; i < MAX_NUMNODES; i++) {
+			cpumask_t nodemask = node_to_cpumask(i);
+			struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+			cpus_and(nodemask, nodemask, *cpu_map);
+			if (cpus_empty(nodemask))
+				continue;
+
+			if (sg == NULL)
+				continue;
+			sg = sg->next;
+next_sg:
+			oldsg = sg;
+			sg = sg->next;
+			kfree(oldsg);
+			if (oldsg != sched_group_nodes[i])
+				goto next_sg;
+		}
+		kfree(sched_group_nodes);
+		sched_group_nodes_bycpu[cpu] = NULL;
+	}
+#endif
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-void build_sched_domains(const cpumask_t *cpu_map)
+static int build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
 #ifdef CONFIG_NUMA
@@ -5834,11 +5879,11 @@ void build_sched_domains(const cpumask_t *cpu_map)
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
-	sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+	sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
 					   GFP_ATOMIC);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
-		return;
+		return -ENOMEM;
 	}
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
@@ -5864,7 +5909,7 @@ void build_sched_domains(const cpumask_t *cpu_map)
 				if (!sched_group_allnodes) {
 					printk(KERN_WARNING
 					"Can not alloc allnodes sched group\n");
-					break;
+					goto error;
 				}
 				sched_group_allnodes_bycpu[i]
 						= sched_group_allnodes;
@@ -5978,23 +6023,20 @@ void build_sched_domains(const cpumask_t *cpu_map)
 		cpus_and(domainspan, domainspan, *cpu_map);
 
 		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+		if (!sg) {
+			printk(KERN_WARNING "Can not alloc domain group for "
+				"node %d\n", i);
+			goto error;
+		}
 		sched_group_nodes[i] = sg;
 		for_each_cpu_mask(j, nodemask) {
 			struct sched_domain *sd;
 			sd = &per_cpu(node_domains, j);
 			sd->groups = sg;
-			if (sd->groups == NULL) {
-				/* Turn off balancing if we have no groups */
-				sd->flags = 0;
-			}
-		}
-		if (!sg) {
-			printk(KERN_WARNING
-			"Can not alloc domain group for node %d\n", i);
-			continue;
 		}
 		sg->cpu_power = 0;
 		sg->cpumask = nodemask;
+		sg->next = sg;
 		cpus_or(covered, covered, nodemask);
 		prev = sg;
 
@@ -6017,15 +6059,15 @@ void build_sched_domains(const cpumask_t *cpu_map)
 			if (!sg) {
 				printk(KERN_WARNING
 				"Can not alloc domain group for node %d\n", j);
-				break;
+				goto error;
 			}
 			sg->cpu_power = 0;
 			sg->cpumask = tmp;
+			sg->next = prev->next;
 			cpus_or(covered, covered, tmp);
 			prev->next = sg;
 			prev = sg;
 		}
-		prev->next = sched_group_nodes[i];
 	}
 #endif
 
@@ -6088,13 +6130,22 @@ void build_sched_domains(const cpumask_t *cpu_map)
 	 * Tune cache-hot values:
 	 */
 	calibrate_migration_costs(cpu_map);
+
+	return 0;
+
+#ifdef CONFIG_NUMA
+error:
+	free_sched_groups(cpu_map);
+	return -ENOMEM;
+#endif
 }
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
-static void arch_init_sched_domains(const cpumask_t *cpu_map)
+static int arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	cpumask_t cpu_default_map;
+	int err;
 
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
@@ -6103,51 +6154,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map)
 	 */
 	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
 
-	build_sched_domains(&cpu_default_map);
+	err = build_sched_domains(&cpu_default_map);
+
+	return err;
 }
 
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-#ifdef CONFIG_NUMA
-	int i;
-	int cpu;
-
-	for_each_cpu_mask(cpu, *cpu_map) {
-		struct sched_group *sched_group_allnodes
-			= sched_group_allnodes_bycpu[cpu];
-		struct sched_group **sched_group_nodes
-			= sched_group_nodes_bycpu[cpu];
-
-		if (sched_group_allnodes) {
-			kfree(sched_group_allnodes);
-			sched_group_allnodes_bycpu[cpu] = NULL;
-		}
-
-		if (!sched_group_nodes)
-			continue;
-
-		for (i = 0; i < MAX_NUMNODES; i++) {
-			cpumask_t nodemask = node_to_cpumask(i);
-			struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
-			cpus_and(nodemask, nodemask, *cpu_map);
-			if (cpus_empty(nodemask))
-				continue;
-
-			if (sg == NULL)
-				continue;
-			sg = sg->next;
-next_sg:
-			oldsg = sg;
-			sg = sg->next;
-			kfree(oldsg);
-			if (oldsg != sched_group_nodes[i])
-				goto next_sg;
-		}
-		kfree(sched_group_nodes);
-		sched_group_nodes_bycpu[cpu] = NULL;
-	}
-#endif
+	free_sched_groups(cpu_map);
 }
 
 /*
@@ -6172,9 +6186,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
  * correct sched domains
  * Call with hotplug lock held
  */
-void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
 {
 	cpumask_t change_map;
+	int err = 0;
 
 	cpus_and(*partition1, *partition1, cpu_online_map);
 	cpus_and(*partition2, *partition2, cpu_online_map);
@@ -6183,9 +6198,11 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
 	/* Detach sched domains from all of the affected cpus */
 	detach_destroy_domains(&change_map);
 	if (!cpus_empty(*partition1))
-		build_sched_domains(partition1);
-	if (!cpus_empty(*partition2))
-		build_sched_domains(partition2);
+		err = build_sched_domains(partition1);
+	if (!err && !cpus_empty(*partition2))
+		err = build_sched_domains(partition2);
+
+	return err;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
cgit v1.2.3


From 5c45bf279d378d436ce45825c0f136696c7b6109 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Tue, 27 Jun 2006 02:54:42 -0700
Subject: [PATCH] sched: mc/smt power savings sched policy

sysfs entries 'sched_mc_power_savings' and 'sched_smt_power_savings' in
/sys/devices/system/cpu/ control the MC/SMT power savings policy for the
scheduler.

Based on the values (1-enable, 0-disable) for these controls, sched groups
cpu power will be determined for different domains.  When power savings
policy is enabled and under light load conditions, scheduler will minimize
the physical packages/cpu cores carrying the load and thus conserving
power(with a perf impact based on the workload characteristics...  see OLS
2005 CMP kernel scheduler paper for more details..)

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Con Kolivas <kernel@kolivas.org>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/smpboot.c     |   8 +-
 arch/x86_64/kernel/smpboot.c   |   8 +-
 drivers/base/cpu.c             |  10 +-
 include/asm-i386/topology.h    |   5 +
 include/asm-ia64/topology.h    |   1 +
 include/asm-powerpc/topology.h |   5 +
 include/asm-sparc64/topology.h |   3 +
 include/asm-x86_64/topology.h  |   2 +
 include/linux/sched.h          |  10 ++
 include/linux/topology.h       |   3 +-
 kernel/sched.c                 | 240 ++++++++++++++++++++++++++++++++++++-----
 11 files changed, 262 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index ab5275beddf7..89e7315e539c 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -448,10 +448,12 @@ cpumask_t cpu_coregroup_map(int cpu)
 	struct cpuinfo_x86 *c = cpu_data + cpu;
 	/*
 	 * For perf, we return last level cache shared map.
-	 * TBD: when power saving sched policy is added, we will return
-	 *      cpu_core_map when power saving policy is enabled
+	 * And for power savings, we return cpu_core_map
 	 */
-	return c->llc_shared_map;
+	if (sched_mc_power_savings || sched_smt_power_savings)
+		return cpu_core_map[cpu];
+	else
+		return c->llc_shared_map;
 }
 
 /* representing cpus for which sibling maps can be computed */
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 4e9755179ecf..540c0ccbcccc 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -455,10 +455,12 @@ cpumask_t cpu_coregroup_map(int cpu)
 	struct cpuinfo_x86 *c = cpu_data + cpu;
 	/*
 	 * For perf, we return last level cache shared map.
-	 * TBD: when power saving sched policy is added, we will return
-	 *      cpu_core_map when power saving policy is enabled
+	 * And for power savings, we return cpu_core_map
 	 */
-	return c->llc_shared_map;
+	if (sched_mc_power_savings || sched_smt_power_savings)
+		return cpu_core_map[cpu];
+	else
+		return c->llc_shared_map;
 }
 
 /* representing cpus for which sibling maps can be computed */
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 3972d8ac9786..4bef76a2f3f2 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -143,5 +143,13 @@ EXPORT_SYMBOL_GPL(get_cpu_sysdev);
 
 int __init cpu_dev_init(void)
 {
-	return sysdev_class_register(&cpu_sysdev_class);
+	int err;
+
+	err = sysdev_class_register(&cpu_sysdev_class);
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+	if (!err)
+		err = sched_create_sysfs_power_savings_entries(&cpu_sysdev_class);
+#endif
+
+	return err;
 }
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index aa4185ee81fb..6adbd9b1ae88 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -112,4 +112,9 @@ extern unsigned long node_remap_size[];
 
 extern cpumask_t cpu_coregroup_map(int cpu);
 
+#ifdef CONFIG_SMP
+#define mc_capable()	(boot_cpu_data.x86_max_cores > 1)
+#define smt_capable()	(smp_num_siblings > 1)
+#endif
+
 #endif /* _ASM_I386_TOPOLOGY_H */
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 616b5ed2aa72..937c21257523 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -112,6 +112,7 @@ void build_cpu_to_node_map(void);
 #define topology_core_id(cpu)			(cpu_data(cpu)->core_id)
 #define topology_core_siblings(cpu)		(cpu_core_map[cpu])
 #define topology_thread_siblings(cpu)		(cpu_sibling_map[cpu])
+#define smt_capable() 				(smp_num_siblings > 1)
 #endif
 
 #include <asm-generic/topology.h>
diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h
index 92f3e5507d22..bbc3844b086f 100644
--- a/include/asm-powerpc/topology.h
+++ b/include/asm-powerpc/topology.h
@@ -93,5 +93,10 @@ static inline void sysfs_remove_device_from_node(struct sys_device *dev,
 
 #endif /* CONFIG_NUMA */
 
+#ifdef CONFIG_SMP
+#include <asm/cputable.h>
+#define smt_capable() 		(cpu_has_feature(CPU_FTR_SMT))
+#endif
+
 #endif /* __KERNEL__ */
 #endif	/* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/include/asm-sparc64/topology.h b/include/asm-sparc64/topology.h
index 0e234e201bd6..98a6c613589d 100644
--- a/include/asm-sparc64/topology.h
+++ b/include/asm-sparc64/topology.h
@@ -1,6 +1,9 @@
 #ifndef _ASM_SPARC64_TOPOLOGY_H
 #define _ASM_SPARC64_TOPOLOGY_H
 
+#include <asm/spitfire.h>
+#define smt_capable()	(tlb_type == hypervisor)
+
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_SPARC64_TOPOLOGY_H */
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index c4e46e7fa7ba..6e7a2e976b04 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -59,6 +59,8 @@ extern int __node_distance(int, int);
 #define topology_core_id(cpu)			(cpu_data[cpu].cpu_core_id)
 #define topology_core_siblings(cpu)		(cpu_core_map[cpu])
 #define topology_thread_siblings(cpu)		(cpu_sibling_map[cpu])
+#define mc_capable()			(boot_cpu_data.x86_max_cores > 1)
+#define smt_capable() 			(smp_num_siblings > 1)
 #endif
 
 #include <asm-generic/topology.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ab8ffc54423a..0bc81a151e50 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -570,6 +570,11 @@ enum idle_type
 #define SD_WAKE_AFFINE		32	/* Wake task to waking CPU */
 #define SD_WAKE_BALANCE		64	/* Perform balancing at task wakeup */
 #define SD_SHARE_CPUPOWER	128	/* Domain members share cpu power */
+#define SD_POWERSAVINGS_BALANCE	256	/* Balance for power savings */
+
+#define BALANCE_FOR_POWER	((sched_mc_power_savings || sched_smt_power_savings) \
+				 ? SD_POWERSAVINGS_BALANCE : 0)
+
 
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
@@ -1412,6 +1417,11 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
 extern long sched_setaffinity(pid_t pid, cpumask_t new_mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 
+#include <linux/sysdev.h>
+extern int sched_mc_power_savings, sched_smt_power_savings;
+extern struct sysdev_attribute attr_sched_mc_power_savings, attr_sched_smt_power_savings;
+extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls);
+
 extern void normalize_rt_tasks(void);
 
 #ifdef CONFIG_PM
diff --git a/include/linux/topology.h b/include/linux/topology.h
index a305ae2e44b6..ec1eca85290a 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -134,7 +134,8 @@
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
-				| SD_WAKE_AFFINE,	\
+				| SD_WAKE_AFFINE	\
+				| BALANCE_FOR_POWER,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
diff --git a/kernel/sched.c b/kernel/sched.c
index 122b75584a13..54fa282657cc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1162,6 +1162,11 @@ static int sched_balance_self(int cpu, int flag)
 	struct sched_domain *tmp, *sd = NULL;
 
 	for_each_domain(cpu, tmp) {
+ 		/*
+ 	 	 * If power savings logic is enabled for a domain, stop there.
+ 	 	 */
+		if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+			break;
 		if (tmp->flags & flag)
 			sd = tmp;
 	}
@@ -2082,6 +2087,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	unsigned long busiest_load_per_task, busiest_nr_running;
 	unsigned long this_load_per_task, this_nr_running;
 	int load_idx;
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+	int power_savings_balance = 1;
+	unsigned long leader_nr_running = 0, min_load_per_task = 0;
+	unsigned long min_nr_running = ULONG_MAX;
+	struct sched_group *group_min = NULL, *group_leader = NULL;
+#endif
 
 	max_load = this_load = total_load = total_pwr = 0;
 	busiest_load_per_task = busiest_nr_running = 0;
@@ -2094,7 +2105,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		load_idx = sd->idle_idx;
 
 	do {
-		unsigned long load;
+		unsigned long load, group_capacity;
 		int local_group;
 		int i;
 		unsigned long sum_nr_running, sum_weighted_load;
@@ -2127,18 +2138,76 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		/* Adjust by relative CPU power of the group */
 		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
 
+		group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
 			this_nr_running = sum_nr_running;
 			this_load_per_task = sum_weighted_load;
 		} else if (avg_load > max_load &&
-			   sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) {
+			   sum_nr_running > group_capacity) {
 			max_load = avg_load;
 			busiest = group;
 			busiest_nr_running = sum_nr_running;
 			busiest_load_per_task = sum_weighted_load;
 		}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+		/*
+		 * Busy processors will not participate in power savings
+		 * balance.
+		 */
+ 		if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ 			goto group_next;
+
+		/*
+		 * If the local group is idle or completely loaded
+		 * no need to do power savings balance at this domain
+		 */
+		if (local_group && (this_nr_running >= group_capacity ||
+				    !this_nr_running))
+			power_savings_balance = 0;
+
+ 		/*
+		 * If a group is already running at full capacity or idle,
+		 * don't include that group in power savings calculations
+ 		 */
+ 		if (!power_savings_balance || sum_nr_running >= group_capacity
+		    || !sum_nr_running)
+ 			goto group_next;
+
+ 		/*
+		 * Calculate the group which has the least non-idle load.
+ 		 * This is the group from where we need to pick up the load
+ 		 * for saving power
+ 		 */
+ 		if ((sum_nr_running < min_nr_running) ||
+ 		    (sum_nr_running == min_nr_running &&
+		     first_cpu(group->cpumask) <
+		     first_cpu(group_min->cpumask))) {
+ 			group_min = group;
+ 			min_nr_running = sum_nr_running;
+			min_load_per_task = sum_weighted_load /
+						sum_nr_running;
+ 		}
+
+ 		/*
+		 * Calculate the group which is almost near its
+ 		 * capacity but still has some space to pick up some load
+ 		 * from other group and save more power
+ 		 */
+ 		if (sum_nr_running <= group_capacity - 1)
+ 			if (sum_nr_running > leader_nr_running ||
+ 			    (sum_nr_running == leader_nr_running &&
+ 			     first_cpu(group->cpumask) >
+ 			      first_cpu(group_leader->cpumask))) {
+ 				group_leader = group;
+ 				leader_nr_running = sum_nr_running;
+ 			}
+
+group_next:
+#endif
 		group = group->next;
 	} while (group != sd->groups);
 
@@ -2247,7 +2316,16 @@ small_imbalance:
 	return busiest;
 
 out_balanced:
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+	if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+		goto ret;
 
+	if (this == group_leader && group_leader != group_min) {
+		*imbalance = min_load_per_task;
+		return group_min;
+	}
+ret:
+#endif
 	*imbalance = 0;
 	return NULL;
 }
@@ -2300,7 +2378,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 	int active_balance = 0;
 	int sd_idle = 0;
 
-	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+	    !sched_smt_power_savings)
 		sd_idle = 1;
 
 	schedstat_inc(sd, lb_cnt[idle]);
@@ -2389,7 +2468,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 			sd->balance_interval *= 2;
 	}
 
-	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+	    !sched_smt_power_savings)
 		return -1;
 	return nr_moved;
 
@@ -2404,7 +2484,7 @@ out_one_pinned:
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 
-	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
 		return -1;
 	return 0;
 }
@@ -2425,7 +2505,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 	int nr_moved = 0;
 	int sd_idle = 0;
 
-	if (sd->flags & SD_SHARE_CPUPOWER)
+	if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
 		sd_idle = 1;
 
 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2466,7 +2546,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 
 out_balanced:
 	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
-	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
 		return -1;
 	sd->nr_balance_failed = 0;
 	return 0;
@@ -5732,6 +5812,7 @@ static cpumask_t sched_domain_node_span(int node)
 }
 #endif
 
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 /*
  * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
  * can switch it on easily if needed.
@@ -6113,37 +6194,72 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
+#ifdef CONFIG_SCHED_SMT
 	for_each_cpu_mask(i, *cpu_map) {
-		int power;
 		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
-		power = SCHED_LOAD_SCALE;
-		sd->groups->cpu_power = power;
+		sd->groups->cpu_power = SCHED_LOAD_SCALE;
+	}
 #endif
 #ifdef CONFIG_SCHED_MC
+	for_each_cpu_mask(i, *cpu_map) {
+		int power;
+		struct sched_domain *sd;
 		sd = &per_cpu(core_domains, i);
-		power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+		if (sched_smt_power_savings)
+			power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+		else
+			power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
 					    * SCHED_LOAD_SCALE / 10;
 		sd->groups->cpu_power = power;
+	}
+#endif
 
+	for_each_cpu_mask(i, *cpu_map) {
+		struct sched_domain *sd;
+#ifdef CONFIG_SCHED_MC
 		sd = &per_cpu(phys_domains, i);
+		if (i != first_cpu(sd->groups->cpumask))
+			continue;
 
- 		/*
- 		 * This has to be < 2 * SCHED_LOAD_SCALE
- 		 * Lets keep it SCHED_LOAD_SCALE, so that
- 		 * while calculating NUMA group's cpu_power
- 		 * we can simply do
- 		 *  numa_group->cpu_power += phys_group->cpu_power;
- 		 *
- 		 * See "only add power once for each physical pkg"
- 		 * comment below
- 		 */
- 		sd->groups->cpu_power = SCHED_LOAD_SCALE;
+		sd->groups->cpu_power = 0;
+		if (sched_mc_power_savings || sched_smt_power_savings) {
+			int j;
+
+ 			for_each_cpu_mask(j, sd->groups->cpumask) {
+				struct sched_domain *sd1;
+ 				sd1 = &per_cpu(core_domains, j);
+ 				/*
+ 			 	 * for each core we will add once
+ 				 * to the group in physical domain
+ 			 	 */
+  	 			if (j != first_cpu(sd1->groups->cpumask))
+ 					continue;
+
+ 				if (sched_smt_power_savings)
+   					sd->groups->cpu_power += sd1->groups->cpu_power;
+ 				else
+   					sd->groups->cpu_power += SCHED_LOAD_SCALE;
+   			}
+ 		} else
+ 			/*
+ 			 * This has to be < 2 * SCHED_LOAD_SCALE
+ 			 * Lets keep it SCHED_LOAD_SCALE, so that
+ 			 * while calculating NUMA group's cpu_power
+ 			 * we can simply do
+ 			 *  numa_group->cpu_power += phys_group->cpu_power;
+ 			 *
+ 			 * See "only add power once for each physical pkg"
+ 			 * comment below
+ 			 */
+ 			sd->groups->cpu_power = SCHED_LOAD_SCALE;
 #else
+		int power;
 		sd = &per_cpu(phys_domains, i);
-		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
+		if (sched_smt_power_savings)
+			power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+		else
+			power = SCHED_LOAD_SCALE;
 		sd->groups->cpu_power = power;
 #endif
 	}
@@ -6244,6 +6360,80 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
 	return err;
 }
 
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+int arch_reinit_sched_domains(void)
+{
+	int err;
+
+	lock_cpu_hotplug();
+	detach_destroy_domains(&cpu_online_map);
+	err = arch_init_sched_domains(&cpu_online_map);
+	unlock_cpu_hotplug();
+
+	return err;
+}
+
+static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
+{
+	int ret;
+
+	if (buf[0] != '0' && buf[0] != '1')
+		return -EINVAL;
+
+	if (smt)
+		sched_smt_power_savings = (buf[0] == '1');
+	else
+		sched_mc_power_savings = (buf[0] == '1');
+
+	ret = arch_reinit_sched_domains();
+
+	return ret ? ret : count;
+}
+
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+	int err = 0;
+#ifdef CONFIG_SCHED_SMT
+	if (smt_capable())
+		err = sysfs_create_file(&cls->kset.kobj,
+					&attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+	if (!err && mc_capable())
+		err = sysfs_create_file(&cls->kset.kobj,
+					&attr_sched_mc_power_savings.attr);
+#endif
+	return err;
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+{
+	return sprintf(page, "%u\n", sched_mc_power_savings);
+}
+static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+{
+	return sched_power_savings_store(buf, count, 0);
+}
+SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+	    sched_mc_power_savings_store);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+{
+	return sprintf(page, "%u\n", sched_smt_power_savings);
+}
+static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+{
+	return sched_power_savings_store(buf, count, 1);
+}
+SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+	    sched_smt_power_savings_store);
+#endif
+
+
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
-- 
cgit v1.2.3


From e2970f2fb6950183a34e8545faa093eb49d186e1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 27 Jun 2006 02:54:47 -0700
Subject: [PATCH] pi-futex: futex code cleanups

We are pleased to announce "lightweight userspace priority inheritance" (PI)
support for futexes.  The following patchset and glibc patch implements it,
ontop of the robust-futexes patchset which is included in 2.6.16-mm1.

We are calling it lightweight for 3 reasons:

 - in the user-space fastpath a PI-enabled futex involves no kernel work
   (or any other PI complexity) at all.  No registration, no extra kernel
   calls - just pure fast atomic ops in userspace.

 - in the slowpath (in the lock-contention case), the system call and
   scheduling pattern is in fact better than that of normal futexes, due to
   the 'integrated' nature of FUTEX_LOCK_PI.  [more about that further down]

 - the in-kernel PI implementation is streamlined around the mutex
   abstraction, with strict rules that keep the implementation relatively
   simple: only a single owner may own a lock (i.e.  no read-write lock
   support), only the owner may unlock a lock, no recursive locking, etc.

  Priority Inheritance - why, oh why???
  -------------------------------------

Many of you heard the horror stories about the evil PI code circling Linux for
years, which makes no real sense at all and is only used by buggy applications
and which has horrible overhead.  Some of you have dreaded this very moment,
when someone actually submits working PI code ;-)

So why would we like to see PI support for futexes?

We'd like to see it done purely for technological reasons.  We dont think it's
a buggy concept, we think it's useful functionality to offer to applications,
which functionality cannot be achieved in other ways.  We also think it's the
right thing to do, and we think we've got the right arguments and the right
numbers to prove that.  We also believe that we can address all the
counter-arguments as well.  For these reasons (and the reasons outlined below)
we are submitting this patch-set for upstream kernel inclusion.

What are the benefits of PI?

  The short reply:
  ----------------

User-space PI helps achieving/improving determinism for user-space
applications.  In the best-case, it can help achieve determinism and
well-bound latencies.  Even in the worst-case, PI will improve the statistical
distribution of locking related application delays.

  The longer reply:
  -----------------

Firstly, sharing locks between multiple tasks is a common programming
technique that often cannot be replaced with lockless algorithms.  As we can
see it in the kernel [which is a quite complex program in itself], lockless
structures are rather the exception than the norm - the current ratio of
lockless vs.  locky code for shared data structures is somewhere between 1:10
and 1:100.  Lockless is hard, and the complexity of lockless algorithms often
endangers to ability to do robust reviews of said code.  I.e.  critical RT
apps often choose lock structures to protect critical data structures, instead
of lockless algorithms.  Furthermore, there are cases (like shared hardware,
or other resource limits) where lockless access is mathematically impossible.

Media players (such as Jack) are an example of reasonable application design
with multiple tasks (with multiple priority levels) sharing short-held locks:
for example, a highprio audio playback thread is combined with medium-prio
construct-audio-data threads and low-prio display-colory-stuff threads.  Add
video and decoding to the mix and we've got even more priority levels.

So once we accept that synchronization objects (locks) are an unavoidable fact
of life, and once we accept that multi-task userspace apps have a very fair
expectation of being able to use locks, we've got to think about how to offer
the option of a deterministic locking implementation to user-space.

Most of the technical counter-arguments against doing priority inheritance
only apply to kernel-space locks.  But user-space locks are different, there
we cannot disable interrupts or make the task non-preemptible in a critical
section, so the 'use spinlocks' argument does not apply (user-space spinlocks
have the same priority inversion problems as other user-space locking
constructs).  Fact is, pretty much the only technique that currently enables
good determinism for userspace locks (such as futex-based pthread mutexes) is
priority inheritance:

Currently (without PI), if a high-prio and a low-prio task shares a lock [this
is a quite common scenario for most non-trivial RT applications], even if all
critical sections are coded carefully to be deterministic (i.e.  all critical
sections are short in duration and only execute a limited number of
instructions), the kernel cannot guarantee any deterministic execution of the
high-prio task: any medium-priority task could preempt the low-prio task while
it holds the shared lock and executes the critical section, and could delay it
indefinitely.

  Implementation:
  ---------------

As mentioned before, the userspace fastpath of PI-enabled pthread mutexes
involves no kernel work at all - they behave quite similarly to normal
futex-based locks: a 0 value means unlocked, and a value==TID means locked.
(This is the same method as used by list-based robust futexes.) Userspace uses
atomic ops to lock/unlock these mutexes without entering the kernel.

To handle the slowpath, we have added two new futex ops:

  FUTEX_LOCK_PI
  FUTEX_UNLOCK_PI

If the lock-acquire fastpath fails, [i.e.  an atomic transition from 0 to TID
fails], then FUTEX_LOCK_PI is called.  The kernel does all the remaining work:
if there is no futex-queue attached to the futex address yet then the code
looks up the task that owns the futex [it has put its own TID into the futex
value], and attaches a 'PI state' structure to the futex-queue.  The pi_state
includes an rt-mutex, which is a PI-aware, kernel-based synchronization
object.  The 'other' task is made the owner of the rt-mutex, and the
FUTEX_WAITERS bit is atomically set in the futex value.  Then this task tries
to lock the rt-mutex, on which it blocks.  Once it returns, it has the mutex
acquired, and it sets the futex value to its own TID and returns.  Userspace
has no other work to perform - it now owns the lock, and futex value contains
FUTEX_WAITERS|TID.

If the unlock side fastpath succeeds, [i.e.  userspace manages to do a TID ->
0 atomic transition of the futex value], then no kernel work is triggered.

If the unlock fastpath fails (because the FUTEX_WAITERS bit is set), then
FUTEX_UNLOCK_PI is called, and the kernel unlocks the futex on the behalf of
userspace - and it also unlocks the attached pi_state->rt_mutex and thus wakes
up any potential waiters.

Note that under this approach, contrary to other PI-futex approaches, there is
no prior 'registration' of a PI-futex.  [which is not quite possible anyway,
due to existing ABI properties of pthread mutexes.]

Also, under this scheme, 'robustness' and 'PI' are two orthogonal properties
of futexes, and all four combinations are possible: futex, robust-futex,
PI-futex, robust+PI-futex.

  glibc support:
  --------------

Ulrich Drepper and Jakub Jelinek have written glibc support for PI-futexes
(and robust futexes), enabling robust and PI (PTHREAD_PRIO_INHERIT) POSIX
mutexes.  (PTHREAD_PRIO_PROTECT support will be added later on too, no
additional kernel changes are needed for that).  [NOTE: The glibc patch is
obviously inofficial and unsupported without matching upstream kernel
functionality.]

the patch-queue and the glibc patch can also be downloaded from:

  http://redhat.com/~mingo/PI-futex-patches/

Many thanks go to the people who helped us create this kernel feature: Steven
Rostedt, Esben Nielsen, Benedikt Spranger, Daniel Walker, John Cooper, Arjan
van de Ven, Oleg Nesterov and others.  Credits for related prior projects goes
to Dirk Grambow, Inaky Perez-Gonzalez, Bill Huey and many others.

Clean up the futex code, before adding more features to it:

 - use u32 as the futex field type - that's the ABI
 - use __user and pointers to u32 instead of unsigned long
 - code style / comment style cleanups
 - rename hash-bucket name from 'bh' to 'hb'.

I checked the pre and post futex.o object files to make sure this
patch has no code effects.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/futex.h    |   5 +-
 include/linux/syscalls.h |   4 +-
 kernel/futex.c           | 245 +++++++++++++++++++++++++----------------------
 kernel/futex_compat.c    |   3 +-
 4 files changed, 135 insertions(+), 122 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 966a5b3da439..f05a3f469322 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -90,9 +90,8 @@ struct robust_list_head {
  */
 #define ROBUST_LIST_LIMIT	2048
 
-long do_futex(unsigned long uaddr, int op, int val,
-		unsigned long timeout, unsigned long uaddr2, int val2,
-		int val3);
+long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+	      u32 __user *uaddr2, u32 val2, u32 val3);
 
 extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr);
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 33785b79d548..008f04c56737 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -174,9 +174,9 @@ asmlinkage long sys_waitid(int which, pid_t pid,
 			   int options, struct rusage __user *ru);
 asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options);
 asmlinkage long sys_set_tid_address(int __user *tidptr);
-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
+asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 			struct timespec __user *utime, u32 __user *uaddr2,
-			int val3);
+			u32 val3);
 
 asmlinkage long sys_init_module(void __user *umod, unsigned long len,
 				const char __user *uargs);
diff --git a/kernel/futex.c b/kernel/futex.c
index e1a380c77a5a..50356fb5d726 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -63,7 +63,7 @@ union futex_key {
 		int offset;
 	} shared;
 	struct {
-		unsigned long uaddr;
+		unsigned long address;
 		struct mm_struct *mm;
 		int offset;
 	} private;
@@ -87,13 +87,13 @@ struct futex_q {
 	struct list_head list;
 	wait_queue_head_t waiters;
 
-	/* Which hash list lock to use. */
+	/* Which hash list lock to use: */
 	spinlock_t *lock_ptr;
 
-	/* Key which the futex is hashed on. */
+	/* Key which the futex is hashed on: */
 	union futex_key key;
 
-	/* For fd, sigio sent using these. */
+	/* For fd, sigio sent using these: */
 	int fd;
 	struct file *filp;
 };
@@ -144,8 +144,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
  *
  * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
  */
-static int get_futex_key(unsigned long uaddr, union futex_key *key)
+static int get_futex_key(u32 __user *uaddr, union futex_key *key)
 {
+	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	struct page *page;
@@ -154,16 +155,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
 	/*
 	 * The futex address must be "naturally" aligned.
 	 */
-	key->both.offset = uaddr % PAGE_SIZE;
+	key->both.offset = address % PAGE_SIZE;
 	if (unlikely((key->both.offset % sizeof(u32)) != 0))
 		return -EINVAL;
-	uaddr -= key->both.offset;
+	address -= key->both.offset;
 
 	/*
 	 * The futex is hashed differently depending on whether
 	 * it's in a shared or private mapping.  So check vma first.
 	 */
-	vma = find_extend_vma(mm, uaddr);
+	vma = find_extend_vma(mm, address);
 	if (unlikely(!vma))
 		return -EFAULT;
 
@@ -184,7 +185,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
 	 */
 	if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
 		key->private.mm = mm;
-		key->private.uaddr = uaddr;
+		key->private.address = address;
 		return 0;
 	}
 
@@ -194,7 +195,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
 	key->shared.inode = vma->vm_file->f_dentry->d_inode;
 	key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
 	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
-		key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
+		key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
 				     + vma->vm_pgoff);
 		return 0;
 	}
@@ -205,7 +206,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
 	 * from swap.  But that's a lot of code to duplicate here
 	 * for a rare case, so we simply fetch the page.
 	 */
-	err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
+	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
 	if (err >= 0) {
 		key->shared.pgoff =
 			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -246,12 +247,12 @@ static void drop_key_refs(union futex_key *key)
 	}
 }
 
-static inline int get_futex_value_locked(int *dest, int __user *from)
+static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
 	int ret;
 
 	inc_preempt_count();
-	ret = __copy_from_user_inatomic(dest, from, sizeof(int));
+	ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
 	dec_preempt_count();
 
 	return ret ? -EFAULT : 0;
@@ -288,12 +289,12 @@ static void wake_futex(struct futex_q *q)
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake(unsigned long uaddr, int nr_wake)
+static int futex_wake(u32 __user *uaddr, int nr_wake)
 {
-	union futex_key key;
-	struct futex_hash_bucket *bh;
-	struct list_head *head;
+	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
+	struct list_head *head;
+	union futex_key key;
 	int ret;
 
 	down_read(&current->mm->mmap_sem);
@@ -302,9 +303,9 @@ static int futex_wake(unsigned long uaddr, int nr_wake)
 	if (unlikely(ret != 0))
 		goto out;
 
-	bh = hash_futex(&key);
-	spin_lock(&bh->lock);
-	head = &bh->chain;
+	hb = hash_futex(&key);
+	spin_lock(&hb->lock);
+	head = &hb->chain;
 
 	list_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key)) {
@@ -314,7 +315,7 @@ static int futex_wake(unsigned long uaddr, int nr_wake)
 		}
 	}
 
-	spin_unlock(&bh->lock);
+	spin_unlock(&hb->lock);
 out:
 	up_read(&current->mm->mmap_sem);
 	return ret;
@@ -324,10 +325,12 @@ out:
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+static int
+futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
+	      int nr_wake, int nr_wake2, int op)
 {
 	union futex_key key1, key2;
-	struct futex_hash_bucket *bh1, *bh2;
+	struct futex_hash_bucket *hb1, *hb2;
 	struct list_head *head;
 	struct futex_q *this, *next;
 	int ret, op_ret, attempt = 0;
@@ -342,27 +345,29 @@ retryfull:
 	if (unlikely(ret != 0))
 		goto out;
 
-	bh1 = hash_futex(&key1);
-	bh2 = hash_futex(&key2);
+	hb1 = hash_futex(&key1);
+	hb2 = hash_futex(&key2);
 
 retry:
-	if (bh1 < bh2)
-		spin_lock(&bh1->lock);
-	spin_lock(&bh2->lock);
-	if (bh1 > bh2)
-		spin_lock(&bh1->lock);
+	if (hb1 < hb2)
+		spin_lock(&hb1->lock);
+	spin_lock(&hb2->lock);
+	if (hb1 > hb2)
+		spin_lock(&hb1->lock);
 
-	op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+	op_ret = futex_atomic_op_inuser(op, uaddr2);
 	if (unlikely(op_ret < 0)) {
-		int dummy;
+		u32 dummy;
 
-		spin_unlock(&bh1->lock);
-		if (bh1 != bh2)
-			spin_unlock(&bh2->lock);
+		spin_unlock(&hb1->lock);
+		if (hb1 != hb2)
+			spin_unlock(&hb2->lock);
 
 #ifndef CONFIG_MMU
-		/* we don't get EFAULT from MMU faults if we don't have an MMU,
-		 * but we might get them from range checking */
+		/*
+		 * we don't get EFAULT from MMU faults if we don't have an MMU,
+		 * but we might get them from range checking
+		 */
 		ret = op_ret;
 		goto out;
 #endif
@@ -372,23 +377,26 @@ retry:
 			goto out;
 		}
 
-		/* futex_atomic_op_inuser needs to both read and write
+		/*
+		 * futex_atomic_op_inuser needs to both read and write
 		 * *(int __user *)uaddr2, but we can't modify it
 		 * non-atomically.  Therefore, if get_user below is not
 		 * enough, we need to handle the fault ourselves, while
-		 * still holding the mmap_sem.  */
+		 * still holding the mmap_sem.
+		 */
 		if (attempt++) {
 			struct vm_area_struct * vma;
 			struct mm_struct *mm = current->mm;
+			unsigned long address = (unsigned long)uaddr2;
 
 			ret = -EFAULT;
 			if (attempt >= 2 ||
-			    !(vma = find_vma(mm, uaddr2)) ||
-			    vma->vm_start > uaddr2 ||
+			    !(vma = find_vma(mm, address)) ||
+			    vma->vm_start > address ||
 			    !(vma->vm_flags & VM_WRITE))
 				goto out;
 
-			switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
+			switch (handle_mm_fault(mm, vma, address, 1)) {
 			case VM_FAULT_MINOR:
 				current->min_flt++;
 				break;
@@ -401,18 +409,20 @@ retry:
 			goto retry;
 		}
 
-		/* If we would have faulted, release mmap_sem,
-		 * fault it in and start all over again.  */
+		/*
+		 * If we would have faulted, release mmap_sem,
+		 * fault it in and start all over again.
+		 */
 		up_read(&current->mm->mmap_sem);
 
-		ret = get_user(dummy, (int __user *)uaddr2);
+		ret = get_user(dummy, uaddr2);
 		if (ret)
 			return ret;
 
 		goto retryfull;
 	}
 
-	head = &bh1->chain;
+	head = &hb1->chain;
 
 	list_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key1)) {
@@ -423,7 +433,7 @@ retry:
 	}
 
 	if (op_ret > 0) {
-		head = &bh2->chain;
+		head = &hb2->chain;
 
 		op_ret = 0;
 		list_for_each_entry_safe(this, next, head, list) {
@@ -436,9 +446,9 @@ retry:
 		ret += op_ret;
 	}
 
-	spin_unlock(&bh1->lock);
-	if (bh1 != bh2)
-		spin_unlock(&bh2->lock);
+	spin_unlock(&hb1->lock);
+	if (hb1 != hb2)
+		spin_unlock(&hb2->lock);
 out:
 	up_read(&current->mm->mmap_sem);
 	return ret;
@@ -448,11 +458,11 @@ out:
  * Requeue all waiters hashed on one physical page to another
  * physical page.
  */
-static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
-			 int nr_wake, int nr_requeue, int *valp)
+static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
+			 int nr_wake, int nr_requeue, u32 *cmpval)
 {
 	union futex_key key1, key2;
-	struct futex_hash_bucket *bh1, *bh2;
+	struct futex_hash_bucket *hb1, *hb2;
 	struct list_head *head1;
 	struct futex_q *this, *next;
 	int ret, drop_count = 0;
@@ -467,68 +477,69 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
 	if (unlikely(ret != 0))
 		goto out;
 
-	bh1 = hash_futex(&key1);
-	bh2 = hash_futex(&key2);
+	hb1 = hash_futex(&key1);
+	hb2 = hash_futex(&key2);
 
-	if (bh1 < bh2)
-		spin_lock(&bh1->lock);
-	spin_lock(&bh2->lock);
-	if (bh1 > bh2)
-		spin_lock(&bh1->lock);
+	if (hb1 < hb2)
+		spin_lock(&hb1->lock);
+	spin_lock(&hb2->lock);
+	if (hb1 > hb2)
+		spin_lock(&hb1->lock);
 
-	if (likely(valp != NULL)) {
-		int curval;
+	if (likely(cmpval != NULL)) {
+		u32 curval;
 
-		ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
+		ret = get_futex_value_locked(&curval, uaddr1);
 
 		if (unlikely(ret)) {
-			spin_unlock(&bh1->lock);
-			if (bh1 != bh2)
-				spin_unlock(&bh2->lock);
+			spin_unlock(&hb1->lock);
+			if (hb1 != hb2)
+				spin_unlock(&hb2->lock);
 
-			/* If we would have faulted, release mmap_sem, fault
+			/*
+			 * If we would have faulted, release mmap_sem, fault
 			 * it in and start all over again.
 			 */
 			up_read(&current->mm->mmap_sem);
 
-			ret = get_user(curval, (int __user *)uaddr1);
+			ret = get_user(curval, uaddr1);
 
 			if (!ret)
 				goto retry;
 
 			return ret;
 		}
-		if (curval != *valp) {
+		if (curval != *cmpval) {
 			ret = -EAGAIN;
 			goto out_unlock;
 		}
 	}
 
-	head1 = &bh1->chain;
+	head1 = &hb1->chain;
 	list_for_each_entry_safe(this, next, head1, list) {
 		if (!match_futex (&this->key, &key1))
 			continue;
 		if (++ret <= nr_wake) {
 			wake_futex(this);
 		} else {
-			list_move_tail(&this->list, &bh2->chain);
-			this->lock_ptr = &bh2->lock;
+			list_move_tail(&this->list, &hb2->chain);
+			this->lock_ptr = &hb2->lock;
 			this->key = key2;
 			get_key_refs(&key2);
 			drop_count++;
 
 			if (ret - nr_wake >= nr_requeue)
 				break;
-			/* Make sure to stop if key1 == key2 */
-			if (head1 == &bh2->chain && head1 != &next->list)
+			/* Make sure to stop if key1 == key2: */
+			if (head1 == &hb2->chain && head1 != &next->list)
 				head1 = &this->list;
 		}
 	}
 
 out_unlock:
-	spin_unlock(&bh1->lock);
-	if (bh1 != bh2)
-		spin_unlock(&bh2->lock);
+	spin_unlock(&hb1->lock);
+	if (hb1 != hb2)
+		spin_unlock(&hb2->lock);
 
 	/* drop_key_refs() must be called outside the spinlocks. */
 	while (--drop_count >= 0)
@@ -543,7 +554,7 @@ out:
 static inline struct futex_hash_bucket *
 queue_lock(struct futex_q *q, int fd, struct file *filp)
 {
-	struct futex_hash_bucket *bh;
+	struct futex_hash_bucket *hb;
 
 	q->fd = fd;
 	q->filp = filp;
@@ -551,23 +562,23 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
 	init_waitqueue_head(&q->waiters);
 
 	get_key_refs(&q->key);
-	bh = hash_futex(&q->key);
-	q->lock_ptr = &bh->lock;
+	hb = hash_futex(&q->key);
+	q->lock_ptr = &hb->lock;
 
-	spin_lock(&bh->lock);
-	return bh;
+	spin_lock(&hb->lock);
+	return hb;
 }
 
-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 {
-	list_add_tail(&q->list, &bh->chain);
-	spin_unlock(&bh->lock);
+	list_add_tail(&q->list, &hb->chain);
+	spin_unlock(&hb->lock);
 }
 
 static inline void
-queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
+queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
 {
-	spin_unlock(&bh->lock);
+	spin_unlock(&hb->lock);
 	drop_key_refs(&q->key);
 }
 
@@ -579,16 +590,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
 /* The key must be already stored in q->key. */
 static void queue_me(struct futex_q *q, int fd, struct file *filp)
 {
-	struct futex_hash_bucket *bh;
-	bh = queue_lock(q, fd, filp);
-	__queue_me(q, bh);
+	struct futex_hash_bucket *hb;
+
+	hb = queue_lock(q, fd, filp);
+	__queue_me(q, hb);
 }
 
 /* Return 1 if we were still queued (ie. 0 means we were woken) */
 static int unqueue_me(struct futex_q *q)
 {
-	int ret = 0;
 	spinlock_t *lock_ptr;
+	int ret = 0;
 
 	/* In the common case we don't take the spinlock, which is nice. */
  retry:
@@ -622,12 +634,13 @@ static int unqueue_me(struct futex_q *q)
 	return ret;
 }
 
-static int futex_wait(unsigned long uaddr, int val, unsigned long time)
+static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	int ret, curval;
+	struct futex_hash_bucket *hb;
 	struct futex_q q;
-	struct futex_hash_bucket *bh;
+	u32 uval;
+	int ret;
 
  retry:
 	down_read(&current->mm->mmap_sem);
@@ -636,7 +649,7 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
 	if (unlikely(ret != 0))
 		goto out_release_sem;
 
-	bh = queue_lock(&q, -1, NULL);
+	hb = queue_lock(&q, -1, NULL);
 
 	/*
 	 * Access the page AFTER the futex is queued.
@@ -658,31 +671,31 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
 	 * We hold the mmap semaphore, so the mapping cannot have changed
 	 * since we looked it up in get_futex_key.
 	 */
-
-	ret = get_futex_value_locked(&curval, (int __user *)uaddr);
+	ret = get_futex_value_locked(&uval, uaddr);
 
 	if (unlikely(ret)) {
-		queue_unlock(&q, bh);
+		queue_unlock(&q, hb);
 
-		/* If we would have faulted, release mmap_sem, fault it in and
+		/*
+		 * If we would have faulted, release mmap_sem, fault it in and
 		 * start all over again.
 		 */
 		up_read(&current->mm->mmap_sem);
 
-		ret = get_user(curval, (int __user *)uaddr);
+		ret = get_user(uval, uaddr);
 
 		if (!ret)
 			goto retry;
 		return ret;
 	}
-	if (curval != val) {
+	if (uval != val) {
 		ret = -EWOULDBLOCK;
-		queue_unlock(&q, bh);
+		queue_unlock(&q, hb);
 		goto out_release_sem;
 	}
 
 	/* Only actually queue if *uaddr contained val.  */
-	__queue_me(&q, bh);
+	__queue_me(&q, hb);
 
 	/*
 	 * Now the futex is queued and we have checked the data, we
@@ -720,8 +733,10 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
 		return 0;
 	if (time == 0)
 		return -ETIMEDOUT;
-	/* We expect signal_pending(current), but another thread may
-	 * have handled it for us already. */
+	/*
+	 * We expect signal_pending(current), but another thread may
+	 * have handled it for us already.
+	 */
 	return -EINTR;
 
  out_release_sem:
@@ -735,6 +750,7 @@ static int futex_close(struct inode *inode, struct file *filp)
 
 	unqueue_me(q);
 	kfree(q);
+
 	return 0;
 }
 
@@ -766,7 +782,7 @@ static struct file_operations futex_fops = {
  * Signal allows caller to avoid the race which would occur if they
  * set the sigio stuff up afterwards.
  */
-static int futex_fd(unsigned long uaddr, int signal)
+static int futex_fd(u32 __user *uaddr, int signal)
 {
 	struct futex_q *q;
 	struct file *filp;
@@ -937,7 +953,7 @@ retry:
 			goto retry;
 
 		if (uval & FUTEX_WAITERS)
-			futex_wake((unsigned long)uaddr, 1);
+			futex_wake(uaddr, 1);
 	}
 	return 0;
 }
@@ -999,8 +1015,8 @@ void exit_robust_list(struct task_struct *curr)
 	}
 }
 
-long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
-		unsigned long uaddr2, int val2, int val3)
+long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+		u32 __user *uaddr2, u32 val2, u32 val3)
 {
 	int ret;
 
@@ -1031,13 +1047,13 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
 }
 
 
-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
+asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 			  struct timespec __user *utime, u32 __user *uaddr2,
-			  int val3)
+			  u32 val3)
 {
 	struct timespec t;
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
-	int val2 = 0;
+	u32 val2 = 0;
 
 	if (utime && (op == FUTEX_WAIT)) {
 		if (copy_from_user(&t, utime, sizeof(t)) != 0)
@@ -1050,10 +1066,9 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
 	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
 	 */
 	if (op >= FUTEX_REQUEUE)
-		val2 = (int) (unsigned long) utime;
+		val2 = (u32) (unsigned long) utime;
 
-	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
 }
 
 static int futexfs_get_sb(struct file_system_type *fs_type,
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 1ab6a0ea3d14..7e57c31670a3 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -139,6 +139,5 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 	if (op >= FUTEX_REQUEUE)
 		val2 = (int) (unsigned long) utime;
 
-	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
 }
-- 
cgit v1.2.3


From f9b8404cf8f8456dfa83459510762b700dc00385 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 27 Jun 2006 02:54:49 -0700
Subject: [PATCH] pi-futex: introduce debug_check_no_locks_freed()

Add debug_check_no_locks_freed(), as a central inline to add
bad-lock-free-debugging functionality to.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/pageattr.c |  4 ++--
 include/linux/mm.h      | 10 ++++++++--
 mm/page_alloc.c         |  4 ++--
 mm/slab.c               |  2 +-
 4 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 0887b34bc59b..353a836ed63c 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -229,8 +229,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 	if (PageHighMem(page))
 		return;
 	if (!enable)
-		mutex_debug_check_no_locks_freed(page_address(page),
-						 numpages * PAGE_SIZE);
+		debug_check_no_locks_freed(page_address(page),
+					   numpages * PAGE_SIZE);
 
 	/* the return value is ignored - the calls cannot fail,
 	 * large pages are disabled at boot time.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ff1fa87df8d0..0ac255720f34 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1030,13 +1030,19 @@ static inline void vm_stat_account(struct mm_struct *mm,
 }
 #endif /* CONFIG_PROC_FS */
 
+static inline void
+debug_check_no_locks_freed(const void *from, unsigned long len)
+{
+	mutex_debug_check_no_locks_freed(from, len);
+}
+
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	if (!PageHighMem(page) && !enable)
-		mutex_debug_check_no_locks_freed(page_address(page),
-						 numpages * PAGE_SIZE);
+		debug_check_no_locks_freed(page_address(page),
+					   numpages * PAGE_SIZE);
 }
 #endif
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dafd12ec7a0c..084a2de7e52a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -446,8 +446,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 
 	arch_free_page(page, order);
 	if (!PageHighMem(page))
-		mutex_debug_check_no_locks_freed(page_address(page),
-						 PAGE_SIZE<<order);
+		debug_check_no_locks_freed(page_address(page),
+					   PAGE_SIZE<<order);
 
 	for (i = 0 ; i < (1 << order) ; ++i)
 		reserved += free_pages_check(page + i);
diff --git a/mm/slab.c b/mm/slab.c
index d1d55279202e..f378d027c684 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3397,7 +3397,7 @@ void kfree(const void *objp)
 	local_irq_save(flags);
 	kfree_debugcheck(objp);
 	c = virt_to_cache(objp);
-	mutex_debug_check_no_locks_freed(objp, obj_size(c));
+	debug_check_no_locks_freed(objp, obj_size(c));
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3


From 77ba89c5cf28d5d98a3cae17f67a3e42b102cc25 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 27 Jun 2006 02:54:51 -0700
Subject: [PATCH] pi-futex: add plist implementation

Add the priority-sorted list (plist) implementation.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/plist.h | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig           |   6 ++
 lib/Makefile          |   1 +
 lib/plist.c           | 118 ++++++++++++++++++++++++
 4 files changed, 372 insertions(+)
 create mode 100644 include/linux/plist.h
 create mode 100644 lib/plist.c

(limited to 'include/linux')

diff --git a/include/linux/plist.h b/include/linux/plist.h
new file mode 100644
index 000000000000..3404faef542c
--- /dev/null
+++ b/include/linux/plist.h
@@ -0,0 +1,247 @@
+/*
+ * Descending-priority-sorted double-linked list
+ *
+ * (C) 2002-2003 Intel Corp
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
+ *
+ * 2001-2005 (c) MontaVista Software, Inc.
+ * Daniel Walker <dwalker@mvista.com>
+ *
+ * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Simplifications of the original code by
+ * Oleg Nesterov <oleg@tv-sign.ru>
+ *
+ * Licensed under the FSF's GNU Public License v2 or later.
+ *
+ * Based on simple lists (include/linux/list.h).
+ *
+ * This is a priority-sorted list of nodes; each node has a
+ * priority from INT_MIN (highest) to INT_MAX (lowest).
+ *
+ * Addition is O(K), removal is O(1), change of priority of a node is
+ * O(K) and K is the number of RT priority levels used in the system.
+ * (1 <= K <= 99)
+ *
+ * This list is really a list of lists:
+ *
+ *  - The tier 1 list is the prio_list, different priority nodes.
+ *
+ *  - The tier 2 list is the node_list, serialized nodes.
+ *
+ * Simple ASCII art explanation:
+ *
+ * |HEAD          |
+ * |              |
+ * |prio_list.prev|<------------------------------------|
+ * |prio_list.next|<->|pl|<->|pl|<--------------->|pl|<-|
+ * |10            |   |10|   |21|   |21|   |21|   |40|   (prio)
+ * |              |   |  |   |  |   |  |   |  |   |  |
+ * |              |   |  |   |  |   |  |   |  |   |  |
+ * |node_list.next|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-|
+ * |node_list.prev|<------------------------------------|
+ *
+ * The nodes on the prio_list list are sorted by priority to simplify
+ * the insertion of new nodes. There are no nodes with duplicate
+ * priorites on the list.
+ *
+ * The nodes on the node_list is ordered by priority and can contain
+ * entries which have the same priority. Those entries are ordered
+ * FIFO
+ *
+ * Addition means: look for the prio_list node in the prio_list
+ * for the priority of the node and insert it before the node_list
+ * entry of the next prio_list node. If it is the first node of
+ * that priority, add it to the prio_list in the right position and
+ * insert it into the serialized node_list list
+ *
+ * Removal means remove it from the node_list and remove it from
+ * the prio_list if the node_list list_head is non empty. In case
+ * of removal from the prio_list it must be checked whether other
+ * entries of the same priority are on the list or not. If there
+ * is another entry of the same priority then this entry has to
+ * replace the removed entry on the prio_list. If the entry which
+ * is removed is the only entry of this priority then a simple
+ * remove from both list is sufficient.
+ *
+ * INT_MIN is the highest priority, 0 is the medium highest, INT_MAX
+ * is lowest priority.
+ *
+ * No locking is done, up to the caller.
+ *
+ */
+#ifndef _LINUX_PLIST_H_
+#define _LINUX_PLIST_H_
+
+#include <linux/list.h>
+#include <linux/spinlock_types.h>
+
+struct plist_head {
+	struct list_head prio_list;
+	struct list_head node_list;
+#ifdef CONFIG_DEBUG_PI_LIST
+	spinlock_t *lock;
+#endif
+};
+
+struct plist_node {
+	int			prio;
+	struct plist_head	plist;
+};
+
+#ifdef CONFIG_DEBUG_PI_LIST
+# define PLIST_HEAD_LOCK_INIT(_lock)	.lock = _lock
+#else
+# define PLIST_HEAD_LOCK_INIT(_lock)
+#endif
+
+/**
+ * #PLIST_HEAD_INIT - static struct plist_head initializer
+ *
+ * @head:	struct plist_head variable name
+ */
+#define PLIST_HEAD_INIT(head, _lock)			\
+{							\
+	.prio_list = LIST_HEAD_INIT((head).prio_list),	\
+	.node_list = LIST_HEAD_INIT((head).node_list),	\
+	PLIST_HEAD_LOCK_INIT(&(_lock))			\
+}
+
+/**
+ * #PLIST_NODE_INIT - static struct plist_node initializer
+ *
+ * @node:	struct plist_node variable name
+ * @__prio:	initial node priority
+ */
+#define PLIST_NODE_INIT(node, __prio)			\
+{							\
+	.prio  = (__prio),				\
+	.plist = PLIST_HEAD_INIT((node).plist, NULL),	\
+}
+
+/**
+ * plist_head_init - dynamic struct plist_head initializer
+ *
+ * @head:	&struct plist_head pointer
+ */
+static inline void
+plist_head_init(struct plist_head *head, spinlock_t *lock)
+{
+	INIT_LIST_HEAD(&head->prio_list);
+	INIT_LIST_HEAD(&head->node_list);
+#ifdef CONFIG_DEBUG_PI_LIST
+	head->lock = lock;
+#endif
+}
+
+/**
+ * plist_node_init - Dynamic struct plist_node initializer
+ *
+ * @node:	&struct plist_node pointer
+ * @prio:	initial node priority
+ */
+static inline void plist_node_init(struct plist_node *node, int prio)
+{
+	node->prio = prio;
+	plist_head_init(&node->plist, NULL);
+}
+
+extern void plist_add(struct plist_node *node, struct plist_head *head);
+extern void plist_del(struct plist_node *node, struct plist_head *head);
+
+/**
+ * plist_for_each - iterate over the plist
+ *
+ * @pos1:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ */
+#define plist_for_each(pos, head)	\
+	 list_for_each_entry(pos, &(head)->node_list, plist.node_list)
+
+/**
+ * plist_for_each_entry_safe - iterate over a plist of given type safe
+ * against removal of list entry
+ *
+ * @pos1:	the type * to use as a loop counter.
+ * @n1:	another type * to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define plist_for_each_safe(pos, n, head)	\
+	 list_for_each_entry_safe(pos, n, &(head)->node_list, plist.node_list)
+
+/**
+ * plist_for_each_entry	- iterate over list of given type
+ *
+ * @pos:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define plist_for_each_entry(pos, head, mem)	\
+	 list_for_each_entry(pos, &(head)->node_list, mem.plist.node_list)
+
+/**
+ * plist_for_each_entry_safe - iterate over list of given type safe against
+ * removal of list entry
+ *
+ * @pos:	the type * to use as a loop counter.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @m:		the name of the list_struct within the struct.
+ */
+#define plist_for_each_entry_safe(pos, n, head, m)	\
+	list_for_each_entry_safe(pos, n, &(head)->node_list, m.plist.node_list)
+
+/**
+ * plist_head_empty - return !0 if a plist_head is empty
+ *
+ * @head:	&struct plist_head pointer
+ */
+static inline int plist_head_empty(const struct plist_head *head)
+{
+	return list_empty(&head->node_list);
+}
+
+/**
+ * plist_node_empty - return !0 if plist_node is not on a list
+ *
+ * @node:	&struct plist_node pointer
+ */
+static inline int plist_node_empty(const struct plist_node *node)
+{
+	return plist_head_empty(&node->plist);
+}
+
+/* All functions below assume the plist_head is not empty. */
+
+/**
+ * plist_first_entry - get the struct for the first entry
+ *
+ * @ptr:	the &struct plist_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ */
+#ifdef CONFIG_DEBUG_PI_LIST
+# define plist_first_entry(head, type, member)	\
+({ \
+	WARN_ON(plist_head_empty(head)); \
+	container_of(plist_first(head), type, member); \
+})
+#else
+# define plist_first_entry(head, type, member)	\
+	container_of(plist_first(head), type, member)
+#endif
+
+/**
+ * plist_first - return the first node (and thus, highest priority)
+ *
+ * @head:	the &struct plist_head pointer
+ *
+ * Assumes the plist is _not_ empty.
+ */
+static inline struct plist_node* plist_first(const struct plist_head *head)
+{
+	return list_entry(head->node_list.next,
+			  struct plist_node, plist.node_list);
+}
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index 3de93357f5ab..f6299342b882 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -86,4 +86,10 @@ config TEXTSEARCH_BM
 config TEXTSEARCH_FSM
 	tristate
 
+#
+# plist support is select#ed if needed
+#
+config PLIST
+	boolean
+
 endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 79358ad1f113..10c13c9d7824 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,6 +25,7 @@ lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 lib-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
+obj-$(CONFIG_PLIST) += plist.o
 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
 
 ifneq ($(CONFIG_HAVE_DEC_LOCK),y)
diff --git a/lib/plist.c b/lib/plist.c
new file mode 100644
index 000000000000..3074a02272f3
--- /dev/null
+++ b/lib/plist.c
@@ -0,0 +1,118 @@
+/*
+ * lib/plist.c
+ *
+ * Descending-priority-sorted double-linked list
+ *
+ * (C) 2002-2003 Intel Corp
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
+ *
+ * 2001-2005 (c) MontaVista Software, Inc.
+ * Daniel Walker <dwalker@mvista.com>
+ *
+ * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Simplifications of the original code by
+ * Oleg Nesterov <oleg@tv-sign.ru>
+ *
+ * Licensed under the FSF's GNU Public License v2 or later.
+ *
+ * Based on simple lists (include/linux/list.h).
+ *
+ * This file contains the add / del functions which are considered to
+ * be too large to inline. See include/linux/plist.h for further
+ * information.
+ */
+
+#include <linux/plist.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_DEBUG_PI_LIST
+
+static void plist_check_prev_next(struct list_head *t, struct list_head *p,
+				  struct list_head *n)
+{
+	if (n->prev != p || p->next != n) {
+		printk("top: %p, n: %p, p: %p\n", t, t->next, t->prev);
+		printk("prev: %p, n: %p, p: %p\n", p, p->next, p->prev);
+		printk("next: %p, n: %p, p: %p\n", n, n->next, n->prev);
+		WARN_ON(1);
+	}
+}
+
+static void plist_check_list(struct list_head *top)
+{
+	struct list_head *prev = top, *next = top->next;
+
+	plist_check_prev_next(top, prev, next);
+	while (next != top) {
+		prev = next;
+		next = prev->next;
+		plist_check_prev_next(top, prev, next);
+	}
+}
+
+static void plist_check_head(struct plist_head *head)
+{
+	WARN_ON(!head->lock);
+	if (head->lock)
+		WARN_ON_SMP(!spin_is_locked(head->lock));
+	plist_check_list(&head->prio_list);
+	plist_check_list(&head->node_list);
+}
+
+#else
+# define plist_check_head(h)	do { } while (0)
+#endif
+
+/**
+ * plist_add - add @node to @head
+ *
+ * @node:	&struct plist_node pointer
+ * @head:	&struct plist_head pointer
+ */
+void plist_add(struct plist_node *node, struct plist_head *head)
+{
+	struct plist_node *iter;
+
+	plist_check_head(head);
+	WARN_ON(!plist_node_empty(node));
+
+	list_for_each_entry(iter, &head->prio_list, plist.prio_list) {
+		if (node->prio < iter->prio)
+			goto lt_prio;
+		else if (node->prio == iter->prio) {
+			iter = list_entry(iter->plist.prio_list.next,
+					struct plist_node, plist.prio_list);
+			goto eq_prio;
+		}
+	}
+
+lt_prio:
+	list_add_tail(&node->plist.prio_list, &iter->plist.prio_list);
+eq_prio:
+	list_add_tail(&node->plist.node_list, &iter->plist.node_list);
+
+	plist_check_head(head);
+}
+
+/**
+ * plist_del - Remove a @node from plist.
+ *
+ * @node:	&struct plist_node pointer - entry to be removed
+ * @head:	&struct plist_head pointer - list head
+ */
+void plist_del(struct plist_node *node, struct plist_head *head)
+{
+	plist_check_head(head);
+
+	if (!list_empty(&node->plist.prio_list)) {
+		struct plist_node *next = plist_first(&node->plist);
+
+		list_move_tail(&next->plist.prio_list, &node->plist.prio_list);
+		list_del_init(&node->plist.prio_list);
+	}
+
+	list_del_init(&node->plist.node_list);
+
+	plist_check_head(head);
+}
-- 
cgit v1.2.3


From b29739f902ee76a05493fb7d2303490fc75364f4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 27 Jun 2006 02:54:51 -0700
Subject: [PATCH] pi-futex: scheduler support for pi

Add framework to boost/unboost the priority of RT tasks.

This consists of:

 - caching the 'normal' priority in ->normal_prio
 - providing a functions to set/get the priority of the task
 - make sched_setscheduler() aware of boosting

The effective_prio() cleanups also fix a priority-calculation bug pointed out
by Andrey Gelman, in set_user_nice().

has_rt_policy() fix: Peter Williams <pwil3058@bigpond.net.au>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Andrey Gelman <agelman@012.net.il>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/init_task.h |   2 +
 include/linux/sched.h     |  21 +++++-
 kernel/sched.c            | 189 +++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 181 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e127ef7e8da8..678c1a90380d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -87,6 +87,7 @@ extern struct group_info init_groups;
 	.lock_depth	= -1,						\
 	.prio		= MAX_PRIO-20,					\
 	.static_prio	= MAX_PRIO-20,					\
+	.normal_prio	= MAX_PRIO-20,					\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
@@ -122,6 +123,7 @@ extern struct group_info init_groups;
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
+	.pi_lock	= SPIN_LOCK_UNLOCKED,				\
 }
 
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0bc81a151e50..6f167645e7e2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -495,8 +495,11 @@ struct signal_struct {
 
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
 
-#define rt_task(p)		(unlikely((p)->prio < MAX_RT_PRIO))
+#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
+#define rt_task(p)		rt_prio((p)->prio)
 #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
+#define has_rt_policy(p) \
+	unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
 
 /*
  * Some day this will be a full-fledged user tracking system..
@@ -725,7 +728,7 @@ struct task_struct {
 #endif
 #endif
 	int load_weight;	/* for niceness load balancing purposes */
-	int prio, static_prio;
+	int prio, static_prio, normal_prio;
 	struct list_head run_list;
 	prio_array_t *array;
 
@@ -852,6 +855,9 @@ struct task_struct {
 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
 	spinlock_t alloc_lock;
 
+	/* Protection of the PI data structures: */
+	spinlock_t pi_lock;
+
 #ifdef CONFIG_DEBUG_MUTEXES
 	/* mutex deadlock detection */
 	struct mutex_waiter *blocked_on;
@@ -1018,6 +1024,17 @@ static inline void idle_task_exit(void) {}
 #endif
 
 extern void sched_idle_next(void);
+
+#ifdef CONFIG_RT_MUTEXES
+extern int rt_mutex_getprio(task_t *p);
+extern void rt_mutex_setprio(task_t *p, int prio);
+#else
+static inline int rt_mutex_getprio(task_t *p)
+{
+	return p->normal_prio;
+}
+#endif
+
 extern void set_user_nice(task_t *p, long nice);
 extern int task_prio(const task_t *p);
 extern int task_nice(const task_t *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index 15abf0833245..08431f07a999 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -354,6 +354,25 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
+/*
+ * __task_rq_lock - lock the runqueue a given task resides on.
+ * Must be called interrupts disabled.
+ */
+static inline runqueue_t *__task_rq_lock(task_t *p)
+	__acquires(rq->lock)
+{
+	struct runqueue *rq;
+
+repeat_lock_task:
+	rq = task_rq(p);
+	spin_lock(&rq->lock);
+	if (unlikely(rq != task_rq(p))) {
+		spin_unlock(&rq->lock);
+		goto repeat_lock_task;
+	}
+	return rq;
+}
+
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
@@ -375,6 +394,12 @@ repeat_lock_task:
 	return rq;
 }
 
+static inline void __task_rq_unlock(runqueue_t *rq)
+	__releases(rq->lock)
+{
+	spin_unlock(&rq->lock);
+}
+
 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 	__releases(rq->lock)
 {
@@ -638,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
 }
 
 /*
- * effective_prio - return the priority that is based on the static
+ * __normal_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
  *
  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -651,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
  *
  * Both properties are important to certain workloads.
  */
-static int effective_prio(task_t *p)
+
+static inline int __normal_prio(task_t *p)
 {
 	int bonus, prio;
 
-	if (rt_task(p))
-		return p->prio;
-
 	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 
 	prio = p->static_prio - bonus;
@@ -692,7 +715,7 @@ static int effective_prio(task_t *p)
 
 static void set_load_weight(task_t *p)
 {
-	if (rt_task(p)) {
+	if (has_rt_policy(p)) {
 #ifdef CONFIG_SMP
 		if (p == task_rq(p)->migration_thread)
 			/*
@@ -730,6 +753,44 @@ static inline void dec_nr_running(task_t *p, runqueue_t *rq)
 	dec_raw_weighted_load(rq, p);
 }
 
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(task_t *p)
+{
+	int prio;
+
+	if (has_rt_policy(p))
+		prio = MAX_RT_PRIO-1 - p->rt_priority;
+	else
+		prio = __normal_prio(p);
+	return prio;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(task_t *p)
+{
+	p->normal_prio = normal_prio(p);
+	/*
+	 * If we are RT tasks or we were boosted to RT priority,
+	 * keep the priority unchanged. Otherwise, update priority
+	 * to the normal priority:
+	 */
+	if (!rt_prio(p->prio))
+		return p->normal_prio;
+	return p->prio;
+}
+
 /*
  * __activate_task - move a task to the runqueue.
  */
@@ -752,6 +813,10 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 	inc_nr_running(p, rq);
 }
 
+/*
+ * Recalculate p->normal_prio and p->prio after having slept,
+ * updating the sleep-average too:
+ */
 static int recalc_task_prio(task_t *p, unsigned long long now)
 {
 	/* Caller must always ensure 'now >= p->timestamp' */
@@ -1448,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
+
+	/*
+	 * Make sure we do not leak PI boosting priority to the child:
+	 */
+	p->prio = current->normal_prio;
+
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
 #ifdef CONFIG_SCHEDSTATS
@@ -1527,6 +1598,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
 				__activate_task(p, rq);
 			else {
 				p->prio = current->prio;
+				p->normal_prio = current->normal_prio;
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
@@ -3668,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 
 EXPORT_SYMBOL(sleep_on_timeout);
 
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(task_t *p, int prio)
+{
+	unsigned long flags;
+	prio_array_t *array;
+	runqueue_t *rq;
+	int oldprio;
+
+	BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+	rq = task_rq_lock(p, &flags);
+
+	oldprio = p->prio;
+	array = p->array;
+	if (array)
+		dequeue_task(p, array);
+	p->prio = prio;
+
+	if (array) {
+		/*
+		 * If changing to an RT priority then queue it
+		 * in the active array!
+		 */
+		if (rt_task(p))
+			array = rq->active;
+		enqueue_task(p, array);
+		/*
+		 * Reschedule if we are currently running on this runqueue and
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
+		 */
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+	task_rq_unlock(rq, &flags);
+}
+
+#endif
+
 void set_user_nice(task_t *p, long nice)
 {
 	unsigned long flags;
 	prio_array_t *array;
 	runqueue_t *rq;
-	int old_prio, new_prio, delta;
+	int old_prio, delta;
 
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
@@ -3688,7 +3813,7 @@ void set_user_nice(task_t *p, long nice)
 	 * it wont have any effect on scheduling until the task is
 	 * not SCHED_NORMAL/SCHED_BATCH:
 	 */
-	if (rt_task(p)) {
+	if (has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
@@ -3698,12 +3823,11 @@ void set_user_nice(task_t *p, long nice)
 		dec_raw_weighted_load(rq, p);
 	}
 
-	old_prio = p->prio;
-	new_prio = NICE_TO_PRIO(nice);
-	delta = new_prio - old_prio;
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
-	p->prio += delta;
+	old_prio = p->prio;
+	p->prio = effective_prio(p);
+	delta = p->prio - old_prio;
 
 	if (array) {
 		enqueue_task(p, array);
@@ -3718,7 +3842,6 @@ void set_user_nice(task_t *p, long nice)
 out_unlock:
 	task_rq_unlock(rq, &flags);
 }
-
 EXPORT_SYMBOL(set_user_nice);
 
 /*
@@ -3833,16 +3956,14 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
 	BUG_ON(p->array);
 	p->policy = policy;
 	p->rt_priority = prio;
-	if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
-		p->prio = MAX_RT_PRIO-1 - p->rt_priority;
-	} else {
-		p->prio = p->static_prio;
-		/*
-		 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
-		 */
-		if (policy == SCHED_BATCH)
-			p->sleep_avg = 0;
-	}
+	p->normal_prio = normal_prio(p);
+	/* we are holding p->pi_lock already */
+	p->prio = rt_mutex_getprio(p);
+	/*
+	 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+	 */
+	if (policy == SCHED_BATCH)
+		p->sleep_avg = 0;
 	set_load_weight(p);
 }
 
@@ -3911,15 +4032,21 @@ recheck:
 	retval = security_task_setscheduler(p, policy, param);
 	if (retval)
 		return retval;
+	/*
+	 * make sure no PI-waiters arrive (or leave) while we are
+	 * changing the priority of the task:
+	 */
+	spin_lock_irqsave(&p->pi_lock, flags);
 	/*
 	 * To be able to change p->policy safely, the apropriate
 	 * runqueue lock must be held.
 	 */
-	rq = task_rq_lock(p, &flags);
+	rq = __task_rq_lock(p);
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
-		task_rq_unlock(rq, &flags);
+		__task_rq_unlock(rq);
+		spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
 	array = p->array;
@@ -3940,7 +4067,9 @@ recheck:
 		} else if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
-	task_rq_unlock(rq, &flags);
+	__task_rq_unlock(rq);
+	spin_unlock_irqrestore(&p->pi_lock, flags);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -4575,7 +4704,7 @@ void __devinit init_idle(task_t *idle, int cpu)
 	idle->timestamp = sched_clock();
 	idle->sleep_avg = 0;
 	idle->array = NULL;
-	idle->prio = MAX_PRIO;
+	idle->prio = idle->normal_prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
 	set_task_cpu(idle, cpu);
@@ -6582,7 +6711,8 @@ void normalize_rt_tasks(void)
 		if (!rt_task(p))
 			continue;
 
-		rq = task_rq_lock(p, &flags);
+		spin_lock_irqsave(&p->pi_lock, flags);
+		rq = __task_rq_lock(p);
 
 		array = p->array;
 		if (array)
@@ -6593,7 +6723,8 @@ void normalize_rt_tasks(void)
 			resched_task(rq->curr);
 		}
 
-		task_rq_unlock(rq, &flags);
+		__task_rq_unlock(rq);
+		spin_unlock_irqrestore(&p->pi_lock, flags);
 	}
 	read_unlock_irq(&tasklist_lock);
 }
-- 
cgit v1.2.3


From 23f78d4a03c53cbd75d87a795378ea540aa08c86 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 27 Jun 2006 02:54:53 -0700
Subject: [PATCH] pi-futex: rt mutex core

Core functions for the rt-mutex subsystem.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/init_task.h |   1 +
 include/linux/rtmutex.h   | 104 ++++++
 include/linux/sched.h     |  12 +
 include/linux/sysctl.h    |   1 +
 init/Kconfig              |   5 +
 kernel/Makefile           |   1 +
 kernel/fork.c             |  16 +
 kernel/rtmutex.c          | 904 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rtmutex.h          |  29 ++
 kernel/rtmutex_common.h   |  93 +++++
 kernel/sysctl.c           |  15 +
 11 files changed, 1181 insertions(+)
 create mode 100644 include/linux/rtmutex.h
 create mode 100644 kernel/rtmutex.c
 create mode 100644 kernel/rtmutex.h
 create mode 100644 kernel/rtmutex_common.h

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 678c1a90380d..3a256957fb56 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -124,6 +124,7 @@ extern struct group_info init_groups;
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
 	.pi_lock	= SPIN_LOCK_UNLOCKED,				\
+	INIT_RT_MUTEXES(tsk)						\
 }
 
 
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
new file mode 100644
index 000000000000..12309c916c68
--- /dev/null
+++ b/include/linux/rtmutex.h
@@ -0,0 +1,104 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the public data structure and API definitions.
+ */
+
+#ifndef __LINUX_RT_MUTEX_H
+#define __LINUX_RT_MUTEX_H
+
+#include <linux/linkage.h>
+#include <linux/plist.h>
+#include <linux/spinlock_types.h>
+
+/*
+ * The rt_mutex structure
+ *
+ * @wait_lock:	spinlock to protect the structure
+ * @wait_list:	pilist head to enqueue waiters in priority order
+ * @owner:	the mutex owner
+ */
+struct rt_mutex {
+	spinlock_t		wait_lock;
+	struct plist_head	wait_list;
+	struct task_struct	*owner;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	int			save_state;
+	struct list_head	held_list_entry;
+	unsigned long		acquire_ip;
+	const char 		*name, *file;
+	int			line;
+	void			*magic;
+#endif
+};
+
+struct rt_mutex_waiter;
+struct hrtimer_sleeper;
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
+	, .name = #mutexname, .file = __FILE__, .line = __LINE__
+# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, __FUNCTION__)
+ extern void rt_mutex_debug_task_free(struct task_struct *tsk);
+#else
+# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
+# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, NULL)
+# define rt_mutex_debug_task_free(t)		do { } while (0)
+#endif
+
+#define __RT_MUTEX_INITIALIZER(mutexname) \
+	{ .wait_lock = SPIN_LOCK_UNLOCKED \
+	, .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \
+	, .owner = NULL \
+	__DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
+
+#define DEFINE_RT_MUTEX(mutexname) \
+	struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
+
+/***
+ * rt_mutex_is_locked - is the mutex locked
+ * @lock: the mutex to be queried
+ *
+ * Returns 1 if the mutex is locked, 0 if unlocked.
+ */
+static inline int rt_mutex_is_locked(struct rt_mutex *lock)
+{
+	return lock->owner != NULL;
+}
+
+extern void __rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void rt_mutex_destroy(struct rt_mutex *lock);
+
+extern void rt_mutex_lock(struct rt_mutex *lock);
+extern int rt_mutex_lock_interruptible(struct rt_mutex *lock,
+						int detect_deadlock);
+extern int rt_mutex_timed_lock(struct rt_mutex *lock,
+					struct hrtimer_sleeper *timeout,
+					int detect_deadlock);
+
+extern int rt_mutex_trylock(struct rt_mutex *lock);
+
+extern void rt_mutex_unlock(struct rt_mutex *lock);
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define INIT_RT_MUTEX_DEBUG(tsk)					\
+	.held_list_head	= LIST_HEAD_INIT(tsk.held_list_head),		\
+	.held_list_lock	= SPIN_LOCK_UNLOCKED
+#else
+# define INIT_RT_MUTEX_DEBUG(tsk)
+#endif
+
+#ifdef CONFIG_RT_MUTEXES
+# define INIT_RT_MUTEXES(tsk)						\
+	.pi_waiters	= PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock),	\
+	INIT_RT_MUTEX_DEBUG(tsk)
+#else
+# define INIT_RT_MUTEXES(tsk)
+#endif
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6f167645e7e2..6ea23c9af413 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -73,6 +73,7 @@ struct sched_param {
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
 #include <linux/futex.h>
+#include <linux/rtmutex.h>
 
 #include <linux/time.h>
 #include <linux/param.h>
@@ -858,6 +859,17 @@ struct task_struct {
 	/* Protection of the PI data structures: */
 	spinlock_t pi_lock;
 
+#ifdef CONFIG_RT_MUTEXES
+	/* PI waiters blocked on a rt_mutex held by this task */
+	struct plist_head pi_waiters;
+	/* Deadlock detection and priority inheritance handling */
+	struct rt_mutex_waiter *pi_blocked_on;
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+	spinlock_t held_list_lock;
+	struct list_head held_list_head;
+# endif
+#endif
+
 #ifdef CONFIG_DEBUG_MUTEXES
 	/* mutex deadlock detection */
 	struct mutex_waiter *blocked_on;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index bee12a7a0576..46e4d8f2771f 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -149,6 +149,7 @@ enum
 	KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
 	KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
 	KERN_COMPAT_LOG=73,	/* int: print compat layer  messages */
+	KERN_MAX_LOCK_DEPTH=74,
 };
 
 
diff --git a/init/Kconfig b/init/Kconfig
index df55b3665601..f70f2fd273c2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -339,9 +339,14 @@ config BASE_FULL
 	  kernel data structures. This saves memory on small machines,
 	  but may reduce performance.
 
+config RT_MUTEXES
+	boolean
+	select PLIST
+
 config FUTEX
 	bool "Enable futex support" if EMBEDDED
 	default y
+	select RT_MUTEXES
 	help
 	  Disabling this option will cause the kernel to be built without
 	  support for "fast userspace mutexes".  The resulting kernel may not
diff --git a/kernel/Makefile b/kernel/Makefile
index 752bd7d383af..21df9a338ff0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_FUTEX) += futex.o
 ifeq ($(CONFIG_COMPAT),y)
 obj-$(CONFIG_FUTEX) += futex_compat.o
 endif
+obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b4e54ef0225..b664a081fffa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep;
 void free_task(struct task_struct *tsk)
 {
 	free_thread_info(tsk->thread_info);
+	rt_mutex_debug_task_free(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -913,6 +914,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
 	return current->pid;
 }
 
+static inline void rt_mutex_init_task(struct task_struct *p)
+{
+#ifdef CONFIG_RT_MUTEXES
+	spin_lock_init(&p->pi_lock);
+	plist_head_init(&p->pi_waiters, &p->pi_lock);
+	p->pi_blocked_on = NULL;
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+	spin_lock_init(&p->held_list_lock);
+	INIT_LIST_HEAD(&p->held_list_head);
+# endif
+#endif
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -1034,6 +1048,8 @@ static task_t *copy_process(unsigned long clone_flags,
 	mpol_fix_fork_child_flag(p);
 #endif
 
+	rt_mutex_init_task(p);
+
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
new file mode 100644
index 000000000000..937a474fae94
--- /dev/null
+++ b/kernel/rtmutex.c
@@ -0,0 +1,904 @@
+/*
+ * RT-Mutexes: simple blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner.
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
+ *  Copyright (C) 2006 Esben Nielsen
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+/*
+ * lock->owner state tracking:
+ *
+ * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
+ * are used to keep track of the "owner is pending" and "lock has
+ * waiters" state.
+ *
+ * owner	bit1	bit0
+ * NULL		0	0	lock is free (fast acquire possible)
+ * NULL		0	1	invalid state
+ * NULL		1	0	Transitional State*
+ * NULL		1	1	invalid state
+ * taskpointer	0	0	lock is held (fast release possible)
+ * taskpointer	0	1	task is pending owner
+ * taskpointer	1	0	lock is held and has waiters
+ * taskpointer	1	1	task is pending owner and lock has more waiters
+ *
+ * Pending ownership is assigned to the top (highest priority)
+ * waiter of the lock, when the lock is released. The thread is woken
+ * up and can now take the lock. Until the lock is taken (bit 0
+ * cleared) a competing higher priority thread can steal the lock
+ * which puts the woken up thread back on the waiters list.
+ *
+ * The fast atomic compare exchange based acquire and release is only
+ * possible when bit 0 and 1 of lock->owner are 0.
+ *
+ * (*) There's a small time where the owner can be NULL and the
+ * "lock has waiters" bit is set.  This can happen when grabbing the lock.
+ * To prevent a cmpxchg of the owner releasing the lock, we need to set this
+ * bit before looking at the lock, hence the reason this is a transitional
+ * state.
+ */
+
+static void
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+		   unsigned long mask)
+{
+	unsigned long val = (unsigned long)owner | mask;
+
+	if (rt_mutex_has_waiters(lock))
+		val |= RT_MUTEX_HAS_WAITERS;
+
+	lock->owner = (struct task_struct *)val;
+}
+
+static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	lock->owner = (struct task_struct *)
+			((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	if (!rt_mutex_has_waiters(lock))
+		clear_rt_mutex_waiters(lock);
+}
+
+/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+	do {
+		owner = *p;
+	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n)	(0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	lock->owner = (struct task_struct *)
+			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
+/*
+ * Calculate task priority from the waiter list priority
+ *
+ * Return task->normal_prio when the waiter list is empty or when
+ * the waiter is not allowed to do priority boosting
+ */
+int rt_mutex_getprio(struct task_struct *task)
+{
+	if (likely(!task_has_pi_waiters(task)))
+		return task->normal_prio;
+
+	return min(task_top_pi_waiter(task)->pi_list_entry.prio,
+		   task->normal_prio);
+}
+
+/*
+ * Adjust the priority of a task, after its pi_waiters got modified.
+ *
+ * This can be both boosting and unboosting. task->pi_lock must be held.
+ */
+static void __rt_mutex_adjust_prio(struct task_struct *task)
+{
+	int prio = rt_mutex_getprio(task);
+
+	if (task->prio != prio)
+		rt_mutex_setprio(task, prio);
+}
+
+/*
+ * Adjust task priority (undo boosting). Called from the exit path of
+ * rt_mutex_slowunlock() and rt_mutex_slowlock().
+ *
+ * (Note: We do this outside of the protection of lock->wait_lock to
+ * allow the lock to be taken while or before we readjust the priority
+ * of task. We do not use the spin_xx_mutex() variants here as we are
+ * outside of the debug path.)
+ */
+static void rt_mutex_adjust_prio(struct task_struct *task)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&task->pi_lock, flags);
+	__rt_mutex_adjust_prio(task);
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+}
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+/*
+ * Adjust the priority chain. Also used for deadlock detection.
+ * Decreases task's usage by one - may thus free the task.
+ * Returns 0 or -EDEADLK.
+ */
+static int rt_mutex_adjust_prio_chain(task_t *task,
+				      int deadlock_detect,
+				      struct rt_mutex *orig_lock,
+				      struct rt_mutex_waiter *orig_waiter
+				      __IP_DECL__)
+{
+	struct rt_mutex *lock;
+	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
+	int detect_deadlock, ret = 0, depth = 0;
+	unsigned long flags;
+
+	detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
+							 deadlock_detect);
+
+	/*
+	 * The (de)boosting is a step by step approach with a lot of
+	 * pitfalls. We want this to be preemptible and we want hold a
+	 * maximum of two locks per step. So we have to check
+	 * carefully whether things change under us.
+	 */
+ again:
+	if (++depth > max_lock_depth) {
+		static int prev_max;
+
+		/*
+		 * Print this only once. If the admin changes the limit,
+		 * print a new message when reaching the limit again.
+		 */
+		if (prev_max != max_lock_depth) {
+			prev_max = max_lock_depth;
+			printk(KERN_WARNING "Maximum lock depth %d reached "
+			       "task: %s (%d)\n", max_lock_depth,
+			       current->comm, current->pid);
+		}
+		put_task_struct(task);
+
+		return deadlock_detect ? -EDEADLK : 0;
+	}
+ retry:
+	/*
+	 * Task can not go away as we did a get_task() before !
+	 */
+	spin_lock_irqsave(&task->pi_lock, flags);
+
+	waiter = task->pi_blocked_on;
+	/*
+	 * Check whether the end of the boosting chain has been
+	 * reached or the state of the chain has changed while we
+	 * dropped the locks.
+	 */
+	if (!waiter || !waiter->task)
+		goto out_unlock_pi;
+
+	if (top_waiter && (!task_has_pi_waiters(task) ||
+			   top_waiter != task_top_pi_waiter(task)))
+		goto out_unlock_pi;
+
+	/*
+	 * When deadlock detection is off then we check, if further
+	 * priority adjustment is necessary.
+	 */
+	if (!detect_deadlock && waiter->list_entry.prio == task->prio)
+		goto out_unlock_pi;
+
+	lock = waiter->lock;
+	if (!spin_trylock(&lock->wait_lock)) {
+		spin_unlock_irqrestore(&task->pi_lock, flags);
+		cpu_relax();
+		goto retry;
+	}
+
+	/* Deadlock detection */
+	if (lock == orig_lock || rt_mutex_owner(lock) == current) {
+		debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
+		spin_unlock(&lock->wait_lock);
+		ret = deadlock_detect ? -EDEADLK : 0;
+		goto out_unlock_pi;
+	}
+
+	top_waiter = rt_mutex_top_waiter(lock);
+
+	/* Requeue the waiter */
+	plist_del(&waiter->list_entry, &lock->wait_list);
+	waiter->list_entry.prio = task->prio;
+	plist_add(&waiter->list_entry, &lock->wait_list);
+
+	/* Release the task */
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+	put_task_struct(task);
+
+	/* Grab the next task */
+	task = rt_mutex_owner(lock);
+	spin_lock_irqsave(&task->pi_lock, flags);
+
+	if (waiter == rt_mutex_top_waiter(lock)) {
+		/* Boost the owner */
+		plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
+		waiter->pi_list_entry.prio = waiter->list_entry.prio;
+		plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+		__rt_mutex_adjust_prio(task);
+
+	} else if (top_waiter == waiter) {
+		/* Deboost the owner */
+		plist_del(&waiter->pi_list_entry, &task->pi_waiters);
+		waiter = rt_mutex_top_waiter(lock);
+		waiter->pi_list_entry.prio = waiter->list_entry.prio;
+		plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+		__rt_mutex_adjust_prio(task);
+	}
+
+	get_task_struct(task);
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+
+	top_waiter = rt_mutex_top_waiter(lock);
+	spin_unlock(&lock->wait_lock);
+
+	if (!detect_deadlock && waiter != top_waiter)
+		goto out_put_task;
+
+	goto again;
+
+ out_unlock_pi:
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+ out_put_task:
+	put_task_struct(task);
+	return ret;
+}
+
+/*
+ * Optimization: check if we can steal the lock from the
+ * assigned pending owner [which might not have taken the
+ * lock yet]:
+ */
+static inline int try_to_steal_lock(struct rt_mutex *lock)
+{
+	struct task_struct *pendowner = rt_mutex_owner(lock);
+	struct rt_mutex_waiter *next;
+	unsigned long flags;
+
+	if (!rt_mutex_owner_pending(lock))
+		return 0;
+
+	if (pendowner == current)
+		return 1;
+
+	spin_lock_irqsave(&pendowner->pi_lock, flags);
+	if (current->prio >= pendowner->prio) {
+		spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+		return 0;
+	}
+
+	/*
+	 * Check if a waiter is enqueued on the pending owners
+	 * pi_waiters list. Remove it and readjust pending owners
+	 * priority.
+	 */
+	if (likely(!rt_mutex_has_waiters(lock))) {
+		spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+		return 1;
+	}
+
+	/* No chain handling, pending owner is not blocked on anything: */
+	next = rt_mutex_top_waiter(lock);
+	plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
+	__rt_mutex_adjust_prio(pendowner);
+	spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+	/*
+	 * We are going to steal the lock and a waiter was
+	 * enqueued on the pending owners pi_waiters queue. So
+	 * we have to enqueue this waiter into
+	 * current->pi_waiters list. This covers the case,
+	 * where current is boosted because it holds another
+	 * lock and gets unboosted because the booster is
+	 * interrupted, so we would delay a waiter with higher
+	 * priority as current->normal_prio.
+	 *
+	 * Note: in the rare case of a SCHED_OTHER task changing
+	 * its priority and thus stealing the lock, next->task
+	 * might be current:
+	 */
+	if (likely(next->task != current)) {
+		spin_lock_irqsave(&current->pi_lock, flags);
+		plist_add(&next->pi_list_entry, &current->pi_waiters);
+		__rt_mutex_adjust_prio(current);
+		spin_unlock_irqrestore(&current->pi_lock, flags);
+	}
+	return 1;
+}
+
+/*
+ * Try to take an rt-mutex
+ *
+ * This fails
+ * - when the lock has a real owner
+ * - when a different pending owner exists and has higher priority than current
+ *
+ * Must be called with lock->wait_lock held.
+ */
+static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
+{
+	/*
+	 * We have to be careful here if the atomic speedups are
+	 * enabled, such that, when
+	 *  - no other waiter is on the lock
+	 *  - the lock has been released since we did the cmpxchg
+	 * the lock can be released or taken while we are doing the
+	 * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
+	 *
+	 * The atomic acquire/release aware variant of
+	 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
+	 * the WAITERS bit, the atomic release / acquire can not
+	 * happen anymore and lock->wait_lock protects us from the
+	 * non-atomic case.
+	 *
+	 * Note, that this might set lock->owner =
+	 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
+	 * any more. This is fixed up when we take the ownership.
+	 * This is the transitional state explained at the top of this file.
+	 */
+	mark_rt_mutex_waiters(lock);
+
+	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+		return 0;
+
+	/* We got the lock. */
+	debug_rt_mutex_lock(lock __IP__);
+
+	rt_mutex_set_owner(lock, current, 0);
+
+	rt_mutex_deadlock_account_lock(lock, current);
+
+	return 1;
+}
+
+/*
+ * Task blocks on lock.
+ *
+ * Prepare waiter and propagate pi chain
+ *
+ * This must be called with lock->wait_lock held.
+ */
+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
+				   struct rt_mutex_waiter *waiter,
+				   int detect_deadlock
+				   __IP_DECL__)
+{
+	struct rt_mutex_waiter *top_waiter = waiter;
+	task_t *owner = rt_mutex_owner(lock);
+	int boost = 0, res;
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->pi_lock, flags);
+	__rt_mutex_adjust_prio(current);
+	waiter->task = current;
+	waiter->lock = lock;
+	plist_node_init(&waiter->list_entry, current->prio);
+	plist_node_init(&waiter->pi_list_entry, current->prio);
+
+	/* Get the top priority waiter on the lock */
+	if (rt_mutex_has_waiters(lock))
+		top_waiter = rt_mutex_top_waiter(lock);
+	plist_add(&waiter->list_entry, &lock->wait_list);
+
+	current->pi_blocked_on = waiter;
+
+	spin_unlock_irqrestore(&current->pi_lock, flags);
+
+	if (waiter == rt_mutex_top_waiter(lock)) {
+		spin_lock_irqsave(&owner->pi_lock, flags);
+		plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+		plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+
+		__rt_mutex_adjust_prio(owner);
+		if (owner->pi_blocked_on) {
+			boost = 1;
+			get_task_struct(owner);
+		}
+		spin_unlock_irqrestore(&owner->pi_lock, flags);
+	}
+	else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
+		spin_lock_irqsave(&owner->pi_lock, flags);
+		if (owner->pi_blocked_on) {
+			boost = 1;
+			get_task_struct(owner);
+		}
+		spin_unlock_irqrestore(&owner->pi_lock, flags);
+	}
+	if (!boost)
+		return 0;
+
+	spin_unlock(&lock->wait_lock);
+
+	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
+					 waiter __IP__);
+
+	spin_lock(&lock->wait_lock);
+
+	return res;
+}
+
+/*
+ * Wake up the next waiter on the lock.
+ *
+ * Remove the top waiter from the current tasks waiter list and from
+ * the lock waiter list. Set it as pending owner. Then wake it up.
+ *
+ * Called with lock->wait_lock held.
+ */
+static void wakeup_next_waiter(struct rt_mutex *lock)
+{
+	struct rt_mutex_waiter *waiter;
+	struct task_struct *pendowner;
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->pi_lock, flags);
+
+	waiter = rt_mutex_top_waiter(lock);
+	plist_del(&waiter->list_entry, &lock->wait_list);
+
+	/*
+	 * Remove it from current->pi_waiters. We do not adjust a
+	 * possible priority boost right now. We execute wakeup in the
+	 * boosted mode and go back to normal after releasing
+	 * lock->wait_lock.
+	 */
+	plist_del(&waiter->pi_list_entry, &current->pi_waiters);
+	pendowner = waiter->task;
+	waiter->task = NULL;
+
+	rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+
+	spin_unlock_irqrestore(&current->pi_lock, flags);
+
+	/*
+	 * Clear the pi_blocked_on variable and enqueue a possible
+	 * waiter into the pi_waiters list of the pending owner. This
+	 * prevents that in case the pending owner gets unboosted a
+	 * waiter with higher priority than pending-owner->normal_prio
+	 * is blocked on the unboosted (pending) owner.
+	 */
+	spin_lock_irqsave(&pendowner->pi_lock, flags);
+
+	WARN_ON(!pendowner->pi_blocked_on);
+	WARN_ON(pendowner->pi_blocked_on != waiter);
+	WARN_ON(pendowner->pi_blocked_on->lock != lock);
+
+	pendowner->pi_blocked_on = NULL;
+
+	if (rt_mutex_has_waiters(lock)) {
+		struct rt_mutex_waiter *next;
+
+		next = rt_mutex_top_waiter(lock);
+		plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
+	}
+	spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+	wake_up_process(pendowner);
+}
+
+/*
+ * Remove a waiter from a lock
+ *
+ * Must be called with lock->wait_lock held
+ */
+static void remove_waiter(struct rt_mutex *lock,
+			  struct rt_mutex_waiter *waiter  __IP_DECL__)
+{
+	int first = (waiter == rt_mutex_top_waiter(lock));
+	int boost = 0;
+	task_t *owner = rt_mutex_owner(lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->pi_lock, flags);
+	plist_del(&waiter->list_entry, &lock->wait_list);
+	waiter->task = NULL;
+	current->pi_blocked_on = NULL;
+	spin_unlock_irqrestore(&current->pi_lock, flags);
+
+	if (first && owner != current) {
+
+		spin_lock_irqsave(&owner->pi_lock, flags);
+
+		plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
+
+		if (rt_mutex_has_waiters(lock)) {
+			struct rt_mutex_waiter *next;
+
+			next = rt_mutex_top_waiter(lock);
+			plist_add(&next->pi_list_entry, &owner->pi_waiters);
+		}
+		__rt_mutex_adjust_prio(owner);
+
+		if (owner->pi_blocked_on) {
+			boost = 1;
+			get_task_struct(owner);
+		}
+		spin_unlock_irqrestore(&owner->pi_lock, flags);
+	}
+
+	WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+
+	if (!boost)
+		return;
+
+	spin_unlock(&lock->wait_lock);
+
+	rt_mutex_adjust_prio_chain(owner, 0, lock, NULL __IP__);
+
+	spin_lock(&lock->wait_lock);
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+		  struct hrtimer_sleeper *timeout,
+		  int detect_deadlock __IP_DECL__)
+{
+	struct rt_mutex_waiter waiter;
+	int ret = 0;
+
+	debug_rt_mutex_init_waiter(&waiter);
+	waiter.task = NULL;
+
+	spin_lock(&lock->wait_lock);
+
+	/* Try to acquire the lock again: */
+	if (try_to_take_rt_mutex(lock __IP__)) {
+		spin_unlock(&lock->wait_lock);
+		return 0;
+	}
+
+	set_current_state(state);
+
+	/* Setup the timer, when timeout != NULL */
+	if (unlikely(timeout))
+		hrtimer_start(&timeout->timer, timeout->timer.expires,
+			      HRTIMER_ABS);
+
+	for (;;) {
+		/* Try to acquire the lock: */
+		if (try_to_take_rt_mutex(lock __IP__))
+			break;
+
+		/*
+		 * TASK_INTERRUPTIBLE checks for signals and
+		 * timeout. Ignored otherwise.
+		 */
+		if (unlikely(state == TASK_INTERRUPTIBLE)) {
+			/* Signal pending? */
+			if (signal_pending(current))
+				ret = -EINTR;
+			if (timeout && !timeout->task)
+				ret = -ETIMEDOUT;
+			if (ret)
+				break;
+		}
+
+		/*
+		 * waiter.task is NULL the first time we come here and
+		 * when we have been woken up by the previous owner
+		 * but the lock got stolen by a higher prio task.
+		 */
+		if (!waiter.task) {
+			ret = task_blocks_on_rt_mutex(lock, &waiter,
+						      detect_deadlock __IP__);
+			/*
+			 * If we got woken up by the owner then start loop
+			 * all over without going into schedule to try
+			 * to get the lock now:
+			 */
+			if (unlikely(!waiter.task))
+				continue;
+
+			if (unlikely(ret))
+				break;
+		}
+		spin_unlock(&lock->wait_lock);
+
+		debug_rt_mutex_print_deadlock(&waiter);
+
+		schedule();
+
+		spin_lock(&lock->wait_lock);
+		set_current_state(state);
+	}
+
+	set_current_state(TASK_RUNNING);
+
+	if (unlikely(waiter.task))
+		remove_waiter(lock, &waiter __IP__);
+
+	/*
+	 * try_to_take_rt_mutex() sets the waiter bit
+	 * unconditionally. We might have to fix that up.
+	 */
+	fixup_rt_mutex_waiters(lock);
+
+	spin_unlock(&lock->wait_lock);
+
+	/* Remove pending timer: */
+	if (unlikely(timeout))
+		hrtimer_cancel(&timeout->timer);
+
+	/*
+	 * Readjust priority, when we did not get the lock. We might
+	 * have been the pending owner and boosted. Since we did not
+	 * take the lock, the PI boost has to go.
+	 */
+	if (unlikely(ret))
+		rt_mutex_adjust_prio(current);
+
+	debug_rt_mutex_free_waiter(&waiter);
+
+	return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static inline int
+rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__)
+{
+	int ret = 0;
+
+	spin_lock(&lock->wait_lock);
+
+	if (likely(rt_mutex_owner(lock) != current)) {
+
+		ret = try_to_take_rt_mutex(lock __IP__);
+		/*
+		 * try_to_take_rt_mutex() sets the lock waiters
+		 * bit unconditionally. Clean this up.
+		 */
+		fixup_rt_mutex_waiters(lock);
+	}
+
+	spin_unlock(&lock->wait_lock);
+
+	return ret;
+}
+
+/*
+ * Slow path to release a rt-mutex:
+ */
+static void __sched
+rt_mutex_slowunlock(struct rt_mutex *lock)
+{
+	spin_lock(&lock->wait_lock);
+
+	debug_rt_mutex_unlock(lock);
+
+	rt_mutex_deadlock_account_unlock(current);
+
+	if (!rt_mutex_has_waiters(lock)) {
+		lock->owner = NULL;
+		spin_unlock(&lock->wait_lock);
+		return;
+	}
+
+	wakeup_next_waiter(lock);
+
+	spin_unlock(&lock->wait_lock);
+
+	/* Undo pi boosting if necessary: */
+	rt_mutex_adjust_prio(current);
+}
+
+/*
+ * debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static inline int
+rt_mutex_fastlock(struct rt_mutex *lock, int state,
+		  int detect_deadlock,
+		  int (*slowfn)(struct rt_mutex *lock, int state,
+				struct hrtimer_sleeper *timeout,
+				int detect_deadlock __IP_DECL__))
+{
+	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+		rt_mutex_deadlock_account_lock(lock, current);
+		return 0;
+	} else
+		return slowfn(lock, state, NULL, detect_deadlock __RET_IP__);
+}
+
+static inline int
+rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
+			struct hrtimer_sleeper *timeout, int detect_deadlock,
+			int (*slowfn)(struct rt_mutex *lock, int state,
+				      struct hrtimer_sleeper *timeout,
+				      int detect_deadlock __IP_DECL__))
+{
+	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+		rt_mutex_deadlock_account_lock(lock, current);
+		return 0;
+	} else
+		return slowfn(lock, state, timeout, detect_deadlock __RET_IP__);
+}
+
+static inline int
+rt_mutex_fasttrylock(struct rt_mutex *lock,
+		     int (*slowfn)(struct rt_mutex *lock __IP_DECL__))
+{
+	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+		rt_mutex_deadlock_account_lock(lock, current);
+		return 1;
+	}
+	return slowfn(lock __RET_IP__);
+}
+
+static inline void
+rt_mutex_fastunlock(struct rt_mutex *lock,
+		    void (*slowfn)(struct rt_mutex *lock))
+{
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+		rt_mutex_deadlock_account_unlock(current);
+	else
+		slowfn(lock);
+}
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+	might_sleep();
+
+	rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: 		the rt_mutex to be locked
+ * @detect_deadlock:	deadlock detection on/off
+ *
+ * Returns:
+ *  0 		on success
+ * -EINTR 	when interrupted by a signal
+ * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
+						 int detect_deadlock)
+{
+	might_sleep();
+
+	return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
+				 detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
+ *				       the timeout structure is provided
+ *				       by the caller
+ *
+ * @lock: 		the rt_mutex to be locked
+ * @timeout:		timeout structure or NULL (no timeout)
+ * @detect_deadlock:	deadlock detection on/off
+ *
+ * Returns:
+ *  0 		on success
+ * -EINTR 	when interrupted by a signal
+ * -ETIMEOUT	when the timeout expired
+ * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
+ */
+int
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
+		    int detect_deadlock)
+{
+	might_sleep();
+
+	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+				       detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock:	the rt_mutex to be locked
+ *
+ * Returns 1 on success and 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+	return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+	rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/***
+ * rt_mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void rt_mutex_destroy(struct rt_mutex *lock)
+{
+	WARN_ON(rt_mutex_is_locked(lock));
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	lock->magic = NULL;
+#endif
+}
+
+EXPORT_SYMBOL_GPL(rt_mutex_destroy);
+
+/**
+ * __rt_mutex_init - initialize the rt lock
+ *
+ * @lock: the rt lock to be initialized
+ *
+ * Initialize the rt lock to unlocked state.
+ *
+ * Initializing of a locked rt lock is not allowed
+ */
+void __rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+	lock->owner = NULL;
+	spin_lock_init(&lock->wait_lock);
+	plist_head_init(&lock->wait_list, &lock->wait_lock);
+
+	debug_rt_mutex_init(lock, name);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h
new file mode 100644
index 000000000000..1e0fca13ff72
--- /dev/null
+++ b/kernel/rtmutex.h
@@ -0,0 +1,29 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c.
+ * Non-debug version.
+ */
+
+#define __IP_DECL__
+#define __IP__
+#define __RET_IP__
+#define rt_mutex_deadlock_check(l)			(0)
+#define rt_mutex_deadlock_account_lock(m, t)		do { } while (0)
+#define rt_mutex_deadlock_account_unlock(l)		do { } while (0)
+#define debug_rt_mutex_init_waiter(w)			do { } while (0)
+#define debug_rt_mutex_free_waiter(w)			do { } while (0)
+#define debug_rt_mutex_lock(l)				do { } while (0)
+#define debug_rt_mutex_proxy_lock(l,p)			do { } while (0)
+#define debug_rt_mutex_proxy_unlock(l)			do { } while (0)
+#define debug_rt_mutex_unlock(l)			do { } while (0)
+#define debug_rt_mutex_init(m, n)			do { } while (0)
+#define debug_rt_mutex_deadlock(d, a ,l)		do { } while (0)
+#define debug_rt_mutex_print_deadlock(w)		do { } while (0)
+#define debug_rt_mutex_detect_deadlock(w,d)		(d)
+#define debug_rt_mutex_reset_waiter(w)			do { } while (0)
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
new file mode 100644
index 000000000000..50eed60eb085
--- /dev/null
+++ b/kernel/rtmutex_common.h
@@ -0,0 +1,93 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the private data structure and API definitions.
+ */
+
+#ifndef __KERNEL_RTMUTEX_COMMON_H
+#define __KERNEL_RTMUTEX_COMMON_H
+
+#include <linux/rtmutex.h>
+
+/*
+ * This is the control structure for tasks blocked on a rt_mutex,
+ * which is allocated on the kernel stack on of the blocked task.
+ *
+ * @list_entry:		pi node to enqueue into the mutex waiters list
+ * @pi_list_entry:	pi node to enqueue into the mutex owner waiters list
+ * @task:		task reference to the blocked task
+ */
+struct rt_mutex_waiter {
+	struct plist_node	list_entry;
+	struct plist_node	pi_list_entry;
+	struct task_struct	*task;
+	struct rt_mutex		*lock;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	unsigned long		ip;
+	pid_t			deadlock_task_pid;
+	struct rt_mutex		*deadlock_lock;
+#endif
+};
+
+/*
+ * Various helpers to access the waiters-plist:
+ */
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+	return !plist_head_empty(&lock->wait_list);
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+	struct rt_mutex_waiter *w;
+
+	w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
+			       list_entry);
+	BUG_ON(w->lock != lock);
+
+	return w;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+	return !plist_head_empty(&p->pi_waiters);
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+	return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
+				  pi_list_entry);
+}
+
+/*
+ * lock->owner state tracking:
+ */
+#define RT_MUTEX_OWNER_PENDING	1UL
+#define RT_MUTEX_HAS_WAITERS	2UL
+#define RT_MUTEX_OWNER_MASKALL	3UL
+
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+{
+	return (struct task_struct *)
+		((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
+}
+
+static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
+{
+ 	return (struct task_struct *)
+		((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
+{
+	return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
+}
+
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f54afed8426f..93a2c5398648 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -133,6 +133,10 @@ extern int acct_parm[];
 extern int no_unaligned_warning;
 #endif
 
+#ifdef CONFIG_RT_MUTEXES
+extern int max_lock_depth;
+#endif
+
 static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
 		       ctl_table *, void **);
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -688,6 +692,17 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_RT_MUTEXES
+	{
+		.ctl_name	= KERN_MAX_LOCK_DEPTH,
+		.procname	= "max_lock_depth",
+		.data		= &max_lock_depth,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.3


From e7eebaf6a81b956c989f184ee4b27277c88f8afe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 27 Jun 2006 02:54:55 -0700
Subject: [PATCH] pi-futex: rt mutex debug

Runtime debugging functionality for rt-mutexes.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h      |   1 +
 include/linux/rtmutex.h |  15 +-
 kernel/Makefile         |   1 +
 kernel/exit.c           |   1 +
 kernel/rtmutex-debug.c  | 513 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rtmutex-debug.h  |  37 ++++
 lib/Kconfig.debug       |  13 ++
 mm/slab.c               |   1 +
 8 files changed, 581 insertions(+), 1 deletion(-)
 create mode 100644 kernel/rtmutex-debug.c
 create mode 100644 kernel/rtmutex-debug.h

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0ac255720f34..c41a1299b8cf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1034,6 +1034,7 @@ static inline void
 debug_check_no_locks_freed(const void *from, unsigned long len)
 {
 	mutex_debug_check_no_locks_freed(from, len);
+	rt_mutex_debug_check_no_locks_freed(from, len);
 }
 
 #ifndef CONFIG_DEBUG_PAGEALLOC
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 12309c916c68..fa4a3b82ba70 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -40,6 +40,19 @@ struct rt_mutex {
 struct rt_mutex_waiter;
 struct hrtimer_sleeper;
 
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ extern int rt_mutex_debug_check_no_locks_freed(const void *from,
+						unsigned long len);
+ extern void rt_mutex_debug_check_no_locks_held(struct task_struct *task);
+#else
+ static inline int rt_mutex_debug_check_no_locks_freed(const void *from,
+						       unsigned long len)
+ {
+	return 0;
+ }
+# define rt_mutex_debug_check_no_locks_held(task)	do { } while (0)
+#endif
+
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
 	, .name = #mutexname, .file = __FILE__, .line = __LINE__
@@ -48,7 +61,7 @@ struct hrtimer_sleeper;
 #else
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
 # define rt_mutex_init(mutex)			__rt_mutex_init(mutex, NULL)
-# define rt_mutex_debug_task_free(t)		do { } while (0)
+# define rt_mutex_debug_task_free(t)			do { } while (0)
 #endif
 
 #define __RT_MUTEX_INITIALIZER(mutexname) \
diff --git a/kernel/Makefile b/kernel/Makefile
index 21df9a338ff0..f9c92d34cde5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -17,6 +17,7 @@ ifeq ($(CONFIG_COMPAT),y)
 obj-$(CONFIG_FUTEX) += futex_compat.o
 endif
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/exit.c b/kernel/exit.c
index 304ef637be6c..3e8a0282e9a5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -929,6 +929,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
 	 */
 	mutex_debug_check_no_locks_held(tsk);
+	rt_mutex_debug_check_no_locks_held(tsk);
 
 	if (tsk->io_context)
 		exit_io_context();
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
new file mode 100644
index 000000000000..4aa8a2c9f453
--- /dev/null
+++ b/kernel/rtmutex-debug.c
@@ -0,0 +1,513 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This code is based on the rt.c implementation in the preempt-rt tree.
+ * Portions of said code are
+ *
+ *  Copyright (C) 2004  LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Copyright (C) 2006  Esben Nielsen
+ *  Copyright (C) 2006  Kihon Technologies Inc.,
+ *			Steven Rostedt <rostedt@goodmis.org>
+ *
+ * See rt.c in preempt-rt for proper credits and further information
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+# define TRACE_WARN_ON(x)			WARN_ON(x)
+# define TRACE_BUG_ON(x)			BUG_ON(x)
+
+# define TRACE_OFF()						\
+do {								\
+	if (rt_trace_on) {					\
+		rt_trace_on = 0;				\
+		console_verbose();				\
+		if (spin_is_locked(&current->pi_lock))		\
+			spin_unlock(&current->pi_lock);		\
+		if (spin_is_locked(&current->held_list_lock))	\
+			spin_unlock(&current->held_list_lock);	\
+	}							\
+} while (0)
+
+# define TRACE_OFF_NOLOCK()					\
+do {								\
+	if (rt_trace_on) {					\
+		rt_trace_on = 0;				\
+		console_verbose();				\
+	}							\
+} while (0)
+
+# define TRACE_BUG_LOCKED()			\
+do {						\
+	TRACE_OFF();				\
+	BUG();					\
+} while (0)
+
+# define TRACE_WARN_ON_LOCKED(c)		\
+do {						\
+	if (unlikely(c)) {			\
+		TRACE_OFF();			\
+		WARN_ON(1);			\
+	}					\
+} while (0)
+
+# define TRACE_BUG_ON_LOCKED(c)			\
+do {						\
+	if (unlikely(c))			\
+		TRACE_BUG_LOCKED();		\
+} while (0)
+
+#ifdef CONFIG_SMP
+# define SMP_TRACE_BUG_ON_LOCKED(c)	TRACE_BUG_ON_LOCKED(c)
+#else
+# define SMP_TRACE_BUG_ON_LOCKED(c)	do { } while (0)
+#endif
+
+/*
+ * deadlock detection flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+int rt_trace_on = 1;
+
+void deadlock_trace_off(void)
+{
+	rt_trace_on = 0;
+}
+
+static void printk_task(task_t *p)
+{
+	if (p)
+		printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_task_short(task_t *p)
+{
+	if (p)
+		printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_lock(struct rt_mutex *lock, int print_owner)
+{
+	if (lock->name)
+		printk(" [%p] {%s}\n",
+			lock, lock->name);
+	else
+		printk(" [%p] {%s:%d}\n",
+			lock, lock->file, lock->line);
+
+	if (print_owner && rt_mutex_owner(lock)) {
+		printk(".. ->owner: %p\n", lock->owner);
+		printk(".. held by:  ");
+		printk_task(rt_mutex_owner(lock));
+		printk("\n");
+	}
+	if (rt_mutex_owner(lock)) {
+		printk("... acquired at:               ");
+		print_symbol("%s\n", lock->acquire_ip);
+	}
+}
+
+static void printk_waiter(struct rt_mutex_waiter *w)
+{
+	printk("-------------------------\n");
+	printk("| waiter struct %p:\n", w);
+	printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
+	       w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next,
+	       w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next,
+	       w->list_entry.prio);
+	printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
+	       w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next,
+	       w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next,
+	       w->pi_list_entry.prio);
+	printk("\n| lock:\n");
+	printk_lock(w->lock, 1);
+	printk("| w->ti->task:\n");
+	printk_task(w->task);
+	printk("| blocked at:  ");
+	print_symbol("%s\n", w->ip);
+	printk("-------------------------\n");
+}
+
+static void show_task_locks(task_t *p)
+{
+	switch (p->state) {
+	case TASK_RUNNING:		printk("R"); break;
+	case TASK_INTERRUPTIBLE:	printk("S"); break;
+	case TASK_UNINTERRUPTIBLE:	printk("D"); break;
+	case TASK_STOPPED:		printk("T"); break;
+	case EXIT_ZOMBIE:		printk("Z"); break;
+	case EXIT_DEAD:			printk("X"); break;
+	default:			printk("?"); break;
+	}
+	printk_task(p);
+	if (p->pi_blocked_on) {
+		struct rt_mutex *lock = p->pi_blocked_on->lock;
+
+		printk(" blocked on:");
+		printk_lock(lock, 1);
+	} else
+		printk(" (not blocked)\n");
+}
+
+void rt_mutex_show_held_locks(task_t *task, int verbose)
+{
+	struct list_head *curr, *cursor = NULL;
+	struct rt_mutex *lock;
+	task_t *t;
+	unsigned long flags;
+	int count = 0;
+
+	if (!rt_trace_on)
+		return;
+
+	if (verbose) {
+		printk("------------------------------\n");
+		printk("| showing all locks held by: |  (");
+		printk_task_short(task);
+		printk("):\n");
+		printk("------------------------------\n");
+	}
+
+next:
+	spin_lock_irqsave(&task->held_list_lock, flags);
+	list_for_each(curr, &task->held_list_head) {
+		if (cursor && curr != cursor)
+			continue;
+		lock = list_entry(curr, struct rt_mutex, held_list_entry);
+		t = rt_mutex_owner(lock);
+		WARN_ON(t != task);
+		count++;
+		cursor = curr->next;
+		spin_unlock_irqrestore(&task->held_list_lock, flags);
+
+		printk("\n#%03d:            ", count);
+		printk_lock(lock, 0);
+		goto next;
+	}
+	spin_unlock_irqrestore(&task->held_list_lock, flags);
+
+	printk("\n");
+}
+
+void rt_mutex_show_all_locks(void)
+{
+	task_t *g, *p;
+	int count = 10;
+	int unlock = 1;
+
+	printk("\n");
+	printk("----------------------\n");
+	printk("| showing all tasks: |\n");
+	printk("----------------------\n");
+
+	/*
+	 * Here we try to get the tasklist_lock as hard as possible,
+	 * if not successful after 2 seconds we ignore it (but keep
+	 * trying). This is to enable a debug printout even if a
+	 * tasklist_lock-holding task deadlocks or crashes.
+	 */
+retry:
+	if (!read_trylock(&tasklist_lock)) {
+		if (count == 10)
+			printk("hm, tasklist_lock locked, retrying... ");
+		if (count) {
+			count--;
+			printk(" #%d", 10-count);
+			mdelay(200);
+			goto retry;
+		}
+		printk(" ignoring it.\n");
+		unlock = 0;
+	}
+	if (count != 10)
+		printk(" locked it.\n");
+
+	do_each_thread(g, p) {
+		show_task_locks(p);
+		if (!unlock)
+			if (read_trylock(&tasklist_lock))
+				unlock = 1;
+	} while_each_thread(g, p);
+
+	printk("\n");
+
+	printk("-----------------------------------------\n");
+	printk("| showing all locks held in the system: |\n");
+	printk("-----------------------------------------\n");
+
+	do_each_thread(g, p) {
+		rt_mutex_show_held_locks(p, 0);
+		if (!unlock)
+			if (read_trylock(&tasklist_lock))
+				unlock = 1;
+	} while_each_thread(g, p);
+
+
+	printk("=============================================\n\n");
+
+	if (unlock)
+		read_unlock(&tasklist_lock);
+}
+
+void rt_mutex_debug_check_no_locks_held(task_t *task)
+{
+	struct rt_mutex_waiter *w;
+	struct list_head *curr;
+	struct rt_mutex *lock;
+
+	if (!rt_trace_on)
+		return;
+	if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) {
+		printk("BUG: PI priority boost leaked!\n");
+		printk_task(task);
+		printk("\n");
+	}
+	if (list_empty(&task->held_list_head))
+		return;
+
+	spin_lock(&task->pi_lock);
+	plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) {
+		TRACE_OFF();
+
+		printk("hm, PI interest held at exit time? Task:\n");
+		printk_task(task);
+		printk_waiter(w);
+		return;
+	}
+	spin_unlock(&task->pi_lock);
+
+	list_for_each(curr, &task->held_list_head) {
+		lock = list_entry(curr, struct rt_mutex, held_list_entry);
+
+		printk("BUG: %s/%d, lock held at task exit time!\n",
+		       task->comm, task->pid);
+		printk_lock(lock, 1);
+		if (rt_mutex_owner(lock) != task)
+			printk("exiting task is not even the owner??\n");
+	}
+}
+
+int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
+{
+	const void *to = from + len;
+	struct list_head *curr;
+	struct rt_mutex *lock;
+	unsigned long flags;
+	void *lock_addr;
+
+	if (!rt_trace_on)
+		return 0;
+
+	spin_lock_irqsave(&current->held_list_lock, flags);
+	list_for_each(curr, &current->held_list_head) {
+		lock = list_entry(curr, struct rt_mutex, held_list_entry);
+		lock_addr = lock;
+		if (lock_addr < from || lock_addr >= to)
+			continue;
+		TRACE_OFF();
+
+		printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
+			current->comm, current->pid, lock, from, to);
+		dump_stack();
+		printk_lock(lock, 1);
+		if (rt_mutex_owner(lock) != current)
+			printk("freeing task is not even the owner??\n");
+		return 1;
+	}
+	spin_unlock_irqrestore(&current->held_list_lock, flags);
+
+	return 0;
+}
+
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+	WARN_ON(!plist_head_empty(&task->pi_waiters));
+	WARN_ON(task->pi_blocked_on);
+}
+
+/*
+ * We fill out the fields in the waiter to store the information about
+ * the deadlock. We print when we return. act_waiter can be NULL in
+ * case of a remove waiter operation.
+ */
+void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
+			     struct rt_mutex *lock)
+{
+	struct task_struct *task;
+
+	if (!rt_trace_on || detect || !act_waiter)
+		return;
+
+	task = rt_mutex_owner(act_waiter->lock);
+	if (task && task != current) {
+		act_waiter->deadlock_task_pid = task->pid;
+		act_waiter->deadlock_lock = lock;
+	}
+}
+
+void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
+{
+	struct task_struct *task;
+
+	if (!waiter->deadlock_lock || !rt_trace_on)
+		return;
+
+	task = find_task_by_pid(waiter->deadlock_task_pid);
+	if (!task)
+		return;
+
+	TRACE_OFF_NOLOCK();
+
+	printk("\n============================================\n");
+	printk(  "[ BUG: circular locking deadlock detected! ]\n");
+	printk(  "--------------------------------------------\n");
+	printk("%s/%d is deadlocking current task %s/%d\n\n",
+	       task->comm, task->pid, current->comm, current->pid);
+
+	printk("\n1) %s/%d is trying to acquire this lock:\n",
+	       current->comm, current->pid);
+	printk_lock(waiter->lock, 1);
+
+	printk("... trying at:                 ");
+	print_symbol("%s\n", waiter->ip);
+
+	printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
+	printk_lock(waiter->deadlock_lock, 1);
+
+	rt_mutex_show_held_locks(current, 1);
+	rt_mutex_show_held_locks(task, 1);
+
+	printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
+	show_stack(task, NULL);
+	printk("\n%s/%d's [current] stackdump:\n\n",
+	       current->comm, current->pid);
+	dump_stack();
+	rt_mutex_show_all_locks();
+	printk("[ turning off deadlock detection."
+	       "Please report this trace. ]\n\n");
+	local_irq_disable();
+}
+
+void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&current->held_list_lock, flags);
+		list_add_tail(&lock->held_list_entry, &current->held_list_head);
+		spin_unlock_irqrestore(&current->held_list_lock, flags);
+
+		lock->acquire_ip = ip;
+	}
+}
+
+void debug_rt_mutex_unlock(struct rt_mutex *lock)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+		TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&current->held_list_lock, flags);
+		list_del_init(&lock->held_list_entry);
+		spin_unlock_irqrestore(&current->held_list_lock, flags);
+	}
+}
+
+void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+			       struct task_struct *powner __IP_DECL__)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&powner->held_list_lock, flags);
+		list_add_tail(&lock->held_list_entry, &powner->held_list_head);
+		spin_unlock_irqrestore(&powner->held_list_lock, flags);
+
+		lock->acquire_ip = ip;
+	}
+}
+
+void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		struct task_struct *owner = rt_mutex_owner(lock);
+
+		TRACE_WARN_ON_LOCKED(!owner);
+		TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&owner->held_list_lock, flags);
+		list_del_init(&lock->held_list_entry);
+		spin_unlock_irqrestore(&owner->held_list_lock, flags);
+	}
+}
+
+void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+	memset(waiter, 0x11, sizeof(*waiter));
+	plist_node_init(&waiter->list_entry, MAX_PRIO);
+	plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
+}
+
+void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+	TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
+	TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+	TRACE_WARN_ON(waiter->task);
+	memset(waiter, 0x22, sizeof(*waiter));
+}
+
+void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+	void *addr = lock;
+
+	if (rt_trace_on) {
+		rt_mutex_debug_check_no_locks_freed(addr,
+						    sizeof(struct rt_mutex));
+		INIT_LIST_HEAD(&lock->held_list_entry);
+		lock->name = name;
+	}
+}
+
+void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task)
+{
+}
+
+void rt_mutex_deadlock_account_unlock(struct task_struct *task)
+{
+}
+
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
new file mode 100644
index 000000000000..7612fbc62d70
--- /dev/null
+++ b/kernel/rtmutex-debug.h
@@ -0,0 +1,37 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c. Debug version.
+ */
+
+#define __IP_DECL__		, unsigned long ip
+#define __IP__			, ip
+#define __RET_IP__		, (unsigned long)__builtin_return_address(0)
+
+extern void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
+extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
+extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__);
+extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+				      struct task_struct *powner __IP_DECL__);
+extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
+				    struct rt_mutex *lock);
+extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
+# define debug_rt_mutex_reset_waiter(w)			\
+	do { (w)->deadlock_lock = NULL; } while (0)
+
+static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+						 int detect)
+{
+	return (waiter != NULL);
+}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8bab0102ac73..06d3ea1133c9 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -107,6 +107,19 @@ config DEBUG_MUTEXES
 	 This allows mutex semantics violations and mutex related deadlocks
 	 (lockups) to be detected and reported automatically.
 
+config DEBUG_RT_MUTEXES
+	bool "RT Mutex debugging, deadlock detection"
+	default y
+	depends on DEBUG_KERNEL && RT_MUTEXES
+	help
+	 This allows rt mutex semantics violations and rt mutex related
+	 deadlocks (lockups) to be detected and reported automatically.
+
+config DEBUG_PI_LIST
+	bool
+	default y
+	depends on DEBUG_RT_MUTEXES
+
 config DEBUG_SPINLOCK
 	bool "Spinlock debugging"
 	depends on DEBUG_KERNEL
diff --git a/mm/slab.c b/mm/slab.c
index f378d027c684..233e39d14caf 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -107,6 +107,7 @@
 #include	<linux/nodemask.h>
 #include	<linux/mempolicy.h>
 #include	<linux/mutex.h>
+#include	<linux/rtmutex.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
-- 
cgit v1.2.3


From 61a87122869b6340a63b6f9f84097d3688604b90 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 27 Jun 2006 02:54:56 -0700
Subject: [PATCH] pi-futex: rt mutex tester

RT-mutex tester: scriptable tester for rt mutexes, which allows userspace
scripting of mutex unit-tests (and dynamic tests as well), using the actual
rt-mutex implementation of the kernel.

[akpm@osdl.org: fixlet]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h                        |   1 +
 kernel/Makefile                              |   1 +
 kernel/rtmutex-tester.c                      | 436 +++++++++++++++++++++++++++
 kernel/rtmutex.c                             |   3 +-
 kernel/rtmutex_common.h                      |  22 ++
 lib/Kconfig.debug                            |   7 +
 scripts/rt-tester/check-all.sh               |  21 ++
 scripts/rt-tester/rt-tester.py               | 222 ++++++++++++++
 scripts/rt-tester/t2-l1-2rt-sameprio.tst     | 101 +++++++
 scripts/rt-tester/t2-l1-pi.tst               |  84 ++++++
 scripts/rt-tester/t2-l1-signal.tst           |  79 +++++
 scripts/rt-tester/t2-l2-2rt-deadlock.tst     |  91 ++++++
 scripts/rt-tester/t3-l1-pi-1rt.tst           |  95 ++++++
 scripts/rt-tester/t3-l1-pi-2rt.tst           |  96 ++++++
 scripts/rt-tester/t3-l1-pi-3rt.tst           |  95 ++++++
 scripts/rt-tester/t3-l1-pi-signal.tst        | 101 +++++++
 scripts/rt-tester/t3-l1-pi-steal.tst         |  99 ++++++
 scripts/rt-tester/t3-l2-pi.tst               |  95 ++++++
 scripts/rt-tester/t4-l2-pi-deboost.tst       | 127 ++++++++
 scripts/rt-tester/t5-l4-pi-boost-deboost.tst | 148 +++++++++
 20 files changed, 1923 insertions(+), 1 deletion(-)
 create mode 100644 kernel/rtmutex-tester.c
 create mode 100644 scripts/rt-tester/check-all.sh
 create mode 100644 scripts/rt-tester/rt-tester.py
 create mode 100644 scripts/rt-tester/t2-l1-2rt-sameprio.tst
 create mode 100644 scripts/rt-tester/t2-l1-pi.tst
 create mode 100644 scripts/rt-tester/t2-l1-signal.tst
 create mode 100644 scripts/rt-tester/t2-l2-2rt-deadlock.tst
 create mode 100644 scripts/rt-tester/t3-l1-pi-1rt.tst
 create mode 100644 scripts/rt-tester/t3-l1-pi-2rt.tst
 create mode 100644 scripts/rt-tester/t3-l1-pi-3rt.tst
 create mode 100644 scripts/rt-tester/t3-l1-pi-signal.tst
 create mode 100644 scripts/rt-tester/t3-l1-pi-steal.tst
 create mode 100644 scripts/rt-tester/t3-l2-pi.tst
 create mode 100644 scripts/rt-tester/t4-l2-pi-deboost.tst
 create mode 100644 scripts/rt-tester/t5-l4-pi-boost-deboost.tst

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6ea23c9af413..edadd13cf53f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -982,6 +982,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
+#define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/Makefile b/kernel/Makefile
index f9c92d34cde5..82fb182f6f61 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_FUTEX) += futex_compat.o
 endif
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
new file mode 100644
index 000000000000..fe211ba3a5b5
--- /dev/null
+++ b/kernel/rtmutex-tester.c
@@ -0,0 +1,436 @@
+/*
+ * RT-Mutex-tester: scriptable tester for rt mutexes
+ *
+ * started by Thomas Gleixner:
+ *
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ */
+#include <linux/config.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/sysdev.h>
+#include <linux/timer.h>
+
+#include "rtmutex.h"
+
+#define MAX_RT_TEST_THREADS	8
+#define MAX_RT_TEST_MUTEXES	8
+
+static spinlock_t rttest_lock;
+static atomic_t rttest_event;
+
+struct test_thread_data {
+	int			opcode;
+	int			opdata;
+	int			mutexes[MAX_RT_TEST_MUTEXES];
+	int			bkl;
+	int			event;
+	struct sys_device	sysdev;
+};
+
+static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
+static task_t *threads[MAX_RT_TEST_THREADS];
+static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
+
+enum test_opcodes {
+	RTTEST_NOP = 0,
+	RTTEST_SCHEDOT,		/* 1 Sched other, data = nice */
+	RTTEST_SCHEDRT,		/* 2 Sched fifo, data = prio */
+	RTTEST_LOCK,		/* 3 Lock uninterruptible, data = lockindex */
+	RTTEST_LOCKNOWAIT,	/* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
+	RTTEST_LOCKINT,		/* 5 Lock interruptible, data = lockindex */
+	RTTEST_LOCKINTNOWAIT,	/* 6 Lock interruptible no wait in wakeup, data = lockindex */
+	RTTEST_LOCKCONT,	/* 7 Continue locking after the wakeup delay */
+	RTTEST_UNLOCK,		/* 8 Unlock, data = lockindex */
+	RTTEST_LOCKBKL, 	/* 9 Lock BKL */
+	RTTEST_UNLOCKBKL,	/* 10 Unlock BKL */
+	RTTEST_SIGNAL,		/* 11 Signal other test thread, data = thread id */
+	RTTEST_RESETEVENT = 98,	/* 98 Reset event counter */
+	RTTEST_RESET = 99,	/* 99 Reset all pending operations */
+};
+
+static int handle_op(struct test_thread_data *td, int lockwakeup)
+{
+	struct sched_param schedpar;
+	int i, id, ret = -EINVAL;
+
+	switch(td->opcode) {
+
+	case RTTEST_NOP:
+		return 0;
+
+	case RTTEST_SCHEDOT:
+		schedpar.sched_priority = 0;
+		ret = sched_setscheduler(current, SCHED_NORMAL, &schedpar);
+		if (!ret)
+			set_user_nice(current, 0);
+		return ret;
+
+	case RTTEST_SCHEDRT:
+		schedpar.sched_priority = td->opdata;
+		return sched_setscheduler(current, SCHED_FIFO, &schedpar);
+
+	case RTTEST_LOCKCONT:
+		td->mutexes[td->opdata] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		return 0;
+
+	case RTTEST_RESET:
+		for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
+			if (td->mutexes[i] == 4) {
+				rt_mutex_unlock(&mutexes[i]);
+				td->mutexes[i] = 0;
+			}
+		}
+
+		if (!lockwakeup && td->bkl == 4) {
+			unlock_kernel();
+			td->bkl = 0;
+		}
+		return 0;
+
+	case RTTEST_RESETEVENT:
+		atomic_set(&rttest_event, 0);
+		return 0;
+
+	default:
+		if (lockwakeup)
+			return ret;
+	}
+
+	switch(td->opcode) {
+
+	case RTTEST_LOCK:
+	case RTTEST_LOCKNOWAIT:
+		id = td->opdata;
+		if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+			return ret;
+
+		td->mutexes[id] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		rt_mutex_lock(&mutexes[id]);
+		td->event = atomic_add_return(1, &rttest_event);
+		td->mutexes[id] = 4;
+		return 0;
+
+	case RTTEST_LOCKINT:
+	case RTTEST_LOCKINTNOWAIT:
+		id = td->opdata;
+		if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+			return ret;
+
+		td->mutexes[id] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
+		td->event = atomic_add_return(1, &rttest_event);
+		td->mutexes[id] = ret ? 0 : 4;
+		return ret ? -EINTR : 0;
+
+	case RTTEST_UNLOCK:
+		id = td->opdata;
+		if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
+			return ret;
+
+		td->event = atomic_add_return(1, &rttest_event);
+		rt_mutex_unlock(&mutexes[id]);
+		td->event = atomic_add_return(1, &rttest_event);
+		td->mutexes[id] = 0;
+		return 0;
+
+	case RTTEST_LOCKBKL:
+		if (td->bkl)
+			return 0;
+		td->bkl = 1;
+		lock_kernel();
+		td->bkl = 4;
+		return 0;
+
+	case RTTEST_UNLOCKBKL:
+		if (td->bkl != 4)
+			break;
+		unlock_kernel();
+		td->bkl = 0;
+		return 0;
+
+	default:
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Schedule replacement for rtsem_down(). Only called for threads with
+ * PF_MUTEX_TESTER set.
+ *
+ * This allows us to have finegrained control over the event flow.
+ *
+ */
+void schedule_rt_mutex_test(struct rt_mutex *mutex)
+{
+	int tid, op, dat;
+	struct test_thread_data *td;
+
+	/* We have to lookup the task */
+	for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
+		if (threads[tid] == current)
+			break;
+	}
+
+	BUG_ON(tid == MAX_RT_TEST_THREADS);
+
+	td = &thread_data[tid];
+
+	op = td->opcode;
+	dat = td->opdata;
+
+	switch (op) {
+	case RTTEST_LOCK:
+	case RTTEST_LOCKINT:
+	case RTTEST_LOCKNOWAIT:
+	case RTTEST_LOCKINTNOWAIT:
+		if (mutex != &mutexes[dat])
+			break;
+
+		if (td->mutexes[dat] != 1)
+			break;
+
+		td->mutexes[dat] = 2;
+		td->event = atomic_add_return(1, &rttest_event);
+		break;
+
+	case RTTEST_LOCKBKL:
+	default:
+		break;
+	}
+
+	schedule();
+
+
+	switch (op) {
+	case RTTEST_LOCK:
+	case RTTEST_LOCKINT:
+		if (mutex != &mutexes[dat])
+			return;
+
+		if (td->mutexes[dat] != 2)
+			return;
+
+		td->mutexes[dat] = 3;
+		td->event = atomic_add_return(1, &rttest_event);
+		break;
+
+	case RTTEST_LOCKNOWAIT:
+	case RTTEST_LOCKINTNOWAIT:
+		if (mutex != &mutexes[dat])
+			return;
+
+		if (td->mutexes[dat] != 2)
+			return;
+
+		td->mutexes[dat] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		return;
+
+	case RTTEST_LOCKBKL:
+		return;
+	default:
+		return;
+	}
+
+	td->opcode = 0;
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (td->opcode > 0) {
+			int ret;
+
+			set_current_state(TASK_RUNNING);
+			ret = handle_op(td, 1);
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (td->opcode == RTTEST_LOCKCONT)
+				break;
+			td->opcode = ret;
+		}
+
+		/* Wait for the next command to be executed */
+		schedule();
+	}
+
+	/* Restore previous command and data */
+	td->opcode = op;
+	td->opdata = dat;
+}
+
+static int test_func(void *data)
+{
+	struct test_thread_data *td = data;
+	int ret;
+
+	current->flags |= PF_MUTEX_TESTER;
+	allow_signal(SIGHUP);
+
+	for(;;) {
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (td->opcode > 0) {
+			set_current_state(TASK_RUNNING);
+			ret = handle_op(td, 0);
+			set_current_state(TASK_INTERRUPTIBLE);
+			td->opcode = ret;
+		}
+
+		/* Wait for the next command to be executed */
+		schedule();
+
+		if (signal_pending(current))
+			flush_signals(current);
+
+		if(kthread_should_stop())
+			break;
+	}
+	return 0;
+}
+
+/**
+ * sysfs_test_command - interface for test commands
+ * @dev:	thread reference
+ * @buf:	command for actual step
+ * @count:	length of buffer
+ *
+ * command syntax:
+ *
+ * opcode:data
+ */
+static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
+				  size_t count)
+{
+	struct test_thread_data *td;
+	char cmdbuf[32];
+	int op, dat, tid;
+
+	td = container_of(dev, struct test_thread_data, sysdev);
+	tid = td->sysdev.id;
+
+	/* strings from sysfs write are not 0 terminated! */
+	if (count >= sizeof(cmdbuf))
+		return -EINVAL;
+
+	/* strip of \n: */
+	if (buf[count-1] == '\n')
+		count--;
+	if (count < 1)
+		return -EINVAL;
+
+	memcpy(cmdbuf, buf, count);
+	cmdbuf[count] = 0;
+
+	if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
+		return -EINVAL;
+
+	switch (op) {
+	case RTTEST_SIGNAL:
+		send_sig(SIGHUP, threads[tid], 0);
+		break;
+
+	default:
+		if (td->opcode > 0)
+			return -EBUSY;
+		td->opdata = dat;
+		td->opcode = op;
+		wake_up_process(threads[tid]);
+	}
+
+	return count;
+}
+
+/**
+ * sysfs_test_status - sysfs interface for rt tester
+ * @dev:	thread to query
+ * @buf:	char buffer to be filled with thread status info
+ */
+static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
+{
+	struct test_thread_data *td;
+	char *curr = buf;
+	task_t *tsk;
+	int i;
+
+	td = container_of(dev, struct test_thread_data, sysdev);
+	tsk = threads[td->sysdev.id];
+
+	spin_lock(&rttest_lock);
+
+	curr += sprintf(curr,
+		"O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+		td->opcode, td->event, tsk->state,
+			(MAX_RT_PRIO - 1) - tsk->prio,
+			(MAX_RT_PRIO - 1) - tsk->normal_prio,
+		tsk->pi_blocked_on, td->bkl);
+
+	for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
+		curr += sprintf(curr, "%d", td->mutexes[i]);
+
+	spin_unlock(&rttest_lock);
+
+	curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
+			mutexes[td->sysdev.id].owner);
+
+	return curr - buf;
+}
+
+static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
+static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
+
+static struct sysdev_class rttest_sysclass = {
+	set_kset_name("rttest"),
+};
+
+static int init_test_thread(int id)
+{
+	thread_data[id].sysdev.cls = &rttest_sysclass;
+	thread_data[id].sysdev.id = id;
+
+	threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
+	if (IS_ERR(threads[id]))
+		return PTR_ERR(threads[id]);
+
+	return sysdev_register(&thread_data[id].sysdev);
+}
+
+static int init_rttest(void)
+{
+	int ret, i;
+
+	spin_lock_init(&rttest_lock);
+
+	for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
+		rt_mutex_init(&mutexes[i]);
+
+	ret = sysdev_class_register(&rttest_sysclass);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
+		ret = init_test_thread(i);
+		if (ret)
+			break;
+		ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
+		if (ret)
+			break;
+		ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
+		if (ret)
+			break;
+	}
+
+	printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
+
+	return ret;
+}
+
+device_initcall(init_rttest);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 937a474fae94..39c8ca0cf526 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -640,7 +640,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 
 		debug_rt_mutex_print_deadlock(&waiter);
 
-		schedule();
+		if (waiter.task)
+			schedule_rt_mutex(lock);
 
 		spin_lock(&lock->wait_lock);
 		set_current_state(state);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 50eed60eb085..e068024eeffc 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -14,6 +14,28 @@
 
 #include <linux/rtmutex.h>
 
+/*
+ * The rtmutex in kernel tester is independent of rtmutex debugging. We
+ * call schedule_rt_mutex_test() instead of schedule() for the tasks which
+ * belong to the tester. That way we can delay the wakeup path of those
+ * threads to provoke lock stealing and testing of  complex boosting scenarios.
+ */
+#ifdef CONFIG_RT_MUTEX_TESTER
+
+extern void schedule_rt_mutex_test(struct rt_mutex *lock);
+
+#define schedule_rt_mutex(_lock)				\
+  do {								\
+	if (!(current->flags & PF_MUTEX_TESTER))		\
+		schedule();					\
+	else							\
+		schedule_rt_mutex_test(_lock);			\
+  } while (0)
+
+#else
+# define schedule_rt_mutex(_lock)			schedule()
+#endif
+
 /*
  * This is the control structure for tasks blocked on a rt_mutex,
  * which is allocated on the kernel stack on of the blocked task.
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 06d3ea1133c9..6afe0af0a19b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -120,6 +120,13 @@ config DEBUG_PI_LIST
 	default y
 	depends on DEBUG_RT_MUTEXES
 
+config RT_MUTEX_TESTER
+	bool "Built-in scriptable tester for rt-mutexes"
+	depends on RT_MUTEXES
+	default n
+	help
+	  This option enables a rt-mutex tester.
+
 config DEBUG_SPINLOCK
 	bool "Spinlock debugging"
 	depends on DEBUG_KERNEL
diff --git a/scripts/rt-tester/check-all.sh b/scripts/rt-tester/check-all.sh
new file mode 100644
index 000000000000..ac45c3e65a35
--- /dev/null
+++ b/scripts/rt-tester/check-all.sh
@@ -0,0 +1,21 @@
+
+
+function testit ()
+{
+ printf "%-30s: " $1
+ ./rt-tester.py $1 | grep Pass
+}
+
+testit t2-l1-2rt-sameprio.tst
+testit t2-l1-pi.tst
+testit t2-l1-signal.tst
+#testit t2-l2-2rt-deadlock.tst
+testit t3-l1-pi-1rt.tst
+testit t3-l1-pi-2rt.tst
+testit t3-l1-pi-3rt.tst
+testit t3-l1-pi-signal.tst
+testit t3-l1-pi-steal.tst
+testit t3-l2-pi.tst
+testit t4-l2-pi-deboost.tst
+testit t5-l4-pi-boost-deboost.tst
+
diff --git a/scripts/rt-tester/rt-tester.py b/scripts/rt-tester/rt-tester.py
new file mode 100644
index 000000000000..4c79660793cf
--- /dev/null
+++ b/scripts/rt-tester/rt-tester.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python
+#
+# rt-mutex tester
+#
+# (C) 2006 Thomas Gleixner <tglx@linutronix.de>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+#
+import os
+import sys
+import getopt
+import shutil
+import string
+
+# Globals
+quiet = 0
+test = 0
+comments = 0
+
+sysfsprefix = "/sys/devices/system/rttest/rttest"
+statusfile = "/status"
+commandfile = "/command"
+
+# Command opcodes
+cmd_opcodes = {
+    "schedother"    : "1",
+    "schedfifo"     : "2",
+    "lock"          : "3",
+    "locknowait"    : "4",
+    "lockint"       : "5",
+    "lockintnowait" : "6",
+    "lockcont"      : "7",
+    "unlock"        : "8",
+    "lockbkl"       : "9",
+    "unlockbkl"     : "10",
+    "signal"        : "11",
+    "resetevent"    : "98",
+    "reset"         : "99",
+    }
+
+test_opcodes = {
+    "prioeq"        : ["P" , "eq" , None],
+    "priolt"        : ["P" , "lt" , None],
+    "priogt"        : ["P" , "gt" , None],
+    "nprioeq"       : ["N" , "eq" , None],
+    "npriolt"       : ["N" , "lt" , None],
+    "npriogt"       : ["N" , "gt" , None],
+    "unlocked"      : ["M" , "eq" , 0],
+    "trylock"       : ["M" , "eq" , 1],
+    "blocked"       : ["M" , "eq" , 2],
+    "blockedwake"   : ["M" , "eq" , 3],
+    "locked"        : ["M" , "eq" , 4],
+    "opcodeeq"      : ["O" , "eq" , None],
+    "opcodelt"      : ["O" , "lt" , None],
+    "opcodegt"      : ["O" , "gt" , None],
+    "eventeq"       : ["E" , "eq" , None],
+    "eventlt"       : ["E" , "lt" , None],
+    "eventgt"       : ["E" , "gt" , None],
+    }
+
+# Print usage information
+def usage():
+    print "rt-tester.py <-c -h -q -t> <testfile>"
+    print " -c    display comments after first command"
+    print " -h    help"
+    print " -q    quiet mode"
+    print " -t    test mode (syntax check)"
+    print " testfile: read test specification from testfile"
+    print " otherwise from stdin"
+    return
+
+# Print progress when not in quiet mode
+def progress(str):
+    if not quiet:
+        print str
+
+# Analyse a status value
+def analyse(val, top, arg):
+
+    intval = int(val)
+
+    if top[0] == "M":
+        intval = intval / (10 ** int(arg))
+	intval = intval % 10
+        argval = top[2]
+    elif top[0] == "O":
+        argval = int(cmd_opcodes.get(arg, arg))
+    else:
+        argval = int(arg)
+
+    # progress("%d %s %d" %(intval, top[1], argval))
+
+    if top[1] == "eq" and intval == argval:
+	return 1
+    if top[1] == "lt" and intval < argval:
+        return 1
+    if top[1] == "gt" and intval > argval:
+	return 1
+    return 0
+
+# Parse the commandline
+try:
+    (options, arguments) = getopt.getopt(sys.argv[1:],'chqt')
+except getopt.GetoptError, ex:
+    usage()
+    sys.exit(1)
+
+# Parse commandline options
+for option, value in options:
+    if option == "-c":
+        comments = 1
+    elif option == "-q":
+        quiet = 1
+    elif option == "-t":
+        test = 1
+    elif option == '-h':
+        usage()
+        sys.exit(0)
+
+# Select the input source
+if arguments:
+    try:
+        fd = open(arguments[0])
+    except Exception,ex:
+        sys.stderr.write("File not found %s\n" %(arguments[0]))
+        sys.exit(1)
+else:
+    fd = sys.stdin
+
+linenr = 0
+
+# Read the test patterns
+while 1:
+
+    linenr = linenr + 1
+    line = fd.readline()
+    if not len(line):
+        break
+
+    line = line.strip()
+    parts = line.split(":")
+
+    if not parts or len(parts) < 1:
+        continue
+
+    if len(parts[0]) == 0:
+        continue
+
+    if parts[0].startswith("#"):
+	if comments > 1:
+	    progress(line)
+	continue
+
+    if comments == 1:
+	comments = 2
+
+    progress(line)
+
+    cmd = parts[0].strip().lower()
+    opc = parts[1].strip().lower()
+    tid = parts[2].strip()
+    dat = parts[3].strip()
+
+    try:
+        # Test or wait for a status value
+        if cmd == "t" or cmd == "w":
+            testop = test_opcodes[opc]
+
+            fname = "%s%s%s" %(sysfsprefix, tid, statusfile)
+            if test:
+		print fname
+                continue
+
+            while 1:
+                query = 1
+                fsta = open(fname, 'r')
+                status = fsta.readline().strip()
+                fsta.close()
+                stat = status.split(",")
+                for s in stat:
+		    s = s.strip()
+                    if s.startswith(testop[0]):
+                        # Seperate status value
+                        val = s[2:].strip()
+                        query = analyse(val, testop, dat)
+                        break
+                if query or cmd == "t":
+                    break
+
+            progress("   " + status)
+
+            if not query:
+                sys.stderr.write("Test failed in line %d\n" %(linenr))
+		sys.exit(1)
+
+        # Issue a command to the tester
+        elif cmd == "c":
+            cmdnr = cmd_opcodes[opc]
+            # Build command string and sys filename
+            cmdstr = "%s:%s" %(cmdnr, dat)
+            fname = "%s%s%s" %(sysfsprefix, tid, commandfile)
+            if test:
+		print fname
+                continue
+            fcmd = open(fname, 'w')
+            fcmd.write(cmdstr)
+            fcmd.close()
+
+    except Exception,ex:
+    	sys.stderr.write(str(ex))
+        sys.stderr.write("\nSyntax error in line %d\n" %(linenr))
+        if not test:
+            fd.close()
+            sys.exit(1)
+
+# Normal exit pass
+print "Pass"
+sys.exit(0)
+
+
diff --git a/scripts/rt-tester/t2-l1-2rt-sameprio.tst b/scripts/rt-tester/t2-l1-2rt-sameprio.tst
new file mode 100644
index 000000000000..a2b6f2aae755
--- /dev/null
+++ b/scripts/rt-tester/t2-l1-2rt-sameprio.tst
@@ -0,0 +1,101 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	0
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 2 threads 1 lock
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedfifo:		0: 	80
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	80
+W: opcodeeq:		1: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+C: locknowait:		1:	0
+W: locked:		0: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	80
+
+# T0 unlock L0
+C: unlock:		0: 	0
+W: locked:		1: 	0
+
+# Verify T0
+W: unlocked:		0: 	0
+T: prioeq:		0: 	80
+
+# Unlock
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
+
+# T1,T0 lock L0
+C: locknowait:		1: 	0
+C: locknowait:		0:	0
+W: locked:		1: 	0
+W: blocked:		0: 	0
+T: prioeq:		1: 	80
+
+# T1 unlock L0
+C: unlock:		1: 	0
+W: locked:		0: 	0
+
+# Verify T1
+W: unlocked:		1: 	0
+T: prioeq:		1: 	80
+
+# Unlock and exit
+C: unlock:		0: 	0
+W: unlocked:		0: 	0
+
diff --git a/scripts/rt-tester/t2-l1-pi.tst b/scripts/rt-tester/t2-l1-pi.tst
new file mode 100644
index 000000000000..aa4c8940acd6
--- /dev/null
+++ b/scripts/rt-tester/t2-l1-pi.tst
@@ -0,0 +1,84 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	0
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 2 threads 1 lock with priority inversion
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	80
+W: opcodeeq:		1: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	80
+
+# T0 unlock L0
+C: unlock:		0: 	0
+W: locked:		1: 	0
+
+# Verify T1
+W: unlocked:		0: 	0
+T: priolt:		0: 	1
+
+# Unlock and exit
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
+
diff --git a/scripts/rt-tester/t2-l1-signal.tst b/scripts/rt-tester/t2-l1-signal.tst
new file mode 100644
index 000000000000..c47ba06af209
--- /dev/null
+++ b/scripts/rt-tester/t2-l1-signal.tst
@@ -0,0 +1,79 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	0
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 2 threads 1 lock with priority inversion
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedother:		1: 	0
+W: opcodeeq:		1: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: lockintnowait:	1: 	0
+W: blocked:		1: 	0
+
+# Interrupt T1
+C: signal:		1:	0
+W: unlocked:		1: 	0
+T: opcodeeq:		1:	-4
+
+# Unlock and exit
+C: unlock:		0: 	0
+W: unlocked:		0: 	0
diff --git a/scripts/rt-tester/t2-l2-2rt-deadlock.tst b/scripts/rt-tester/t2-l2-2rt-deadlock.tst
new file mode 100644
index 000000000000..0cee476b206e
--- /dev/null
+++ b/scripts/rt-tester/t2-l2-2rt-deadlock.tst
@@ -0,0 +1,91 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	0
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 2 threads 2 lock
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedfifo:		0: 	80
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	80
+W: opcodeeq:		1: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L1
+C: locknowait:		1:	1
+W: locked:		1: 	1
+
+# T0 lock L1
+C: lockintnowait:	0: 	1
+W: blocked:		0: 	1
+
+# T1 lock L0
+C: lockintnowait:	1: 	0
+W: blocked:		1: 	0
+
+# Make deadlock go away
+C: signal:		1:	0
+W: unlocked:		1:	0
+C: signal:		0:	0
+W: unlocked:		0:	1
+
+# Unlock and exit
+C: unlock:		0: 	0
+W: unlocked:		0: 	0
+C: unlock:		1: 	1
+W: unlocked:		1: 	1
+
diff --git a/scripts/rt-tester/t3-l1-pi-1rt.tst b/scripts/rt-tester/t3-l1-pi-1rt.tst
new file mode 100644
index 000000000000..a5eaf7ed5d54
--- /dev/null
+++ b/scripts/rt-tester/t3-l1-pi-1rt.tst
@@ -0,0 +1,95 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 1 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedother:		1: 	0
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	82
+W: opcodeeq:		2: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: priolt:		0: 	1
+
+# T2 lock L0
+C: locknowait:		2: 	0
+W: blocked:		2: 	0
+T: prioeq:		0: 	82
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T2 got the lock
+W: locked:		2: 	0
+W: unlocked:		0:	0
+T: priolt:		0:	1
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+W: unlocked:		2: 	0
+W: locked:		1: 	0
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
diff --git a/scripts/rt-tester/t3-l1-pi-2rt.tst b/scripts/rt-tester/t3-l1-pi-2rt.tst
new file mode 100644
index 000000000000..c622391a8afe
--- /dev/null
+++ b/scripts/rt-tester/t3-l1-pi-2rt.tst
@@ -0,0 +1,96 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 1 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	81
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	82
+W: opcodeeq:		2: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	81
+
+# T2 lock L0
+C: locknowait:		2: 	0
+W: blocked:		2: 	0
+T: prioeq:		0: 	82
+T: prioeq:		1:	81
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T2 got the lock
+W: locked:		2: 	0
+W: unlocked:		0:	0
+T: priolt:		0:	1
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+W: unlocked:		2: 	0
+W: locked:		1: 	0
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
diff --git a/scripts/rt-tester/t3-l1-pi-3rt.tst b/scripts/rt-tester/t3-l1-pi-3rt.tst
new file mode 100644
index 000000000000..b5057fb13e09
--- /dev/null
+++ b/scripts/rt-tester/t3-l1-pi-3rt.tst
@@ -0,0 +1,95 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 1 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedfifo:		0: 	80
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	81
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	82
+W: opcodeeq:		2: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	81
+
+# T2 lock L0
+C: locknowait:		2: 	0
+W: blocked:		2: 	0
+T: prioeq:		0: 	82
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T2 got the lock
+W: locked:		2: 	0
+W: unlocked:		0:	0
+T: prioeq:		0:	80
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+W: locked:		1: 	0
+W: unlocked:		2: 	0
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
diff --git a/scripts/rt-tester/t3-l1-pi-signal.tst b/scripts/rt-tester/t3-l1-pi-signal.tst
new file mode 100644
index 000000000000..3e427daa33ff
--- /dev/null
+++ b/scripts/rt-tester/t3-l1-pi-signal.tst
@@ -0,0 +1,101 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+# Reset event counter
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set priorities
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	80
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	81
+W: opcodeeq:		2: 	0
+
+# T0 lock L0
+C: lock:		0:	0
+W: locked:		0: 	0
+
+# T1 lock L0, no wait in the wakeup path
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0:	80
+T: prioeq:		1:	80
+
+# T2 lock L0 interruptible, no wait in the wakeup path
+C: lockintnowait:	2:	0
+W: blocked:		2: 	0
+T: prioeq:		0:	81
+T: prioeq:		1:	80
+
+# Interrupt T2
+C: signal:		2:	2
+W: unlocked:		2:	0
+T: prioeq:		1:	80
+T: prioeq:		0:	80
+
+T: locked:		0:	0
+T: blocked:		1:	0
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T1 has locked L0 and exit
+W: locked:		1:	0
+W: unlocked:		0: 	0
+T: priolt:		0:	1
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
+
+
+
diff --git a/scripts/rt-tester/t3-l1-pi-steal.tst b/scripts/rt-tester/t3-l1-pi-steal.tst
new file mode 100644
index 000000000000..72c24a9e4be4
--- /dev/null
+++ b/scripts/rt-tester/t3-l1-pi-steal.tst
@@ -0,0 +1,99 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 1 lock PI steal pending ownership
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	80
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	81
+W: opcodeeq:		2: 	0
+
+# T0 lock L0
+C: lock:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: lock:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	80
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T1 is in the wakeup loop
+W: blockedwake:		1: 	0
+T: priolt:		0: 	1
+
+# T2 lock L0
+C: lock:		2: 	0
+# T1 leave wakeup loop
+C: lockcont:		1: 	0
+
+# T2 must have the lock and T1 must be blocked
+W: locked:		2: 	0
+W: blocked:		1: 	0
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+# Wait until T1 is in the wakeup loop and let it run
+W: blockedwake:		1: 	0
+C: lockcont:		1: 	0
+W: locked:		1: 	0
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
diff --git a/scripts/rt-tester/t3-l2-pi.tst b/scripts/rt-tester/t3-l2-pi.tst
new file mode 100644
index 000000000000..2ba0dced48fa
--- /dev/null
+++ b/scripts/rt-tester/t3-l2-pi.tst
@@ -0,0 +1,95 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 2 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedother:		1: 	0
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	82
+W: opcodeeq:		2: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: priolt:		0: 	1
+
+# T2 lock L0
+C: locknowait:		2: 	0
+W: blocked:		2: 	0
+T: prioeq:		0: 	82
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T2 got the lock
+W: locked:		2: 	0
+W: unlocked:		0:	0
+T: priolt:		0:	1
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+W: unlocked:		2: 	0
+W: locked:		1: 	0
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
diff --git a/scripts/rt-tester/t4-l2-pi-deboost.tst b/scripts/rt-tester/t4-l2-pi-deboost.tst
new file mode 100644
index 000000000000..01f1a80fa02a
--- /dev/null
+++ b/scripts/rt-tester/t4-l2-pi-deboost.tst
@@ -0,0 +1,127 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 4 threads 2 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedother:		1: 	0
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	82
+W: opcodeeq:		2: 	0
+C: schedfifo:		3: 	83
+W: opcodeeq:		3: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L1
+C: locknowait:		1: 	1
+W: locked:		1: 	1
+
+# T3 lock L0
+C: lockintnowait:	3: 	0
+W: blocked:		3: 	0
+T: prioeq:		0: 	83
+
+# T0 lock L1
+C: lock:		0: 	1
+W: blocked:		0: 	1
+T: prioeq:		1: 	83
+
+# T1 unlock L1
+C: unlock:		1:	1
+
+# Wait until T0 is in the wakeup code
+W: blockedwake:		0:	1
+
+# Verify that T1 is unboosted
+W: unlocked:		1: 	1
+T: priolt:		1: 	1
+
+# T2 lock L1 (T0 is boosted and pending owner !)
+C: locknowait:		2:	1
+W: blocked:		2: 	1
+T: prioeq:		0: 	83
+
+# Interrupt T3 and wait until T3 returned
+C: signal:		3:	0
+W: unlocked:		3:	0
+
+# Verify prio of T0 (still pending owner,
+# but T2 is enqueued due to the previous boost by T3
+T: prioeq:		0:	82
+
+# Let T0 continue
+C: lockcont:		0:	1
+W: locked:		0:	1
+
+# Unlock L1 and let T2 get L1
+C: unlock:		0:	1
+W: locked:		2:	1
+
+# Verify that T0 is unboosted
+W: unlocked:		0:	1
+T: priolt:		0:	1
+
+# Unlock everything and exit
+C: unlock:		2:	1
+W: unlocked:		2:	1
+
+C: unlock:		0:	0
+W: unlocked:		0:	0
+
diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
new file mode 100644
index 000000000000..efa0788c1189
--- /dev/null
+++ b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
@@ -0,0 +1,148 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 5 threads 4 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	81
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	82
+W: opcodeeq:		2: 	0
+C: schedfifo:		3: 	83
+W: opcodeeq:		3: 	0
+C: schedfifo:		4: 	84
+W: opcodeeq:		4: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L1
+C: locknowait:		1: 	1
+W: locked:		1: 	1
+
+# T1 lock L0
+C: lockintnowait:	1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	81
+
+# T2 lock L2
+C: locknowait:		2: 	2
+W: locked:		2: 	2
+
+# T2 lock L1
+C: lockintnowait:	2: 	1
+W: blocked:		2: 	1
+T: prioeq:		0: 	82
+T: prioeq:		1:	82
+
+# T3 lock L3
+C: locknowait:		3: 	3
+W: locked:		3: 	3
+
+# T3 lock L2
+C: lockintnowait:	3: 	2
+W: blocked:		3: 	2
+T: prioeq:		0: 	83
+T: prioeq:		1:	83
+T: prioeq:		2:	83
+
+# T4 lock L3
+C: lockintnowait:	4:	3
+W: blocked:		4: 	3
+T: prioeq:		0: 	84
+T: prioeq:		1:	84
+T: prioeq:		2:	84
+T: prioeq:		3:	84
+
+# Signal T4
+C: signal:		4: 	0
+W: unlocked:		4: 	3
+T: prioeq:		0: 	83
+T: prioeq:		1:	83
+T: prioeq:		2:	83
+T: prioeq:		3:	83
+
+# Signal T3
+C: signal:		3: 	0
+W: unlocked:		3: 	2
+T: prioeq:		0: 	82
+T: prioeq:		1:	82
+T: prioeq:		2:	82
+
+# Signal T2
+C: signal:		2: 	0
+W: unlocked:		2: 	1
+T: prioeq:		0: 	81
+T: prioeq:		1:	81
+
+# Signal T1
+C: signal:		1: 	0
+W: unlocked:		1: 	0
+T: priolt:		0: 	1
+
+# Unlock and exit
+C: unlock:		3:	3
+C: unlock:		2:	2
+C: unlock:		1:	1
+C: unlock:		0:	0
+
+W: unlocked:		3:	3
+W: unlocked:		2:	2
+W: unlocked:		1:	1
+W: unlocked:		0:	0
+
-- 
cgit v1.2.3


From c87e2837be82df479a6bae9f155c43516d2feebc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 27 Jun 2006 02:54:58 -0700
Subject: [PATCH] pi-futex: futex_lock_pi/futex_unlock_pi support

This adds the actual pi-futex implementation, based on rt-mutexes.

[dino@in.ibm.com: fix an oops-causing race]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/futex.h   |   7 +
 include/linux/sched.h   |   3 +
 kernel/exit.c           |   8 +
 kernel/fork.c           |   3 +
 kernel/futex.c          | 829 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/futex_compat.c   |  11 +-
 kernel/rtmutex_common.h |   8 +
 7 files changed, 828 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index f05a3f469322..34c3a215f2cd 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -12,6 +12,9 @@
 #define FUTEX_REQUEUE		3
 #define FUTEX_CMP_REQUEUE	4
 #define FUTEX_WAKE_OP		5
+#define FUTEX_LOCK_PI		6
+#define FUTEX_UNLOCK_PI		7
+#define FUTEX_TRYLOCK_PI	8
 
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
@@ -97,10 +100,14 @@ extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr);
 
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
+extern void exit_pi_state_list(struct task_struct *curr);
 #else
 static inline void exit_robust_list(struct task_struct *curr)
 {
 }
+static inline void exit_pi_state_list(struct task_struct *curr)
+{
+}
 #endif
 
 #define FUTEX_OP_SET		0	/* *(int *)UADDR2 = OPARG; */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index edadd13cf53f..b4e6be7de5ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -84,6 +84,7 @@ struct sched_param {
 #include <asm/processor.h>
 
 struct exec_domain;
+struct futex_pi_state;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -915,6 +916,8 @@ struct task_struct {
 #ifdef CONFIG_COMPAT
 	struct compat_robust_list_head __user *compat_robust_list;
 #endif
+	struct list_head pi_state_list;
+	struct futex_pi_state *pi_state_cache;
 
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
diff --git a/kernel/exit.c b/kernel/exit.c
index 3e8a0282e9a5..ab06b9f88f64 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -925,6 +925,14 @@ fastcall NORET_TYPE void do_exit(long code)
 	mpol_free(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
+	/*
+	 * This must happen late, after the PID is not
+	 * hashed anymore:
+	 */
+	if (unlikely(!list_empty(&tsk->pi_state_list)))
+		exit_pi_state_list(tsk);
+	if (unlikely(current->pi_state_cache))
+		kfree(current->pi_state_cache);
 	/*
 	 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
 	 */
diff --git a/kernel/fork.c b/kernel/fork.c
index b664a081fffa..628198a4f28a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1092,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_COMPAT
 	p->compat_robust_list = NULL;
 #endif
+	INIT_LIST_HEAD(&p->pi_state_list);
+	p->pi_state_cache = NULL;
+
 	/*
 	 * sigaltstack should be cleared when sharing the same VM
 	 */
diff --git a/kernel/futex.c b/kernel/futex.c
index 50356fb5d726..b305b7f8dad5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -12,6 +12,10 @@
  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
  *
+ *  PI-futex support started by Ingo Molnar and Thomas Gleixner
+ *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -46,6 +50,8 @@
 #include <linux/signal.h>
 #include <asm/futex.h>
 
+#include "rtmutex_common.h"
+
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 
 /*
@@ -74,6 +80,27 @@ union futex_key {
 	} both;
 };
 
+/*
+ * Priority Inheritance state:
+ */
+struct futex_pi_state {
+	/*
+	 * list of 'owned' pi_state instances - these have to be
+	 * cleaned up in do_exit() if the task exits prematurely:
+	 */
+	struct list_head list;
+
+	/*
+	 * The PI object:
+	 */
+	struct rt_mutex pi_mutex;
+
+	struct task_struct *owner;
+	atomic_t refcount;
+
+	union futex_key key;
+};
+
 /*
  * We use this hashed waitqueue instead of a normal wait_queue_t, so
  * we can wake only the relevant ones (hashed queues may be shared).
@@ -96,6 +123,10 @@ struct futex_q {
 	/* For fd, sigio sent using these: */
 	int fd;
 	struct file *filp;
+
+	/* Optional priority inheritance state: */
+	struct futex_pi_state *pi_state;
+	struct task_struct *task;
 };
 
 /*
@@ -258,6 +289,232 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 	return ret ? -EFAULT : 0;
 }
 
+/*
+ * Fault handling. Called with current->mm->mmap_sem held.
+ */
+static int futex_handle_fault(unsigned long address, int attempt)
+{
+	struct vm_area_struct * vma;
+	struct mm_struct *mm = current->mm;
+
+	if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
+	    vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
+		return -EFAULT;
+
+	switch (handle_mm_fault(mm, vma, address, 1)) {
+	case VM_FAULT_MINOR:
+		current->min_flt++;
+		break;
+	case VM_FAULT_MAJOR:
+		current->maj_flt++;
+		break;
+	default:
+		return -EFAULT;
+	}
+	return 0;
+}
+
+/*
+ * PI code:
+ */
+static int refill_pi_state_cache(void)
+{
+	struct futex_pi_state *pi_state;
+
+	if (likely(current->pi_state_cache))
+		return 0;
+
+	pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
+
+	if (!pi_state)
+		return -ENOMEM;
+
+	memset(pi_state, 0, sizeof(*pi_state));
+	INIT_LIST_HEAD(&pi_state->list);
+	/* pi_mutex gets initialized later */
+	pi_state->owner = NULL;
+	atomic_set(&pi_state->refcount, 1);
+
+	current->pi_state_cache = pi_state;
+
+	return 0;
+}
+
+static struct futex_pi_state * alloc_pi_state(void)
+{
+	struct futex_pi_state *pi_state = current->pi_state_cache;
+
+	WARN_ON(!pi_state);
+	current->pi_state_cache = NULL;
+
+	return pi_state;
+}
+
+static void free_pi_state(struct futex_pi_state *pi_state)
+{
+	if (!atomic_dec_and_test(&pi_state->refcount))
+		return;
+
+	/*
+	 * If pi_state->owner is NULL, the owner is most probably dying
+	 * and has cleaned up the pi_state already
+	 */
+	if (pi_state->owner) {
+		spin_lock_irq(&pi_state->owner->pi_lock);
+		list_del_init(&pi_state->list);
+		spin_unlock_irq(&pi_state->owner->pi_lock);
+
+		rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+	}
+
+	if (current->pi_state_cache)
+		kfree(pi_state);
+	else {
+		/*
+		 * pi_state->list is already empty.
+		 * clear pi_state->owner.
+		 * refcount is at 0 - put it back to 1.
+		 */
+		pi_state->owner = NULL;
+		atomic_set(&pi_state->refcount, 1);
+		current->pi_state_cache = pi_state;
+	}
+}
+
+/*
+ * Look up the task based on what TID userspace gave us.
+ * We dont trust it.
+ */
+static struct task_struct * futex_find_get_task(pid_t pid)
+{
+	struct task_struct *p;
+
+	read_lock(&tasklist_lock);
+	p = find_task_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+	if ((current->euid != p->euid) && (current->euid != p->uid)) {
+		p = NULL;
+		goto out_unlock;
+	}
+	if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
+		p = NULL;
+		goto out_unlock;
+	}
+	get_task_struct(p);
+out_unlock:
+	read_unlock(&tasklist_lock);
+
+	return p;
+}
+
+/*
+ * This task is holding PI mutexes at exit time => bad.
+ * Kernel cleans up PI-state, but userspace is likely hosed.
+ * (Robust-futex cleanup is separate and might save the day for userspace.)
+ */
+void exit_pi_state_list(struct task_struct *curr)
+{
+	struct futex_hash_bucket *hb;
+	struct list_head *next, *head = &curr->pi_state_list;
+	struct futex_pi_state *pi_state;
+	union futex_key key;
+
+	/*
+	 * We are a ZOMBIE and nobody can enqueue itself on
+	 * pi_state_list anymore, but we have to be careful
+	 * versus waiters unqueueing themselfs
+	 */
+	spin_lock_irq(&curr->pi_lock);
+	while (!list_empty(head)) {
+
+		next = head->next;
+		pi_state = list_entry(next, struct futex_pi_state, list);
+		key = pi_state->key;
+		spin_unlock_irq(&curr->pi_lock);
+
+		hb = hash_futex(&key);
+		spin_lock(&hb->lock);
+
+		spin_lock_irq(&curr->pi_lock);
+		if (head->next != next) {
+			spin_unlock(&hb->lock);
+			continue;
+		}
+
+		list_del_init(&pi_state->list);
+
+		WARN_ON(pi_state->owner != curr);
+
+		pi_state->owner = NULL;
+		spin_unlock_irq(&curr->pi_lock);
+
+		rt_mutex_unlock(&pi_state->pi_mutex);
+
+		spin_unlock(&hb->lock);
+
+		spin_lock_irq(&curr->pi_lock);
+	}
+	spin_unlock_irq(&curr->pi_lock);
+}
+
+static int
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
+{
+	struct futex_pi_state *pi_state = NULL;
+	struct futex_q *this, *next;
+	struct list_head *head;
+	struct task_struct *p;
+	pid_t pid;
+
+	head = &hb->chain;
+
+	list_for_each_entry_safe(this, next, head, list) {
+		if (match_futex (&this->key, &me->key)) {
+			/*
+			 * Another waiter already exists - bump up
+			 * the refcount and return its pi_state:
+			 */
+			pi_state = this->pi_state;
+			atomic_inc(&pi_state->refcount);
+			me->pi_state = pi_state;
+
+			return 0;
+		}
+	}
+
+	/*
+	 * We are the first waiter - try to look up the real owner and
+	 * attach the new pi_state to it:
+	 */
+	pid = uval & FUTEX_TID_MASK;
+	p = futex_find_get_task(pid);
+	if (!p)
+		return -ESRCH;
+
+	pi_state = alloc_pi_state();
+
+	/*
+	 * Initialize the pi_mutex in locked state and make 'p'
+	 * the owner of it:
+	 */
+	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+	/* Store the key for possible exit cleanups: */
+	pi_state->key = me->key;
+
+	spin_lock_irq(&p->pi_lock);
+	list_add(&pi_state->list, &p->pi_state_list);
+	pi_state->owner = p;
+	spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+
+	me->pi_state = pi_state;
+
+	return 0;
+}
+
 /*
  * The hash bucket lock must be held when this is called.
  * Afterwards, the futex_q must not be accessed.
@@ -285,6 +542,70 @@ static void wake_futex(struct futex_q *q)
 	q->lock_ptr = NULL;
 }
 
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+{
+	struct task_struct *new_owner;
+	struct futex_pi_state *pi_state = this->pi_state;
+	u32 curval, newval;
+
+	if (!pi_state)
+		return -EINVAL;
+
+	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+
+	/*
+	 * This happens when we have stolen the lock and the original
+	 * pending owner did not enqueue itself back on the rt_mutex.
+	 * Thats not a tragedy. We know that way, that a lock waiter
+	 * is on the fly. We make the futex_q waiter the pending owner.
+	 */
+	if (!new_owner)
+		new_owner = this->task;
+
+	/*
+	 * We pass it to the next owner. (The WAITERS bit is always
+	 * kept enabled while there is PI state around. We must also
+	 * preserve the owner died bit.)
+	 */
+	newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
+
+	inc_preempt_count();
+	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+	dec_preempt_count();
+
+	if (curval == -EFAULT)
+		return -EFAULT;
+	if (curval != uval)
+		return -EINVAL;
+
+	list_del_init(&pi_state->owner->pi_state_list);
+	list_add(&pi_state->list, &new_owner->pi_state_list);
+	pi_state->owner = new_owner;
+	rt_mutex_unlock(&pi_state->pi_mutex);
+
+	return 0;
+}
+
+static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
+{
+	u32 oldval;
+
+	/*
+	 * There is no waiter, so we unlock the futex. The owner died
+	 * bit has not to be preserved here. We are the owner:
+	 */
+	inc_preempt_count();
+	oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
+	dec_preempt_count();
+
+	if (oldval == -EFAULT)
+		return oldval;
+	if (oldval != uval)
+		return -EAGAIN;
+
+	return 0;
+}
+
 /*
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
@@ -309,6 +630,8 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
 
 	list_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key)) {
+			if (this->pi_state)
+				return -EINVAL;
 			wake_futex(this);
 			if (++ret >= nr_wake)
 				break;
@@ -385,27 +708,9 @@ retry:
 		 * still holding the mmap_sem.
 		 */
 		if (attempt++) {
-			struct vm_area_struct * vma;
-			struct mm_struct *mm = current->mm;
-			unsigned long address = (unsigned long)uaddr2;
-
-			ret = -EFAULT;
-			if (attempt >= 2 ||
-			    !(vma = find_vma(mm, address)) ||
-			    vma->vm_start > address ||
-			    !(vma->vm_flags & VM_WRITE))
+			if (futex_handle_fault((unsigned long)uaddr2,
+					       attempt))
 				goto out;
-
-			switch (handle_mm_fault(mm, vma, address, 1)) {
-			case VM_FAULT_MINOR:
-				current->min_flt++;
-				break;
-			case VM_FAULT_MAJOR:
-				current->maj_flt++;
-				break;
-			default:
-				goto out;
-			}
 			goto retry;
 		}
 
@@ -572,6 +877,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
 static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 {
 	list_add_tail(&q->list, &hb->chain);
+	q->task = current;
 	spin_unlock(&hb->lock);
 }
 
@@ -626,6 +932,9 @@ static int unqueue_me(struct futex_q *q)
 		}
 		WARN_ON(list_empty(&q->list));
 		list_del(&q->list);
+
+		BUG_ON(q->pi_state);
+
 		spin_unlock(lock_ptr);
 		ret = 1;
 	}
@@ -634,16 +943,36 @@ static int unqueue_me(struct futex_q *q)
 	return ret;
 }
 
+/*
+ * PI futexes can not be requeued and must remove themself from the
+ * hash bucket. The hash bucket lock is held on entry and dropped here.
+ */
+static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
+{
+	WARN_ON(list_empty(&q->list));
+	list_del(&q->list);
+
+	BUG_ON(!q->pi_state);
+	free_pi_state(q->pi_state);
+	q->pi_state = NULL;
+
+	spin_unlock(&hb->lock);
+
+	drop_key_refs(&q->key);
+}
+
 static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
 {
-	DECLARE_WAITQUEUE(wait, current);
+	struct task_struct *curr = current;
+	DECLARE_WAITQUEUE(wait, curr);
 	struct futex_hash_bucket *hb;
 	struct futex_q q;
 	u32 uval;
 	int ret;
 
+	q.pi_state = NULL;
  retry:
-	down_read(&current->mm->mmap_sem);
+	down_read(&curr->mm->mmap_sem);
 
 	ret = get_futex_key(uaddr, &q.key);
 	if (unlikely(ret != 0))
@@ -680,7 +1009,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
 		 * If we would have faulted, release mmap_sem, fault it in and
 		 * start all over again.
 		 */
-		up_read(&current->mm->mmap_sem);
+		up_read(&curr->mm->mmap_sem);
 
 		ret = get_user(uval, uaddr);
 
@@ -688,11 +1017,9 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
 			goto retry;
 		return ret;
 	}
-	if (uval != val) {
-		ret = -EWOULDBLOCK;
-		queue_unlock(&q, hb);
-		goto out_release_sem;
-	}
+	ret = -EWOULDBLOCK;
+	if (uval != val)
+		goto out_unlock_release_sem;
 
 	/* Only actually queue if *uaddr contained val.  */
 	__queue_me(&q, hb);
@@ -700,8 +1027,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
 	/*
 	 * Now the futex is queued and we have checked the data, we
 	 * don't want to hold mmap_sem while we sleep.
-	 */	
-	up_read(&current->mm->mmap_sem);
+	 */
+	up_read(&curr->mm->mmap_sem);
 
 	/*
 	 * There might have been scheduling since the queue_me(), as we
@@ -739,8 +1066,415 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
 	 */
 	return -EINTR;
 
+ out_unlock_release_sem:
+	queue_unlock(&q, hb);
+
  out_release_sem:
+	up_read(&curr->mm->mmap_sem);
+	return ret;
+}
+
+/*
+ * Userspace tried a 0 -> TID atomic transition of the futex value
+ * and failed. The kernel side here does the whole locking operation:
+ * if there are waiters then it will block, it does PI, etc. (Due to
+ * races the kernel might see a 0 value of the futex too.)
+ */
+static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
+			    struct hrtimer_sleeper *to)
+{
+	struct task_struct *curr = current;
+	struct futex_hash_bucket *hb;
+	u32 uval, newval, curval;
+	struct futex_q q;
+	int ret, attempt = 0;
+
+	if (refill_pi_state_cache())
+		return -ENOMEM;
+
+	q.pi_state = NULL;
+ retry:
+	down_read(&curr->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr, &q.key);
+	if (unlikely(ret != 0))
+		goto out_release_sem;
+
+	hb = queue_lock(&q, -1, NULL);
+
+ retry_locked:
+	/*
+	 * To avoid races, we attempt to take the lock here again
+	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
+	 * the locks. It will most likely not succeed.
+	 */
+	newval = current->pid;
+
+	inc_preempt_count();
+	curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
+	dec_preempt_count();
+
+	if (unlikely(curval == -EFAULT))
+		goto uaddr_faulted;
+
+	/* We own the lock already */
+	if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
+		if (!detect && 0)
+			force_sig(SIGKILL, current);
+		ret = -EDEADLK;
+		goto out_unlock_release_sem;
+	}
+
+	/*
+	 * Surprise - we got the lock. Just return
+	 * to userspace:
+	 */
+	if (unlikely(!curval))
+		goto out_unlock_release_sem;
+
+	uval = curval;
+	newval = uval | FUTEX_WAITERS;
+
+	inc_preempt_count();
+	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+	dec_preempt_count();
+
+	if (unlikely(curval == -EFAULT))
+		goto uaddr_faulted;
+	if (unlikely(curval != uval))
+		goto retry_locked;
+
+	/*
+	 * We dont have the lock. Look up the PI state (or create it if
+	 * we are the first waiter):
+	 */
+	ret = lookup_pi_state(uval, hb, &q);
+
+	if (unlikely(ret)) {
+		/*
+		 * There were no waiters and the owner task lookup
+		 * failed. When the OWNER_DIED bit is set, then we
+		 * know that this is a robust futex and we actually
+		 * take the lock. This is safe as we are protected by
+		 * the hash bucket lock. We also set the waiters bit
+		 * unconditionally here, to simplify glibc handling of
+		 * multiple tasks racing to acquire the lock and
+		 * cleanup the problems which were left by the dead
+		 * owner.
+		 */
+		if (curval & FUTEX_OWNER_DIED) {
+			uval = newval;
+			newval = current->pid |
+				FUTEX_OWNER_DIED | FUTEX_WAITERS;
+
+			inc_preempt_count();
+			curval = futex_atomic_cmpxchg_inatomic(uaddr,
+							       uval, newval);
+			dec_preempt_count();
+
+			if (unlikely(curval == -EFAULT))
+				goto uaddr_faulted;
+			if (unlikely(curval != uval))
+				goto retry_locked;
+			ret = 0;
+		}
+		goto out_unlock_release_sem;
+	}
+
+	/*
+	 * Only actually queue now that the atomic ops are done:
+	 */
+	__queue_me(&q, hb);
+
+	/*
+	 * Now the futex is queued and we have checked the data, we
+	 * don't want to hold mmap_sem while we sleep.
+	 */
+	up_read(&curr->mm->mmap_sem);
+
+	WARN_ON(!q.pi_state);
+	/*
+	 * Block on the PI mutex:
+	 */
+	if (!trylock)
+		ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
+	else {
+		ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+		/* Fixup the trylock return value: */
+		ret = ret ? 0 : -EWOULDBLOCK;
+	}
+
+	down_read(&curr->mm->mmap_sem);
+	hb = queue_lock(&q, -1, NULL);
+
+	/*
+	 * Got the lock. We might not be the anticipated owner if we
+	 * did a lock-steal - fix up the PI-state in that case.
+	 */
+	if (!ret && q.pi_state->owner != curr) {
+		u32 newtid = current->pid | FUTEX_WAITERS;
+
+		/* Owner died? */
+		if (q.pi_state->owner != NULL) {
+			spin_lock_irq(&q.pi_state->owner->pi_lock);
+			list_del_init(&q.pi_state->list);
+			spin_unlock_irq(&q.pi_state->owner->pi_lock);
+		} else
+			newtid |= FUTEX_OWNER_DIED;
+
+		q.pi_state->owner = current;
+
+		spin_lock_irq(&current->pi_lock);
+		list_add(&q.pi_state->list, &current->pi_state_list);
+		spin_unlock_irq(&current->pi_lock);
+
+		/* Unqueue and drop the lock */
+		unqueue_me_pi(&q, hb);
+		up_read(&curr->mm->mmap_sem);
+		/*
+		 * We own it, so we have to replace the pending owner
+		 * TID. This must be atomic as we have preserve the
+		 * owner died bit here.
+		 */
+		ret = get_user(uval, uaddr);
+		while (!ret) {
+			newval = (uval & FUTEX_OWNER_DIED) | newtid;
+			curval = futex_atomic_cmpxchg_inatomic(uaddr,
+							       uval, newval);
+			if (curval == -EFAULT)
+				ret = -EFAULT;
+			if (curval == uval)
+				break;
+			uval = curval;
+		}
+	} else {
+		/*
+		 * Catch the rare case, where the lock was released
+		 * when we were on the way back before we locked
+		 * the hash bucket.
+		 */
+		if (ret && q.pi_state->owner == curr) {
+			if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+				ret = 0;
+		}
+		/* Unqueue and drop the lock */
+		unqueue_me_pi(&q, hb);
+		up_read(&curr->mm->mmap_sem);
+	}
+
+	if (!detect && ret == -EDEADLK && 0)
+		force_sig(SIGKILL, current);
+
+	return ret;
+
+ out_unlock_release_sem:
+	queue_unlock(&q, hb);
+
+ out_release_sem:
+	up_read(&curr->mm->mmap_sem);
+	return ret;
+
+ uaddr_faulted:
+	/*
+	 * We have to r/w  *(int __user *)uaddr, but we can't modify it
+	 * non-atomically.  Therefore, if get_user below is not
+	 * enough, we need to handle the fault ourselves, while
+	 * still holding the mmap_sem.
+	 */
+	if (attempt++) {
+		if (futex_handle_fault((unsigned long)uaddr, attempt))
+			goto out_unlock_release_sem;
+
+		goto retry_locked;
+	}
+
+	queue_unlock(&q, hb);
+	up_read(&curr->mm->mmap_sem);
+
+	ret = get_user(uval, uaddr);
+	if (!ret && (uval != -EFAULT))
+		goto retry;
+
+	return ret;
+}
+
+/*
+ * Restart handler
+ */
+static long futex_lock_pi_restart(struct restart_block *restart)
+{
+	struct hrtimer_sleeper timeout, *to = NULL;
+	int ret;
+
+	restart->fn = do_no_restart_syscall;
+
+	if (restart->arg2 || restart->arg3) {
+		to = &timeout;
+		hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
+		hrtimer_init_sleeper(to, current);
+		to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
+			(u64) restart->arg0;
+	}
+
+	pr_debug("lock_pi restart: %p, %d (%d)\n",
+		 (u32 __user *)restart->arg0, current->pid);
+
+	ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
+			       0, to);
+
+	if (ret != -EINTR)
+		return ret;
+
+	restart->fn = futex_lock_pi_restart;
+
+	/* The other values are filled in */
+	return -ERESTART_RESTARTBLOCK;
+}
+
+/*
+ * Called from the syscall entry below.
+ */
+static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
+			 long nsec, int trylock)
+{
+	struct hrtimer_sleeper timeout, *to = NULL;
+	struct restart_block *restart;
+	int ret;
+
+	if (sec != MAX_SCHEDULE_TIMEOUT) {
+		to = &timeout;
+		hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
+		hrtimer_init_sleeper(to, current);
+		to->timer.expires = ktime_set(sec, nsec);
+	}
+
+	ret = do_futex_lock_pi(uaddr, detect, trylock, to);
+
+	if (ret != -EINTR)
+		return ret;
+
+	pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
+
+	restart = &current_thread_info()->restart_block;
+	restart->fn = futex_lock_pi_restart;
+	restart->arg0 = (unsigned long) uaddr;
+	restart->arg1 = detect;
+	if (to) {
+		restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
+		restart->arg3 = to->timer.expires.tv64 >> 32;
+	} else
+		restart->arg2 = restart->arg3 = 0;
+
+	return -ERESTART_RESTARTBLOCK;
+}
+
+/*
+ * Userspace attempted a TID -> 0 atomic transition, and failed.
+ * This is the in-kernel slowpath: we look up the PI state (if any),
+ * and do the rt-mutex unlock.
+ */
+static int futex_unlock_pi(u32 __user *uaddr)
+{
+	struct futex_hash_bucket *hb;
+	struct futex_q *this, *next;
+	u32 uval;
+	struct list_head *head;
+	union futex_key key;
+	int ret, attempt = 0;
+
+retry:
+	if (get_user(uval, uaddr))
+		return -EFAULT;
+	/*
+	 * We release only a lock we actually own:
+	 */
+	if ((uval & FUTEX_TID_MASK) != current->pid)
+		return -EPERM;
+	/*
+	 * First take all the futex related locks:
+	 */
+	down_read(&current->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr, &key);
+	if (unlikely(ret != 0))
+		goto out;
+
+	hb = hash_futex(&key);
+	spin_lock(&hb->lock);
+
+retry_locked:
+	/*
+	 * To avoid races, try to do the TID -> 0 atomic transition
+	 * again. If it succeeds then we can return without waking
+	 * anyone else up:
+	 */
+	inc_preempt_count();
+	uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+	dec_preempt_count();
+
+	if (unlikely(uval == -EFAULT))
+		goto pi_faulted;
+	/*
+	 * Rare case: we managed to release the lock atomically,
+	 * no need to wake anyone else up:
+	 */
+	if (unlikely(uval == current->pid))
+		goto out_unlock;
+
+	/*
+	 * Ok, other tasks may need to be woken up - check waiters
+	 * and do the wakeup if necessary:
+	 */
+	head = &hb->chain;
+
+	list_for_each_entry_safe(this, next, head, list) {
+		if (!match_futex (&this->key, &key))
+			continue;
+		ret = wake_futex_pi(uaddr, uval, this);
+		/*
+		 * The atomic access to the futex value
+		 * generated a pagefault, so retry the
+		 * user-access and the wakeup:
+		 */
+		if (ret == -EFAULT)
+			goto pi_faulted;
+		goto out_unlock;
+	}
+	/*
+	 * No waiters - kernel unlocks the futex:
+	 */
+	ret = unlock_futex_pi(uaddr, uval);
+	if (ret == -EFAULT)
+		goto pi_faulted;
+
+out_unlock:
+	spin_unlock(&hb->lock);
+out:
+	up_read(&current->mm->mmap_sem);
+
+	return ret;
+
+pi_faulted:
+	/*
+	 * We have to r/w  *(int __user *)uaddr, but we can't modify it
+	 * non-atomically.  Therefore, if get_user below is not
+	 * enough, we need to handle the fault ourselves, while
+	 * still holding the mmap_sem.
+	 */
+	if (attempt++) {
+		if (futex_handle_fault((unsigned long)uaddr, attempt))
+			goto out_unlock;
+
+		goto retry_locked;
+	}
+
+	spin_unlock(&hb->lock);
 	up_read(&current->mm->mmap_sem);
+
+	ret = get_user(uval, uaddr);
+	if (!ret && (uval != -EFAULT))
+		goto retry;
+
 	return ret;
 }
 
@@ -819,6 +1553,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
 		err = -ENOMEM;
 		goto error;
 	}
+	q->pi_state = NULL;
 
 	down_read(&current->mm->mmap_sem);
 	err = get_futex_key(uaddr, &q->key);
@@ -856,7 +1591,7 @@ error:
  * Implementation: user-space maintains a per-thread list of locks it
  * is holding. Upon do_exit(), the kernel carefully walks this list,
  * and marks all locks that are owned by this thread with the
- * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
+ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
  * always manipulated with the lock held, so the list is private and
  * per-thread. Userspace also maintains a per-thread 'list_op_pending'
  * field, to allow the kernel to clean up if the thread dies after
@@ -931,7 +1666,7 @@ err_unlock:
  */
 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
 {
-	u32 uval;
+	u32 uval, nval;
 
 retry:
 	if (get_user(uval, uaddr))
@@ -948,8 +1683,12 @@ retry:
 		 * thread-death.) The rest of the cleanup is done in
 		 * userspace.
 		 */
-		if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
-					 uval | FUTEX_OWNER_DIED) != uval)
+		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
+						     uval | FUTEX_OWNER_DIED);
+		if (nval == -EFAULT)
+			return -1;
+
+		if (nval != uval)
 			goto retry;
 
 		if (uval & FUTEX_WAITERS)
@@ -994,7 +1733,7 @@ void exit_robust_list(struct task_struct *curr)
 	while (entry != &head->list) {
 		/*
 		 * A pending lock might already be on the list, so
-		 * dont process it twice:
+		 * don't process it twice:
 		 */
 		if (entry != pending)
 			if (handle_futex_death((void *)entry + futex_offset,
@@ -1040,6 +1779,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
 	case FUTEX_WAKE_OP:
 		ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
 		break;
+	case FUTEX_LOCK_PI:
+		ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
+		break;
+	case FUTEX_UNLOCK_PI:
+		ret = futex_unlock_pi(uaddr);
+		break;
+	case FUTEX_TRYLOCK_PI:
+		ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
@@ -1055,17 +1803,22 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
 	u32 val2 = 0;
 
-	if (utime && (op == FUTEX_WAIT)) {
+	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
 		if (copy_from_user(&t, utime, sizeof(t)) != 0)
 			return -EFAULT;
 		if (!timespec_valid(&t))
 			return -EINVAL;
-		timeout = timespec_to_jiffies(&t) + 1;
+		if (op == FUTEX_WAIT)
+			timeout = timespec_to_jiffies(&t) + 1;
+		else {
+			timeout = t.tv_sec;
+			val2 = t.tv_nsec;
+		}
 	}
 	/*
 	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
 	 */
-	if (op >= FUTEX_REQUEUE)
+	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
 		val2 = (u32) (unsigned long) utime;
 
 	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 7e57c31670a3..d1d92b441fb7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -129,14 +129,19 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
 	int val2 = 0;
 
-	if (utime && (op == FUTEX_WAIT)) {
+	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
 		if (get_compat_timespec(&t, utime))
 			return -EFAULT;
 		if (!timespec_valid(&t))
 			return -EINVAL;
-		timeout = timespec_to_jiffies(&t) + 1;
+		if (op == FUTEX_WAIT)
+			timeout = timespec_to_jiffies(&t) + 1;
+		else {
+			timeout = t.tv_sec;
+			val2 = t.tv_nsec;
+		}
 	}
-	if (op >= FUTEX_REQUEUE)
+	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
 		val2 = (int) (unsigned long) utime;
 
 	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index e068024eeffc..9c75856e791e 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -112,4 +112,12 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
 	return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
 }
 
+/*
+ * PI-futex support (proxy locking functions, etc.):
+ */
+extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
+extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+				       struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+				  struct task_struct *proxy_owner);
 #endif
-- 
cgit v1.2.3


From 95e02ca9bb5324360e7dea1ea1c563036d84a5e6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 27 Jun 2006 02:55:02 -0700
Subject: [PATCH] rtmutex: Propagate priority settings into PI lock chains

When the priority of a task, which is blocked on a lock, changes we must
propagate this change into the PI lock chain.  Therefor the chain walk code
is changed to get rid of the references to current to avoid false positives
in the deadlock detector, as setscheduler might be called by a task which
holds the lock on which the task whose priority is changed is blocked.

Also add some comments about the get/put_task_struct usage to avoid
confusion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  2 ++
 kernel/rtmutex.c      | 42 ++++++++++++++++++++++++++++++++++++------
 kernel/sched.c        |  2 ++
 3 files changed, 40 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4e6be7de5ad..821f0481ebe1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1044,11 +1044,13 @@ extern void sched_idle_next(void);
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(task_t *p);
 extern void rt_mutex_setprio(task_t *p, int prio);
+extern void rt_mutex_adjust_pi(task_t *p);
 #else
 static inline int rt_mutex_getprio(task_t *p)
 {
 	return p->normal_prio;
 }
+# define rt_mutex_adjust_pi(p)		do { } while (0)
 #endif
 
 extern void set_user_nice(task_t *p, long nice);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 3fc0f0680ca2..45d61016da57 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -160,7 +160,8 @@ int max_lock_depth = 1024;
 static int rt_mutex_adjust_prio_chain(task_t *task,
 				      int deadlock_detect,
 				      struct rt_mutex *orig_lock,
-				      struct rt_mutex_waiter *orig_waiter
+				      struct rt_mutex_waiter *orig_waiter,
+				      struct task_struct *top_task
 				      __IP_DECL__)
 {
 	struct rt_mutex *lock;
@@ -189,7 +190,7 @@ static int rt_mutex_adjust_prio_chain(task_t *task,
 			prev_max = max_lock_depth;
 			printk(KERN_WARNING "Maximum lock depth %d reached "
 			       "task: %s (%d)\n", max_lock_depth,
-			       current->comm, current->pid);
+			       top_task->comm, top_task->pid);
 		}
 		put_task_struct(task);
 
@@ -229,7 +230,7 @@ static int rt_mutex_adjust_prio_chain(task_t *task,
 	}
 
 	/* Deadlock detection */
-	if (lock == orig_lock || rt_mutex_owner(lock) == current) {
+	if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
 		debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
 		spin_unlock(&lock->wait_lock);
 		ret = deadlock_detect ? -EDEADLK : 0;
@@ -433,6 +434,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		__rt_mutex_adjust_prio(owner);
 		if (owner->pi_blocked_on) {
 			boost = 1;
+			/* gets dropped in rt_mutex_adjust_prio_chain()! */
 			get_task_struct(owner);
 		}
 		spin_unlock_irqrestore(&owner->pi_lock, flags);
@@ -441,6 +443,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		spin_lock_irqsave(&owner->pi_lock, flags);
 		if (owner->pi_blocked_on) {
 			boost = 1;
+			/* gets dropped in rt_mutex_adjust_prio_chain()! */
 			get_task_struct(owner);
 		}
 		spin_unlock_irqrestore(&owner->pi_lock, flags);
@@ -450,8 +453,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 
 	spin_unlock(&lock->wait_lock);
 
-	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
-					 waiter __IP__);
+	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
+					 current __IP__);
 
 	spin_lock(&lock->wait_lock);
 
@@ -552,6 +555,7 @@ static void remove_waiter(struct rt_mutex *lock,
 
 		if (owner->pi_blocked_on) {
 			boost = 1;
+			/* gets dropped in rt_mutex_adjust_prio_chain()! */
 			get_task_struct(owner);
 		}
 		spin_unlock_irqrestore(&owner->pi_lock, flags);
@@ -564,11 +568,36 @@ static void remove_waiter(struct rt_mutex *lock,
 
 	spin_unlock(&lock->wait_lock);
 
-	rt_mutex_adjust_prio_chain(owner, 0, lock, NULL __IP__);
+	rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__);
 
 	spin_lock(&lock->wait_lock);
 }
 
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void rt_mutex_adjust_pi(struct task_struct *task)
+{
+	struct rt_mutex_waiter *waiter;
+	unsigned long flags;
+
+	spin_lock_irqsave(&task->pi_lock, flags);
+
+	waiter = task->pi_blocked_on;
+	if (!waiter || waiter->list_entry.prio == task->prio) {
+		spin_unlock_irqrestore(&task->pi_lock, flags);
+		return;
+	}
+
+	/* gets dropped in rt_mutex_adjust_prio_chain()! */
+	get_task_struct(task);
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+
+	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__);
+}
+
 /*
  * Slow path lock function:
  */
@@ -636,6 +665,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 			if (unlikely(ret))
 				break;
 		}
+
 		spin_unlock(&lock->wait_lock);
 
 		debug_rt_mutex_print_deadlock(&waiter);
diff --git a/kernel/sched.c b/kernel/sched.c
index 7a30addfd235..2629c1711fd6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4070,6 +4070,8 @@ recheck:
 	__task_rq_unlock(rq);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
 
+	rt_mutex_adjust_pi(p);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
-- 
cgit v1.2.3


From 456229a91d2bdf884f0c01b33f1ecee762abba7d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 27 Jun 2006 02:55:07 -0700
Subject: [PATCH] drivers/char/ipmi/ipmi_msghandler.c: make proc_ipmi_root
 static

Make struct proc_ipmi_root static.

Besides this, tremove removes an unused #ifdef CONFIG_PROC_FS from
include/linux/ipmi.h.

Acked-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/ipmi/ipmi_msghandler.c | 3 +--
 include/linux/ipmi.h                | 4 ----
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index b03ddab1bef5..83ed6ae466a5 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -57,8 +57,7 @@ static int ipmi_init_msghandler(void);
 static int initialized = 0;
 
 #ifdef CONFIG_PROC_FS
-struct proc_dir_entry *proc_ipmi_root = NULL;
-EXPORT_SYMBOL(proc_ipmi_root);
+static struct proc_dir_entry *proc_ipmi_root = NULL;
 #endif /* CONFIG_PROC_FS */
 
 #define MAX_EVENTS_IN_QUEUE	25
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 5653b2f23b6a..d09fbeabf1dc 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -210,11 +210,7 @@ struct kernel_ipmi_msg
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/device.h>
-
-#ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
-extern struct proc_dir_entry *proc_ipmi_root;
-#endif /* CONFIG_PROC_FS */
 
 /* Opaque type for a IPMI message user.  One of these is needed to
    send and receive messages. */
-- 
cgit v1.2.3


From f5e54d6e53a20cef45af7499e86164f0e0d16bb2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 28 Jun 2006 04:26:44 -0700
Subject: [PATCH] mark address_space_operations const

Same as with already do with the file operations: keep them in .rodata and
prevents people from doing runtime patching.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Steven French <sfrench@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/loop.c        | 4 ++--
 drivers/block/rd.c          | 2 +-
 fs/9p/v9fs_vfs.h            | 2 +-
 fs/9p/vfs_addr.c            | 2 +-
 fs/adfs/inode.c             | 2 +-
 fs/affs/affs.h              | 6 +++---
 fs/affs/file.c              | 4 ++--
 fs/affs/symlink.c           | 2 +-
 fs/afs/file.c               | 2 +-
 fs/afs/internal.h           | 2 +-
 fs/befs/linuxvfs.c          | 2 +-
 fs/bfs/bfs.h                | 2 +-
 fs/bfs/file.c               | 2 +-
 fs/block_dev.c              | 2 +-
 fs/buffer.c                 | 2 +-
 fs/cifs/cifsfs.h            | 4 ++--
 fs/cifs/file.c              | 4 ++--
 fs/coda/symlink.c           | 2 +-
 fs/configfs/inode.c         | 2 +-
 fs/cramfs/inode.c           | 4 ++--
 fs/efs/inode.c              | 2 +-
 fs/efs/symlink.c            | 2 +-
 fs/ext2/ext2.h              | 6 +++---
 fs/ext2/inode.c             | 6 +++---
 fs/ext3/inode.c             | 6 +++---
 fs/fat/inode.c              | 2 +-
 fs/freevxfs/vxfs_immed.c    | 2 +-
 fs/freevxfs/vxfs_inode.c    | 6 +++---
 fs/freevxfs/vxfs_subr.c     | 2 +-
 fs/fuse/file.c              | 2 +-
 fs/hfs/hfs_fs.h             | 4 ++--
 fs/hfs/inode.c              | 4 ++--
 fs/hfsplus/hfsplus_fs.h     | 4 ++--
 fs/hfsplus/inode.c          | 4 ++--
 fs/hostfs/hostfs_kern.c     | 6 +++---
 fs/hpfs/file.c              | 2 +-
 fs/hpfs/hpfs_fn.h           | 4 ++--
 fs/hpfs/namei.c             | 2 +-
 fs/hugetlbfs/inode.c        | 4 ++--
 fs/inode.c                  | 2 +-
 fs/isofs/compress.c         | 2 +-
 fs/isofs/inode.c            | 2 +-
 fs/isofs/isofs.h            | 2 +-
 fs/isofs/rock.c             | 2 +-
 fs/isofs/zisofs.h           | 2 +-
 fs/jffs/inode-v23.c         | 4 ++--
 fs/jffs2/file.c             | 2 +-
 fs/jffs2/os-linux.h         | 2 +-
 fs/jfs/inode.c              | 2 +-
 fs/jfs/jfs_inode.h          | 2 +-
 fs/jfs/jfs_metapage.c       | 2 +-
 fs/jfs/jfs_metapage.h       | 2 +-
 fs/minix/inode.c            | 2 +-
 fs/ncpfs/inode.c            | 2 +-
 fs/ncpfs/symlink.c          | 2 +-
 fs/nfs/file.c               | 2 +-
 fs/ntfs/aops.c              | 4 ++--
 fs/ntfs/ntfs.h              | 4 ++--
 fs/ocfs2/aops.c             | 2 +-
 fs/ocfs2/inode.h            | 2 +-
 fs/qnx4/inode.c             | 2 +-
 fs/ramfs/file-mmu.c         | 2 +-
 fs/ramfs/file-nommu.c       | 2 +-
 fs/ramfs/internal.h         | 2 +-
 fs/reiserfs/inode.c         | 2 +-
 fs/romfs/inode.c            | 2 +-
 fs/smbfs/file.c             | 2 +-
 fs/smbfs/proto.h            | 2 +-
 fs/sysfs/inode.c            | 2 +-
 fs/sysv/itree.c             | 2 +-
 fs/sysv/sysv.h              | 2 +-
 fs/udf/file.c               | 2 +-
 fs/udf/inode.c              | 2 +-
 fs/udf/symlink.c            | 2 +-
 fs/udf/udfdecl.h            | 6 +++---
 fs/ufs/inode.c              | 2 +-
 fs/xfs/linux-2.6/xfs_aops.c | 2 +-
 fs/xfs/linux-2.6/xfs_aops.h | 2 +-
 fs/xfs/linux-2.6/xfs_buf.c  | 2 +-
 include/linux/coda_linux.h  | 4 ++--
 include/linux/efs_fs.h      | 2 +-
 include/linux/fs.h          | 4 ++--
 include/linux/nfs_fs.h      | 2 +-
 include/linux/reiserfs_fs.h | 2 +-
 include/linux/ufs_fs.h      | 2 +-
 mm/filemap.c                | 4 ++--
 mm/filemap_xip.c            | 2 +-
 mm/shmem.c                  | 4 ++--
 mm/swap_state.c             | 2 +-
 89 files changed, 121 insertions(+), 121 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3c74ea729fc7..18dd026f470d 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -210,7 +210,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 {
 	struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
 	struct address_space *mapping = file->f_mapping;
-	struct address_space_operations *aops = mapping->a_ops;
+	const struct address_space_operations *aops = mapping->a_ops;
 	pgoff_t index;
 	unsigned offset, bv_offs;
 	int len, ret;
@@ -784,7 +784,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 
 	error = -EINVAL;
 	if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-		struct address_space_operations *aops = mapping->a_ops;
+		const struct address_space_operations *aops = mapping->a_ops;
 		/*
 		 * If we can't read - sorry. If we only can't write - well,
 		 * it's going to be read-only.
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index 940bfd7951e5..0378da04cfa2 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -191,7 +191,7 @@ static int ramdisk_set_page_dirty(struct page *page)
 	return 0;
 }
 
-static struct address_space_operations ramdisk_aops = {
+static const struct address_space_operations ramdisk_aops = {
 	.readpage	= ramdisk_readpage,
 	.prepare_write	= ramdisk_prepare_write,
 	.commit_write	= ramdisk_commit_write,
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index f867b8d3e973..450b0c1b385e 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -38,7 +38,7 @@
  */
 
 extern struct file_system_type v9fs_fs_type;
-extern struct address_space_operations v9fs_addr_operations;
+extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
 extern const struct file_operations v9fs_dir_operations;
 extern struct dentry_operations v9fs_dentry_operations;
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index efda46fb64d9..d4f0aa3c87f2 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -103,6 +103,6 @@ UnmapAndUnlock:
 	return retval;
 }
 
-struct address_space_operations v9fs_addr_operations = {
+const struct address_space_operations v9fs_addr_operations = {
       .readpage = v9fs_vfs_readpage,
 };
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index a02802a30798..534f3eecc985 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -72,7 +72,7 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, adfs_get_block);
 }
 
-static struct address_space_operations adfs_aops = {
+static const struct address_space_operations adfs_aops = {
 	.readpage	= adfs_readpage,
 	.writepage	= adfs_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a43a876742b8..0ddd4cc0d1a0 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -195,9 +195,9 @@ extern struct inode_operations   affs_symlink_inode_operations;
 extern const struct file_operations	 affs_file_operations;
 extern const struct file_operations	 affs_file_operations_ofs;
 extern const struct file_operations	 affs_dir_operations;
-extern struct address_space_operations	 affs_symlink_aops;
-extern struct address_space_operations	 affs_aops;
-extern struct address_space_operations	 affs_aops_ofs;
+extern const struct address_space_operations	 affs_symlink_aops;
+extern const struct address_space_operations	 affs_aops;
+extern const struct address_space_operations	 affs_aops_ofs;
 
 extern struct dentry_operations	 affs_dentry_operations;
 extern struct dentry_operations	 affs_dentry_operations_intl;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 7076262af39b..3de8590e4f6a 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -406,7 +406,7 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,affs_get_block);
 }
-struct address_space_operations affs_aops = {
+const struct address_space_operations affs_aops = {
 	.readpage = affs_readpage,
 	.writepage = affs_writepage,
 	.sync_page = block_sync_page,
@@ -759,7 +759,7 @@ out:
 	goto done;
 }
 
-struct address_space_operations affs_aops_ofs = {
+const struct address_space_operations affs_aops_ofs = {
 	.readpage = affs_readpage_ofs,
 	//.writepage = affs_writepage_ofs,
 	//.sync_page = affs_sync_page_ofs,
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index 426f0f094f23..f802256a5933 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -66,7 +66,7 @@ fail:
 	return err;
 }
 
-struct address_space_operations affs_symlink_aops = {
+const struct address_space_operations affs_symlink_aops = {
 	.readpage	= affs_symlink_readpage,
 };
 
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 7bb716887e29..67d6634101fd 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -35,7 +35,7 @@ struct inode_operations afs_file_inode_operations = {
 	.getattr	= afs_inode_getattr,
 };
 
-struct address_space_operations afs_fs_aops = {
+const struct address_space_operations afs_fs_aops = {
 	.readpage	= afs_file_readpage,
 	.sync_page	= block_sync_page,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 72febdf9a35a..e88b3b65ae49 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -69,7 +69,7 @@ extern const struct file_operations afs_dir_file_operations;
 /*
  * file.c
  */
-extern struct address_space_operations afs_fs_aops;
+extern const struct address_space_operations afs_fs_aops;
 extern struct inode_operations afs_file_inode_operations;
 
 #ifdef AFS_CACHING_SUPPORT
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 08201fab26cd..a83e889a97cd 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -73,7 +73,7 @@ static struct inode_operations befs_dir_inode_operations = {
 	.lookup		= befs_lookup,
 };
 
-static struct address_space_operations befs_aops = {
+static const struct address_space_operations befs_aops = {
 	.readpage	= befs_readpage,
 	.sync_page	= block_sync_page,
 	.bmap		= befs_bmap,
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 9d791004b21c..31973bbbf057 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -50,7 +50,7 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode)
 /* file.c */
 extern struct inode_operations bfs_file_inops;
 extern const struct file_operations bfs_file_operations;
-extern struct address_space_operations bfs_aops;
+extern const struct address_space_operations bfs_aops;
 
 /* dir.c */
 extern struct inode_operations bfs_dir_inops;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index d83cd74a2e4e..3d5aca28a0a0 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -153,7 +153,7 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, bfs_get_block);
 }
 
-struct address_space_operations bfs_aops = {
+const struct address_space_operations bfs_aops = {
 	.readpage	= bfs_readpage,
 	.writepage	= bfs_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 028d9fb9c2d5..7f7600e2381c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1095,7 +1095,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
 }
 
-struct address_space_operations def_blk_aops = {
+const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
 	.writepage	= blkdev_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/buffer.c b/fs/buffer.c
index f23bb647db47..e9994722f4a3 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2598,7 +2598,7 @@ int nobh_truncate_page(struct address_space *mapping, loff_t from)
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned to;
 	struct page *page;
-	struct address_space_operations *a_ops = mapping->a_ops;
+	const struct address_space_operations *a_ops = mapping->a_ops;
 	char *kaddr;
 	int ret = 0;
 
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index a6384d83fdef..8f75c6f24701 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -32,8 +32,8 @@
 #define TRUE 1
 #endif
 
-extern struct address_space_operations cifs_addr_ops;
-extern struct address_space_operations cifs_addr_ops_smallbuf;
+extern const struct address_space_operations cifs_addr_ops;
+extern const struct address_space_operations cifs_addr_ops_smallbuf;
 
 /* Functions related to super block operations */
 extern struct super_operations cifs_super_ops;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e9c1573f6aa7..5861eb42e626 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1942,7 +1942,7 @@ static int cifs_prepare_write(struct file *file, struct page *page,
 	return 0;
 }
 
-struct address_space_operations cifs_addr_ops = {
+const struct address_space_operations cifs_addr_ops = {
 	.readpage = cifs_readpage,
 	.readpages = cifs_readpages,
 	.writepage = cifs_writepage,
@@ -1959,7 +1959,7 @@ struct address_space_operations cifs_addr_ops = {
  * contain the header plus one complete page of data.  Otherwise, we need
  * to leave cifs_readpages out of the address space operations.
  */
-struct address_space_operations cifs_addr_ops_smallbuf = {
+const struct address_space_operations cifs_addr_ops_smallbuf = {
 	.readpage = cifs_readpage,
 	.writepage = cifs_writepage,
 	.writepages = cifs_writepages,
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index b35e5bbd9c99..76e00a65a75b 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -50,6 +50,6 @@ fail:
 	return error;
 }
 
-struct address_space_operations coda_symlink_aops = {
+const struct address_space_operations coda_symlink_aops = {
 	.readpage	= coda_symlink_filler,
 };
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c153bd9534cb..e14488ca6411 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -38,7 +38,7 @@
 
 extern struct super_block * configfs_sb;
 
-static struct address_space_operations configfs_aops = {
+static const struct address_space_operations configfs_aops = {
 	.readpage	= simple_readpage,
 	.prepare_write	= simple_prepare_write,
 	.commit_write	= simple_commit_write
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index c45d73860803..223c0431042d 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -30,7 +30,7 @@
 static struct super_operations cramfs_ops;
 static struct inode_operations cramfs_dir_inode_operations;
 static const struct file_operations cramfs_directory_operations;
-static struct address_space_operations cramfs_aops;
+static const struct address_space_operations cramfs_aops;
 
 static DEFINE_MUTEX(read_mutex);
 
@@ -501,7 +501,7 @@ static int cramfs_readpage(struct file *file, struct page * page)
 	return 0;
 }
 
-static struct address_space_operations cramfs_aops = {
+static const struct address_space_operations cramfs_aops = {
 	.readpage = cramfs_readpage
 };
 
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 180607f9314d..174696f9bf14 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -21,7 +21,7 @@ static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,efs_get_block);
 }
-static struct address_space_operations efs_aops = {
+static const struct address_space_operations efs_aops = {
 	.readpage = efs_readpage,
 	.sync_page = block_sync_page,
 	.bmap = _efs_bmap
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 3d9a350e3e7f..e249cf733a6b 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -53,6 +53,6 @@ fail:
 	return err;
 }
 
-struct address_space_operations efs_symlink_aops = {
+const struct address_space_operations efs_symlink_aops = {
 	.readpage	= efs_symlink_readpage
 };
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 9f74a62be555..e65a019fc7a5 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -162,9 +162,9 @@ extern const struct file_operations ext2_file_operations;
 extern const struct file_operations ext2_xip_file_operations;
 
 /* inode.c */
-extern struct address_space_operations ext2_aops;
-extern struct address_space_operations ext2_aops_xip;
-extern struct address_space_operations ext2_nobh_aops;
+extern const struct address_space_operations ext2_aops;
+extern const struct address_space_operations ext2_aops_xip;
+extern const struct address_space_operations ext2_nobh_aops;
 
 /* namei.c */
 extern struct inode_operations ext2_dir_inode_operations;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 04af9c45dce2..fb4d3220eb8d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -684,7 +684,7 @@ ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	return mpage_writepages(mapping, wbc, ext2_get_block);
 }
 
-struct address_space_operations ext2_aops = {
+const struct address_space_operations ext2_aops = {
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_writepage,
@@ -697,12 +697,12 @@ struct address_space_operations ext2_aops = {
 	.migratepage		= buffer_migrate_page,
 };
 
-struct address_space_operations ext2_aops_xip = {
+const struct address_space_operations ext2_aops_xip = {
 	.bmap			= ext2_bmap,
 	.get_xip_page		= ext2_get_xip_page,
 };
 
-struct address_space_operations ext2_nobh_aops = {
+const struct address_space_operations ext2_nobh_aops = {
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_nobh_writepage,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 0321e1b9034a..f804d5e9d60c 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1698,7 +1698,7 @@ static int ext3_journalled_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
-static struct address_space_operations ext3_ordered_aops = {
+static const struct address_space_operations ext3_ordered_aops = {
 	.readpage	= ext3_readpage,
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_ordered_writepage,
@@ -1712,7 +1712,7 @@ static struct address_space_operations ext3_ordered_aops = {
 	.migratepage	= buffer_migrate_page,
 };
 
-static struct address_space_operations ext3_writeback_aops = {
+static const struct address_space_operations ext3_writeback_aops = {
 	.readpage	= ext3_readpage,
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_writeback_writepage,
@@ -1726,7 +1726,7 @@ static struct address_space_operations ext3_writeback_aops = {
 	.migratepage	= buffer_migrate_page,
 };
 
-static struct address_space_operations ext3_journalled_aops = {
+static const struct address_space_operations ext3_journalled_aops = {
 	.readpage	= ext3_readpage,
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_journalled_writepage,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 7c35d582ec10..31b7174176ba 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -196,7 +196,7 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, fat_get_block);
 }
 
-static struct address_space_operations fat_aops = {
+static const struct address_space_operations fat_aops = {
 	.readpage	= fat_readpage,
 	.readpages	= fat_readpages,
 	.writepage	= fat_writepage,
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index 6f5df1700e95..4e25f3fbed86 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -56,7 +56,7 @@ struct inode_operations vxfs_immed_symlink_iops = {
 /*
  * Adress space operations for immed files and directories.
  */
-struct address_space_operations vxfs_immed_aops = {
+const struct address_space_operations vxfs_immed_aops = {
 	.readpage =		vxfs_immed_readpage,
 };
 
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index f544aae9169f..ca6a39714771 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -41,8 +41,8 @@
 #include "vxfs_extern.h"
 
 
-extern struct address_space_operations vxfs_aops;
-extern struct address_space_operations vxfs_immed_aops;
+extern const struct address_space_operations vxfs_aops;
+extern const struct address_space_operations vxfs_immed_aops;
 
 extern struct inode_operations vxfs_immed_symlink_iops;
 
@@ -295,7 +295,7 @@ vxfs_read_inode(struct inode *ip)
 {
 	struct super_block		*sbp = ip->i_sb;
 	struct vxfs_inode_info		*vip;
-	struct address_space_operations	*aops;
+	const struct address_space_operations	*aops;
 	ino_t				ino = ip->i_ino;
 
 	if (!(vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_ilist)))
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index c1be118fc067..decac62efe57 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -42,7 +42,7 @@
 static int		vxfs_readpage(struct file *, struct page *);
 static sector_t		vxfs_bmap(struct address_space *, sector_t);
 
-struct address_space_operations vxfs_aops = {
+const struct address_space_operations vxfs_aops = {
 	.readpage =		vxfs_readpage,
 	.bmap =			vxfs_bmap,
 	.sync_page =		block_sync_page,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 28aa81eae2cc..63614ed16336 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -770,7 +770,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
 	/* no mmap and sendfile */
 };
 
-static struct address_space_operations fuse_file_aops  = {
+static const struct address_space_operations fuse_file_aops  = {
 	.readpage	= fuse_readpage,
 	.prepare_write	= fuse_prepare_write,
 	.commit_write	= fuse_commit_write,
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 3ed8663a8db1..735332dfd1b8 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -182,8 +182,8 @@ extern void hfs_file_truncate(struct inode *);
 extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 
 /* inode.c */
-extern struct address_space_operations hfs_aops;
-extern struct address_space_operations hfs_btree_aops;
+extern const struct address_space_operations hfs_aops;
+extern const struct address_space_operations hfs_btree_aops;
 
 extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int);
 extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2d4ced22201b..315cf44a90b2 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -114,7 +114,7 @@ static int hfs_writepages(struct address_space *mapping,
 	return mpage_writepages(mapping, wbc, hfs_get_block);
 }
 
-struct address_space_operations hfs_btree_aops = {
+const struct address_space_operations hfs_btree_aops = {
 	.readpage	= hfs_readpage,
 	.writepage	= hfs_writepage,
 	.sync_page	= block_sync_page,
@@ -124,7 +124,7 @@ struct address_space_operations hfs_btree_aops = {
 	.releasepage	= hfs_releasepage,
 };
 
-struct address_space_operations hfs_aops = {
+const struct address_space_operations hfs_aops = {
 	.readpage	= hfs_readpage,
 	.writepage	= hfs_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 7ae393637a0c..8a1ca5ef7ada 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -323,8 +323,8 @@ int hfsplus_file_extend(struct inode *);
 void hfsplus_file_truncate(struct inode *);
 
 /* inode.c */
-extern struct address_space_operations hfsplus_aops;
-extern struct address_space_operations hfsplus_btree_aops;
+extern const struct address_space_operations hfsplus_aops;
+extern const struct address_space_operations hfsplus_btree_aops;
 
 void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *);
 void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index acf66dba3e01..924ecdef8091 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -109,7 +109,7 @@ static int hfsplus_writepages(struct address_space *mapping,
 	return mpage_writepages(mapping, wbc, hfsplus_get_block);
 }
 
-struct address_space_operations hfsplus_btree_aops = {
+const struct address_space_operations hfsplus_btree_aops = {
 	.readpage	= hfsplus_readpage,
 	.writepage	= hfsplus_writepage,
 	.sync_page	= block_sync_page,
@@ -119,7 +119,7 @@ struct address_space_operations hfsplus_btree_aops = {
 	.releasepage	= hfsplus_releasepage,
 };
 
-struct address_space_operations hfsplus_aops = {
+const struct address_space_operations hfsplus_aops = {
 	.readpage	= hfsplus_readpage,
 	.writepage	= hfsplus_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 8e0d37743e7c..b82e3d9c8790 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -54,7 +54,7 @@ static int append = 0;
 
 static struct inode_operations hostfs_iops;
 static struct inode_operations hostfs_dir_iops;
-static struct address_space_operations hostfs_link_aops;
+static const struct address_space_operations hostfs_link_aops;
 
 #ifndef MODULE
 static int __init hostfs_args(char *options, int *add)
@@ -518,7 +518,7 @@ int hostfs_commit_write(struct file *file, struct page *page, unsigned from,
 	return(err);
 }
 
-static struct address_space_operations hostfs_aops = {
+static const struct address_space_operations hostfs_aops = {
 	.writepage 	= hostfs_writepage,
 	.readpage	= hostfs_readpage,
 	.set_page_dirty = __set_page_dirty_nobuffers,
@@ -935,7 +935,7 @@ int hostfs_link_readpage(struct file *file, struct page *page)
 	return(err);
 }
 
-static struct address_space_operations hostfs_link_aops = {
+static const struct address_space_operations hostfs_link_aops = {
 	.readpage	= hostfs_link_readpage,
 };
 
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index d3b9fffe45a1..d9eb19b7b8ae 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -99,7 +99,7 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,hpfs_get_block);
 }
-struct address_space_operations hpfs_aops = {
+const struct address_space_operations hpfs_aops = {
 	.readpage = hpfs_readpage,
 	.writepage = hpfs_writepage,
 	.sync_page = block_sync_page,
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 29b7a3e55173..f687d54ed442 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int);
 int hpfs_file_fsync(struct file *, struct dentry *, int);
 extern const struct file_operations hpfs_file_ops;
 extern struct inode_operations hpfs_file_iops;
-extern struct address_space_operations hpfs_aops;
+extern const struct address_space_operations hpfs_aops;
 
 /* inode.c */
 
@@ -304,7 +304,7 @@ void hpfs_decide_conv(struct inode *, unsigned char *, unsigned);
 /* namei.c */
 
 extern struct inode_operations hpfs_dir_iops;
-extern struct address_space_operations hpfs_symlink_aops;
+extern const struct address_space_operations hpfs_symlink_aops;
 
 static inline struct hpfs_inode_info *hpfs_i(struct inode *inode)
 {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index a03abb12c610..59e7dc182a0c 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -538,7 +538,7 @@ fail:
 	return err;
 }
 
-struct address_space_operations hpfs_symlink_aops = {
+const struct address_space_operations hpfs_symlink_aops = {
 	.readpage	= hpfs_symlink_readpage
 };
 	
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e6410d8edd0e..6449cb697967 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -34,7 +34,7 @@
 #define HUGETLBFS_MAGIC	0x958458f6
 
 static struct super_operations hugetlbfs_ops;
-static struct address_space_operations hugetlbfs_aops;
+static const struct address_space_operations hugetlbfs_aops;
 const struct file_operations hugetlbfs_file_operations;
 static struct inode_operations hugetlbfs_dir_inode_operations;
 static struct inode_operations hugetlbfs_inode_operations;
@@ -547,7 +547,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
 }
 
-static struct address_space_operations hugetlbfs_aops = {
+static const struct address_space_operations hugetlbfs_aops = {
 	.readpage	= hugetlbfs_readpage,
 	.prepare_write	= hugetlbfs_prepare_write,
 	.commit_write	= hugetlbfs_commit_write,
diff --git a/fs/inode.c b/fs/inode.c
index 3a2446a27d2c..f42961eb983b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -102,7 +102,7 @@ static kmem_cache_t * inode_cachep __read_mostly;
 
 static struct inode *alloc_inode(struct super_block *sb)
 {
-	static struct address_space_operations empty_aops;
+	static const struct address_space_operations empty_aops;
 	static struct inode_operations empty_iops;
 	static const struct file_operations empty_fops;
 	struct inode *inode;
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 4917315db732..3a39158cca96 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -312,7 +312,7 @@ eio:
 	return err;
 }
 
-struct address_space_operations zisofs_aops = {
+const struct address_space_operations zisofs_aops = {
 	.readpage = zisofs_readpage,
 	/* No sync_page operation supported? */
 	/* No bmap operation supported */
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3f9c8ba1fa1f..bb11c7fb4019 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1054,7 +1054,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping,block,isofs_get_block);
 }
 
-static struct address_space_operations isofs_aops = {
+static const struct address_space_operations isofs_aops = {
 	.readpage = isofs_readpage,
 	.sync_page = block_sync_page,
 	.bmap = _isofs_bmap
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index b87ba066f5e7..e6308c8b5735 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -176,5 +176,5 @@ isofs_normalize_block_and_offset(struct iso_directory_record* de,
 
 extern struct inode_operations isofs_dir_inode_operations;
 extern const struct file_operations isofs_dir_operations;
-extern struct address_space_operations isofs_symlink_aops;
+extern const struct address_space_operations isofs_symlink_aops;
 extern struct export_operations isofs_export_ops;
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 4326cb47f8fa..f3a1db3098de 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -754,6 +754,6 @@ error:
 	return -EIO;
 }
 
-struct address_space_operations isofs_symlink_aops = {
+const struct address_space_operations isofs_symlink_aops = {
 	.readpage = rock_ridge_symlink_readpage
 };
diff --git a/fs/isofs/zisofs.h b/fs/isofs/zisofs.h
index d78485d101c2..273795709155 100644
--- a/fs/isofs/zisofs.h
+++ b/fs/isofs/zisofs.h
@@ -15,7 +15,7 @@
  */
 
 #ifdef CONFIG_ZISOFS
-extern struct address_space_operations zisofs_aops;
+extern const struct address_space_operations zisofs_aops;
 extern int __init zisofs_init(void);
 extern void zisofs_cleanup(void);
 #endif
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 9e46ea6da752..93068697a9bf 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -59,7 +59,7 @@ static const struct file_operations jffs_file_operations;
 static struct inode_operations jffs_file_inode_operations;
 static const struct file_operations jffs_dir_operations;
 static struct inode_operations jffs_dir_inode_operations;
-static struct address_space_operations jffs_address_operations;
+static const struct address_space_operations jffs_address_operations;
 
 kmem_cache_t     *node_cache = NULL;
 kmem_cache_t     *fm_cache = NULL;
@@ -1614,7 +1614,7 @@ jffs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 } /* jffs_ioctl()  */
 
 
-static struct address_space_operations jffs_address_operations = {
+static const struct address_space_operations jffs_address_operations = {
 	.readpage	= jffs_readpage,
 	.prepare_write	= jffs_prepare_write,
 	.commit_write	= jffs_commit_write,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index bb8844f40e48..3ed6e3e120b6 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -62,7 +62,7 @@ struct inode_operations jffs2_file_inode_operations =
 	.removexattr =	jffs2_removexattr
 };
 
-struct address_space_operations jffs2_file_address_operations =
+const struct address_space_operations jffs2_file_address_operations =
 {
 	.readpage =	jffs2_readpage,
 	.prepare_write =jffs2_prepare_write,
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 6b5223565405..9f41fc01a371 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -158,7 +158,7 @@ extern struct inode_operations jffs2_dir_inode_operations;
 /* file.c */
 extern const struct file_operations jffs2_file_operations;
 extern struct inode_operations jffs2_file_inode_operations;
-extern struct address_space_operations jffs2_file_address_operations;
+extern const struct address_space_operations jffs2_file_address_operations;
 int jffs2_fsync(struct file *, struct dentry *, int);
 int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
 
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 04eb78f1252e..43e3f566aad6 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -305,7 +305,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
 				offset, nr_segs, jfs_get_block, NULL);
 }
 
-struct address_space_operations jfs_aops = {
+const struct address_space_operations jfs_aops = {
 	.readpage	= jfs_readpage,
 	.readpages	= jfs_readpages,
 	.writepage	= jfs_writepage,
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index c30072674464..b5c7da6190dc 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -33,7 +33,7 @@ extern void jfs_free_zero_link(struct inode *);
 extern struct dentry *jfs_get_parent(struct dentry *dentry);
 extern void jfs_set_inode_flags(struct inode *);
 
-extern struct address_space_operations jfs_aops;
+extern const struct address_space_operations jfs_aops;
 extern struct inode_operations jfs_dir_inode_operations;
 extern const struct file_operations jfs_dir_operations;
 extern struct inode_operations jfs_file_inode_operations;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 7f6e88039700..e1e0a6e6ebdf 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -577,7 +577,7 @@ static void metapage_invalidatepage(struct page *page, unsigned long offset)
 	metapage_releasepage(page, 0);
 }
 
-struct address_space_operations jfs_metapage_aops = {
+const struct address_space_operations jfs_metapage_aops = {
 	.readpage	= metapage_readpage,
 	.writepage	= metapage_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
index f0b7d3282b07..d17a3290f5aa 100644
--- a/fs/jfs/jfs_metapage.h
+++ b/fs/jfs/jfs_metapage.h
@@ -139,7 +139,7 @@ static inline void metapage_homeok(struct metapage *mp)
 	put_metapage(mp);
 }
 
-extern struct address_space_operations jfs_metapage_aops;
+extern const struct address_space_operations jfs_metapage_aops;
 
 /*
  * This routines invalidate all pages for an extent.
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index a6fb509b7341..9ea91c5eeb7b 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -335,7 +335,7 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,minix_get_block);
 }
-static struct address_space_operations minix_aops = {
+static const struct address_space_operations minix_aops = {
 	.readpage = minix_readpage,
 	.writepage = minix_writepage,
 	.sync_page = block_sync_page,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 90d2ea28f333..6c51c1198464 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -105,7 +105,7 @@ static struct super_operations ncp_sops =
 
 extern struct dentry_operations ncp_root_dentry_operations;
 #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
-extern struct address_space_operations ncp_symlink_aops;
+extern const struct address_space_operations ncp_symlink_aops;
 extern int ncp_symlink(struct inode*, struct dentry*, const char*);
 #endif
 
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e935f1b34bc2..f76b1392a012 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -99,7 +99,7 @@ fail:
 /*
  * symlinks can't do much...
  */
-struct address_space_operations ncp_symlink_aops = {
+const struct address_space_operations ncp_symlink_aops = {
 	.readpage	= ncp_symlink_readpage,
 };
 	
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index add289138836..cc2b874ad5a4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -315,7 +315,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 	return !nfs_wb_page(page->mapping->host, page);
 }
 
-struct address_space_operations nfs_file_aops = {
+const struct address_space_operations nfs_file_aops = {
 	.readpage = nfs_readpage,
 	.readpages = nfs_readpages,
 	.set_page_dirty = __set_page_dirty_nobuffers,
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 580412d330cb..bc579bfdfbd8 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1544,7 +1544,7 @@ err_out:
 /**
  * ntfs_aops - general address space operations for inodes and attributes
  */
-struct address_space_operations ntfs_aops = {
+const struct address_space_operations ntfs_aops = {
 	.readpage	= ntfs_readpage,	/* Fill page with data. */
 	.sync_page	= block_sync_page,	/* Currently, just unplugs the
 						   disk request queue. */
@@ -1560,7 +1560,7 @@ struct address_space_operations ntfs_aops = {
  * ntfs_mst_aops - general address space operations for mst protecteed inodes
  *		   and attributes
  */
-struct address_space_operations ntfs_mst_aops = {
+const struct address_space_operations ntfs_mst_aops = {
 	.readpage	= ntfs_readpage,	/* Fill page with data. */
 	.sync_page	= block_sync_page,	/* Currently, just unplugs the
 						   disk request queue. */
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index bf7b3d7c0930..ddd3d503097c 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -57,8 +57,8 @@ extern struct kmem_cache *ntfs_attr_ctx_cache;
 extern struct kmem_cache *ntfs_index_ctx_cache;
 
 /* The various operations structs defined throughout the driver files. */
-extern struct address_space_operations ntfs_aops;
-extern struct address_space_operations ntfs_mst_aops;
+extern const struct address_space_operations ntfs_aops;
+extern const struct address_space_operations ntfs_mst_aops;
 
 extern const struct  file_operations ntfs_file_ops;
 extern struct inode_operations ntfs_file_inode_ops;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 47152bf9a7f2..cca71317b6d6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -666,7 +666,7 @@ out:
 	return ret;
 }
 
-struct address_space_operations ocfs2_aops = {
+const struct address_space_operations ocfs2_aops = {
 	.readpage	= ocfs2_readpage,
 	.writepage	= ocfs2_writepage,
 	.prepare_write	= ocfs2_prepare_write,
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 84c507961287..35140f6cf840 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -114,7 +114,7 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 
 extern kmem_cache_t *ocfs2_inode_cache;
 
-extern struct address_space_operations ocfs2_aops;
+extern const struct address_space_operations ocfs2_aops;
 
 struct buffer_head *ocfs2_bread(struct inode *inode, int block,
 				int *err, int reada);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2f24c46f72a1..8bc182a88748 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -450,7 +450,7 @@ static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,qnx4_get_block);
 }
-static struct address_space_operations qnx4_aops = {
+static const struct address_space_operations qnx4_aops = {
 	.readpage	= qnx4_readpage,
 	.writepage	= qnx4_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 00a933eb820c..86f14cacf641 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -26,7 +26,7 @@
 
 #include <linux/fs.h>
 
-struct address_space_operations ramfs_aops = {
+const struct address_space_operations ramfs_aops = {
 	.readpage	= simple_readpage,
 	.prepare_write	= simple_prepare_write,
 	.commit_write	= simple_commit_write
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index f443a84b98a5..99fffc9e1bfd 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -27,7 +27,7 @@
 
 static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
 
-struct address_space_operations ramfs_aops = {
+const struct address_space_operations ramfs_aops = {
 	.readpage		= simple_readpage,
 	.prepare_write		= simple_prepare_write,
 	.commit_write		= simple_commit_write
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
index 313237631b49..c2bb58e74653 100644
--- a/fs/ramfs/internal.h
+++ b/fs/ramfs/internal.h
@@ -10,6 +10,6 @@
  */
 
 
-extern struct address_space_operations ramfs_aops;
+extern const struct address_space_operations ramfs_aops;
 extern const struct file_operations ramfs_file_operations;
 extern struct inode_operations ramfs_file_inode_operations;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9857e50f85e7..a24858a632fa 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2996,7 +2996,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 	return error;
 }
 
-struct address_space_operations reiserfs_address_space_operations = {
+const struct address_space_operations reiserfs_address_space_operations = {
 	.writepage = reiserfs_writepage,
 	.readpage = reiserfs_readpage,
 	.readpages = reiserfs_readpages,
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 283fbc6b8eea..22eed61ebf69 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -459,7 +459,7 @@ err_out:
 
 /* Mapping from our types to the kernel */
 
-static struct address_space_operations romfs_aops = {
+static const struct address_space_operations romfs_aops = {
 	.readpage = romfs_readpage
 };
 
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index ed9a24d19d7d..dae67048baba 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -306,7 +306,7 @@ static int smb_commit_write(struct file *file, struct page *page,
 	return status;
 }
 
-struct address_space_operations smb_file_aops = {
+const struct address_space_operations smb_file_aops = {
 	.readpage = smb_readpage,
 	.writepage = smb_writepage,
 	.prepare_write = smb_prepare_write,
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 972ed7dad388..34fb462b2379 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -63,7 +63,7 @@ extern int smb_revalidate_inode(struct dentry *dentry);
 extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
 /* file.c */
-extern struct address_space_operations smb_file_aops;
+extern const struct address_space_operations smb_file_aops;
 extern const struct file_operations smb_file_operations;
 extern struct inode_operations smb_file_inode_operations;
 /* ioctl.c */
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index f0b347bd12ca..5e0e31cc46f5 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -16,7 +16,7 @@
 
 extern struct super_block * sysfs_sb;
 
-static struct address_space_operations sysfs_aops = {
+static const struct address_space_operations sysfs_aops = {
 	.readpage	= simple_readpage,
 	.prepare_write	= simple_prepare_write,
 	.commit_write	= simple_commit_write
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 86f5f8d43d0f..f2bcccd1d6fc 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -465,7 +465,7 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,get_block);
 }
-struct address_space_operations sysv_aops = {
+const struct address_space_operations sysv_aops = {
 	.readpage = sysv_readpage,
 	.writepage = sysv_writepage,
 	.sync_page = block_sync_page,
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 393a480e4deb..9dcc82120935 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -161,7 +161,7 @@ extern struct inode_operations sysv_dir_inode_operations;
 extern struct inode_operations sysv_fast_symlink_inode_operations;
 extern const struct file_operations sysv_file_operations;
 extern const struct file_operations sysv_dir_operations;
-extern struct address_space_operations sysv_aops;
+extern const struct address_space_operations sysv_aops;
 extern struct super_operations sysv_sops;
 extern struct dentry_operations sysv_dentry_operations;
 
diff --git a/fs/udf/file.c b/fs/udf/file.c
index e34b00e303f1..a59e5f33daf6 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -95,7 +95,7 @@ static int udf_adinicb_commit_write(struct file *file, struct page *page, unsign
 	return 0;
 }
 
-struct address_space_operations udf_adinicb_aops = {
+const struct address_space_operations udf_adinicb_aops = {
 	.readpage		= udf_adinicb_readpage,
 	.writepage		= udf_adinicb_writepage,
 	.sync_page		= block_sync_page,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 2983afd5e7fd..605f5111b6d8 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -132,7 +132,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping,block,udf_get_block);
 }
 
-struct address_space_operations udf_aops = {
+const struct address_space_operations udf_aops = {
 	.readpage		= udf_readpage,
 	.writepage		= udf_writepage,
 	.sync_page		= block_sync_page,
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 674bb40edc83..ba068a786563 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -113,6 +113,6 @@ out:
 /*
  * symlinks can't do much...
  */
-struct address_space_operations udf_symlink_aops = {
+const struct address_space_operations udf_symlink_aops = {
 	.readpage		= udf_symlink_filler,
 };
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 023e19ba5a2e..2f992387cc9e 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -47,9 +47,9 @@ extern struct inode_operations udf_dir_inode_operations;
 extern const struct file_operations udf_dir_operations;
 extern struct inode_operations udf_file_inode_operations;
 extern const struct file_operations udf_file_operations;
-extern struct address_space_operations udf_aops;
-extern struct address_space_operations udf_adinicb_aops;
-extern struct address_space_operations udf_symlink_aops;
+extern const struct address_space_operations udf_aops;
+extern const struct address_space_operations udf_adinicb_aops;
+extern const struct address_space_operations udf_symlink_aops;
 
 struct udf_fileident_bh
 {
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 259bd196099d..8e1f90e42040 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -574,7 +574,7 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,ufs_getfrag_block);
 }
-struct address_space_operations ufs_aops = {
+const struct address_space_operations ufs_aops = {
 	.readpage = ufs_readpage,
 	.writepage = ufs_writepage,
 	.sync_page = block_sync_page,
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 3e807b828e22..c40f81ba9b13 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1454,7 +1454,7 @@ xfs_vm_invalidatepage(
 	block_invalidatepage(page, offset);
 }
 
-struct address_space_operations xfs_address_space_operations = {
+const struct address_space_operations xfs_address_space_operations = {
 	.readpage		= xfs_vm_readpage,
 	.readpages		= xfs_vm_readpages,
 	.writepage		= xfs_vm_writepage,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 706d8c781b8a..2244e516b66a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -40,7 +40,7 @@ typedef struct xfs_ioend {
 	struct work_struct	io_work;	/* xfsdatad work queue */
 } xfs_ioend_t;
 
-extern struct address_space_operations xfs_address_space_operations;
+extern const struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 26fed0756f01..2af528dcfb04 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1520,7 +1520,7 @@ xfs_mapping_buftarg(
 	struct backing_dev_info	*bdi;
 	struct inode		*inode;
 	struct address_space	*mapping;
-	static struct address_space_operations mapping_aops = {
+	static const struct address_space_operations mapping_aops = {
 		.sync_page = block_sync_page,
 		.migratepage = fail_migrate_page,
 	};
diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h
index 7b5c5df5cb69..be512cc98791 100644
--- a/include/linux/coda_linux.h
+++ b/include/linux/coda_linux.h
@@ -27,8 +27,8 @@ extern struct inode_operations coda_dir_inode_operations;
 extern struct inode_operations coda_file_inode_operations;
 extern struct inode_operations coda_ioctl_inode_operations;
 
-extern struct address_space_operations coda_file_aops;
-extern struct address_space_operations coda_symlink_aops;
+extern const struct address_space_operations coda_file_aops;
+extern const struct address_space_operations coda_symlink_aops;
 
 extern const struct file_operations coda_dir_operations;
 extern const struct file_operations coda_file_operations;
diff --git a/include/linux/efs_fs.h b/include/linux/efs_fs.h
index fbfa6b52e2fb..278ef4495819 100644
--- a/include/linux/efs_fs.h
+++ b/include/linux/efs_fs.h
@@ -38,7 +38,7 @@ struct statfs;
 
 extern struct inode_operations efs_dir_inode_operations;
 extern const struct file_operations efs_dir_operations;
-extern struct address_space_operations efs_symlink_aops;
+extern const struct address_space_operations efs_symlink_aops;
 
 extern void efs_read_inode(struct inode *);
 extern efs_block_t efs_map_block(struct inode *, efs_block_t);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2d8b348c1192..e04a5cfe874f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -392,7 +392,7 @@ struct address_space {
 	unsigned int		truncate_count;	/* Cover race condition with truncate */
 	unsigned long		nrpages;	/* number of total pages */
 	pgoff_t			writeback_index;/* writeback starts here */
-	struct address_space_operations *a_ops;	/* methods */
+	const struct address_space_operations *a_ops;	/* methods */
 	unsigned long		flags;		/* error bits/gfp mask */
 	struct backing_dev_info *backing_dev_info; /* device readahead, etc */
 	spinlock_t		private_lock;	/* for use by the address_space */
@@ -1405,7 +1405,7 @@ extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
 extern struct block_device *open_by_devnum(dev_t, unsigned);
 extern const struct file_operations def_blk_fops;
-extern struct address_space_operations def_blk_aops;
+extern const struct address_space_operations def_blk_aops;
 extern const struct file_operations def_chr_fops;
 extern const struct file_operations bad_sock_fops;
 extern const struct file_operations def_fifo_fops;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 0a1740b2532e..d90b1bb37563 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -335,7 +335,7 @@ extern struct inode_operations nfs_file_inode_operations;
 extern struct inode_operations nfs3_file_inode_operations;
 #endif /* CONFIG_NFS_V3 */
 extern const struct file_operations nfs_file_operations;
-extern struct address_space_operations nfs_file_aops;
+extern const struct address_space_operations nfs_file_aops;
 
 static inline struct rpc_cred *nfs_file_cred(struct file *file)
 {
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 5676c4210e2c..daa2d83cefe8 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1973,7 +1973,7 @@ void reiserfs_unmap_buffer(struct buffer_head *);
 /* file.c */
 extern struct inode_operations reiserfs_file_inode_operations;
 extern const struct file_operations reiserfs_file_operations;
-extern struct address_space_operations reiserfs_address_space_operations;
+extern const struct address_space_operations reiserfs_address_space_operations;
 
 /* fix_nodes.c */
 
diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h
index 914f911325be..e39b7cc43390 100644
--- a/include/linux/ufs_fs.h
+++ b/include/linux/ufs_fs.h
@@ -966,7 +966,7 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
 extern struct inode_operations ufs_file_inode_operations;
 extern const struct file_operations ufs_file_operations;
 
-extern struct address_space_operations ufs_aops;
+extern const struct address_space_operations ufs_aops;
 
 /* ialloc.c */
 extern void ufs_free_inode (struct inode *inode);
diff --git a/mm/filemap.c b/mm/filemap.c
index d504d6e98886..4082b3b3cea7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2069,7 +2069,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space * mapping = file->f_mapping;
-	struct address_space_operations *a_ops = mapping->a_ops;
+	const struct address_space_operations *a_ops = mapping->a_ops;
 	struct inode 	*inode = mapping->host;
 	long		status = 0;
 	struct page	*page;
@@ -2219,7 +2219,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t *ppos)
 {
 	struct file *file = iocb->ki_filp;
-	struct address_space * mapping = file->f_mapping;
+	const struct address_space * mapping = file->f_mapping;
 	size_t ocount;		/* original count */
 	size_t count;		/* after file limit checks */
 	struct inode 	*inode = mapping->host;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b960ac8e5918..b4fd0d7c9bfb 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -273,7 +273,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
 		  size_t count, loff_t pos, loff_t *ppos)
 {
 	struct address_space * mapping = filp->f_mapping;
-	struct address_space_operations *a_ops = mapping->a_ops;
+	const struct address_space_operations *a_ops = mapping->a_ops;
 	struct inode 	*inode = mapping->host;
 	long		status = 0;
 	struct page	*page;
diff --git a/mm/shmem.c b/mm/shmem.c
index 38bc3334f263..ea64c07cbe72 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -174,7 +174,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 }
 
 static struct super_operations shmem_ops;
-static struct address_space_operations shmem_aops;
+static const struct address_space_operations shmem_aops;
 static struct file_operations shmem_file_operations;
 static struct inode_operations shmem_inode_operations;
 static struct inode_operations shmem_dir_inode_operations;
@@ -2162,7 +2162,7 @@ static void destroy_inodecache(void)
 		printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
 }
 
-static struct address_space_operations shmem_aops = {
+static const struct address_space_operations shmem_aops = {
 	.writepage	= shmem_writepage,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
 #ifdef CONFIG_TMPFS
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e0e1583f32c2..7535211bb495 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -24,7 +24,7 @@
  * vmscan's shrink_list, to make sync_page look nicer, and to allow
  * future use of radix_tree tags in the swap cache.
  */
-static struct address_space_operations swap_aops = {
+static const struct address_space_operations swap_aops = {
 	.writepage	= swap_writepage,
 	.sync_page	= block_sync_page,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
-- 
cgit v1.2.3


From f71d20e961474dde77e6558396efb93d6ac80a4b Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 28 Jun 2006 04:26:45 -0700
Subject: [PATCH] Add EXPORT_UNUSED_SYMBOL and EXPORT_UNUSED_SYMBOL_GPL

Temporarily add EXPORT_UNUSED_SYMBOL and EXPORT_UNUSED_SYMBOL_GPL.  These
will be used as a transition measure for symbols that aren't used in the
kernel and are on the way out.  When a module uses such a symbol, a warning
is printk'd at modprobe time.

The main reason for removing unused exports is size: eacho export takes
roughly between 100 and 150 bytes of kernel space in the binary.  This
patch gives users the option to immediately get this size gain via a config
option.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/feature-removal-schedule.txt | 10 ++++
 include/asm-generic/vmlinux.lds.h          | 28 +++++++++++
 include/linux/module.h                     | 20 ++++++++
 kernel/module.c                            | 78 +++++++++++++++++++++++++++++-
 lib/Kconfig.debug                          | 16 ++++++
 5 files changed, 150 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 027285d0c26c..033ac91da07a 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -177,6 +177,16 @@ Who:	Jean Delvare <khali@linux-fr.org>
 
 ---------------------------
 
+What:	Unused EXPORT_SYMBOL/EXPORT_SYMBOL_GPL exports
+	(temporary transition config option provided until then)
+	The transition config option will also be removed at the same time.
+When:	before 2.6.19
+Why:	Unused symbols are both increasing the size of the kernel binary
+	and are often a sign of "wrong API"
+Who:	Arjan van de Ven <arjan@linux.intel.com>
+
+---------------------------
+
 What:	remove EXPORT_SYMBOL(tasklist_lock)
 When:	August 2006
 Files:	kernel/fork.c
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 9d11550b4818..db5a3732f106 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -58,6 +58,20 @@
 		VMLINUX_SYMBOL(__stop___ksymtab_gpl) = .;		\
 	}								\
 									\
+	/* Kernel symbol table: Normal unused symbols */		\
+	__ksymtab_unused  : AT(ADDR(__ksymtab_unused) - LOAD_OFFSET) {	\
+		VMLINUX_SYMBOL(__start___ksymtab_unused) = .;		\
+		*(__ksymtab_unused)					\
+		VMLINUX_SYMBOL(__stop___ksymtab_unused) = .;		\
+	}								\
+									\
+	/* Kernel symbol table: GPL-only unused symbols */		\
+	__ksymtab_unused_gpl : AT(ADDR(__ksymtab_unused_gpl) - LOAD_OFFSET) { \
+		VMLINUX_SYMBOL(__start___ksymtab_unused_gpl) = .;	\
+		*(__ksymtab_unused_gpl)					\
+		VMLINUX_SYMBOL(__stop___ksymtab_unused_gpl) = .;	\
+	}								\
+									\
 	/* Kernel symbol table: GPL-future-only symbols */		\
 	__ksymtab_gpl_future : AT(ADDR(__ksymtab_gpl_future) - LOAD_OFFSET) { \
 		VMLINUX_SYMBOL(__start___ksymtab_gpl_future) = .;	\
@@ -79,6 +93,20 @@
 		VMLINUX_SYMBOL(__stop___kcrctab_gpl) = .;		\
 	}								\
 									\
+	/* Kernel symbol table: Normal unused symbols */		\
+	__kcrctab_unused  : AT(ADDR(__kcrctab_unused) - LOAD_OFFSET) {	\
+		VMLINUX_SYMBOL(__start___kcrctab_unused) = .;		\
+		*(__kcrctab_unused)					\
+		VMLINUX_SYMBOL(__stop___kcrctab_unused) = .;		\
+	}								\
+									\
+	/* Kernel symbol table: GPL-only unused symbols */		\
+	__kcrctab_unused_gpl : AT(ADDR(__kcrctab_unused_gpl) - LOAD_OFFSET) { \
+		VMLINUX_SYMBOL(__start___kcrctab_unused_gpl) = .;	\
+		*(__kcrctab_unused_gpl)					\
+		VMLINUX_SYMBOL(__stop___kcrctab_unused_gpl) = .;	\
+	}								\
+									\
 	/* Kernel symbol table: GPL-future-only symbols */		\
 	__kcrctab_gpl_future : AT(ADDR(__kcrctab_gpl_future) - LOAD_OFFSET) { \
 		VMLINUX_SYMBOL(__start___kcrctab_gpl_future) = .;	\
diff --git a/include/linux/module.h b/include/linux/module.h
index 9ebbb74b7b72..9e9dc7c24d95 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -203,6 +203,15 @@ void *__symbol_get_gpl(const char *symbol);
 #define EXPORT_SYMBOL_GPL_FUTURE(sym)				\
 	__EXPORT_SYMBOL(sym, "_gpl_future")
 
+
+#ifdef CONFIG_UNUSED_SYMBOLS
+#define EXPORT_UNUSED_SYMBOL(sym) __EXPORT_SYMBOL(sym, "_unused")
+#define EXPORT_UNUSED_SYMBOL_GPL(sym) __EXPORT_SYMBOL(sym, "_unused_gpl")
+#else
+#define EXPORT_UNUSED_SYMBOL(sym)
+#define EXPORT_UNUSED_SYMBOL_GPL(sym)
+#endif
+
 #endif
 
 struct module_ref
@@ -261,6 +270,15 @@ struct module
 	unsigned int num_gpl_syms;
 	const unsigned long *gpl_crcs;
 
+	/* unused exported symbols. */
+	const struct kernel_symbol *unused_syms;
+	unsigned int num_unused_syms;
+	const unsigned long *unused_crcs;
+	/* GPL-only, unused exported symbols. */
+	const struct kernel_symbol *unused_gpl_syms;
+	unsigned int num_unused_gpl_syms;
+	const unsigned long *unused_gpl_crcs;
+
 	/* symbols that will be GPL-only in the near future. */
 	const struct kernel_symbol *gpl_future_syms;
 	unsigned int num_gpl_future_syms;
@@ -456,6 +474,8 @@ void module_remove_driver(struct device_driver *);
 #define EXPORT_SYMBOL(sym)
 #define EXPORT_SYMBOL_GPL(sym)
 #define EXPORT_SYMBOL_GPL_FUTURE(sym)
+#define EXPORT_UNUSED_SYMBOL(sym)
+#define EXPORT_UNUSED_SYMBOL_GPL(sym)
 
 /* Given an address, look for it in the exception tables. */
 static inline const struct exception_table_entry *
diff --git a/kernel/module.c b/kernel/module.c
index 10e5b872adf6..03b738172a8c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,4 +1,4 @@
-/* Rewritten by Rusty Russell, on the backs of many others...
+/*
    Copyright (C) 2002 Richard Henderson
    Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
 
@@ -122,9 +122,17 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
 extern const struct kernel_symbol __start___ksymtab_gpl_future[];
 extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
+extern const struct kernel_symbol __start___ksymtab_unused[];
+extern const struct kernel_symbol __stop___ksymtab_unused[];
+extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
+extern const struct kernel_symbol __start___ksymtab_gpl_future[];
+extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const unsigned long __start___kcrctab[];
 extern const unsigned long __start___kcrctab_gpl[];
 extern const unsigned long __start___kcrctab_gpl_future[];
+extern const unsigned long __start___kcrctab_unused[];
+extern const unsigned long __start___kcrctab_unused_gpl[];
 
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
@@ -144,6 +152,17 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
 	return NULL;
 }
 
+static void printk_unused_warning(const char *name)
+{
+	printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
+		"however this module is using it.\n", name);
+	printk(KERN_WARNING "This symbol will go away in the future.\n");
+	printk(KERN_WARNING "Please evalute if this is the right api to use, "
+		"and if it really is, submit a report the linux kernel "
+		"mailinglist together with submitting your code for "
+		"inclusion.\n");
+}
+
 /* Find a symbol, return value, crc and module which owns it */
 static unsigned long __find_symbol(const char *name,
 				   struct module **owner,
@@ -186,6 +205,25 @@ static unsigned long __find_symbol(const char *name,
 		return ks->value;
 	}
 
+	ks = lookup_symbol(name, __start___ksymtab_unused,
+				 __stop___ksymtab_unused);
+	if (ks) {
+		printk_unused_warning(name);
+		*crc = symversion(__start___kcrctab_unused,
+				  (ks - __start___ksymtab_unused));
+		return ks->value;
+	}
+
+	if (gplok)
+		ks = lookup_symbol(name, __start___ksymtab_unused_gpl,
+				 __stop___ksymtab_unused_gpl);
+	if (ks) {
+		printk_unused_warning(name);
+		*crc = symversion(__start___kcrctab_unused_gpl,
+				  (ks - __start___ksymtab_unused_gpl));
+		return ks->value;
+	}
+
 	/* Now try modules. */ 
 	list_for_each_entry(mod, &modules, list) {
 		*owner = mod;
@@ -204,6 +242,23 @@ static unsigned long __find_symbol(const char *name,
 				return ks->value;
 			}
 		}
+		ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms);
+		if (ks) {
+			printk_unused_warning(name);
+			*crc = symversion(mod->unused_crcs, (ks - mod->unused_syms));
+			return ks->value;
+		}
+
+		if (gplok) {
+			ks = lookup_symbol(name, mod->unused_gpl_syms,
+					   mod->unused_gpl_syms + mod->num_unused_gpl_syms);
+			if (ks) {
+				printk_unused_warning(name);
+				*crc = symversion(mod->unused_gpl_crcs,
+						  (ks - mod->unused_gpl_syms));
+				return ks->value;
+			}
+		}
 		ks = lookup_symbol(name, mod->gpl_future_syms,
 				   (mod->gpl_future_syms +
 				    mod->num_gpl_future_syms));
@@ -1407,6 +1462,8 @@ static struct module *load_module(void __user *umod,
 		exportindex, modindex, obsparmindex, infoindex, gplindex,
 		crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
 		gplfuturecrcindex, unwindex = 0;
+	unsigned int unusedindex, unusedcrcindex, unusedgplindex,
+			unusedgplcrcindex;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1487,9 +1544,13 @@ static struct module *load_module(void __user *umod,
 	exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
 	gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
 	gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
+	unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
+	unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
 	crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
 	gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
 	gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
+	unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
+	unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
 	setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
 	exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
 	obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
@@ -1638,14 +1699,27 @@ static struct module *load_module(void __user *umod,
 		mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
 	mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
 					sizeof(*mod->gpl_future_syms);
+	mod->num_unused_syms = sechdrs[unusedindex].sh_size /
+					sizeof(*mod->unused_syms);
+	mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
+					sizeof(*mod->unused_gpl_syms);
 	mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
 	if (gplfuturecrcindex)
 		mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
 
+	mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
+	if (unusedcrcindex)
+		mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
+	mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
+	if (unusedgplcrcindex)
+		mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr;
+
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !crcindex) || 
 	    (mod->num_gpl_syms && !gplcrcindex) ||
-	    (mod->num_gpl_future_syms && !gplfuturecrcindex)) {
+	    (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
+	    (mod->num_unused_syms && !unusedcrcindex) ||
+	    (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
 		printk(KERN_WARNING "%s: No versions for exported symbols."
 		       " Tainting kernel.\n", mod->name);
 		add_taint(TAINT_FORCED_MODULE);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5330911ebd30..e4fcbd12cf6e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -23,6 +23,22 @@ config MAGIC_SYSRQ
 	  keys are documented in <file:Documentation/sysrq.txt>. Don't say Y
 	  unless you really know what this hack does.
 
+config UNUSED_SYMBOLS
+	bool "Enable unused/obsolete exported symbols"
+	default y if X86
+	help
+	  Unused but exported symbols make the kernel needlessly bigger.  For
+	  that reason most of these unused exports will soon be removed.  This
+	  option is provided temporarily to provide a transition period in case
+	  some external kernel module needs one of these symbols anyway. If you
+	  encounter such a case in your module, consider if you are actually
+	  using the right API.  (rationale: since nobody in the kernel is using
+	  this in a module, there is a pretty good chance it's actually the
+	  wrong interface to use).  If you really need the symbol, please send a
+	  mail to the linux kernel mailing list mentioning the symbol and why
+	  you really need it, and what the merge plan to the mainline kernel for
+	  your module is.
+
 config DEBUG_KERNEL
 	bool "Kernel debugging"
 	help
-- 
cgit v1.2.3


From 817d6d3bceaf34c99f5343820f9b9e6021f0655c Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Wed, 28 Jun 2006 04:26:47 -0700
Subject: [PATCH] remove TTY_DONT_FLIP

Remove TTY_DONT_FLIP tty flag.  This flag was introduced in 2.1.X kernels
to prevent the N_TTY line discipline functions read_chan() and
n_tty_receive_buf() from running at the same time.  2.2.15 introduced
tty->read_lock to protect access to the N_TTY read buffer, which is the
only state requiring protection between these two functions.

The current TTY_DONT_FLIP implementation is broken for SMP, and is not
universally honored by drivers that send data directly to the line
discipline receive_buf function.

Because TTY_DONT_FLIP is not necessary, is broken in implementation, and is
not universally honored, it is removed.

Signed-off-by: Paul Fulghum <paulkf@microgate.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/mxser.c         |  1 -
 drivers/char/n_tty.c         |  6 +-----
 drivers/char/pty.c           |  2 +-
 drivers/char/tty_io.c        | 15 ++-------------
 drivers/serial/crisv10.c     |  6 ------
 drivers/serial/jsm/jsm_tty.c |  7 -------
 drivers/usb/serial/ir-usb.c  |  3 +--
 include/linux/tty.h          |  1 -
 net/bluetooth/rfcomm/tty.c   |  8 ++------
 9 files changed, 7 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c
index 645d9d713aec..72cfd09091e0 100644
--- a/drivers/char/mxser.c
+++ b/drivers/char/mxser.c
@@ -996,7 +996,6 @@ static int mxser_open(struct tty_struct *tty, struct file *filp)
 
 	info->session = current->signal->session;
 	info->pgrp = process_group(current);
-	clear_bit(TTY_DONT_FLIP, &tty->flags);
 
 	/*
 	status = mxser_get_msr(info->base, 0, info->port);
diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index b9371d5bf790..603b9ade5eb0 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -1132,7 +1132,7 @@ static inline int input_available_p(struct tty_struct *tty, int amt)
  *	buffer, and once to drain the space from the (physical) beginning of
  *	the buffer to head pointer.
  *
- *	Called under the tty->atomic_read_lock sem and with TTY_DONT_FLIP set
+ *	Called under the tty->atomic_read_lock sem
  *
  */
  
@@ -1271,7 +1271,6 @@ do_it_again:
 	}
 
 	add_wait_queue(&tty->read_wait, &wait);
-	set_bit(TTY_DONT_FLIP, &tty->flags);
 	while (nr) {
 		/* First test for status change. */
 		if (tty->packet && tty->link->ctrl_status) {
@@ -1315,9 +1314,7 @@ do_it_again:
 				break;
 			}
 			n_tty_set_room(tty);
-			clear_bit(TTY_DONT_FLIP, &tty->flags);
 			timeout = schedule_timeout(timeout);
-			set_bit(TTY_DONT_FLIP, &tty->flags);
 			continue;
 		}
 		__set_current_state(TASK_RUNNING);
@@ -1394,7 +1391,6 @@ do_it_again:
 		if (time)
 			timeout = time;
 	}
-	clear_bit(TTY_DONT_FLIP, &tty->flags);
 	mutex_unlock(&tty->atomic_read_lock);
 	remove_wait_queue(&tty->read_wait, &wait);
 
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index 9b5a2c0e7008..0c17f61549b4 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -101,7 +101,7 @@ static void pty_unthrottle(struct tty_struct * tty)
  *
  * FIXME: Our pty_write method is called with our ldisc lock held but
  * not our partners. We can't just take the other one blindly without
- * risking deadlocks.  There is also the small matter of TTY_DONT_FLIP
+ * risking deadlocks.
  */
 static int pty_write(struct tty_struct * tty, const unsigned char *buf, int count)
 {
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index bd74e82d8a72..b846d87f2b56 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -784,11 +784,8 @@ restart:
 	}
 
 	clear_bit(TTY_LDISC, &tty->flags);
-	clear_bit(TTY_DONT_FLIP, &tty->flags);
-	if (o_tty) {
+	if (o_tty)
 		clear_bit(TTY_LDISC, &o_tty->flags);
-		clear_bit(TTY_DONT_FLIP, &o_tty->flags);
-	}
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
 
 	/*
@@ -1955,7 +1952,6 @@ static void release_dev(struct file * filp)
 	 * race with the set_ldisc code path.
 	 */
 	clear_bit(TTY_LDISC, &tty->flags);
-	clear_bit(TTY_DONT_FLIP, &tty->flags);
 	cancel_delayed_work(&tty->buf.work);
 
 	/*
@@ -2784,13 +2780,6 @@ static void flush_to_ldisc(void *private_)
 	if (disc == NULL)	/*  !TTY_LDISC */
 		return;
 
-	if (test_bit(TTY_DONT_FLIP, &tty->flags)) {
-		/*
-		 * Do it after the next timer tick:
-		 */
-		schedule_delayed_work(&tty->buf.work, 1);
-		goto out;
-	}
 	spin_lock_irqsave(&tty->buf.lock, flags);
 	while((tbuf = tty->buf.head) != NULL) {
 		while ((count = tbuf->commit - tbuf->read) != 0) {
@@ -2809,7 +2798,7 @@ static void flush_to_ldisc(void *private_)
 		tty_buffer_free(tty, tbuf);
 	}
 	spin_unlock_irqrestore(&tty->buf.lock, flags);
-out:
+
 	tty_ldisc_deref(disc);
 }
 
diff --git a/drivers/serial/crisv10.c b/drivers/serial/crisv10.c
index 89700141f87e..5cacc5e74a92 100644
--- a/drivers/serial/crisv10.c
+++ b/drivers/serial/crisv10.c
@@ -2573,12 +2573,6 @@ static void flush_to_flip_buffer(struct e100_serial *info)
 
 	DFLIP(
 	  if (1) {
-
-		  if (test_bit(TTY_DONT_FLIP, &tty->flags)) {
-			  DEBUG_LOG(info->line, "*** TTY_DONT_FLIP set flip.count %i ***\n", tty->flip.count);
-			  DEBUG_LOG(info->line, "*** recv_cnt %i\n", info->recv_cnt);
-		  } else {
-		  }
 		  DEBUG_LOG(info->line, "*** rxtot %i\n", info->icount.rx);
 		  DEBUG_LOG(info->line, "ldisc %lu\n", tty->ldisc.chars_in_buffer(tty));
 		  DEBUG_LOG(info->line, "room  %lu\n", tty->ldisc.receive_room(tty));
diff --git a/drivers/serial/jsm/jsm_tty.c b/drivers/serial/jsm/jsm_tty.c
index 7d823705193c..f8262e6ad8d3 100644
--- a/drivers/serial/jsm/jsm_tty.c
+++ b/drivers/serial/jsm/jsm_tty.c
@@ -588,13 +588,6 @@ void jsm_input(struct jsm_channel *ch)
 	len = min(len, (N_TTY_BUF_SIZE - 1) - tp->read_cnt);
 	ld = tty_ldisc_ref(tp);
 
-	/*
-	 * If the DONT_FLIP flag is on, don't flush our buffer, and act
-	 * like the ld doesn't have any space to put the data right now.
-	 */
-	if (test_bit(TTY_DONT_FLIP, &tp->flags))
-		len = 0;
-
 	/*
 	 * If we were unable to get a reference to the ld,
 	 * don't flush our buffer, and act like the ld doesn't
diff --git a/drivers/usb/serial/ir-usb.c b/drivers/usb/serial/ir-usb.c
index 9432c7302275..d7f3f736a692 100644
--- a/drivers/usb/serial/ir-usb.c
+++ b/drivers/usb/serial/ir-usb.c
@@ -453,8 +453,7 @@ static void ir_read_bulk_callback (struct urb *urb, struct pt_regs *regs)
 			tty = port->tty;
 
 			/*
-			 *	FIXME: must not do this in IRQ context,
-			 *	must honour TTY_DONT_FLIP
+			 *	FIXME: must not do this in IRQ context
 			 */
 			tty->ldisc.receive_buf(
 				tty,
diff --git a/include/linux/tty.h b/include/linux/tty.h
index cb35ca50a0a6..341cc4552c00 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -259,7 +259,6 @@ struct tty_struct {
 #define TTY_DO_WRITE_WAKEUP 	5	/* Call write_wakeup after queuing new */
 #define TTY_PUSH 		6	/* n_tty private */
 #define TTY_CLOSING 		7	/* ->close() in progress */
-#define TTY_DONT_FLIP 		8	/* Defer buffer flip */
 #define TTY_LDISC 		9	/* Line discipline attached */
 #define TTY_HW_COOK_OUT 	14	/* Hardware can do output cooking */
 #define TTY_HW_COOK_IN 		15	/* Hardware can do input cooking */
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 74368f79ee5d..b81fad893328 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -480,12 +480,8 @@ static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb)
 
 	BT_DBG("dlc %p tty %p len %d", dlc, tty, skb->len);
 
-	if (test_bit(TTY_DONT_FLIP, &tty->flags)) {
-		tty_buffer_request_room(tty, skb->len);
-		tty_insert_flip_string(tty, skb->data, skb->len);
-		tty_flip_buffer_push(tty);
-	} else
-		tty->ldisc.receive_buf(tty, skb->data, NULL, skb->len);
+	tty_insert_flip_string(tty, skb->data, skb->len);
+	tty_flip_buffer_push(tty);
 
 	kfree_skb(skb);
 }
-- 
cgit v1.2.3


From 33b37a33c242542fac2980b8ccd90977388b7a8d Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Wed, 28 Jun 2006 04:26:49 -0700
Subject: [PATCH] remove active field from tty buffer structure

Remove 'active' field from tty buffer structure.  This was added in 2.6.16
as part of a patch to make the new tty buffering SMP safe.  This field is
unnecessary with the more intelligently written flush_to_ldisc that adds
receive_room handling.

Removing this field reverts to simpler logic where the tail buffer is
always the 'active' buffer, which should not be freed by flush_to_ldisc.
(active == buffer being filled with new data)

The result is simpler, smaller, and faster tty buffer code.

Signed-off-by: Paul Fulghum <paulkf@microgate.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/tty_io.c    | 16 ++++------------
 include/linux/kbd_kern.h |  4 +---
 include/linux/tty.h      |  1 -
 include/linux/tty_flip.h |  2 +-
 4 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 1f03ebf165d9..8d19f7281f0b 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -267,7 +267,6 @@ static struct tty_buffer *tty_buffer_alloc(size_t size)
 	p->used = 0;
 	p->size = size;
 	p->next = NULL;
-	p->active = 0;
 	p->commit = 0;
 	p->read = 0;
 	p->char_buf_ptr = (char *)(p->data);
@@ -327,10 +326,9 @@ int tty_buffer_request_room(struct tty_struct *tty, size_t size)
 	/* OPTIMISATION: We could keep a per tty "zero" sized buffer to
 	   remove this conditional if its worth it. This would be invisible
 	   to the callers */
-	if ((b = tty->buf.tail) != NULL) {
+	if ((b = tty->buf.tail) != NULL)
 		left = b->size - b->used;
-		b->active = 1;
-	} else
+	else
 		left = 0;
 
 	if (left < size) {
@@ -338,12 +336,10 @@ int tty_buffer_request_room(struct tty_struct *tty, size_t size)
 		if ((n = tty_buffer_find(tty, size)) != NULL) {
 			if (b != NULL) {
 				b->next = n;
-				b->active = 0;
 				b->commit = b->used;
 			} else
 				tty->buf.head = n;
 			tty->buf.tail = n;
-			n->active = 1;
 		} else
 			size = left;
 	}
@@ -404,10 +400,8 @@ void tty_schedule_flip(struct tty_struct *tty)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&tty->buf.lock, flags);
-	if (tty->buf.tail != NULL) {
-		tty->buf.tail->active = 0;
+	if (tty->buf.tail != NULL)
 		tty->buf.tail->commit = tty->buf.tail->used;
-	}
 	spin_unlock_irqrestore(&tty->buf.lock, flags);
 	schedule_delayed_work(&tty->buf.work, 1);
 }
@@ -2902,10 +2896,8 @@ void tty_flip_buffer_push(struct tty_struct *tty)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&tty->buf.lock, flags);
-	if (tty->buf.tail != NULL) {
-		tty->buf.tail->active = 0;
+	if (tty->buf.tail != NULL)
 		tty->buf.tail->commit = tty->buf.tail->used;
-	}
 	spin_unlock_irqrestore(&tty->buf.lock, flags);
 
 	if (tty->low_latency)
diff --git a/include/linux/kbd_kern.h b/include/linux/kbd_kern.h
index 4eb851ece080..efe0ee4cc80b 100644
--- a/include/linux/kbd_kern.h
+++ b/include/linux/kbd_kern.h
@@ -155,10 +155,8 @@ static inline void con_schedule_flip(struct tty_struct *t)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&t->buf.lock, flags);
-	if (t->buf.tail != NULL) {
-		t->buf.tail->active = 0;
+	if (t->buf.tail != NULL)
 		t->buf.tail->commit = t->buf.tail->used;
-	}
 	spin_unlock_irqrestore(&t->buf.lock, flags);
 	schedule_work(&t->buf.work);
 }
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 341cc4552c00..b3b807e4b050 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -57,7 +57,6 @@ struct tty_buffer {
 	unsigned char *flag_buf_ptr;
 	int used;
 	int size;
-	int active;
 	int commit;
 	int read;
 	/* Data points here */
diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h
index 31548303ee37..eb677cf56106 100644
--- a/include/linux/tty_flip.h
+++ b/include/linux/tty_flip.h
@@ -12,7 +12,7 @@ static inline int tty_insert_flip_char(struct tty_struct *tty,
 					unsigned char ch, char flag)
 {
 	struct tty_buffer *tb = tty->buf.tail;
-	if (tb && tb->active && tb->used < tb->size) {
+	if (tb && tb->used < tb->size) {
 		tb->flag_buf_ptr[tb->used] = flag;
 		tb->char_buf_ptr[tb->used++] = ch;
 		return 1;
-- 
cgit v1.2.3


From be6990e7473fcd11becda747c24b94a478413245 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 28 Jun 2006 04:26:57 -0700
Subject: [PATCH] ac97_codec: make bitfield unsigned

Make a 1-bit bitfield unsigned (no space for sign bit).
Removes 24 sparse warnings from this one file:
include/linux/ac97_codec.h:262:13: error: dubious one-bit signed bitfield

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/ac97_codec.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ac97_codec.h b/include/linux/ac97_codec.h
index c35833824e11..2ed2fd855133 100644
--- a/include/linux/ac97_codec.h
+++ b/include/linux/ac97_codec.h
@@ -259,7 +259,7 @@ struct ac97_codec {
 	int type;
 	u32 model;
 
-	int modem:1;
+	unsigned int modem:1;
 
 	struct ac97_ops *codec_ops;
 
-- 
cgit v1.2.3


From da574af755bcb1d604e01feadf2a8c31b364447c Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Wed, 28 Jun 2006 04:27:01 -0700
Subject: [PATCH] ide: fix error handling for drives which clear the FIFO on
 error

If the controller FIFO cleared automatically on error we must not try
and drain it as this will hang some chips.

Based in concept on a broken patch from -mm some while back

Signed-off-by: Alan Cox <alan@redhat.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Bartlomiej Zolnierkiewicz <B.Zolnierkiewicz@elka.pw.edu.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/ide/ide-io.c | 2 +-
 include/linux/ide.h  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 935cb2583770..26ceab1e90bb 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -505,7 +505,7 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq, u8
 		}
 	}
 
-	if ((stat & DRQ_STAT) && rq_data_dir(rq) == READ)
+	if ((stat & DRQ_STAT) && rq_data_dir(rq) == READ && hwif->err_stops_fifo == 0)
 		try_to_flush_leftover_data(drive);
 
 	if (hwif->INB(IDE_STATUS_REG) & (BUSY_STAT|DRQ_STAT))
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ef7bef207f48..0c100168c0cf 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -793,6 +793,7 @@ typedef struct hwif_s {
 	unsigned	auto_poll  : 1; /* supports nop auto-poll */
 	unsigned	sg_mapped  : 1;	/* sg_table and sg_nents are ready */
 	unsigned	no_io_32bit : 1; /* 1 = can not do 32-bit IO ops */
+	unsigned	err_stops_fifo : 1; /* 1=data FIFO is cleared by an error */
 
 	struct device	gendev;
 	struct completion gendev_rel_comp; /* To deal with device release() */
-- 
cgit v1.2.3


From 980a01c9bfb090cb8a991e39e56ac379c30c61b8 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Wed, 28 Jun 2006 07:47:15 -0700
Subject: [PATCH] SPI: infrastructure to initialize spi_device.mode early

This patch adds earlier initialization of spi_device.mode, as needed
on boards using nondefault chipselect polarity.  An example would be
ones using the RS5C348 RTC without an external signal inverter between
the RTC chipselect and the SPI controller.

Without this mechanism, the first setup() call for that chip would
wrongly enable chips, corrupting transfers to/from other chips sharing
that SPI bus.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/spi/spi.c       | 1 +
 include/linux/spi/spi.h | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 1cea4a6799fe..ed1cdf6ac8f3 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -210,6 +210,7 @@ spi_new_device(struct spi_master *master, struct spi_board_info *chip)
 	proxy->master = master;
 	proxy->chip_select = chip->chip_select;
 	proxy->max_speed_hz = chip->max_speed_hz;
+	proxy->mode = chip->mode;
 	proxy->irq = chip->irq;
 	proxy->modalias = chip->modalias;
 
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index e928c0dcc297..c8bb68099eb9 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -642,10 +642,14 @@ struct spi_board_info {
 	u16		bus_num;
 	u16		chip_select;
 
+	/* mode becomes spi_device.mode, and is essential for chips
+	 * where the default of SPI_CS_HIGH = 0 is wrong.
+	 */
+	u8		mode;
+
 	/* ... may need additional spi_device chip config data here.
 	 * avoid stuff protocol drivers can set; but include stuff
 	 * needed to behave without being bound to a driver:
-	 *  - chipselect polarity
 	 *  - quirks like clock rate mattering when not selected
 	 */
 };
-- 
cgit v1.2.3


From b44597906e03d5e2b467c17a3b73585596c0d7be Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 28 Jun 2006 17:14:07 +0200
Subject: [PATCH] Fix plist include dependency

plist.h uses container_of, which is defined in kernel.h.
Include kernel.h in plist.h as the kernel.h include does not longer
happen automatically on all architectures.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/plist.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/plist.h b/include/linux/plist.h
index 3404faef542c..b95818a037ad 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -73,6 +73,7 @@
 #ifndef _LINUX_PLIST_H_
 #define _LINUX_PLIST_H_
 
+#include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/spinlock_types.h>
 
-- 
cgit v1.2.3


From 4e54f08543d05e519e601368571cc3787fefae96 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 29 Jun 2006 02:24:28 -0700
Subject: [PATCH] Keys: Allow in-kernel key requestor to pass auxiliary data to
 upcaller

The proposed NFS key type uses its own method of passing key requests to
userspace (upcalling) rather than invoking /sbin/request-key.  This is
because the responsible userspace daemon should already be running and will
be contacted through rpc_pipefs.

This patch permits the NFS filesystem to pass auxiliary data to the upcall
operation (struct key_type::request_key) so that the upcaller can use a
pre-existing communications channel more easily.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-By: Kevin Coffman <kwc@citi.umich.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys-request-key.txt | 54 +++++++++++++++++++++++++-------------
 Documentation/keys.txt             | 29 ++++++++++++++++++++
 include/linux/key.h                |  8 +++++-
 security/keys/internal.h           |  1 +
 security/keys/keyctl.c             |  2 +-
 security/keys/request_key.c        | 44 ++++++++++++++++++++++++-------
 6 files changed, 108 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/keys-request-key.txt b/Documentation/keys-request-key.txt
index 22488d791168..c1f64fdf84cb 100644
--- a/Documentation/keys-request-key.txt
+++ b/Documentation/keys-request-key.txt
@@ -3,16 +3,23 @@
 			      ===================
 
 The key request service is part of the key retention service (refer to
-Documentation/keys.txt). This document explains more fully how that the
-requesting algorithm works.
+Documentation/keys.txt).  This document explains more fully how the requesting
+algorithm works.
 
 The process starts by either the kernel requesting a service by calling
-request_key():
+request_key*():
 
 	struct key *request_key(const struct key_type *type,
 				const char *description,
 				const char *callout_string);
 
+or:
+
+	struct key *request_key_with_auxdata(const struct key_type *type,
+					     const char *description,
+					     const char *callout_string,
+					     void *aux);
+
 Or by userspace invoking the request_key system call:
 
 	key_serial_t request_key(const char *type,
@@ -20,16 +27,26 @@ Or by userspace invoking the request_key system call:
 				 const char *callout_info,
 				 key_serial_t dest_keyring);
 
-The main difference between the two access points is that the in-kernel
-interface does not need to link the key to a keyring to prevent it from being
-immediately destroyed. The kernel interface returns a pointer directly to the
-key, and it's up to the caller to destroy the key.
+The main difference between the access points is that the in-kernel interface
+does not need to link the key to a keyring to prevent it from being immediately
+destroyed.  The kernel interface returns a pointer directly to the key, and
+it's up to the caller to destroy the key.
+
+The request_key_with_auxdata() call is like the in-kernel request_key() call,
+except that it permits auxiliary data to be passed to the upcaller (the default
+is NULL).  This is only useful for those key types that define their own upcall
+mechanism rather than using /sbin/request-key.
 
 The userspace interface links the key to a keyring associated with the process
 to prevent the key from going away, and returns the serial number of the key to
 the caller.
 
 
+The following example assumes that the key types involved don't define their
+own upcall mechanisms.  If they do, then those should be substituted for the
+forking and execution of /sbin/request-key.
+
+
 ===========
 THE PROCESS
 ===========
@@ -40,8 +57,8 @@ A request proceeds in the following manner:
      interface].
 
  (2) request_key() searches the process's subscribed keyrings to see if there's
-     a suitable key there. If there is, it returns the key. If there isn't, and
-     callout_info is not set, an error is returned. Otherwise the process
+     a suitable key there.  If there is, it returns the key.  If there isn't,
+     and callout_info is not set, an error is returned.  Otherwise the process
      proceeds to the next step.
 
  (3) request_key() sees that A doesn't have the desired key yet, so it creates
@@ -62,7 +79,7 @@ A request proceeds in the following manner:
      instantiation.
 
  (7) The program may want to access another key from A's context (say a
-     Kerberos TGT key). It just requests the appropriate key, and the keyring
+     Kerberos TGT key).  It just requests the appropriate key, and the keyring
      search notes that the session keyring has auth key V in its bottom level.
 
      This will permit it to then search the keyrings of process A with the
@@ -79,10 +96,11 @@ A request proceeds in the following manner:
 (10) The program then exits 0 and request_key() deletes key V and returns key
      U to the caller.
 
-This also extends further. If key W (step 7 above) didn't exist, key W would be
-created uninstantiated, another auth key (X) would be created (as per step 3)
-and another copy of /sbin/request-key spawned (as per step 4); but the context
-specified by auth key X will still be process A, as it was in auth key V.
+This also extends further.  If key W (step 7 above) didn't exist, key W would
+be created uninstantiated, another auth key (X) would be created (as per step
+3) and another copy of /sbin/request-key spawned (as per step 4); but the
+context specified by auth key X will still be process A, as it was in auth key
+V.
 
 This is because process A's keyrings can't simply be attached to
 /sbin/request-key at the appropriate places because (a) execve will discard two
@@ -118,17 +136,17 @@ A search of any particular keyring proceeds in the following fashion:
 
  (2) It considers all the non-keyring keys within that keyring and, if any key
      matches the criteria specified, calls key_permission(SEARCH) on it to see
-     if the key is allowed to be found. If it is, that key is returned; if
+     if the key is allowed to be found.  If it is, that key is returned; if
      not, the search continues, and the error code is retained if of higher
      priority than the one currently set.
 
  (3) It then considers all the keyring-type keys in the keyring it's currently
-     searching. It calls key_permission(SEARCH) on each keyring, and if this
+     searching.  It calls key_permission(SEARCH) on each keyring, and if this
      grants permission, it recurses, executing steps (2) and (3) on that
      keyring.
 
 The process stops immediately a valid key is found with permission granted to
-use it. Any error from a previous match attempt is discarded and the key is
+use it.  Any error from a previous match attempt is discarded and the key is
 returned.
 
 When search_process_keyrings() is invoked, it performs the following searches
@@ -153,7 +171,7 @@ The moment one succeeds, all pending errors are discarded and the found key is
 returned.
 
 Only if all these fail does the whole thing fail with the highest priority
-error. Note that several errors may have come from LSM.
+error.  Note that several errors may have come from LSM.
 
 The error priority is:
 
diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 61c0fad2fe2f..e373f0212843 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -780,6 +780,17 @@ payload contents" for more information.
     See also Documentation/keys-request-key.txt.
 
 
+(*) To search for a key, passing auxiliary data to the upcaller, call:
+
+	struct key *request_key_with_auxdata(const struct key_type *type,
+					     const char *description,
+					     const char *callout_string,
+					     void *aux);
+
+    This is identical to request_key(), except that the auxiliary data is
+    passed to the key_type->request_key() op if it exists.
+
+
 (*) When it is no longer required, the key should be released using:
 
 	void key_put(struct key *key);
@@ -1031,6 +1042,24 @@ The structure has a number of fields, some of which are mandatory:
      as might happen when the userspace buffer is accessed.
 
 
+ (*) int (*request_key)(struct key *key, struct key *authkey, const char *op,
+			void *aux);
+
+     This method is optional.  If provided, request_key() and
+     request_key_with_auxdata() will invoke this function rather than
+     upcalling to /sbin/request-key to operate upon a key of this type.
+
+     The aux parameter is as passed to request_key_with_auxdata() or is NULL
+     otherwise.  Also passed are the key to be operated upon, the
+     authorisation key for this operation and the operation type (currently
+     only "create").
+
+     This function should return only when the upcall is complete.  Upon return
+     the authorisation key will be revoked, and the target key will be
+     negatively instantiated if it is still uninstantiated.  The error will be
+     returned to the caller of request_key*().
+
+
 ============================
 REQUEST-KEY CALLBACK SERVICE
 ============================
diff --git a/include/linux/key.h b/include/linux/key.h
index e693e729bc92..169f05e4863e 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -177,7 +177,8 @@ struct key {
 /*
  * kernel managed key type definition
  */
-typedef int (*request_key_actor_t)(struct key *key, struct key *authkey, const char *op);
+typedef int (*request_key_actor_t)(struct key *key, struct key *authkey,
+				   const char *op, void *aux);
 
 struct key_type {
 	/* name of the type */
@@ -285,6 +286,11 @@ extern struct key *request_key(struct key_type *type,
 			       const char *description,
 			       const char *callout_info);
 
+extern struct key *request_key_with_auxdata(struct key_type *type,
+					    const char *description,
+					    const char *callout_info,
+					    void *aux);
+
 extern int key_validate(struct key *key);
 
 extern key_ref_t key_create_or_update(key_ref_t keyring,
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 3c2877f0663e..1bb416f4bbce 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -99,6 +99,7 @@ extern int install_process_keyring(struct task_struct *tsk);
 extern struct key *request_key_and_link(struct key_type *type,
 					const char *description,
 					const char *callout_info,
+					void *aux,
 					struct key *dest_keyring,
 					unsigned long flags);
 
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 329411cf8768..d9ca15c109cc 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -183,7 +183,7 @@ asmlinkage long sys_request_key(const char __user *_type,
 	}
 
 	/* do the search */
-	key = request_key_and_link(ktype, description, callout_info,
+	key = request_key_and_link(ktype, description, callout_info, NULL,
 				   key_ref_to_ptr(dest_ref),
 				   KEY_ALLOC_IN_QUOTA);
 	if (IS_ERR(key)) {
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 58d1efd4fc2c..f573ac189a0a 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -1,6 +1,6 @@
 /* request_key.c: request a key from userspace
  *
- * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-6 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -33,7 +33,8 @@ DECLARE_WAIT_QUEUE_HEAD(request_key_conswq);
  */
 static int call_sbin_request_key(struct key *key,
 				 struct key *authkey,
-				 const char *op)
+				 const char *op,
+				 void *aux)
 {
 	struct task_struct *tsk = current;
 	key_serial_t prkey, sskey;
@@ -127,6 +128,7 @@ error_alloc:
 static struct key *__request_key_construction(struct key_type *type,
 					      const char *description,
 					      const char *callout_info,
+					      void *aux,
 					      unsigned long flags)
 {
 	request_key_actor_t actor;
@@ -164,7 +166,7 @@ static struct key *__request_key_construction(struct key_type *type,
 	actor = call_sbin_request_key;
 	if (type->request_key)
 		actor = type->request_key;
-	ret = actor(key, authkey, "create");
+	ret = actor(key, authkey, "create", aux);
 	if (ret < 0)
 		goto request_failed;
 
@@ -258,8 +260,9 @@ alloc_failed:
  */
 static struct key *request_key_construction(struct key_type *type,
 					    const char *description,
-					    struct key_user *user,
 					    const char *callout_info,
+					    void *aux,
+					    struct key_user *user,
 					    unsigned long flags)
 {
 	struct key_construction *pcons;
@@ -284,7 +287,7 @@ static struct key *request_key_construction(struct key_type *type,
 	}
 
 	/* see about getting userspace to construct the key */
-	key = __request_key_construction(type, description, callout_info,
+	key = __request_key_construction(type, description, callout_info, aux,
 					 flags);
  error:
 	kleave(" = %p", key);
@@ -392,6 +395,7 @@ static void request_key_link(struct key *key, struct key *dest_keyring)
 struct key *request_key_and_link(struct key_type *type,
 				 const char *description,
 				 const char *callout_info,
+				 void *aux,
 				 struct key *dest_keyring,
 				 unsigned long flags)
 {
@@ -399,8 +403,9 @@ struct key *request_key_and_link(struct key_type *type,
 	struct key *key;
 	key_ref_t key_ref;
 
-	kenter("%s,%s,%s,%p,%lx",
-	       type->name, description, callout_info, dest_keyring, flags);
+	kenter("%s,%s,%s,%p,%p,%lx",
+	       type->name, description, callout_info, aux,
+	       dest_keyring, flags);
 
 	/* search all the process keyrings for a key */
 	key_ref = search_process_keyrings(type, description, type->match,
@@ -433,8 +438,8 @@ struct key *request_key_and_link(struct key_type *type,
 			/* ask userspace (returns NULL if it waited on a key
 			 * being constructed) */
 			key = request_key_construction(type, description,
-						       user, callout_info,
-						       flags);
+						       callout_info, aux,
+						       user, flags);
 			if (key)
 				break;
 
@@ -491,8 +496,27 @@ struct key *request_key(struct key_type *type,
 			const char *callout_info)
 {
 	return request_key_and_link(type, description, callout_info, NULL,
-				    KEY_ALLOC_IN_QUOTA);
+				    NULL, KEY_ALLOC_IN_QUOTA);
 
 } /* end request_key() */
 
 EXPORT_SYMBOL(request_key);
+
+/*****************************************************************************/
+/*
+ * request a key with auxiliary data for the upcaller
+ * - search the process's keyrings
+ * - check the list of keys being created or updated
+ * - call out to userspace for a key if supplementary info was provided
+ */
+struct key *request_key_with_auxdata(struct key_type *type,
+				     const char *description,
+				     const char *callout_info,
+				     void *aux)
+{
+	return request_key_and_link(type, description, callout_info, aux,
+				    NULL, KEY_ALLOC_IN_QUOTA);
+
+} /* end request_key_with_auxdata() */
+
+EXPORT_SYMBOL(request_key_with_auxdata);
-- 
cgit v1.2.3


From 9dc3885dfbebc76f4461b19e1af15e704ff4fcb0 Mon Sep 17 00:00:00 2001
From: Karsten Keil <kkeil@suse.de>
Date: Thu, 29 Jun 2006 02:24:33 -0700
Subject: [PATCH] i4l: remove unneeded include/linux/isdn/tpam.h

The TPAM isdn driver was removed in 2.6.12, but include/linux/isdn/tpam.h
was missed.

Signed-off-by: Karsten Keil <kkeil@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/isdn/tpam.h | 55 -----------------------------------------------
 1 file changed, 55 deletions(-)
 delete mode 100644 include/linux/isdn/tpam.h

(limited to 'include/linux')

diff --git a/include/linux/isdn/tpam.h b/include/linux/isdn/tpam.h
deleted file mode 100644
index d18dd0dc570d..000000000000
--- a/include/linux/isdn/tpam.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* $Id: tpam.h,v 1.1.2.1 2001/06/08 08:23:46 kai Exp $
- *
- * Turbo PAM ISDN driver for Linux. (Kernel Driver)
- *
- * Copyright 2001 Stelian Pop <stelian.pop@fr.alcove.com>, Alc�ve
- *
- * For all support questions please contact: <support@auvertech.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#ifndef _TPAM_H_
-#define _TPAM_H_
-
-#include <linux/types.h>
-
-/* IOCTL commands */
-#define TPAM_CMD_DSPLOAD	0x0001
-#define TPAM_CMD_DSPSAVE	0x0002
-#define TPAM_CMD_DSPRUN		0x0003
-#define TPAM_CMD_LOOPMODEON	0x0004
-#define TPAM_CMD_LOOPMODEOFF	0x0005
-
-/* addresses of debug information zones on board */
-#define TPAM_TRAPAUDIT_REGISTER		0x005493e4
-#define TPAM_NCOAUDIT_REGISTER		0x00500000
-#define TPAM_MSGAUDIT_REGISTER		0x008E30F0
-
-/* length of debug information zones on board */
-#define TPAM_TRAPAUDIT_LENGTH		10000
-#define TPAM_NCOAUDIT_LENGTH		300000
-#define TPAM_NCOAUDIT_COUNT		30
-#define TPAM_MSGAUDIT_LENGTH		60000
-
-/* IOCTL load/save parameter */
-typedef struct tpam_dsp_ioctl {
-	__u32 address;	/* address to load/save data */
-	__u32 data_len;	/* size of data to be loaded/saved */
-	__u8 data[0];	/* data */
-} tpam_dsp_ioctl;
-
-#endif /* _TPAM_H_ */
-- 
cgit v1.2.3


From d1bef4ed5faf7d9872337b33c4269e45ae1bf960 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jun 2006 02:24:36 -0700
Subject: [PATCH] genirq: rename desc->handler to desc->chip

This patch-queue improves the generic IRQ layer to be truly generic, by adding
various abstractions and features to it, without impacting existing
functionality.

While the queue can be best described as "fix and improve everything in the
generic IRQ layer that we could think of", and thus it consists of many
smaller features and lots of cleanups, the one feature that stands out most is
the new 'irq chip' abstraction.

The irq-chip abstraction is about describing and coding and IRQ controller
driver by mapping its raw hardware capabilities [and quirks, if needed] in a
straightforward way, without having to think about "IRQ flow"
(level/edge/etc.) type of details.

This stands in contrast with the current 'irq-type' model of genirq
architectures, which 'mixes' raw hardware capabilities with 'flow' details.
The patchset supports both types of irq controller designs at once, and
converts i386 and x86_64 to the new irq-chip design.

As a bonus side-effect of the irq-chip approach, chained interrupt controllers
(master/slave PIC constructs, etc.) are now supported by design as well.

The end result of this patchset intends to be simpler architecture-level code
and more consolidation between architectures.

We reused many bits of code and many concepts from Russell King's ARM IRQ
layer, the merging of which was one of the motivations for this patchset.

This patch:

rename desc->handler to desc->chip.

Originally i did not want to do this, because it's a big patch.  But having
both "desc->handler", "desc->handle_irq" and "action->handler" caused a
large degree of confusion and made the code appear alot less clean than it
truly is.

I have also attempted a dual approach as well by introducing a
desc->chip alias - but that just wasnt robust enough and broke
frequently.

So lets get over with this quickly.  The conversion was done automatically
via scripts and converts all the code in the kernel.

This renaming patch is the first one amongst the patches, so that the
remaining patches can stay flexible and can be merged and split up
without having some big monolithic patch act as a merge barrier.

[akpm@osdl.org: build fix]
[akpm@osdl.org: another build fix]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/irq.c                            |  6 ++---
 arch/alpha/kernel/irq_alpha.c                      |  2 +-
 arch/alpha/kernel/irq_i8259.c                      |  2 +-
 arch/alpha/kernel/irq_pyxis.c                      |  2 +-
 arch/alpha/kernel/irq_srm.c                        |  2 +-
 arch/alpha/kernel/sys_alcor.c                      |  2 +-
 arch/alpha/kernel/sys_cabriolet.c                  |  2 +-
 arch/alpha/kernel/sys_dp264.c                      |  2 +-
 arch/alpha/kernel/sys_eb64p.c                      |  2 +-
 arch/alpha/kernel/sys_eiger.c                      |  2 +-
 arch/alpha/kernel/sys_jensen.c                     | 10 ++++----
 arch/alpha/kernel/sys_marvel.c                     |  6 ++---
 arch/alpha/kernel/sys_mikasa.c                     |  2 +-
 arch/alpha/kernel/sys_noritake.c                   |  2 +-
 arch/alpha/kernel/sys_rawhide.c                    |  2 +-
 arch/alpha/kernel/sys_rx164.c                      |  2 +-
 arch/alpha/kernel/sys_sable.c                      |  2 +-
 arch/alpha/kernel/sys_takara.c                     |  2 +-
 arch/alpha/kernel/sys_titan.c                      |  2 +-
 arch/alpha/kernel/sys_wildfire.c                   |  6 ++---
 arch/cris/arch-v10/kernel/irq.c                    |  2 +-
 arch/cris/arch-v32/kernel/irq.c                    |  2 +-
 arch/cris/kernel/irq.c                             |  2 +-
 arch/i386/kernel/i8259.c                           |  6 ++---
 arch/i386/kernel/io_apic.c                         | 16 +++++++------
 arch/i386/kernel/irq.c                             |  6 ++---
 arch/i386/mach-visws/visws_apic.c                  | 12 +++++-----
 arch/i386/mach-voyager/voyager_smp.c               |  2 +-
 arch/ia64/kernel/iosapic.c                         | 10 ++++----
 arch/ia64/kernel/irq.c                             | 18 +++++++-------
 arch/ia64/kernel/irq_ia64.c                        |  2 +-
 arch/ia64/kernel/smpboot.c                         |  6 ++---
 arch/ia64/sn/kernel/irq.c                          |  4 ++--
 arch/m32r/kernel/irq.c                             |  2 +-
 arch/m32r/kernel/setup_m32104ut.c                  |  8 +++----
 arch/m32r/kernel/setup_m32700ut.c                  | 28 +++++++++++-----------
 arch/m32r/kernel/setup_mappi.c                     | 16 ++++++-------
 arch/m32r/kernel/setup_mappi2.c                    | 20 ++++++++--------
 arch/m32r/kernel/setup_mappi3.c                    | 20 ++++++++--------
 arch/m32r/kernel/setup_oaks32r.c                   | 12 +++++-----
 arch/m32r/kernel/setup_opsput.c                    | 28 +++++++++++-----------
 arch/m32r/kernel/setup_usrv.c                      | 18 +++++++-------
 arch/mips/au1000/common/irq.c                      | 20 ++++++++--------
 arch/mips/au1000/pb1200/irqmap.c                   |  2 +-
 arch/mips/ddb5xxx/ddb5477/irq_5477.c               |  2 +-
 arch/mips/dec/ioasic-irq.c                         |  4 ++--
 arch/mips/dec/kn02-irq.c                           |  2 +-
 arch/mips/gt64120/ev64120/irq.c                    |  2 +-
 arch/mips/ite-boards/generic/irq.c                 |  4 ++--
 arch/mips/jazz/irq.c                               |  2 +-
 arch/mips/jmr3927/rbhma3100/irq.c                  |  2 +-
 arch/mips/kernel/i8259.c                           |  4 ++--
 arch/mips/kernel/irq-msc01.c                       |  4 ++--
 arch/mips/kernel/irq-mv6434x.c                     |  2 +-
 arch/mips/kernel/irq-rm7000.c                      |  2 +-
 arch/mips/kernel/irq-rm9000.c                      |  4 ++--
 arch/mips/kernel/irq.c                             |  4 ++--
 arch/mips/kernel/irq_cpu.c                         |  4 ++--
 arch/mips/lasat/interrupt.c                        |  2 +-
 arch/mips/mips-boards/atlas/atlas_int.c            |  2 +-
 arch/mips/momentum/ocelot_c/cpci-irq.c             |  2 +-
 arch/mips/momentum/ocelot_c/uart-irq.c             |  4 ++--
 arch/mips/philips/pnx8550/common/int.c             | 10 ++++----
 arch/mips/sgi-ip22/ip22-eisa.c                     |  4 ++--
 arch/mips/sgi-ip22/ip22-int.c                      |  2 +-
 arch/mips/sgi-ip27/ip27-irq.c                      |  2 +-
 arch/mips/sgi-ip32/ip32-irq.c                      |  2 +-
 arch/mips/sibyte/bcm1480/irq.c                     |  4 ++--
 arch/mips/sibyte/sb1250/irq.c                      |  4 ++--
 arch/mips/sni/irq.c                                |  2 +-
 arch/mips/tx4927/common/tx4927_irq.c               |  4 ++--
 .../tx4927/toshiba_rbtx4927/toshiba_rbtx4927_irq.c | 14 +++++------
 arch/mips/tx4938/common/irq.c                      |  4 ++--
 arch/mips/tx4938/toshiba_rbtx4938/irq.c            |  2 +-
 arch/mips/vr41xx/common/icu.c                      |  4 ++--
 arch/mips/vr41xx/common/irq.c                      |  4 ++--
 arch/mips/vr41xx/common/vrc4173.c                  |  2 +-
 arch/mips/vr41xx/nec-cmbvr4133/irq.c               |  2 +-
 arch/parisc/kernel/irq.c                           | 10 ++++----
 arch/powerpc/kernel/crash.c                        |  4 ++--
 arch/powerpc/kernel/irq.c                          |  8 +++----
 arch/powerpc/platforms/cell/interrupt.c            |  4 ++--
 arch/powerpc/platforms/cell/spider-pic.c           |  4 ++--
 arch/powerpc/platforms/iseries/irq.c               |  6 ++---
 arch/powerpc/platforms/powermac/pic.c              |  4 ++--
 arch/powerpc/platforms/pseries/xics.c              |  8 +++----
 arch/powerpc/sysdev/i8259.c                        |  2 +-
 arch/powerpc/sysdev/ipic.c                         |  2 +-
 arch/powerpc/sysdev/mpic.c                         |  8 +++----
 arch/ppc/8xx_io/commproc.c                         |  2 +-
 arch/ppc/platforms/apus_setup.c                    |  4 ++--
 arch/ppc/platforms/sbc82xx.c                       |  2 +-
 arch/ppc/syslib/cpc700_pic.c                       |  4 ++--
 arch/ppc/syslib/cpm2_pic.c                         |  2 +-
 arch/ppc/syslib/gt64260_pic.c                      |  2 +-
 arch/ppc/syslib/m82xx_pci.c                        |  2 +-
 arch/ppc/syslib/m8xx_setup.c                       |  4 ++--
 arch/ppc/syslib/mpc52xx_pic.c                      |  4 ++--
 arch/ppc/syslib/mv64360_pic.c                      |  2 +-
 arch/ppc/syslib/open_pic.c                         |  4 ++--
 arch/ppc/syslib/open_pic2.c                        |  2 +-
 arch/ppc/syslib/ppc403_pic.c                       |  2 +-
 arch/ppc/syslib/ppc4xx_pic.c                       |  2 +-
 arch/ppc/syslib/xilinx_pic.c                       |  2 +-
 arch/sh/boards/adx/irq_maskreg.c                   |  2 +-
 arch/sh/boards/bigsur/irq.c                        |  4 ++--
 arch/sh/boards/cqreek/irq.c                        |  4 ++--
 arch/sh/boards/dreamcast/setup.c                   |  2 +-
 arch/sh/boards/ec3104/setup.c                      |  2 +-
 arch/sh/boards/harp/irq.c                          |  2 +-
 arch/sh/boards/mpc1211/setup.c                     |  2 +-
 arch/sh/boards/overdrive/irq.c                     |  2 +-
 arch/sh/boards/renesas/hs7751rvoip/irq.c           |  2 +-
 arch/sh/boards/renesas/rts7751r2d/irq.c            |  2 +-
 arch/sh/boards/renesas/systemh/irq.c               |  2 +-
 arch/sh/boards/se/73180/irq.c                      |  2 +-
 arch/sh/boards/superh/microdev/irq.c               |  2 +-
 arch/sh/cchips/hd6446x/hd64461/setup.c             |  2 +-
 arch/sh/cchips/hd6446x/hd64465/setup.c             |  2 +-
 arch/sh/cchips/voyagergx/irq.c                     |  2 +-
 arch/sh/kernel/cpu/irq/imask.c                     |  2 +-
 arch/sh/kernel/cpu/irq/intc2.c                     |  2 +-
 arch/sh/kernel/cpu/irq/ipr.c                       |  2 +-
 arch/sh/kernel/cpu/irq/pint.c                      |  2 +-
 arch/sh/kernel/irq.c                               |  2 +-
 arch/sh64/kernel/irq.c                             |  2 +-
 arch/sh64/kernel/irq_intc.c                        |  4 ++--
 arch/sh64/mach-cayman/irq.c                        |  2 +-
 arch/sparc64/kernel/irq.c                          | 10 ++++----
 arch/um/kernel/irq.c                               |  6 ++---
 arch/v850/kernel/irq.c                             |  6 ++---
 arch/x86_64/kernel/i8259.c                         |  6 ++---
 arch/x86_64/kernel/io_apic.c                       | 16 +++++++------
 arch/x86_64/kernel/irq.c                           |  6 ++---
 arch/xtensa/kernel/irq.c                           |  4 ++--
 drivers/char/vr41xx_giu.c                          |  4 ++--
 drivers/parisc/dino.c                              |  4 ++--
 drivers/parisc/eisa.c                              |  2 +-
 drivers/parisc/gsc.c                               |  8 +++----
 drivers/parisc/iosapic.c                           |  2 +-
 drivers/parisc/superio.c                           |  2 +-
 drivers/pci/msi.c                                  |  6 ++---
 drivers/pcmcia/hd64465_ss.c                        |  6 ++---
 include/asm-powerpc/hw_irq.h                       | 12 +++++-----
 include/linux/irq.h                                |  4 ++--
 kernel/irq/autoprobe.c                             | 10 ++++----
 kernel/irq/handle.c                                | 14 +++++------
 kernel/irq/manage.c                                | 24 +++++++++----------
 kernel/irq/migration.c                             |  8 +++----
 kernel/irq/proc.c                                  |  6 ++---
 kernel/irq/spurious.c                              |  4 ++--
 151 files changed, 384 insertions(+), 380 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index da677f829f76..ec9d243d42c9 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -49,7 +49,7 @@ select_smp_affinity(unsigned int irq)
 	static int last_cpu;
 	int cpu = last_cpu + 1;
 
-	if (!irq_desc[irq].handler->set_affinity || irq_user_affinity[irq])
+	if (!irq_desc[irq].chip->set_affinity || irq_user_affinity[irq])
 		return 1;
 
 	while (!cpu_possible(cpu))
@@ -57,7 +57,7 @@ select_smp_affinity(unsigned int irq)
 	last_cpu = cpu;
 
 	irq_affinity[irq] = cpumask_of_cpu(cpu);
-	irq_desc[irq].handler->set_affinity(irq, cpumask_of_cpu(cpu));
+	irq_desc[irq].chip->set_affinity(irq, cpumask_of_cpu(cpu));
 	return 0;
 }
 #endif /* CONFIG_SMP */
@@ -93,7 +93,7 @@ show_interrupts(struct seq_file *p, void *v)
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[irq]);
 #endif
-		seq_printf(p, " %14s", irq_desc[irq].handler->typename);
+		seq_printf(p, " %14s", irq_desc[irq].chip->typename);
 		seq_printf(p, "  %c%s",
 			(action->flags & SA_INTERRUPT)?'+':' ',
 			action->name);
diff --git a/arch/alpha/kernel/irq_alpha.c b/arch/alpha/kernel/irq_alpha.c
index 9d34ce26e5ef..f20f2dff9c43 100644
--- a/arch/alpha/kernel/irq_alpha.c
+++ b/arch/alpha/kernel/irq_alpha.c
@@ -233,7 +233,7 @@ void __init
 init_rtc_irq(void)
 {
 	irq_desc[RTC_IRQ].status = IRQ_DISABLED;
-	irq_desc[RTC_IRQ].handler = &rtc_irq_type;
+	irq_desc[RTC_IRQ].chip = &rtc_irq_type;
 	setup_irq(RTC_IRQ, &timer_irqaction);
 }
 
diff --git a/arch/alpha/kernel/irq_i8259.c b/arch/alpha/kernel/irq_i8259.c
index b188683b83fd..ac893bd48036 100644
--- a/arch/alpha/kernel/irq_i8259.c
+++ b/arch/alpha/kernel/irq_i8259.c
@@ -109,7 +109,7 @@ init_i8259a_irqs(void)
 
 	for (i = 0; i < 16; i++) {
 		irq_desc[i].status = IRQ_DISABLED;
-		irq_desc[i].handler = &i8259a_irq_type;
+		irq_desc[i].chip = &i8259a_irq_type;
 	}
 
 	setup_irq(2, &cascade);
diff --git a/arch/alpha/kernel/irq_pyxis.c b/arch/alpha/kernel/irq_pyxis.c
index 146a20b9e3d5..3b581415bab0 100644
--- a/arch/alpha/kernel/irq_pyxis.c
+++ b/arch/alpha/kernel/irq_pyxis.c
@@ -120,7 +120,7 @@ init_pyxis_irqs(unsigned long ignore_mask)
 		if ((ignore_mask >> i) & 1)
 			continue;
 		irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i].handler = &pyxis_irq_type;
+		irq_desc[i].chip = &pyxis_irq_type;
 	}
 
 	setup_irq(16+7, &isa_cascade_irqaction);
diff --git a/arch/alpha/kernel/irq_srm.c b/arch/alpha/kernel/irq_srm.c
index 0a87e466918c..8e4d121f84cc 100644
--- a/arch/alpha/kernel/irq_srm.c
+++ b/arch/alpha/kernel/irq_srm.c
@@ -67,7 +67,7 @@ init_srm_irqs(long max, unsigned long ignore_mask)
 		if (i < 64 && ((ignore_mask >> i) & 1))
 			continue;
 		irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i].handler = &srm_irq_type;
+		irq_desc[i].chip = &srm_irq_type;
 	}
 }
 
diff --git a/arch/alpha/kernel/sys_alcor.c b/arch/alpha/kernel/sys_alcor.c
index d7f0e97fe56f..1a1a2c7a3d94 100644
--- a/arch/alpha/kernel/sys_alcor.c
+++ b/arch/alpha/kernel/sys_alcor.c
@@ -144,7 +144,7 @@ alcor_init_irq(void)
 		if (i >= 16+20 && i <= 16+30)
 			continue;
 		irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i].handler = &alcor_irq_type;
+		irq_desc[i].chip = &alcor_irq_type;
 	}
 	i8259a_irq_type.ack = alcor_isa_mask_and_ack_irq;
 
diff --git a/arch/alpha/kernel/sys_cabriolet.c b/arch/alpha/kernel/sys_cabriolet.c
index 8e3374d34c95..8c9e443d93ad 100644
--- a/arch/alpha/kernel/sys_cabriolet.c
+++ b/arch/alpha/kernel/sys_cabriolet.c
@@ -124,7 +124,7 @@ common_init_irq(void (*srm_dev_int)(unsigned long v, struct pt_regs *r))
 
 		for (i = 16; i < 35; ++i) {
 			irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
-			irq_desc[i].handler = &cabriolet_irq_type;
+			irq_desc[i].chip = &cabriolet_irq_type;
 		}
 	}
 
diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c
index d5da6b1b28ee..b28c8f1c6e10 100644
--- a/arch/alpha/kernel/sys_dp264.c
+++ b/arch/alpha/kernel/sys_dp264.c
@@ -300,7 +300,7 @@ init_tsunami_irqs(struct hw_interrupt_type * ops, int imin, int imax)
 	long i;
 	for (i = imin; i <= imax; ++i) {
 		irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i].handler = ops;
+		irq_desc[i].chip = ops;
 	}
 }
 
diff --git a/arch/alpha/kernel/sys_eb64p.c b/arch/alpha/kernel/sys_eb64p.c
index 61a79c354f0b..aeb8e0277905 100644
--- a/arch/alpha/kernel/sys_eb64p.c
+++ b/arch/alpha/kernel/sys_eb64p.c
@@ -137,7 +137,7 @@ eb64p_init_irq(void)
 
 	for (i = 16; i < 32; ++i) {
 		irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i].handler = &eb64p_irq_type;
+		irq_desc[i].chip = &eb64p_irq_type;
 	}		
 
 	common_init_isa_dma();
diff --git a/arch/alpha/kernel/sys_eiger.c b/arch/alpha/kernel/sys_eiger.c
index bd6e5f0e43c7..64a785baf53a 100644
--- a/arch/alpha/kernel/sys_eiger.c
+++ b/arch/alpha/kernel/sys_eiger.c
@@ -154,7 +154,7 @@ eiger_init_irq(void)
 
 	for (i = 16; i < 128; ++i) {
 		irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i].handler = &eiger_irq_type;
+		irq_desc[i].chip = &eiger_irq_type;
 	}
 }
 
diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c
index fcabb7c96a16..0148e095638f 100644
--- a/arch/alpha/kernel/sys_jensen.c
+++ b/arch/alpha/kernel/sys_jensen.c
@@ -206,11 +206,11 @@ jensen_init_irq(void)
 {
 	init_i8259a_irqs();
 
-	irq_desc[1].handler = &jensen_local_irq_type;
-	irq_desc[4].handler = &jensen_local_irq_type;
-	irq_desc[3].handler = &jensen_local_irq_type;
-	irq_desc[7].handler = &jensen_local_irq_type;
-	irq_desc[9].handler = &jensen_local_irq_type;
+	irq_desc[1].chip = &jensen_local_irq_type;
+	irq_desc[4].chip = &jensen_local_irq_type;
+	irq_desc[3].chip = &jensen_local_irq_type;
+	irq_desc[7].chip = &jensen_local_irq_type;
+	irq_desc[9].chip = &jensen_local_irq_type;
 
 	common_init_isa_dma();
 }
diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c
index e32fee505220..36d215954376 100644
--- a/arch/alpha/kernel/sys_marvel.c
+++ b/arch/alpha/kernel/sys_marvel.c
@@ -303,7 +303,7 @@ init_io7_irqs(struct io7 *io7,
 	/* Set up the lsi irqs.  */
 	for (i = 0; i < 128; ++i) {
 		irq_desc[base + i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[base + i].handler = lsi_ops;
+		irq_desc[base + i].chip = lsi_ops;
 	}
 
 	/* Disable the implemented irqs in hardware.  */
@@ -317,7 +317,7 @@ init_io7_irqs(struct io7 *io7,
 	/* Set up the msi irqs.  */
 	for (i = 128; i < (128 + 512); ++i) {
 		irq_desc[base + i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[base + i].handler = msi_ops;
+		irq_desc[base + i].chip = msi_ops;
 	}
 
 	for (i = 0; i < 16; ++i)
@@ -335,7 +335,7 @@ marvel_init_irq(void)
 	/* Reserve the legacy irqs.  */
 	for (i = 0; i < 16; ++i) {
 		irq_desc[i].status = IRQ_DISABLED;
-		irq_desc[i].handler = &marvel_legacy_irq_type;
+		irq_desc[i].chip = &marvel_legacy_irq_type;
 	}
 
 	/* Init the io7 irqs.  */
diff --git a/arch/alpha/kernel/sys_mikasa.c b/arch/alpha/kernel/sys_mikasa.c
index d78a0daa6168..b741600e3761 100644
--- a/arch/alpha/kernel/sys_mikasa.c
+++ b/arch/alpha/kernel/sys_mikasa.c
@@ -117,7 +117,7 @@ mikasa_init_irq(void)
 
 	for (i = 16; i < 32; ++i) {
 		irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i].handler = &mikasa_irq_type;
+		irq_desc[i].chip = &mikasa_irq_type;
 	}
 
 	init_i8259a_irqs();
diff --git a/arch/alpha/kernel/sys_noritake.c b/arch/alpha/kernel/sys_noritake.c
index 65061f5d7410..55db02d318d7 100644
--- a/arch/alpha/kernel/sys_noritake.c
+++ b/arch/alpha/kernel/sys_noritake.c
@@ -139,7 +139,7 @@ noritake_init_irq(void)
 
 	for (i = 16; i < 48; ++i) {
 		irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i].handler = &noritake_irq_type;
+		irq_desc[i].chip = &noritake_irq_type;
 	}
 
 	init_i8259a_irqs();
diff --git a/arch/alpha/kernel/sys_rawhide.c b/arch/alpha/kernel/sys_rawhide.c
index 05888a02a604..949607e3d6fb 100644
--- a/arch/alpha/kernel/sys_rawhide.c
+++ b/arch/alpha/kernel/sys_rawhide.c
@@ -180,7 +180,7 @@ rawhide_init_irq(void)
 
 	for (i = 16; i < 128; ++i) {
 		irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i].handler = &rawhide_irq_type;
+		irq_desc[i].chip = &rawhide_irq_type;
 	}
 
 	init_i8259a_irqs();
diff --git a/arch/alpha/kernel/sys_rx164.c b/arch/alpha/kernel/sys_rx164.c
index 58404243057b..6ae506052635 100644
--- a/arch/alpha/kernel/sys_rx164.c
+++ b/arch/alpha/kernel/sys_rx164.c
@@ -117,7 +117,7 @@ rx164_init_irq(void)
 	rx164_update_irq_hw(0);
 	for (i = 16; i < 40; ++i) {
 		irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i].handler = &rx164_irq_type;
+		irq_desc[i].chip = &rx164_irq_type;
 	}
 
 	init_i8259a_irqs();
diff --git a/arch/alpha/kernel/sys_sable.c b/arch/alpha/kernel/sys_sable.c
index a7ff84474ace..24dea40c9bfe 100644
--- a/arch/alpha/kernel/sys_sable.c
+++ b/arch/alpha/kernel/sys_sable.c
@@ -537,7 +537,7 @@ sable_lynx_init_irq(int nr_irqs)
 
 	for (i = 0; i < nr_irqs; ++i) {
 		irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i].handler = &sable_lynx_irq_type;
+		irq_desc[i].chip = &sable_lynx_irq_type;
 	}
 
 	common_init_isa_dma();
diff --git a/arch/alpha/kernel/sys_takara.c b/arch/alpha/kernel/sys_takara.c
index 7955bdfc2db0..2c75cd1fd81a 100644
--- a/arch/alpha/kernel/sys_takara.c
+++ b/arch/alpha/kernel/sys_takara.c
@@ -154,7 +154,7 @@ takara_init_irq(void)
 
 	for (i = 16; i < 128; ++i) {
 		irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i].handler = &takara_irq_type;
+		irq_desc[i].chip = &takara_irq_type;
 	}
 
 	common_init_isa_dma();
diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c
index 2551fb49ae09..13f3ed8ed7ac 100644
--- a/arch/alpha/kernel/sys_titan.c
+++ b/arch/alpha/kernel/sys_titan.c
@@ -189,7 +189,7 @@ init_titan_irqs(struct hw_interrupt_type * ops, int imin, int imax)
 	long i;
 	for (i = imin; i <= imax; ++i) {
 		irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i].handler = ops;
+		irq_desc[i].chip = ops;
 	}
 }
 
diff --git a/arch/alpha/kernel/sys_wildfire.c b/arch/alpha/kernel/sys_wildfire.c
index 1553f470246e..22c5798fe083 100644
--- a/arch/alpha/kernel/sys_wildfire.c
+++ b/arch/alpha/kernel/sys_wildfire.c
@@ -199,14 +199,14 @@ wildfire_init_irq_per_pca(int qbbno, int pcano)
 		if (i == 2)
 			continue;
 		irq_desc[i+irq_bias].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i+irq_bias].handler = &wildfire_irq_type;
+		irq_desc[i+irq_bias].chip = &wildfire_irq_type;
 	}
 
 	irq_desc[36+irq_bias].status = IRQ_DISABLED | IRQ_LEVEL;
-	irq_desc[36+irq_bias].handler = &wildfire_irq_type;
+	irq_desc[36+irq_bias].chip = &wildfire_irq_type;
 	for (i = 40; i < 64; ++i) {
 		irq_desc[i+irq_bias].status = IRQ_DISABLED | IRQ_LEVEL;
-		irq_desc[i+irq_bias].handler = &wildfire_irq_type;
+		irq_desc[i+irq_bias].chip = &wildfire_irq_type;
 	}
 
 	setup_irq(32+irq_bias, &isa_enable);	
diff --git a/arch/cris/arch-v10/kernel/irq.c b/arch/cris/arch-v10/kernel/irq.c
index 4b368a122015..2d5be93b5197 100644
--- a/arch/cris/arch-v10/kernel/irq.c
+++ b/arch/cris/arch-v10/kernel/irq.c
@@ -172,7 +172,7 @@ init_IRQ(void)
 
 	/* Initialize IRQ handler descriptiors. */
 	for(i = 2; i < NR_IRQS; i++) {
-		irq_desc[i].handler = &crisv10_irq_type;
+		irq_desc[i].chip = &crisv10_irq_type;
 		set_int_vector(i, interrupt[i]);
 	}
 
diff --git a/arch/cris/arch-v32/kernel/irq.c b/arch/cris/arch-v32/kernel/irq.c
index c78cc2685133..06260874f018 100644
--- a/arch/cris/arch-v32/kernel/irq.c
+++ b/arch/cris/arch-v32/kernel/irq.c
@@ -369,7 +369,7 @@ init_IRQ(void)
 
 	/* Point all IRQ's to bad handlers. */
 	for (i = FIRST_IRQ, j = 0; j < NR_IRQS; i++, j++) {
-		irq_desc[j].handler = &crisv32_irq_type;
+		irq_desc[j].chip = &crisv32_irq_type;
 		set_exception_vector(i, interrupt[j]);
 	}
 
diff --git a/arch/cris/kernel/irq.c b/arch/cris/kernel/irq.c
index b504def3e346..6547bb646364 100644
--- a/arch/cris/kernel/irq.c
+++ b/arch/cris/kernel/irq.c
@@ -69,7 +69,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+		seq_printf(p, " %14s", irq_desc[i].chip->typename);
 		seq_printf(p, "  %s", action->name);
 
 		for (action=action->next; action; action = action->next)
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index c1a42feba286..3c6063671a9f 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -132,7 +132,7 @@ void make_8259A_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
 	io_apic_irqs &= ~(1<<irq);
-	irq_desc[irq].handler = &i8259A_irq_type;
+	irq_desc[irq].chip = &i8259A_irq_type;
 	enable_irq(irq);
 }
 
@@ -386,12 +386,12 @@ void __init init_ISA_irqs (void)
 			/*
 			 * 16 old-style INTA-cycle interrupts:
 			 */
-			irq_desc[i].handler = &i8259A_irq_type;
+			irq_desc[i].chip = &i8259A_irq_type;
 		} else {
 			/*
 			 * 'high' PCI IRQs filled in on demand
 			 */
-			irq_desc[i].handler = &no_irq_type;
+			irq_desc[i].chip = &no_irq_type;
 		}
 	}
 }
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 72ae414e4d49..4a74b696c6a3 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -1205,15 +1205,17 @@ static struct hw_interrupt_type ioapic_edge_type;
 #define IOAPIC_EDGE	0
 #define IOAPIC_LEVEL	1
 
-static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
 {
-	unsigned idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
+	unsigned idx;
+
+	idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 			trigger == IOAPIC_LEVEL)
-		irq_desc[idx].handler = &ioapic_level_type;
+		irq_desc[idx].chip = &ioapic_level_type;
 	else
-		irq_desc[idx].handler = &ioapic_edge_type;
+		irq_desc[idx].chip = &ioapic_edge_type;
 	set_intr_gate(vector, interrupt[idx]);
 }
 
@@ -1325,7 +1327,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 	 * The timer IRQ doesn't have to know that behind the
 	 * scene we have a 8259A-master in AEOI mode ...
 	 */
-	irq_desc[0].handler = &ioapic_edge_type;
+	irq_desc[0].chip = &ioapic_edge_type;
 
 	/*
 	 * Add it to the IO-APIC irq-routing table:
@@ -2135,7 +2137,7 @@ static inline void init_IO_APIC_traps(void)
 				make_8259A_irq(irq);
 			else
 				/* Strange. Oh, well.. */
-				irq_desc[irq].handler = &no_irq_type;
+				irq_desc[irq].chip = &no_irq_type;
 		}
 	}
 }
@@ -2351,7 +2353,7 @@ static inline void check_timer(void)
 	printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
 
 	disable_8259A_irq(0);
-	irq_desc[0].handler = &lapic_irq_type;
+	irq_desc[0].chip = &lapic_irq_type;
 	apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 9eec9435318e..b942a5918dab 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -249,7 +249,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+		seq_printf(p, " %14s", irq_desc[i].chip->typename);
 		seq_printf(p, "  %s", action->name);
 
 		for (action=action->next; action; action = action->next)
@@ -296,8 +296,8 @@ void fixup_irqs(cpumask_t map)
 			printk("Breaking affinity for irq %i\n", irq);
 			mask = map;
 		}
-		if (irq_desc[irq].handler->set_affinity)
-			irq_desc[irq].handler->set_affinity(irq, mask);
+		if (irq_desc[irq].chip->set_affinity)
+			irq_desc[irq].chip->set_affinity(irq, mask);
 		else if (irq_desc[irq].action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
 	}
diff --git a/arch/i386/mach-visws/visws_apic.c b/arch/i386/mach-visws/visws_apic.c
index 3e64fb721291..c418521dd554 100644
--- a/arch/i386/mach-visws/visws_apic.c
+++ b/arch/i386/mach-visws/visws_apic.c
@@ -278,22 +278,22 @@ void init_VISWS_APIC_irqs(void)
 		irq_desc[i].depth = 1;
 
 		if (i == 0) {
-			irq_desc[i].handler = &cobalt_irq_type;
+			irq_desc[i].chip = &cobalt_irq_type;
 		}
 		else if (i == CO_IRQ_IDE0) {
-			irq_desc[i].handler = &cobalt_irq_type;
+			irq_desc[i].chip = &cobalt_irq_type;
 		}
 		else if (i == CO_IRQ_IDE1) {
-			irq_desc[i].handler = &cobalt_irq_type;
+			irq_desc[i].chip = &cobalt_irq_type;
 		}
 		else if (i == CO_IRQ_8259) {
-			irq_desc[i].handler = &piix4_master_irq_type;
+			irq_desc[i].chip = &piix4_master_irq_type;
 		}
 		else if (i < CO_IRQ_APIC0) {
-			irq_desc[i].handler = &piix4_virtual_irq_type;
+			irq_desc[i].chip = &piix4_virtual_irq_type;
 		}
 		else if (IS_CO_APIC(i)) {
-			irq_desc[i].handler = &cobalt_irq_type;
+			irq_desc[i].chip = &cobalt_irq_type;
 		}
 	}
 
diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c
index 8242af9ebc6f..5b8b579a079f 100644
--- a/arch/i386/mach-voyager/voyager_smp.c
+++ b/arch/i386/mach-voyager/voyager_smp.c
@@ -1419,7 +1419,7 @@ smp_intr_init(void)
 	 * This is for later: first 16 correspond to PC IRQs; next 16
 	 * are Primary MC IRQs and final 16 are Secondary MC IRQs */
 	for(i = 0; i < 48; i++)
-		irq_desc[i].handler = &vic_irq_type;
+		irq_desc[i].chip = &vic_irq_type;
 }
 
 /* send a CPI at level cpi to a set of cpus in cpuset (set 1 bit per
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index d58c1c5c903a..abb4cb1c831e 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -660,13 +660,13 @@ register_intr (unsigned int gsi, int vector, unsigned char delivery,
 		irq_type = &irq_type_iosapic_level;
 
 	idesc = irq_descp(vector);
-	if (idesc->handler != irq_type) {
-		if (idesc->handler != &no_irq_type)
+	if (idesc->chip != irq_type) {
+		if (idesc->chip != &no_irq_type)
 			printk(KERN_WARNING
 			       "%s: changing vector %d from %s to %s\n",
 			       __FUNCTION__, vector,
-			       idesc->handler->typename, irq_type->typename);
-		idesc->handler = irq_type;
+			       idesc->chip->typename, irq_type->typename);
+		idesc->chip = irq_type;
 	}
 	return 0;
 }
@@ -903,7 +903,7 @@ iosapic_unregister_intr (unsigned int gsi)
 			BUG_ON(iosapic_intr_info[vector].count);
 
 			/* Clear the interrupt controller descriptor */
-			idesc->handler = &no_irq_type;
+			idesc->chip = &no_irq_type;
 
 			/* Clear the interrupt information */
 			memset(&iosapic_intr_info[vector], 0,
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index 9c72ea3f6432..2645153dba8a 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -76,7 +76,7 @@ int show_interrupts(struct seq_file *p, void *v)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 		}
 #endif
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+		seq_printf(p, " %14s", irq_desc[i].chip->typename);
 		seq_printf(p, "  %s", action->name);
 
 		for (action=action->next; action; action = action->next)
@@ -144,15 +144,15 @@ static void migrate_irqs(void)
 			/*
 			 * Al three are essential, currently WARN_ON.. maybe panic?
 			 */
-			if (desc->handler && desc->handler->disable &&
-				desc->handler->enable && desc->handler->set_affinity) {
-				desc->handler->disable(irq);
-				desc->handler->set_affinity(irq, mask);
-				desc->handler->enable(irq);
+			if (desc->chip && desc->chip->disable &&
+				desc->chip->enable && desc->chip->set_affinity) {
+				desc->chip->disable(irq);
+				desc->chip->set_affinity(irq, mask);
+				desc->chip->enable(irq);
 			} else {
-				WARN_ON((!(desc->handler) || !(desc->handler->disable) ||
-						!(desc->handler->enable) ||
-						!(desc->handler->set_affinity)));
+				WARN_ON((!(desc->chip) || !(desc->chip->disable) ||
+						!(desc->chip->enable) ||
+						!(desc->chip->set_affinity)));
 			}
 		}
 	}
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index ef9a2b49307a..6d8fc9498ed9 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -251,7 +251,7 @@ register_percpu_irq (ia64_vector vec, struct irqaction *action)
 		if (irq_to_vector(irq) == vec) {
 			desc = irq_descp(irq);
 			desc->status |= IRQ_PER_CPU;
-			desc->handler = &irq_type_ia64_lsapic;
+			desc->chip = &irq_type_ia64_lsapic;
 			if (action)
 				setup_irq(irq, action);
 		}
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 44e9547878ac..d69288055599 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -684,9 +684,9 @@ int migrate_platform_irqs(unsigned int cpu)
 			 * polling before making changes.
 			 */
 			if (desc) {
-				desc->handler->disable(ia64_cpe_irq);
-				desc->handler->set_affinity(ia64_cpe_irq, mask);
-				desc->handler->enable(ia64_cpe_irq);
+				desc->chip->disable(ia64_cpe_irq);
+				desc->chip->set_affinity(ia64_cpe_irq, mask);
+				desc->chip->enable(ia64_cpe_irq);
 				printk ("Re-targetting CPEI to cpu %d\n", new_cpei_cpu);
 			}
 		}
diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index 677c6c0fd661..7bb6ad188ba3 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -225,8 +225,8 @@ void sn_irq_init(void)
 	ia64_last_device_vector = IA64_SN2_LAST_DEVICE_VECTOR;
 
 	for (i = 0; i < NR_IRQS; i++) {
-		if (base_desc[i].handler == &no_irq_type) {
-			base_desc[i].handler = &irq_type_sn;
+		if (base_desc[i].chip == &no_irq_type) {
+			base_desc[i].chip = &irq_type_sn;
 		}
 	}
 }
diff --git a/arch/m32r/kernel/irq.c b/arch/m32r/kernel/irq.c
index a4634b06f675..3841861df6a2 100644
--- a/arch/m32r/kernel/irq.c
+++ b/arch/m32r/kernel/irq.c
@@ -54,7 +54,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+		seq_printf(p, " %14s", irq_desc[i].chip->typename);
 		seq_printf(p, "  %s", action->name);
 
 		for (action=action->next; action; action = action->next)
diff --git a/arch/m32r/kernel/setup_m32104ut.c b/arch/m32r/kernel/setup_m32104ut.c
index 6328e1357a80..f9f56c270195 100644
--- a/arch/m32r/kernel/setup_m32104ut.c
+++ b/arch/m32r/kernel/setup_m32104ut.c
@@ -87,7 +87,7 @@ void __init init_IRQ(void)
 #if defined(CONFIG_SMC91X)
 	/* INT#0: LAN controller on M32104UT-LAN (SMC91C111)*/
 	irq_desc[M32R_IRQ_INT0].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_INT0].handler = &m32104ut_irq_type;
+	irq_desc[M32R_IRQ_INT0].chip = &m32104ut_irq_type;
 	irq_desc[M32R_IRQ_INT0].action = 0;
 	irq_desc[M32R_IRQ_INT0].depth = 1;
 	icu_data[M32R_IRQ_INT0].icucr = M32R_ICUCR_IEN | M32R_ICUCR_ISMOD11; /* "H" level sense */
@@ -96,7 +96,7 @@ void __init init_IRQ(void)
 
 	/* MFT2 : system timer */
 	irq_desc[M32R_IRQ_MFT2].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_MFT2].handler = &m32104ut_irq_type;
+	irq_desc[M32R_IRQ_MFT2].chip = &m32104ut_irq_type;
 	irq_desc[M32R_IRQ_MFT2].action = 0;
 	irq_desc[M32R_IRQ_MFT2].depth = 1;
 	icu_data[M32R_IRQ_MFT2].icucr = M32R_ICUCR_IEN;
@@ -105,7 +105,7 @@ void __init init_IRQ(void)
 #ifdef CONFIG_SERIAL_M32R_SIO
 	/* SIO0_R : uart receive data */
 	irq_desc[M32R_IRQ_SIO0_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_R].handler = &m32104ut_irq_type;
+	irq_desc[M32R_IRQ_SIO0_R].chip = &m32104ut_irq_type;
 	irq_desc[M32R_IRQ_SIO0_R].action = 0;
 	irq_desc[M32R_IRQ_SIO0_R].depth = 1;
 	icu_data[M32R_IRQ_SIO0_R].icucr = M32R_ICUCR_IEN;
@@ -113,7 +113,7 @@ void __init init_IRQ(void)
 
 	/* SIO0_S : uart send data */
 	irq_desc[M32R_IRQ_SIO0_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_S].handler = &m32104ut_irq_type;
+	irq_desc[M32R_IRQ_SIO0_S].chip = &m32104ut_irq_type;
 	irq_desc[M32R_IRQ_SIO0_S].action = 0;
 	irq_desc[M32R_IRQ_SIO0_S].depth = 1;
 	icu_data[M32R_IRQ_SIO0_S].icucr = M32R_ICUCR_IEN;
diff --git a/arch/m32r/kernel/setup_m32700ut.c b/arch/m32r/kernel/setup_m32700ut.c
index fad1fc99bb27..b6ab00eff580 100644
--- a/arch/m32r/kernel/setup_m32700ut.c
+++ b/arch/m32r/kernel/setup_m32700ut.c
@@ -301,7 +301,7 @@ void __init init_IRQ(void)
 #if defined(CONFIG_SMC91X)
 	/* INT#0: LAN controller on M32700UT-LAN (SMC91C111)*/
 	irq_desc[M32700UT_LAN_IRQ_LAN].status = IRQ_DISABLED;
-	irq_desc[M32700UT_LAN_IRQ_LAN].handler = &m32700ut_lanpld_irq_type;
+	irq_desc[M32700UT_LAN_IRQ_LAN].chip = &m32700ut_lanpld_irq_type;
 	irq_desc[M32700UT_LAN_IRQ_LAN].action = 0;
 	irq_desc[M32700UT_LAN_IRQ_LAN].depth = 1;	/* disable nested irq */
 	lanpld_icu_data[irq2lanpldirq(M32700UT_LAN_IRQ_LAN)].icucr = PLD_ICUCR_IEN|PLD_ICUCR_ISMOD02;	/* "H" edge sense */
@@ -310,7 +310,7 @@ void __init init_IRQ(void)
 
 	/* MFT2 : system timer */
 	irq_desc[M32R_IRQ_MFT2].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_MFT2].handler = &m32700ut_irq_type;
+	irq_desc[M32R_IRQ_MFT2].chip = &m32700ut_irq_type;
 	irq_desc[M32R_IRQ_MFT2].action = 0;
 	irq_desc[M32R_IRQ_MFT2].depth = 1;
 	icu_data[M32R_IRQ_MFT2].icucr = M32R_ICUCR_IEN;
@@ -318,7 +318,7 @@ void __init init_IRQ(void)
 
 	/* SIO0 : receive */
 	irq_desc[M32R_IRQ_SIO0_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_R].handler = &m32700ut_irq_type;
+	irq_desc[M32R_IRQ_SIO0_R].chip = &m32700ut_irq_type;
 	irq_desc[M32R_IRQ_SIO0_R].action = 0;
 	irq_desc[M32R_IRQ_SIO0_R].depth = 1;
 	icu_data[M32R_IRQ_SIO0_R].icucr = 0;
@@ -326,7 +326,7 @@ void __init init_IRQ(void)
 
 	/* SIO0 : send */
 	irq_desc[M32R_IRQ_SIO0_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_S].handler = &m32700ut_irq_type;
+	irq_desc[M32R_IRQ_SIO0_S].chip = &m32700ut_irq_type;
 	irq_desc[M32R_IRQ_SIO0_S].action = 0;
 	irq_desc[M32R_IRQ_SIO0_S].depth = 1;
 	icu_data[M32R_IRQ_SIO0_S].icucr = 0;
@@ -334,7 +334,7 @@ void __init init_IRQ(void)
 
 	/* SIO1 : receive */
 	irq_desc[M32R_IRQ_SIO1_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO1_R].handler = &m32700ut_irq_type;
+	irq_desc[M32R_IRQ_SIO1_R].chip = &m32700ut_irq_type;
 	irq_desc[M32R_IRQ_SIO1_R].action = 0;
 	irq_desc[M32R_IRQ_SIO1_R].depth = 1;
 	icu_data[M32R_IRQ_SIO1_R].icucr = 0;
@@ -342,7 +342,7 @@ void __init init_IRQ(void)
 
 	/* SIO1 : send */
 	irq_desc[M32R_IRQ_SIO1_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO1_S].handler = &m32700ut_irq_type;
+	irq_desc[M32R_IRQ_SIO1_S].chip = &m32700ut_irq_type;
 	irq_desc[M32R_IRQ_SIO1_S].action = 0;
 	irq_desc[M32R_IRQ_SIO1_S].depth = 1;
 	icu_data[M32R_IRQ_SIO1_S].icucr = 0;
@@ -350,7 +350,7 @@ void __init init_IRQ(void)
 
 	/* DMA1 : */
 	irq_desc[M32R_IRQ_DMA1].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_DMA1].handler = &m32700ut_irq_type;
+	irq_desc[M32R_IRQ_DMA1].chip = &m32700ut_irq_type;
 	irq_desc[M32R_IRQ_DMA1].action = 0;
 	irq_desc[M32R_IRQ_DMA1].depth = 1;
 	icu_data[M32R_IRQ_DMA1].icucr = 0;
@@ -359,7 +359,7 @@ void __init init_IRQ(void)
 #ifdef CONFIG_SERIAL_M32R_PLDSIO
 	/* INT#1: SIO0 Receive on PLD */
 	irq_desc[PLD_IRQ_SIO0_RCV].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_SIO0_RCV].handler = &m32700ut_pld_irq_type;
+	irq_desc[PLD_IRQ_SIO0_RCV].chip = &m32700ut_pld_irq_type;
 	irq_desc[PLD_IRQ_SIO0_RCV].action = 0;
 	irq_desc[PLD_IRQ_SIO0_RCV].depth = 1;	/* disable nested irq */
 	pld_icu_data[irq2pldirq(PLD_IRQ_SIO0_RCV)].icucr = PLD_ICUCR_IEN|PLD_ICUCR_ISMOD03;
@@ -367,7 +367,7 @@ void __init init_IRQ(void)
 
 	/* INT#1: SIO0 Send on PLD */
 	irq_desc[PLD_IRQ_SIO0_SND].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_SIO0_SND].handler = &m32700ut_pld_irq_type;
+	irq_desc[PLD_IRQ_SIO0_SND].chip = &m32700ut_pld_irq_type;
 	irq_desc[PLD_IRQ_SIO0_SND].action = 0;
 	irq_desc[PLD_IRQ_SIO0_SND].depth = 1;	/* disable nested irq */
 	pld_icu_data[irq2pldirq(PLD_IRQ_SIO0_SND)].icucr = PLD_ICUCR_IEN|PLD_ICUCR_ISMOD03;
@@ -376,7 +376,7 @@ void __init init_IRQ(void)
 
 	/* INT#1: CFC IREQ on PLD */
 	irq_desc[PLD_IRQ_CFIREQ].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_CFIREQ].handler = &m32700ut_pld_irq_type;
+	irq_desc[PLD_IRQ_CFIREQ].chip = &m32700ut_pld_irq_type;
 	irq_desc[PLD_IRQ_CFIREQ].action = 0;
 	irq_desc[PLD_IRQ_CFIREQ].depth = 1;	/* disable nested irq */
 	pld_icu_data[irq2pldirq(PLD_IRQ_CFIREQ)].icucr = PLD_ICUCR_IEN|PLD_ICUCR_ISMOD01;	/* 'L' level sense */
@@ -384,7 +384,7 @@ void __init init_IRQ(void)
 
 	/* INT#1: CFC Insert on PLD */
 	irq_desc[PLD_IRQ_CFC_INSERT].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_CFC_INSERT].handler = &m32700ut_pld_irq_type;
+	irq_desc[PLD_IRQ_CFC_INSERT].chip = &m32700ut_pld_irq_type;
 	irq_desc[PLD_IRQ_CFC_INSERT].action = 0;
 	irq_desc[PLD_IRQ_CFC_INSERT].depth = 1;	/* disable nested irq */
 	pld_icu_data[irq2pldirq(PLD_IRQ_CFC_INSERT)].icucr = PLD_ICUCR_IEN|PLD_ICUCR_ISMOD00;	/* 'L' edge sense */
@@ -392,7 +392,7 @@ void __init init_IRQ(void)
 
 	/* INT#1: CFC Eject on PLD */
 	irq_desc[PLD_IRQ_CFC_EJECT].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_CFC_EJECT].handler = &m32700ut_pld_irq_type;
+	irq_desc[PLD_IRQ_CFC_EJECT].chip = &m32700ut_pld_irq_type;
 	irq_desc[PLD_IRQ_CFC_EJECT].action = 0;
 	irq_desc[PLD_IRQ_CFC_EJECT].depth = 1;	/* disable nested irq */
 	pld_icu_data[irq2pldirq(PLD_IRQ_CFC_EJECT)].icucr = PLD_ICUCR_IEN|PLD_ICUCR_ISMOD02;	/* 'H' edge sense */
@@ -416,7 +416,7 @@ void __init init_IRQ(void)
 	outw(USBCR_OTGS, USBCR); 	/* USBCR: non-OTG */
 
     irq_desc[M32700UT_LCD_IRQ_USB_INT1].status = IRQ_DISABLED;
-    irq_desc[M32700UT_LCD_IRQ_USB_INT1].handler = &m32700ut_lcdpld_irq_type;
+    irq_desc[M32700UT_LCD_IRQ_USB_INT1].chip = &m32700ut_lcdpld_irq_type;
     irq_desc[M32700UT_LCD_IRQ_USB_INT1].action = 0;
     irq_desc[M32700UT_LCD_IRQ_USB_INT1].depth = 1;
     lcdpld_icu_data[irq2lcdpldirq(M32700UT_LCD_IRQ_USB_INT1)].icucr = PLD_ICUCR_IEN|PLD_ICUCR_ISMOD01;	/* "L" level sense */
@@ -434,7 +434,7 @@ void __init init_IRQ(void)
 	 * INT3# is used for AR
 	 */
 	irq_desc[M32R_IRQ_INT3].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_INT3].handler = &m32700ut_irq_type;
+	irq_desc[M32R_IRQ_INT3].chip = &m32700ut_irq_type;
 	irq_desc[M32R_IRQ_INT3].action = 0;
 	irq_desc[M32R_IRQ_INT3].depth = 1;
 	icu_data[M32R_IRQ_INT3].icucr = M32R_ICUCR_IEN|M32R_ICUCR_ISMOD10;
diff --git a/arch/m32r/kernel/setup_mappi.c b/arch/m32r/kernel/setup_mappi.c
index 00f253209cb3..c268044185f5 100644
--- a/arch/m32r/kernel/setup_mappi.c
+++ b/arch/m32r/kernel/setup_mappi.c
@@ -86,7 +86,7 @@ void __init init_IRQ(void)
 #ifdef CONFIG_NE2000
 	/* INT0 : LAN controller (RTL8019AS) */
 	irq_desc[M32R_IRQ_INT0].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_INT0].handler = &mappi_irq_type;
+	irq_desc[M32R_IRQ_INT0].chip = &mappi_irq_type;
 	irq_desc[M32R_IRQ_INT0].action = 0;
 	irq_desc[M32R_IRQ_INT0].depth = 1;
 	icu_data[M32R_IRQ_INT0].icucr = M32R_ICUCR_IEN|M32R_ICUCR_ISMOD10;
@@ -95,7 +95,7 @@ void __init init_IRQ(void)
 
 	/* MFT2 : system timer */
 	irq_desc[M32R_IRQ_MFT2].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_MFT2].handler = &mappi_irq_type;
+	irq_desc[M32R_IRQ_MFT2].chip = &mappi_irq_type;
 	irq_desc[M32R_IRQ_MFT2].action = 0;
 	irq_desc[M32R_IRQ_MFT2].depth = 1;
 	icu_data[M32R_IRQ_MFT2].icucr = M32R_ICUCR_IEN;
@@ -104,7 +104,7 @@ void __init init_IRQ(void)
 #ifdef CONFIG_SERIAL_M32R_SIO
 	/* SIO0_R : uart receive data */
 	irq_desc[M32R_IRQ_SIO0_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_R].handler = &mappi_irq_type;
+	irq_desc[M32R_IRQ_SIO0_R].chip = &mappi_irq_type;
 	irq_desc[M32R_IRQ_SIO0_R].action = 0;
 	irq_desc[M32R_IRQ_SIO0_R].depth = 1;
 	icu_data[M32R_IRQ_SIO0_R].icucr = 0;
@@ -112,7 +112,7 @@ void __init init_IRQ(void)
 
 	/* SIO0_S : uart send data */
 	irq_desc[M32R_IRQ_SIO0_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_S].handler = &mappi_irq_type;
+	irq_desc[M32R_IRQ_SIO0_S].chip = &mappi_irq_type;
 	irq_desc[M32R_IRQ_SIO0_S].action = 0;
 	irq_desc[M32R_IRQ_SIO0_S].depth = 1;
 	icu_data[M32R_IRQ_SIO0_S].icucr = 0;
@@ -120,7 +120,7 @@ void __init init_IRQ(void)
 
 	/* SIO1_R : uart receive data */
 	irq_desc[M32R_IRQ_SIO1_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO1_R].handler = &mappi_irq_type;
+	irq_desc[M32R_IRQ_SIO1_R].chip = &mappi_irq_type;
 	irq_desc[M32R_IRQ_SIO1_R].action = 0;
 	irq_desc[M32R_IRQ_SIO1_R].depth = 1;
 	icu_data[M32R_IRQ_SIO1_R].icucr = 0;
@@ -128,7 +128,7 @@ void __init init_IRQ(void)
 
 	/* SIO1_S : uart send data */
 	irq_desc[M32R_IRQ_SIO1_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO1_S].handler = &mappi_irq_type;
+	irq_desc[M32R_IRQ_SIO1_S].chip = &mappi_irq_type;
 	irq_desc[M32R_IRQ_SIO1_S].action = 0;
 	irq_desc[M32R_IRQ_SIO1_S].depth = 1;
 	icu_data[M32R_IRQ_SIO1_S].icucr = 0;
@@ -138,7 +138,7 @@ void __init init_IRQ(void)
 #if defined(CONFIG_M32R_PCC)
 	/* INT1 : pccard0 interrupt */
 	irq_desc[M32R_IRQ_INT1].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_INT1].handler = &mappi_irq_type;
+	irq_desc[M32R_IRQ_INT1].chip = &mappi_irq_type;
 	irq_desc[M32R_IRQ_INT1].action = 0;
 	irq_desc[M32R_IRQ_INT1].depth = 1;
 	icu_data[M32R_IRQ_INT1].icucr = M32R_ICUCR_IEN | M32R_ICUCR_ISMOD00;
@@ -146,7 +146,7 @@ void __init init_IRQ(void)
 
 	/* INT2 : pccard1 interrupt */
 	irq_desc[M32R_IRQ_INT2].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_INT2].handler = &mappi_irq_type;
+	irq_desc[M32R_IRQ_INT2].chip = &mappi_irq_type;
 	irq_desc[M32R_IRQ_INT2].action = 0;
 	irq_desc[M32R_IRQ_INT2].depth = 1;
 	icu_data[M32R_IRQ_INT2].icucr = M32R_ICUCR_IEN | M32R_ICUCR_ISMOD00;
diff --git a/arch/m32r/kernel/setup_mappi2.c b/arch/m32r/kernel/setup_mappi2.c
index eebc9d8b4e72..bd2327d5cca2 100644
--- a/arch/m32r/kernel/setup_mappi2.c
+++ b/arch/m32r/kernel/setup_mappi2.c
@@ -87,7 +87,7 @@ void __init init_IRQ(void)
 #if defined(CONFIG_SMC91X)
 	/* INT0 : LAN controller (SMC91111) */
 	irq_desc[M32R_IRQ_INT0].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_INT0].handler = &mappi2_irq_type;
+	irq_desc[M32R_IRQ_INT0].chip = &mappi2_irq_type;
 	irq_desc[M32R_IRQ_INT0].action = 0;
 	irq_desc[M32R_IRQ_INT0].depth = 1;
 	icu_data[M32R_IRQ_INT0].icucr = M32R_ICUCR_IEN|M32R_ICUCR_ISMOD10;
@@ -96,7 +96,7 @@ void __init init_IRQ(void)
 
 	/* MFT2 : system timer */
 	irq_desc[M32R_IRQ_MFT2].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_MFT2].handler = &mappi2_irq_type;
+	irq_desc[M32R_IRQ_MFT2].chip = &mappi2_irq_type;
 	irq_desc[M32R_IRQ_MFT2].action = 0;
 	irq_desc[M32R_IRQ_MFT2].depth = 1;
 	icu_data[M32R_IRQ_MFT2].icucr = M32R_ICUCR_IEN;
@@ -105,7 +105,7 @@ void __init init_IRQ(void)
 #ifdef CONFIG_SERIAL_M32R_SIO
 	/* SIO0_R : uart receive data */
 	irq_desc[M32R_IRQ_SIO0_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_R].handler = &mappi2_irq_type;
+	irq_desc[M32R_IRQ_SIO0_R].chip = &mappi2_irq_type;
 	irq_desc[M32R_IRQ_SIO0_R].action = 0;
 	irq_desc[M32R_IRQ_SIO0_R].depth = 1;
 	icu_data[M32R_IRQ_SIO0_R].icucr = 0;
@@ -113,14 +113,14 @@ void __init init_IRQ(void)
 
 	/* SIO0_S : uart send data */
 	irq_desc[M32R_IRQ_SIO0_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_S].handler = &mappi2_irq_type;
+	irq_desc[M32R_IRQ_SIO0_S].chip = &mappi2_irq_type;
 	irq_desc[M32R_IRQ_SIO0_S].action = 0;
 	irq_desc[M32R_IRQ_SIO0_S].depth = 1;
 	icu_data[M32R_IRQ_SIO0_S].icucr = 0;
 	disable_mappi2_irq(M32R_IRQ_SIO0_S);
 	/* SIO1_R : uart receive data */
 	irq_desc[M32R_IRQ_SIO1_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO1_R].handler = &mappi2_irq_type;
+	irq_desc[M32R_IRQ_SIO1_R].chip = &mappi2_irq_type;
 	irq_desc[M32R_IRQ_SIO1_R].action = 0;
 	irq_desc[M32R_IRQ_SIO1_R].depth = 1;
 	icu_data[M32R_IRQ_SIO1_R].icucr = 0;
@@ -128,7 +128,7 @@ void __init init_IRQ(void)
 
 	/* SIO1_S : uart send data */
 	irq_desc[M32R_IRQ_SIO1_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO1_S].handler = &mappi2_irq_type;
+	irq_desc[M32R_IRQ_SIO1_S].chip = &mappi2_irq_type;
 	irq_desc[M32R_IRQ_SIO1_S].action = 0;
 	irq_desc[M32R_IRQ_SIO1_S].depth = 1;
 	icu_data[M32R_IRQ_SIO1_S].icucr = 0;
@@ -138,7 +138,7 @@ void __init init_IRQ(void)
 #if defined(CONFIG_USB)
 	/* INT1 : USB Host controller interrupt */
 	irq_desc[M32R_IRQ_INT1].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_INT1].handler = &mappi2_irq_type;
+	irq_desc[M32R_IRQ_INT1].chip = &mappi2_irq_type;
 	irq_desc[M32R_IRQ_INT1].action = 0;
 	irq_desc[M32R_IRQ_INT1].depth = 1;
 	icu_data[M32R_IRQ_INT1].icucr = M32R_ICUCR_ISMOD01;
@@ -147,7 +147,7 @@ void __init init_IRQ(void)
 
 	/* ICUCR40: CFC IREQ */
 	irq_desc[PLD_IRQ_CFIREQ].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_CFIREQ].handler = &mappi2_irq_type;
+	irq_desc[PLD_IRQ_CFIREQ].chip = &mappi2_irq_type;
 	irq_desc[PLD_IRQ_CFIREQ].action = 0;
 	irq_desc[PLD_IRQ_CFIREQ].depth = 1;	/* disable nested irq */
 	icu_data[PLD_IRQ_CFIREQ].icucr = M32R_ICUCR_IEN|M32R_ICUCR_ISMOD01;
@@ -156,7 +156,7 @@ void __init init_IRQ(void)
 #if defined(CONFIG_M32R_CFC)
 	/* ICUCR41: CFC Insert */
 	irq_desc[PLD_IRQ_CFC_INSERT].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_CFC_INSERT].handler = &mappi2_irq_type;
+	irq_desc[PLD_IRQ_CFC_INSERT].chip = &mappi2_irq_type;
 	irq_desc[PLD_IRQ_CFC_INSERT].action = 0;
 	irq_desc[PLD_IRQ_CFC_INSERT].depth = 1;	/* disable nested irq */
 	icu_data[PLD_IRQ_CFC_INSERT].icucr = M32R_ICUCR_IEN|M32R_ICUCR_ISMOD00;
@@ -164,7 +164,7 @@ void __init init_IRQ(void)
 
 	/* ICUCR42: CFC Eject */
 	irq_desc[PLD_IRQ_CFC_EJECT].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_CFC_EJECT].handler = &mappi2_irq_type;
+	irq_desc[PLD_IRQ_CFC_EJECT].chip = &mappi2_irq_type;
 	irq_desc[PLD_IRQ_CFC_EJECT].action = 0;
 	irq_desc[PLD_IRQ_CFC_EJECT].depth = 1;	/* disable nested irq */
 	icu_data[PLD_IRQ_CFC_EJECT].icucr = M32R_ICUCR_IEN|M32R_ICUCR_ISMOD10;
diff --git a/arch/m32r/kernel/setup_mappi3.c b/arch/m32r/kernel/setup_mappi3.c
index d2ff021e2d3d..014b51d17505 100644
--- a/arch/m32r/kernel/setup_mappi3.c
+++ b/arch/m32r/kernel/setup_mappi3.c
@@ -87,7 +87,7 @@ void __init init_IRQ(void)
 #if defined(CONFIG_SMC91X)
 	/* INT0 : LAN controller (SMC91111) */
 	irq_desc[M32R_IRQ_INT0].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_INT0].handler = &mappi3_irq_type;
+	irq_desc[M32R_IRQ_INT0].chip = &mappi3_irq_type;
 	irq_desc[M32R_IRQ_INT0].action = 0;
 	irq_desc[M32R_IRQ_INT0].depth = 1;
 	icu_data[M32R_IRQ_INT0].icucr = M32R_ICUCR_IEN|M32R_ICUCR_ISMOD10;
@@ -96,7 +96,7 @@ void __init init_IRQ(void)
 
 	/* MFT2 : system timer */
 	irq_desc[M32R_IRQ_MFT2].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_MFT2].handler = &mappi3_irq_type;
+	irq_desc[M32R_IRQ_MFT2].chip = &mappi3_irq_type;
 	irq_desc[M32R_IRQ_MFT2].action = 0;
 	irq_desc[M32R_IRQ_MFT2].depth = 1;
 	icu_data[M32R_IRQ_MFT2].icucr = M32R_ICUCR_IEN;
@@ -105,7 +105,7 @@ void __init init_IRQ(void)
 #ifdef CONFIG_SERIAL_M32R_SIO
 	/* SIO0_R : uart receive data */
 	irq_desc[M32R_IRQ_SIO0_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_R].handler = &mappi3_irq_type;
+	irq_desc[M32R_IRQ_SIO0_R].chip = &mappi3_irq_type;
 	irq_desc[M32R_IRQ_SIO0_R].action = 0;
 	irq_desc[M32R_IRQ_SIO0_R].depth = 1;
 	icu_data[M32R_IRQ_SIO0_R].icucr = 0;
@@ -113,14 +113,14 @@ void __init init_IRQ(void)
 
 	/* SIO0_S : uart send data */
 	irq_desc[M32R_IRQ_SIO0_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_S].handler = &mappi3_irq_type;
+	irq_desc[M32R_IRQ_SIO0_S].chip = &mappi3_irq_type;
 	irq_desc[M32R_IRQ_SIO0_S].action = 0;
 	irq_desc[M32R_IRQ_SIO0_S].depth = 1;
 	icu_data[M32R_IRQ_SIO0_S].icucr = 0;
 	disable_mappi3_irq(M32R_IRQ_SIO0_S);
 	/* SIO1_R : uart receive data */
 	irq_desc[M32R_IRQ_SIO1_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO1_R].handler = &mappi3_irq_type;
+	irq_desc[M32R_IRQ_SIO1_R].chip = &mappi3_irq_type;
 	irq_desc[M32R_IRQ_SIO1_R].action = 0;
 	irq_desc[M32R_IRQ_SIO1_R].depth = 1;
 	icu_data[M32R_IRQ_SIO1_R].icucr = 0;
@@ -128,7 +128,7 @@ void __init init_IRQ(void)
 
 	/* SIO1_S : uart send data */
 	irq_desc[M32R_IRQ_SIO1_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO1_S].handler = &mappi3_irq_type;
+	irq_desc[M32R_IRQ_SIO1_S].chip = &mappi3_irq_type;
 	irq_desc[M32R_IRQ_SIO1_S].action = 0;
 	irq_desc[M32R_IRQ_SIO1_S].depth = 1;
 	icu_data[M32R_IRQ_SIO1_S].icucr = 0;
@@ -138,7 +138,7 @@ void __init init_IRQ(void)
 #if defined(CONFIG_USB)
 	/* INT1 : USB Host controller interrupt */
 	irq_desc[M32R_IRQ_INT1].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_INT1].handler = &mappi3_irq_type;
+	irq_desc[M32R_IRQ_INT1].chip = &mappi3_irq_type;
 	irq_desc[M32R_IRQ_INT1].action = 0;
 	irq_desc[M32R_IRQ_INT1].depth = 1;
 	icu_data[M32R_IRQ_INT1].icucr = M32R_ICUCR_ISMOD01;
@@ -147,7 +147,7 @@ void __init init_IRQ(void)
 
 	/* CFC IREQ */
 	irq_desc[PLD_IRQ_CFIREQ].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_CFIREQ].handler = &mappi3_irq_type;
+	irq_desc[PLD_IRQ_CFIREQ].chip = &mappi3_irq_type;
 	irq_desc[PLD_IRQ_CFIREQ].action = 0;
 	irq_desc[PLD_IRQ_CFIREQ].depth = 1;	/* disable nested irq */
 	icu_data[PLD_IRQ_CFIREQ].icucr = M32R_ICUCR_IEN|M32R_ICUCR_ISMOD01;
@@ -156,7 +156,7 @@ void __init init_IRQ(void)
 #if defined(CONFIG_M32R_CFC)
 	/* ICUCR41: CFC Insert & eject */
 	irq_desc[PLD_IRQ_CFC_INSERT].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_CFC_INSERT].handler = &mappi3_irq_type;
+	irq_desc[PLD_IRQ_CFC_INSERT].chip = &mappi3_irq_type;
 	irq_desc[PLD_IRQ_CFC_INSERT].action = 0;
 	irq_desc[PLD_IRQ_CFC_INSERT].depth = 1;	/* disable nested irq */
 	icu_data[PLD_IRQ_CFC_INSERT].icucr = M32R_ICUCR_IEN|M32R_ICUCR_ISMOD00;
@@ -166,7 +166,7 @@ void __init init_IRQ(void)
 
 	/* IDE IREQ */
 	irq_desc[PLD_IRQ_IDEIREQ].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_IDEIREQ].handler = &mappi3_irq_type;
+	irq_desc[PLD_IRQ_IDEIREQ].chip = &mappi3_irq_type;
 	irq_desc[PLD_IRQ_IDEIREQ].action = 0;
 	irq_desc[PLD_IRQ_IDEIREQ].depth = 1;	/* disable nested irq */
 	icu_data[PLD_IRQ_IDEIREQ].icucr = M32R_ICUCR_IEN|M32R_ICUCR_ISMOD10;
diff --git a/arch/m32r/kernel/setup_oaks32r.c b/arch/m32r/kernel/setup_oaks32r.c
index 0e9e63538c0f..ea64831aef7a 100644
--- a/arch/m32r/kernel/setup_oaks32r.c
+++ b/arch/m32r/kernel/setup_oaks32r.c
@@ -85,7 +85,7 @@ void __init init_IRQ(void)
 #ifdef CONFIG_NE2000
 	/* INT3 : LAN controller (RTL8019AS) */
 	irq_desc[M32R_IRQ_INT3].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_INT3].handler = &oaks32r_irq_type;
+	irq_desc[M32R_IRQ_INT3].chip = &oaks32r_irq_type;
 	irq_desc[M32R_IRQ_INT3].action = 0;
 	irq_desc[M32R_IRQ_INT3].depth = 1;
 	icu_data[M32R_IRQ_INT3].icucr = M32R_ICUCR_IEN|M32R_ICUCR_ISMOD10;
@@ -94,7 +94,7 @@ void __init init_IRQ(void)
 
 	/* MFT2 : system timer */
 	irq_desc[M32R_IRQ_MFT2].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_MFT2].handler = &oaks32r_irq_type;
+	irq_desc[M32R_IRQ_MFT2].chip = &oaks32r_irq_type;
 	irq_desc[M32R_IRQ_MFT2].action = 0;
 	irq_desc[M32R_IRQ_MFT2].depth = 1;
 	icu_data[M32R_IRQ_MFT2].icucr = M32R_ICUCR_IEN;
@@ -103,7 +103,7 @@ void __init init_IRQ(void)
 #ifdef CONFIG_SERIAL_M32R_SIO
 	/* SIO0_R : uart receive data */
 	irq_desc[M32R_IRQ_SIO0_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_R].handler = &oaks32r_irq_type;
+	irq_desc[M32R_IRQ_SIO0_R].chip = &oaks32r_irq_type;
 	irq_desc[M32R_IRQ_SIO0_R].action = 0;
 	irq_desc[M32R_IRQ_SIO0_R].depth = 1;
 	icu_data[M32R_IRQ_SIO0_R].icucr = 0;
@@ -111,7 +111,7 @@ void __init init_IRQ(void)
 
 	/* SIO0_S : uart send data */
 	irq_desc[M32R_IRQ_SIO0_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_S].handler = &oaks32r_irq_type;
+	irq_desc[M32R_IRQ_SIO0_S].chip = &oaks32r_irq_type;
 	irq_desc[M32R_IRQ_SIO0_S].action = 0;
 	irq_desc[M32R_IRQ_SIO0_S].depth = 1;
 	icu_data[M32R_IRQ_SIO0_S].icucr = 0;
@@ -119,7 +119,7 @@ void __init init_IRQ(void)
 
 	/* SIO1_R : uart receive data */
 	irq_desc[M32R_IRQ_SIO1_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO1_R].handler = &oaks32r_irq_type;
+	irq_desc[M32R_IRQ_SIO1_R].chip = &oaks32r_irq_type;
 	irq_desc[M32R_IRQ_SIO1_R].action = 0;
 	irq_desc[M32R_IRQ_SIO1_R].depth = 1;
 	icu_data[M32R_IRQ_SIO1_R].icucr = 0;
@@ -127,7 +127,7 @@ void __init init_IRQ(void)
 
 	/* SIO1_S : uart send data */
 	irq_desc[M32R_IRQ_SIO1_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO1_S].handler = &oaks32r_irq_type;
+	irq_desc[M32R_IRQ_SIO1_S].chip = &oaks32r_irq_type;
 	irq_desc[M32R_IRQ_SIO1_S].action = 0;
 	irq_desc[M32R_IRQ_SIO1_S].depth = 1;
 	icu_data[M32R_IRQ_SIO1_S].icucr = 0;
diff --git a/arch/m32r/kernel/setup_opsput.c b/arch/m32r/kernel/setup_opsput.c
index 548e8fc7949b..55e8972d455a 100644
--- a/arch/m32r/kernel/setup_opsput.c
+++ b/arch/m32r/kernel/setup_opsput.c
@@ -302,7 +302,7 @@ void __init init_IRQ(void)
 #if defined(CONFIG_SMC91X)
 	/* INT#0: LAN controller on OPSPUT-LAN (SMC91C111)*/
 	irq_desc[OPSPUT_LAN_IRQ_LAN].status = IRQ_DISABLED;
-	irq_desc[OPSPUT_LAN_IRQ_LAN].handler = &opsput_lanpld_irq_type;
+	irq_desc[OPSPUT_LAN_IRQ_LAN].chip = &opsput_lanpld_irq_type;
 	irq_desc[OPSPUT_LAN_IRQ_LAN].action = 0;
 	irq_desc[OPSPUT_LAN_IRQ_LAN].depth = 1;	/* disable nested irq */
 	lanpld_icu_data[irq2lanpldirq(OPSPUT_LAN_IRQ_LAN)].icucr = PLD_ICUCR_IEN|PLD_ICUCR_ISMOD02;	/* "H" edge sense */
@@ -311,7 +311,7 @@ void __init init_IRQ(void)
 
 	/* MFT2 : system timer */
 	irq_desc[M32R_IRQ_MFT2].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_MFT2].handler = &opsput_irq_type;
+	irq_desc[M32R_IRQ_MFT2].chip = &opsput_irq_type;
 	irq_desc[M32R_IRQ_MFT2].action = 0;
 	irq_desc[M32R_IRQ_MFT2].depth = 1;
 	icu_data[M32R_IRQ_MFT2].icucr = M32R_ICUCR_IEN;
@@ -319,7 +319,7 @@ void __init init_IRQ(void)
 
 	/* SIO0 : receive */
 	irq_desc[M32R_IRQ_SIO0_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_R].handler = &opsput_irq_type;
+	irq_desc[M32R_IRQ_SIO0_R].chip = &opsput_irq_type;
 	irq_desc[M32R_IRQ_SIO0_R].action = 0;
 	irq_desc[M32R_IRQ_SIO0_R].depth = 1;
 	icu_data[M32R_IRQ_SIO0_R].icucr = 0;
@@ -327,7 +327,7 @@ void __init init_IRQ(void)
 
 	/* SIO0 : send */
 	irq_desc[M32R_IRQ_SIO0_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_S].handler = &opsput_irq_type;
+	irq_desc[M32R_IRQ_SIO0_S].chip = &opsput_irq_type;
 	irq_desc[M32R_IRQ_SIO0_S].action = 0;
 	irq_desc[M32R_IRQ_SIO0_S].depth = 1;
 	icu_data[M32R_IRQ_SIO0_S].icucr = 0;
@@ -335,7 +335,7 @@ void __init init_IRQ(void)
 
 	/* SIO1 : receive */
 	irq_desc[M32R_IRQ_SIO1_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO1_R].handler = &opsput_irq_type;
+	irq_desc[M32R_IRQ_SIO1_R].chip = &opsput_irq_type;
 	irq_desc[M32R_IRQ_SIO1_R].action = 0;
 	irq_desc[M32R_IRQ_SIO1_R].depth = 1;
 	icu_data[M32R_IRQ_SIO1_R].icucr = 0;
@@ -343,7 +343,7 @@ void __init init_IRQ(void)
 
 	/* SIO1 : send */
 	irq_desc[M32R_IRQ_SIO1_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO1_S].handler = &opsput_irq_type;
+	irq_desc[M32R_IRQ_SIO1_S].chip = &opsput_irq_type;
 	irq_desc[M32R_IRQ_SIO1_S].action = 0;
 	irq_desc[M32R_IRQ_SIO1_S].depth = 1;
 	icu_data[M32R_IRQ_SIO1_S].icucr = 0;
@@ -351,7 +351,7 @@ void __init init_IRQ(void)
 
 	/* DMA1 : */
 	irq_desc[M32R_IRQ_DMA1].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_DMA1].handler = &opsput_irq_type;
+	irq_desc[M32R_IRQ_DMA1].chip = &opsput_irq_type;
 	irq_desc[M32R_IRQ_DMA1].action = 0;
 	irq_desc[M32R_IRQ_DMA1].depth = 1;
 	icu_data[M32R_IRQ_DMA1].icucr = 0;
@@ -360,7 +360,7 @@ void __init init_IRQ(void)
 #ifdef CONFIG_SERIAL_M32R_PLDSIO
 	/* INT#1: SIO0 Receive on PLD */
 	irq_desc[PLD_IRQ_SIO0_RCV].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_SIO0_RCV].handler = &opsput_pld_irq_type;
+	irq_desc[PLD_IRQ_SIO0_RCV].chip = &opsput_pld_irq_type;
 	irq_desc[PLD_IRQ_SIO0_RCV].action = 0;
 	irq_desc[PLD_IRQ_SIO0_RCV].depth = 1;	/* disable nested irq */
 	pld_icu_data[irq2pldirq(PLD_IRQ_SIO0_RCV)].icucr = PLD_ICUCR_IEN|PLD_ICUCR_ISMOD03;
@@ -368,7 +368,7 @@ void __init init_IRQ(void)
 
 	/* INT#1: SIO0 Send on PLD */
 	irq_desc[PLD_IRQ_SIO0_SND].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_SIO0_SND].handler = &opsput_pld_irq_type;
+	irq_desc[PLD_IRQ_SIO0_SND].chip = &opsput_pld_irq_type;
 	irq_desc[PLD_IRQ_SIO0_SND].action = 0;
 	irq_desc[PLD_IRQ_SIO0_SND].depth = 1;	/* disable nested irq */
 	pld_icu_data[irq2pldirq(PLD_IRQ_SIO0_SND)].icucr = PLD_ICUCR_IEN|PLD_ICUCR_ISMOD03;
@@ -378,7 +378,7 @@ void __init init_IRQ(void)
 #if defined(CONFIG_M32R_CFC)
 	/* INT#1: CFC IREQ on PLD */
 	irq_desc[PLD_IRQ_CFIREQ].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_CFIREQ].handler = &opsput_pld_irq_type;
+	irq_desc[PLD_IRQ_CFIREQ].chip = &opsput_pld_irq_type;
 	irq_desc[PLD_IRQ_CFIREQ].action = 0;
 	irq_desc[PLD_IRQ_CFIREQ].depth = 1;	/* disable nested irq */
 	pld_icu_data[irq2pldirq(PLD_IRQ_CFIREQ)].icucr = PLD_ICUCR_IEN|PLD_ICUCR_ISMOD01;	/* 'L' level sense */
@@ -386,7 +386,7 @@ void __init init_IRQ(void)
 
 	/* INT#1: CFC Insert on PLD */
 	irq_desc[PLD_IRQ_CFC_INSERT].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_CFC_INSERT].handler = &opsput_pld_irq_type;
+	irq_desc[PLD_IRQ_CFC_INSERT].chip = &opsput_pld_irq_type;
 	irq_desc[PLD_IRQ_CFC_INSERT].action = 0;
 	irq_desc[PLD_IRQ_CFC_INSERT].depth = 1;	/* disable nested irq */
 	pld_icu_data[irq2pldirq(PLD_IRQ_CFC_INSERT)].icucr = PLD_ICUCR_IEN|PLD_ICUCR_ISMOD00;	/* 'L' edge sense */
@@ -394,7 +394,7 @@ void __init init_IRQ(void)
 
 	/* INT#1: CFC Eject on PLD */
 	irq_desc[PLD_IRQ_CFC_EJECT].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_CFC_EJECT].handler = &opsput_pld_irq_type;
+	irq_desc[PLD_IRQ_CFC_EJECT].chip = &opsput_pld_irq_type;
 	irq_desc[PLD_IRQ_CFC_EJECT].action = 0;
 	irq_desc[PLD_IRQ_CFC_EJECT].depth = 1;	/* disable nested irq */
 	pld_icu_data[irq2pldirq(PLD_IRQ_CFC_EJECT)].icucr = PLD_ICUCR_IEN|PLD_ICUCR_ISMOD02;	/* 'H' edge sense */
@@ -420,7 +420,7 @@ void __init init_IRQ(void)
 	outw(USBCR_OTGS, USBCR); 	/* USBCR: non-OTG */
 
     irq_desc[OPSPUT_LCD_IRQ_USB_INT1].status = IRQ_DISABLED;
-    irq_desc[OPSPUT_LCD_IRQ_USB_INT1].handler = &opsput_lcdpld_irq_type;
+    irq_desc[OPSPUT_LCD_IRQ_USB_INT1].chip = &opsput_lcdpld_irq_type;
     irq_desc[OPSPUT_LCD_IRQ_USB_INT1].action = 0;
     irq_desc[OPSPUT_LCD_IRQ_USB_INT1].depth = 1;
     lcdpld_icu_data[irq2lcdpldirq(OPSPUT_LCD_IRQ_USB_INT1)].icucr = PLD_ICUCR_IEN|PLD_ICUCR_ISMOD01;	/* "L" level sense */
@@ -438,7 +438,7 @@ void __init init_IRQ(void)
 	 * INT3# is used for AR
 	 */
 	irq_desc[M32R_IRQ_INT3].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_INT3].handler = &opsput_irq_type;
+	irq_desc[M32R_IRQ_INT3].chip = &opsput_irq_type;
 	irq_desc[M32R_IRQ_INT3].action = 0;
 	irq_desc[M32R_IRQ_INT3].depth = 1;
 	icu_data[M32R_IRQ_INT3].icucr = M32R_ICUCR_IEN|M32R_ICUCR_ISMOD10;
diff --git a/arch/m32r/kernel/setup_usrv.c b/arch/m32r/kernel/setup_usrv.c
index 64be659a23e7..7fa12d8f66b4 100644
--- a/arch/m32r/kernel/setup_usrv.c
+++ b/arch/m32r/kernel/setup_usrv.c
@@ -158,7 +158,7 @@ void __init init_IRQ(void)
 
 	/* MFT2 : system timer */
 	irq_desc[M32R_IRQ_MFT2].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_MFT2].handler = &mappi_irq_type;
+	irq_desc[M32R_IRQ_MFT2].chip = &mappi_irq_type;
 	irq_desc[M32R_IRQ_MFT2].action = 0;
 	irq_desc[M32R_IRQ_MFT2].depth = 1;
 	icu_data[M32R_IRQ_MFT2].icucr = M32R_ICUCR_IEN;
@@ -167,7 +167,7 @@ void __init init_IRQ(void)
 #if defined(CONFIG_SERIAL_M32R_SIO)
 	/* SIO0_R : uart receive data */
 	irq_desc[M32R_IRQ_SIO0_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_R].handler = &mappi_irq_type;
+	irq_desc[M32R_IRQ_SIO0_R].chip = &mappi_irq_type;
 	irq_desc[M32R_IRQ_SIO0_R].action = 0;
 	irq_desc[M32R_IRQ_SIO0_R].depth = 1;
 	icu_data[M32R_IRQ_SIO0_R].icucr = 0;
@@ -175,7 +175,7 @@ void __init init_IRQ(void)
 
 	/* SIO0_S : uart send data */
 	irq_desc[M32R_IRQ_SIO0_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO0_S].handler = &mappi_irq_type;
+	irq_desc[M32R_IRQ_SIO0_S].chip = &mappi_irq_type;
 	irq_desc[M32R_IRQ_SIO0_S].action = 0;
 	irq_desc[M32R_IRQ_SIO0_S].depth = 1;
 	icu_data[M32R_IRQ_SIO0_S].icucr = 0;
@@ -183,7 +183,7 @@ void __init init_IRQ(void)
 
 	/* SIO1_R : uart receive data */
 	irq_desc[M32R_IRQ_SIO1_R].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO1_R].handler = &mappi_irq_type;
+	irq_desc[M32R_IRQ_SIO1_R].chip = &mappi_irq_type;
 	irq_desc[M32R_IRQ_SIO1_R].action = 0;
 	irq_desc[M32R_IRQ_SIO1_R].depth = 1;
 	icu_data[M32R_IRQ_SIO1_R].icucr = 0;
@@ -191,7 +191,7 @@ void __init init_IRQ(void)
 
 	/* SIO1_S : uart send data */
 	irq_desc[M32R_IRQ_SIO1_S].status = IRQ_DISABLED;
-	irq_desc[M32R_IRQ_SIO1_S].handler = &mappi_irq_type;
+	irq_desc[M32R_IRQ_SIO1_S].chip = &mappi_irq_type;
 	irq_desc[M32R_IRQ_SIO1_S].action = 0;
 	irq_desc[M32R_IRQ_SIO1_S].depth = 1;
 	icu_data[M32R_IRQ_SIO1_S].icucr = 0;
@@ -201,7 +201,7 @@ void __init init_IRQ(void)
 	/* INT#67-#71: CFC#0 IREQ on PLD */
 	for (i = 0 ; i < CONFIG_CFC_NUM ; i++ ) {
 		irq_desc[PLD_IRQ_CF0 + i].status = IRQ_DISABLED;
-		irq_desc[PLD_IRQ_CF0 + i].handler = &m32700ut_pld_irq_type;
+		irq_desc[PLD_IRQ_CF0 + i].chip = &m32700ut_pld_irq_type;
 		irq_desc[PLD_IRQ_CF0 + i].action = 0;
 		irq_desc[PLD_IRQ_CF0 + i].depth = 1;	/* disable nested irq */
 		pld_icu_data[irq2pldirq(PLD_IRQ_CF0 + i)].icucr
@@ -212,7 +212,7 @@ void __init init_IRQ(void)
 #if defined(CONFIG_SERIAL_8250) || defined(CONFIG_SERIAL_8250_MODULE)
 	/* INT#76: 16552D#0 IREQ on PLD */
 	irq_desc[PLD_IRQ_UART0].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_UART0].handler = &m32700ut_pld_irq_type;
+	irq_desc[PLD_IRQ_UART0].chip = &m32700ut_pld_irq_type;
 	irq_desc[PLD_IRQ_UART0].action = 0;
 	irq_desc[PLD_IRQ_UART0].depth = 1;	/* disable nested irq */
 	pld_icu_data[irq2pldirq(PLD_IRQ_UART0)].icucr
@@ -221,7 +221,7 @@ void __init init_IRQ(void)
 
 	/* INT#77: 16552D#1 IREQ on PLD */
 	irq_desc[PLD_IRQ_UART1].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_UART1].handler = &m32700ut_pld_irq_type;
+	irq_desc[PLD_IRQ_UART1].chip = &m32700ut_pld_irq_type;
 	irq_desc[PLD_IRQ_UART1].action = 0;
 	irq_desc[PLD_IRQ_UART1].depth = 1;	/* disable nested irq */
 	pld_icu_data[irq2pldirq(PLD_IRQ_UART1)].icucr
@@ -232,7 +232,7 @@ void __init init_IRQ(void)
 #if defined(CONFIG_IDC_AK4524) || defined(CONFIG_IDC_AK4524_MODULE)
 	/* INT#80: AK4524 IREQ on PLD */
 	irq_desc[PLD_IRQ_SNDINT].status = IRQ_DISABLED;
-	irq_desc[PLD_IRQ_SNDINT].handler = &m32700ut_pld_irq_type;
+	irq_desc[PLD_IRQ_SNDINT].chip = &m32700ut_pld_irq_type;
 	irq_desc[PLD_IRQ_SNDINT].action = 0;
 	irq_desc[PLD_IRQ_SNDINT].depth = 1;	/* disable nested irq */
 	pld_icu_data[irq2pldirq(PLD_IRQ_SNDINT)].icucr
diff --git a/arch/mips/au1000/common/irq.c b/arch/mips/au1000/common/irq.c
index afe05ec12c27..da74ac21954b 100644
--- a/arch/mips/au1000/common/irq.c
+++ b/arch/mips/au1000/common/irq.c
@@ -333,31 +333,31 @@ static void setup_local_irq(unsigned int irq_nr, int type, int int_req)
 				au_writel(1<<(irq_nr-32), IC1_CFG2CLR);
 				au_writel(1<<(irq_nr-32), IC1_CFG1CLR);
 				au_writel(1<<(irq_nr-32), IC1_CFG0SET);
-				irq_desc[irq_nr].handler = &rise_edge_irq_type;
+				irq_desc[irq_nr].chip = &rise_edge_irq_type;
 				break;
 			case INTC_INT_FALL_EDGE: /* 0:1:0 */
 				au_writel(1<<(irq_nr-32), IC1_CFG2CLR);
 				au_writel(1<<(irq_nr-32), IC1_CFG1SET);
 				au_writel(1<<(irq_nr-32), IC1_CFG0CLR);
-				irq_desc[irq_nr].handler = &fall_edge_irq_type;
+				irq_desc[irq_nr].chip = &fall_edge_irq_type;
 				break;
 			case INTC_INT_RISE_AND_FALL_EDGE: /* 0:1:1 */
 				au_writel(1<<(irq_nr-32), IC1_CFG2CLR);
 				au_writel(1<<(irq_nr-32), IC1_CFG1SET);
 				au_writel(1<<(irq_nr-32), IC1_CFG0SET);
-				irq_desc[irq_nr].handler = &either_edge_irq_type;
+				irq_desc[irq_nr].chip = &either_edge_irq_type;
 				break;
 			case INTC_INT_HIGH_LEVEL: /* 1:0:1 */
 				au_writel(1<<(irq_nr-32), IC1_CFG2SET);
 				au_writel(1<<(irq_nr-32), IC1_CFG1CLR);
 				au_writel(1<<(irq_nr-32), IC1_CFG0SET);
-				irq_desc[irq_nr].handler = &level_irq_type;
+				irq_desc[irq_nr].chip = &level_irq_type;
 				break;
 			case INTC_INT_LOW_LEVEL: /* 1:1:0 */
 				au_writel(1<<(irq_nr-32), IC1_CFG2SET);
 				au_writel(1<<(irq_nr-32), IC1_CFG1SET);
 				au_writel(1<<(irq_nr-32), IC1_CFG0CLR);
-				irq_desc[irq_nr].handler = &level_irq_type;
+				irq_desc[irq_nr].chip = &level_irq_type;
 				break;
 			case INTC_INT_DISABLED: /* 0:0:0 */
 				au_writel(1<<(irq_nr-32), IC1_CFG0CLR);
@@ -385,31 +385,31 @@ static void setup_local_irq(unsigned int irq_nr, int type, int int_req)
 				au_writel(1<<irq_nr, IC0_CFG2CLR);
 				au_writel(1<<irq_nr, IC0_CFG1CLR);
 				au_writel(1<<irq_nr, IC0_CFG0SET);
-				irq_desc[irq_nr].handler = &rise_edge_irq_type;
+				irq_desc[irq_nr].chip = &rise_edge_irq_type;
 				break;
 			case INTC_INT_FALL_EDGE: /* 0:1:0 */
 				au_writel(1<<irq_nr, IC0_CFG2CLR);
 				au_writel(1<<irq_nr, IC0_CFG1SET);
 				au_writel(1<<irq_nr, IC0_CFG0CLR);
-				irq_desc[irq_nr].handler = &fall_edge_irq_type;
+				irq_desc[irq_nr].chip = &fall_edge_irq_type;
 				break;
 			case INTC_INT_RISE_AND_FALL_EDGE: /* 0:1:1 */
 				au_writel(1<<irq_nr, IC0_CFG2CLR);
 				au_writel(1<<irq_nr, IC0_CFG1SET);
 				au_writel(1<<irq_nr, IC0_CFG0SET);
-				irq_desc[irq_nr].handler = &either_edge_irq_type;
+				irq_desc[irq_nr].chip = &either_edge_irq_type;
 				break;
 			case INTC_INT_HIGH_LEVEL: /* 1:0:1 */
 				au_writel(1<<irq_nr, IC0_CFG2SET);
 				au_writel(1<<irq_nr, IC0_CFG1CLR);
 				au_writel(1<<irq_nr, IC0_CFG0SET);
-				irq_desc[irq_nr].handler = &level_irq_type;
+				irq_desc[irq_nr].chip = &level_irq_type;
 				break;
 			case INTC_INT_LOW_LEVEL: /* 1:1:0 */
 				au_writel(1<<irq_nr, IC0_CFG2SET);
 				au_writel(1<<irq_nr, IC0_CFG1SET);
 				au_writel(1<<irq_nr, IC0_CFG0CLR);
-				irq_desc[irq_nr].handler = &level_irq_type;
+				irq_desc[irq_nr].chip = &level_irq_type;
 				break;
 			case INTC_INT_DISABLED: /* 0:0:0 */
 				au_writel(1<<irq_nr, IC0_CFG0CLR);
diff --git a/arch/mips/au1000/pb1200/irqmap.c b/arch/mips/au1000/pb1200/irqmap.c
index bacc0c6bfe67..5dd164fc1889 100644
--- a/arch/mips/au1000/pb1200/irqmap.c
+++ b/arch/mips/au1000/pb1200/irqmap.c
@@ -172,7 +172,7 @@ void _board_init_irq(void)
 
 	for (irq_nr = PB1200_INT_BEGIN; irq_nr <= PB1200_INT_END; irq_nr++)
 	{
-		irq_desc[irq_nr].handler = &external_irq_type;
+		irq_desc[irq_nr].chip = &external_irq_type;
 		pb1200_disable_irq(irq_nr);
 	}
 
diff --git a/arch/mips/ddb5xxx/ddb5477/irq_5477.c b/arch/mips/ddb5xxx/ddb5477/irq_5477.c
index 5fcd5f070cdc..63c3d6534b3a 100644
--- a/arch/mips/ddb5xxx/ddb5477/irq_5477.c
+++ b/arch/mips/ddb5xxx/ddb5477/irq_5477.c
@@ -107,7 +107,7 @@ void __init vrc5477_irq_init(u32 irq_base)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = NULL;
 		irq_desc[i].depth = 1;
-		irq_desc[i].handler = &vrc5477_irq_controller;
+		irq_desc[i].chip = &vrc5477_irq_controller;
 	}
 
 	vrc5477_irq_base = irq_base;
diff --git a/arch/mips/dec/ioasic-irq.c b/arch/mips/dec/ioasic-irq.c
index d5bca5d233b6..da2dbb42f913 100644
--- a/arch/mips/dec/ioasic-irq.c
+++ b/arch/mips/dec/ioasic-irq.c
@@ -144,13 +144,13 @@ void __init init_ioasic_irqs(int base)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = 0;
 		irq_desc[i].depth = 1;
-		irq_desc[i].handler = &ioasic_irq_type;
+		irq_desc[i].chip = &ioasic_irq_type;
 	}
 	for (; i < base + IO_IRQ_LINES; i++) {
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = 0;
 		irq_desc[i].depth = 1;
-		irq_desc[i].handler = &ioasic_dma_irq_type;
+		irq_desc[i].chip = &ioasic_dma_irq_type;
 	}
 
 	ioasic_irq_base = base;
diff --git a/arch/mips/dec/kn02-irq.c b/arch/mips/dec/kn02-irq.c
index 898bed502a34..d44c00d9e80f 100644
--- a/arch/mips/dec/kn02-irq.c
+++ b/arch/mips/dec/kn02-irq.c
@@ -123,7 +123,7 @@ void __init init_kn02_irqs(int base)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = 0;
 		irq_desc[i].depth = 1;
-		irq_desc[i].handler = &kn02_irq_type;
+		irq_desc[i].chip = &kn02_irq_type;
 	}
 
 	kn02_irq_base = base;
diff --git a/arch/mips/gt64120/ev64120/irq.c b/arch/mips/gt64120/ev64120/irq.c
index 46c468b26b30..f489a8067a93 100644
--- a/arch/mips/gt64120/ev64120/irq.c
+++ b/arch/mips/gt64120/ev64120/irq.c
@@ -138,7 +138,7 @@ void __init arch_init_irq(void)
 	/*  Let's initialize our IRQ descriptors  */
 	for (i = 0; i < NR_IRQS; i++) {
 		irq_desc[i].status = 0;
-		irq_desc[i].handler = &no_irq_type;
+		irq_desc[i].chip = &no_irq_type;
 		irq_desc[i].action = NULL;
 		irq_desc[i].depth = 0;
 		spin_lock_init(&irq_desc[i].lock);
diff --git a/arch/mips/ite-boards/generic/irq.c b/arch/mips/ite-boards/generic/irq.c
index 77be7216bdd0..a6749c56fe38 100644
--- a/arch/mips/ite-boards/generic/irq.c
+++ b/arch/mips/ite-boards/generic/irq.c
@@ -208,10 +208,10 @@ void __init arch_init_irq(void)
 #endif
 
 	for (i = 0; i <= IT8172_LAST_IRQ; i++) {
-		irq_desc[i].handler = &it8172_irq_type;
+		irq_desc[i].chip = &it8172_irq_type;
 		spin_lock_init(&irq_desc[i].lock);
 	}
-	irq_desc[MIPS_CPU_TIMER_IRQ].handler = &cp0_irq_type;
+	irq_desc[MIPS_CPU_TIMER_IRQ].chip = &cp0_irq_type;
 	set_c0_status(ALLINTS_NOTIMER);
 }
 
diff --git a/arch/mips/jazz/irq.c b/arch/mips/jazz/irq.c
index becc9accd495..478be9858a1e 100644
--- a/arch/mips/jazz/irq.c
+++ b/arch/mips/jazz/irq.c
@@ -73,7 +73,7 @@ void __init init_r4030_ints(void)
 		irq_desc[i].status     = IRQ_DISABLED;
 		irq_desc[i].action     = 0;
 		irq_desc[i].depth      = 1;
-		irq_desc[i].handler    = &r4030_irq_type;
+		irq_desc[i].chip    = &r4030_irq_type;
 	}
 
 	r4030_write_reg16(JAZZ_IO_IRQ_ENABLE, 0);
diff --git a/arch/mips/jmr3927/rbhma3100/irq.c b/arch/mips/jmr3927/rbhma3100/irq.c
index 11304d1354f4..380046ea1db5 100644
--- a/arch/mips/jmr3927/rbhma3100/irq.c
+++ b/arch/mips/jmr3927/rbhma3100/irq.c
@@ -435,7 +435,7 @@ void jmr3927_irq_init(u32 irq_base)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = NULL;
 		irq_desc[i].depth = 1;
-		irq_desc[i].handler = &jmr3927_irq_controller;
+		irq_desc[i].chip = &jmr3927_irq_controller;
 	}
 
 	jmr3927_irq_base = irq_base;
diff --git a/arch/mips/kernel/i8259.c b/arch/mips/kernel/i8259.c
index 0cb8ed5662f3..91ffb1233cad 100644
--- a/arch/mips/kernel/i8259.c
+++ b/arch/mips/kernel/i8259.c
@@ -120,7 +120,7 @@ int i8259A_irq_pending(unsigned int irq)
 void make_8259A_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
-	irq_desc[irq].handler = &i8259A_irq_type;
+	irq_desc[irq].chip = &i8259A_irq_type;
 	enable_irq(irq);
 }
 
@@ -327,7 +327,7 @@ void __init init_i8259_irqs (void)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = NULL;
 		irq_desc[i].depth = 1;
-		irq_desc[i].handler = &i8259A_irq_type;
+		irq_desc[i].chip = &i8259A_irq_type;
 	}
 
 	setup_irq(2, &irq2);
diff --git a/arch/mips/kernel/irq-msc01.c b/arch/mips/kernel/irq-msc01.c
index 97ebdc754b9e..f8cd1ac64d88 100644
--- a/arch/mips/kernel/irq-msc01.c
+++ b/arch/mips/kernel/irq-msc01.c
@@ -174,14 +174,14 @@ void __init init_msc_irqs(unsigned int base, msc_irqmap_t *imp, int nirq)
 
 		switch (imp->im_type) {
 		case MSC01_IRQ_EDGE:
-			irq_desc[base+n].handler = &msc_edgeirq_type;
+			irq_desc[base+n].chip = &msc_edgeirq_type;
 			if (cpu_has_veic)
 				MSCIC_WRITE(MSC01_IC_SUP+n*8, MSC01_IC_SUP_EDGE_BIT);
 			else
 				MSCIC_WRITE(MSC01_IC_SUP+n*8, MSC01_IC_SUP_EDGE_BIT | imp->im_lvl);
 			break;
 		case MSC01_IRQ_LEVEL:
-			irq_desc[base+n].handler = &msc_levelirq_type;
+			irq_desc[base+n].chip = &msc_levelirq_type;
 			if (cpu_has_veic)
 				MSCIC_WRITE(MSC01_IC_SUP+n*8, 0);
 			else
diff --git a/arch/mips/kernel/irq-mv6434x.c b/arch/mips/kernel/irq-mv6434x.c
index 0613f1f36b1b..f9c763a65547 100644
--- a/arch/mips/kernel/irq-mv6434x.c
+++ b/arch/mips/kernel/irq-mv6434x.c
@@ -155,7 +155,7 @@ void __init mv64340_irq_init(unsigned int base)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = 0;
 		irq_desc[i].depth = 2;
-		irq_desc[i].handler = &mv64340_irq_type;
+		irq_desc[i].chip = &mv64340_irq_type;
 	}
 
 	irq_base = base;
diff --git a/arch/mips/kernel/irq-rm7000.c b/arch/mips/kernel/irq-rm7000.c
index 0b130c5ac5d9..121da385a94d 100644
--- a/arch/mips/kernel/irq-rm7000.c
+++ b/arch/mips/kernel/irq-rm7000.c
@@ -91,7 +91,7 @@ void __init rm7k_cpu_irq_init(int base)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = NULL;
 		irq_desc[i].depth = 1;
-		irq_desc[i].handler = &rm7k_irq_controller;
+		irq_desc[i].chip = &rm7k_irq_controller;
 	}
 
 	irq_base = base;
diff --git a/arch/mips/kernel/irq-rm9000.c b/arch/mips/kernel/irq-rm9000.c
index 9b5f20c32acb..25109c103e44 100644
--- a/arch/mips/kernel/irq-rm9000.c
+++ b/arch/mips/kernel/irq-rm9000.c
@@ -139,11 +139,11 @@ void __init rm9k_cpu_irq_init(int base)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = NULL;
 		irq_desc[i].depth = 1;
-		irq_desc[i].handler = &rm9k_irq_controller;
+		irq_desc[i].chip = &rm9k_irq_controller;
 	}
 
 	rm9000_perfcount_irq = base + 1;
-	irq_desc[rm9000_perfcount_irq].handler = &rm9k_perfcounter_irq;
+	irq_desc[rm9000_perfcount_irq].chip = &rm9k_perfcounter_irq;
 
 	irq_base = base;
 }
diff --git a/arch/mips/kernel/irq.c b/arch/mips/kernel/irq.c
index 3dce742e716f..5c9dcd5eed59 100644
--- a/arch/mips/kernel/irq.c
+++ b/arch/mips/kernel/irq.c
@@ -95,7 +95,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+		seq_printf(p, " %14s", irq_desc[i].chip->typename);
 		seq_printf(p, "  %s", action->name);
 
 		for (action=action->next; action; action = action->next)
@@ -137,7 +137,7 @@ void __init init_IRQ(void)
 		irq_desc[i].status  = IRQ_DISABLED;
 		irq_desc[i].action  = NULL;
 		irq_desc[i].depth   = 1;
-		irq_desc[i].handler = &no_irq_type;
+		irq_desc[i].chip = &no_irq_type;
 		spin_lock_init(&irq_desc[i].lock);
 #ifdef CONFIG_MIPS_MT_SMTC
 		irq_hwmask[i] = 0;
diff --git a/arch/mips/kernel/irq_cpu.c b/arch/mips/kernel/irq_cpu.c
index 5db67e31ec1a..0e455a8ad860 100644
--- a/arch/mips/kernel/irq_cpu.c
+++ b/arch/mips/kernel/irq_cpu.c
@@ -167,14 +167,14 @@ void __init mips_cpu_irq_init(int irq_base)
 			irq_desc[i].status = IRQ_DISABLED;
 			irq_desc[i].action = NULL;
 			irq_desc[i].depth = 1;
-			irq_desc[i].handler = &mips_mt_cpu_irq_controller;
+			irq_desc[i].chip = &mips_mt_cpu_irq_controller;
 		}
 
 	for (i = irq_base + 2; i < irq_base + 8; i++) {
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = NULL;
 		irq_desc[i].depth = 1;
-		irq_desc[i].handler = &mips_cpu_irq_controller;
+		irq_desc[i].chip = &mips_cpu_irq_controller;
 	}
 
 	mips_cpu_irq_base = irq_base;
diff --git a/arch/mips/lasat/interrupt.c b/arch/mips/lasat/interrupt.c
index 2d3472b21ebb..9316a024a818 100644
--- a/arch/mips/lasat/interrupt.c
+++ b/arch/mips/lasat/interrupt.c
@@ -156,6 +156,6 @@ void __init arch_init_irq(void)
 		irq_desc[i].status	= IRQ_DISABLED;
 		irq_desc[i].action	= 0;
 		irq_desc[i].depth	= 1;
-		irq_desc[i].handler	= &lasat_irq_type;
+		irq_desc[i].chip	= &lasat_irq_type;
 	}
 }
diff --git a/arch/mips/mips-boards/atlas/atlas_int.c b/arch/mips/mips-boards/atlas/atlas_int.c
index db53950b7cfb..9dd6b8925581 100644
--- a/arch/mips/mips-boards/atlas/atlas_int.c
+++ b/arch/mips/mips-boards/atlas/atlas_int.c
@@ -215,7 +215,7 @@ void __init arch_init_irq(void)
 		irq_desc[i].status	= IRQ_DISABLED;
 		irq_desc[i].action	= 0;
 		irq_desc[i].depth	= 1;
-		irq_desc[i].handler	= &atlas_irq_type;
+		irq_desc[i].chip	= &atlas_irq_type;
 		spin_lock_init(&irq_desc[i].lock);
 	}
 }
diff --git a/arch/mips/momentum/ocelot_c/cpci-irq.c b/arch/mips/momentum/ocelot_c/cpci-irq.c
index bd885785e2f9..31d179c4673f 100644
--- a/arch/mips/momentum/ocelot_c/cpci-irq.c
+++ b/arch/mips/momentum/ocelot_c/cpci-irq.c
@@ -147,6 +147,6 @@ void cpci_irq_init(void)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = 0;
 		irq_desc[i].depth = 2;
-		irq_desc[i].handler = &cpci_irq_type;
+		irq_desc[i].chip = &cpci_irq_type;
 	}
 }
diff --git a/arch/mips/momentum/ocelot_c/uart-irq.c b/arch/mips/momentum/ocelot_c/uart-irq.c
index 755bde5146be..852265026fd1 100644
--- a/arch/mips/momentum/ocelot_c/uart-irq.c
+++ b/arch/mips/momentum/ocelot_c/uart-irq.c
@@ -137,10 +137,10 @@ void uart_irq_init(void)
 	irq_desc[80].status = IRQ_DISABLED;
 	irq_desc[80].action = 0;
 	irq_desc[80].depth = 2;
-	irq_desc[80].handler = &uart_irq_type;
+	irq_desc[80].chip = &uart_irq_type;
 
 	irq_desc[81].status = IRQ_DISABLED;
 	irq_desc[81].action = 0;
 	irq_desc[81].depth = 2;
-	irq_desc[81].handler = &uart_irq_type;
+	irq_desc[81].chip = &uart_irq_type;
 }
diff --git a/arch/mips/philips/pnx8550/common/int.c b/arch/mips/philips/pnx8550/common/int.c
index 39ee6314f627..8f18764a2359 100644
--- a/arch/mips/philips/pnx8550/common/int.c
+++ b/arch/mips/philips/pnx8550/common/int.c
@@ -236,7 +236,7 @@ void __init arch_init_irq(void)
 	int configPR;
 
 	for (i = 0; i < PNX8550_INT_CP0_TOTINT; i++) {
-		irq_desc[i].handler = &level_irq_type;
+		irq_desc[i].chip = &level_irq_type;
 		pnx8550_ack(i);	/* mask the irq just in case  */
 	}
 
@@ -273,7 +273,7 @@ void __init arch_init_irq(void)
 		/* mask/priority is still 0 so we will not get any
 		 * interrupts until it is unmasked */
 
-		irq_desc[i].handler = &level_irq_type;
+		irq_desc[i].chip = &level_irq_type;
 	}
 
 	/* Priority level 0 */
@@ -282,12 +282,12 @@ void __init arch_init_irq(void)
 	/* Set int vector table address */
 	PNX8550_GIC_VECTOR_0 = PNX8550_GIC_VECTOR_1 = 0;
 
-	irq_desc[MIPS_CPU_GIC_IRQ].handler = &level_irq_type;
+	irq_desc[MIPS_CPU_GIC_IRQ].chip = &level_irq_type;
 	setup_irq(MIPS_CPU_GIC_IRQ, &gic_action);
 
 	/* init of Timer interrupts */
 	for (i = PNX8550_INT_TIMER_MIN; i <= PNX8550_INT_TIMER_MAX; i++) {
-		irq_desc[i].handler = &level_irq_type;
+		irq_desc[i].chip = &level_irq_type;
 	}
 
 	/* Stop Timer 1-3 */
@@ -295,7 +295,7 @@ void __init arch_init_irq(void)
 	configPR |= 0x00000038;
 	write_c0_config7(configPR);
 
-	irq_desc[MIPS_CPU_TIMER_IRQ].handler = &level_irq_type;
+	irq_desc[MIPS_CPU_TIMER_IRQ].chip = &level_irq_type;
 	setup_irq(MIPS_CPU_TIMER_IRQ, &timer_action);
 }
 
diff --git a/arch/mips/sgi-ip22/ip22-eisa.c b/arch/mips/sgi-ip22/ip22-eisa.c
index b19820110aa3..989167b49ce9 100644
--- a/arch/mips/sgi-ip22/ip22-eisa.c
+++ b/arch/mips/sgi-ip22/ip22-eisa.c
@@ -279,9 +279,9 @@ int __init ip22_eisa_init(void)
 		irq_desc[i].action = 0;
 		irq_desc[i].depth = 1;
 		if (i < (SGINT_EISA + 8))
-			irq_desc[i].handler = &ip22_eisa1_irq_type;
+			irq_desc[i].chip = &ip22_eisa1_irq_type;
 		else
-			irq_desc[i].handler = &ip22_eisa2_irq_type;
+			irq_desc[i].chip = &ip22_eisa2_irq_type;
 	}
 
 	/* Cannot use request_irq because of kmalloc not being ready at such
diff --git a/arch/mips/sgi-ip22/ip22-int.c b/arch/mips/sgi-ip22/ip22-int.c
index fc6a7e2b189c..18906af69691 100644
--- a/arch/mips/sgi-ip22/ip22-int.c
+++ b/arch/mips/sgi-ip22/ip22-int.c
@@ -436,7 +436,7 @@ void __init arch_init_irq(void)
 		irq_desc[i].status	= IRQ_DISABLED;
 		irq_desc[i].action	= 0;
 		irq_desc[i].depth	= 1;
-		irq_desc[i].handler	= handler;
+		irq_desc[i].chip	= handler;
 	}
 
 	/* vector handler. this register the IRQ as non-sharable */
diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c
index 0b61a39ce2bb..869566c360ae 100644
--- a/arch/mips/sgi-ip27/ip27-irq.c
+++ b/arch/mips/sgi-ip27/ip27-irq.c
@@ -386,7 +386,7 @@ void __devinit register_bridge_irq(unsigned int irq)
 	irq_desc[irq].status	= IRQ_DISABLED;
 	irq_desc[irq].action	= 0;
 	irq_desc[irq].depth	= 1;
-	irq_desc[irq].handler	= &bridge_irq_type;
+	irq_desc[irq].chip	= &bridge_irq_type;
 }
 
 int __devinit request_bridge_irq(struct bridge_controller *bc)
diff --git a/arch/mips/sgi-ip32/ip32-irq.c b/arch/mips/sgi-ip32/ip32-irq.c
index 8ba08047d164..00b94aaf6371 100644
--- a/arch/mips/sgi-ip32/ip32-irq.c
+++ b/arch/mips/sgi-ip32/ip32-irq.c
@@ -591,7 +591,7 @@ void __init arch_init_irq(void)
 		irq_desc[irq].status = IRQ_DISABLED;
 		irq_desc[irq].action = 0;
 		irq_desc[irq].depth = 0;
-		irq_desc[irq].handler = controller;
+		irq_desc[irq].chip = controller;
 	}
 	setup_irq(CRIME_MEMERR_IRQ, &memerr_irq);
 	setup_irq(CRIME_CPUERR_IRQ, &cpuerr_irq);
diff --git a/arch/mips/sibyte/bcm1480/irq.c b/arch/mips/sibyte/bcm1480/irq.c
index e61760b14d99..610df40cb820 100644
--- a/arch/mips/sibyte/bcm1480/irq.c
+++ b/arch/mips/sibyte/bcm1480/irq.c
@@ -276,10 +276,10 @@ void __init init_bcm1480_irqs(void)
 		irq_desc[i].action = 0;
 		irq_desc[i].depth = 1;
 		if (i < BCM1480_NR_IRQS) {
-			irq_desc[i].handler = &bcm1480_irq_type;
+			irq_desc[i].chip = &bcm1480_irq_type;
 			bcm1480_irq_owner[i] = 0;
 		} else {
-			irq_desc[i].handler = &no_irq_type;
+			irq_desc[i].chip = &no_irq_type;
 		}
 	}
 }
diff --git a/arch/mips/sibyte/sb1250/irq.c b/arch/mips/sibyte/sb1250/irq.c
index f853c32f60a0..fcc61940f1ff 100644
--- a/arch/mips/sibyte/sb1250/irq.c
+++ b/arch/mips/sibyte/sb1250/irq.c
@@ -246,10 +246,10 @@ void __init init_sb1250_irqs(void)
 		irq_desc[i].action = 0;
 		irq_desc[i].depth = 1;
 		if (i < SB1250_NR_IRQS) {
-			irq_desc[i].handler = &sb1250_irq_type;
+			irq_desc[i].chip = &sb1250_irq_type;
 			sb1250_irq_owner[i] = 0;
 		} else {
-			irq_desc[i].handler = &no_irq_type;
+			irq_desc[i].chip = &no_irq_type;
 		}
 	}
 }
diff --git a/arch/mips/sni/irq.c b/arch/mips/sni/irq.c
index 7365b4853ddb..c19e158ec402 100644
--- a/arch/mips/sni/irq.c
+++ b/arch/mips/sni/irq.c
@@ -203,7 +203,7 @@ void __init arch_init_irq(void)
 		irq_desc[i].status     = IRQ_DISABLED;
 		irq_desc[i].action     = 0;
 		irq_desc[i].depth      = 1;
-		irq_desc[i].handler    = &pciasic_irq_type;
+		irq_desc[i].chip    = &pciasic_irq_type;
 	}
 
 	change_c0_status(ST0_IM, IE_IRQ1|IE_IRQ2|IE_IRQ3|IE_IRQ4);
diff --git a/arch/mips/tx4927/common/tx4927_irq.c b/arch/mips/tx4927/common/tx4927_irq.c
index 8ca68015cf40..a42be00483e6 100644
--- a/arch/mips/tx4927/common/tx4927_irq.c
+++ b/arch/mips/tx4927/common/tx4927_irq.c
@@ -227,7 +227,7 @@ static void __init tx4927_irq_cp0_init(void)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = 0;
 		irq_desc[i].depth = 1;
-		irq_desc[i].handler = &tx4927_irq_cp0_type;
+		irq_desc[i].chip = &tx4927_irq_cp0_type;
 	}
 
 	return;
@@ -435,7 +435,7 @@ static void __init tx4927_irq_pic_init(void)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = 0;
 		irq_desc[i].depth = 2;
-		irq_desc[i].handler = &tx4927_irq_pic_type;
+		irq_desc[i].chip = &tx4927_irq_pic_type;
 	}
 
 	setup_irq(TX4927_IRQ_NEST_PIC_ON_CP0, &tx4927_irq_pic_action);
diff --git a/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_irq.c b/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_irq.c
index aee07ff2212a..c67978b6dae4 100644
--- a/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_irq.c
+++ b/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_irq.c
@@ -368,7 +368,7 @@ static void __init toshiba_rbtx4927_irq_ioc_init(void)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = 0;
 		irq_desc[i].depth = 3;
-		irq_desc[i].handler = &toshiba_rbtx4927_irq_ioc_type;
+		irq_desc[i].chip = &toshiba_rbtx4927_irq_ioc_type;
 	}
 
 	setup_irq(TOSHIBA_RBTX4927_IRQ_NEST_IOC_ON_PIC,
@@ -526,7 +526,7 @@ static void __init toshiba_rbtx4927_irq_isa_init(void)
 		irq_desc[i].action = 0;
 		irq_desc[i].depth =
 		    ((i < TOSHIBA_RBTX4927_IRQ_ISA_MID) ? (4) : (5));
-		irq_desc[i].handler = &toshiba_rbtx4927_irq_isa_type;
+		irq_desc[i].chip = &toshiba_rbtx4927_irq_isa_type;
 	}
 
 	setup_irq(TOSHIBA_RBTX4927_IRQ_NEST_ISA_ON_IOC,
@@ -692,13 +692,13 @@ void toshiba_rbtx4927_irq_dump(char *key)
 	{
 		u32 i, j = 0;
 		for (i = 0; i < NR_IRQS; i++) {
-			if (strcmp(irq_desc[i].handler->typename, "none")
+			if (strcmp(irq_desc[i].chip->typename, "none")
 			    == 0)
 				continue;
 
 			if ((i >= 1)
-			    && (irq_desc[i - 1].handler->typename ==
-				irq_desc[i].handler->typename)) {
+			    && (irq_desc[i - 1].chip->typename ==
+				irq_desc[i].chip->typename)) {
 				j++;
 			} else {
 				j = 0;
@@ -707,12 +707,12 @@ void toshiba_rbtx4927_irq_dump(char *key)
 			    (TOSHIBA_RBTX4927_IRQ_INFO,
 			     "%s irq=0x%02x/%3d s=0x%08x h=0x%08x a=0x%08x ah=0x%08x d=%1d n=%s/%02d\n",
 			     key, i, i, irq_desc[i].status,
-			     (u32) irq_desc[i].handler,
+			     (u32) irq_desc[i].chip,
 			     (u32) irq_desc[i].action,
 			     (u32) (irq_desc[i].action ? irq_desc[i].
 				    action->handler : 0),
 			     irq_desc[i].depth,
-			     irq_desc[i].handler->typename, j);
+			     irq_desc[i].chip->typename, j);
 		}
 	}
 #endif
diff --git a/arch/mips/tx4938/common/irq.c b/arch/mips/tx4938/common/irq.c
index 873805178d8e..0b2f8c849218 100644
--- a/arch/mips/tx4938/common/irq.c
+++ b/arch/mips/tx4938/common/irq.c
@@ -102,7 +102,7 @@ tx4938_irq_cp0_init(void)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = 0;
 		irq_desc[i].depth = 1;
-		irq_desc[i].handler = &tx4938_irq_cp0_type;
+		irq_desc[i].chip = &tx4938_irq_cp0_type;
 	}
 
 	return;
@@ -306,7 +306,7 @@ tx4938_irq_pic_init(void)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = 0;
 		irq_desc[i].depth = 2;
-		irq_desc[i].handler = &tx4938_irq_pic_type;
+		irq_desc[i].chip = &tx4938_irq_pic_type;
 	}
 
 	setup_irq(TX4938_IRQ_NEST_PIC_ON_CP0, &tx4938_irq_pic_action);
diff --git a/arch/mips/tx4938/toshiba_rbtx4938/irq.c b/arch/mips/tx4938/toshiba_rbtx4938/irq.c
index 9cd9c0fe2265..3b8245dc5bd3 100644
--- a/arch/mips/tx4938/toshiba_rbtx4938/irq.c
+++ b/arch/mips/tx4938/toshiba_rbtx4938/irq.c
@@ -146,7 +146,7 @@ toshiba_rbtx4938_irq_ioc_init(void)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = 0;
 		irq_desc[i].depth = 3;
-		irq_desc[i].handler = &toshiba_rbtx4938_irq_ioc_type;
+		irq_desc[i].chip = &toshiba_rbtx4938_irq_ioc_type;
 	}
 
 	setup_irq(RBTX4938_IRQ_IOCINT,
diff --git a/arch/mips/vr41xx/common/icu.c b/arch/mips/vr41xx/common/icu.c
index 07ae19cf0c29..b9323302cc4e 100644
--- a/arch/mips/vr41xx/common/icu.c
+++ b/arch/mips/vr41xx/common/icu.c
@@ -722,10 +722,10 @@ static int __init vr41xx_icu_init(void)
 	icu2_write(MGIUINTHREG, 0xffff);
 
 	for (i = SYSINT1_IRQ_BASE; i <= SYSINT1_IRQ_LAST; i++)
-		irq_desc[i].handler = &sysint1_irq_type;
+		irq_desc[i].chip = &sysint1_irq_type;
 
 	for (i = SYSINT2_IRQ_BASE; i <= SYSINT2_IRQ_LAST; i++)
-		irq_desc[i].handler = &sysint2_irq_type;
+		irq_desc[i].chip = &sysint2_irq_type;
 
 	cascade_irq(INT0_IRQ, icu_get_irq);
 	cascade_irq(INT1_IRQ, icu_get_irq);
diff --git a/arch/mips/vr41xx/common/irq.c b/arch/mips/vr41xx/common/irq.c
index 86796bb63c3c..66aa50802deb 100644
--- a/arch/mips/vr41xx/common/irq.c
+++ b/arch/mips/vr41xx/common/irq.c
@@ -73,13 +73,13 @@ static void irq_dispatch(unsigned int irq, struct pt_regs *regs)
 	if (cascade->get_irq != NULL) {
 		unsigned int source_irq = irq;
 		desc = irq_desc + source_irq;
-		desc->handler->ack(source_irq);
+		desc->chip->ack(source_irq);
 		irq = cascade->get_irq(irq, regs);
 		if (irq < 0)
 			atomic_inc(&irq_err_count);
 		else
 			irq_dispatch(irq, regs);
-		desc->handler->end(source_irq);
+		desc->chip->end(source_irq);
 	} else
 		do_IRQ(irq, regs);
 }
diff --git a/arch/mips/vr41xx/common/vrc4173.c b/arch/mips/vr41xx/common/vrc4173.c
index 3e31f8193d21..2d287b8893d9 100644
--- a/arch/mips/vr41xx/common/vrc4173.c
+++ b/arch/mips/vr41xx/common/vrc4173.c
@@ -483,7 +483,7 @@ static inline int vrc4173_icu_init(int cascade_irq)
 	vr41xx_set_irq_level(GIU_IRQ_TO_PIN(cascade_irq), LEVEL_LOW);
 
 	for (i = VRC4173_IRQ_BASE; i <= VRC4173_IRQ_LAST; i++)
-                irq_desc[i].handler = &vrc4173_irq_type;
+                irq_desc[i].chip = &vrc4173_irq_type;
 
 	return 0;
 }
diff --git a/arch/mips/vr41xx/nec-cmbvr4133/irq.c b/arch/mips/vr41xx/nec-cmbvr4133/irq.c
index 31db6b61a39e..7b2511ca0a61 100644
--- a/arch/mips/vr41xx/nec-cmbvr4133/irq.c
+++ b/arch/mips/vr41xx/nec-cmbvr4133/irq.c
@@ -104,7 +104,7 @@ void __init rockhopper_init_irq(void)
 	}
 
 	for (i = I8259_IRQ_BASE; i <= I8259_IRQ_LAST; i++)
-		irq_desc[i].handler = &i8259_irq_type;
+		irq_desc[i].chip = &i8259_irq_type;
 
 	setup_irq(I8259_SLAVE_IRQ, &i8259_slave_cascade);
 
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 197936d9359a..26e69f73fdb0 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -158,7 +158,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #endif
 
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+		seq_printf(p, " %14s", irq_desc[i].chip->typename);
 #ifndef PARISC_IRQ_CR16_COUNTS
 		seq_printf(p, "  %s", action->name);
 
@@ -210,12 +210,12 @@ int cpu_claim_irq(unsigned int irq, struct hw_interrupt_type *type, void *data)
 {
 	if (irq_desc[irq].action)
 		return -EBUSY;
-	if (irq_desc[irq].handler != &cpu_interrupt_type)
+	if (irq_desc[irq].chip != &cpu_interrupt_type)
 		return -EBUSY;
 
 	if (type) {
-		irq_desc[irq].handler = type;
-		irq_desc[irq].handler_data = data;
+		irq_desc[irq].chip = type;
+		irq_desc[irq].chip_data = data;
 		cpu_interrupt_type.enable(irq);
 	}
 	return 0;
@@ -378,7 +378,7 @@ static void claim_cpu_irqs(void)
 {
 	int i;
 	for (i = CPU_IRQ_BASE; i <= CPU_IRQ_MAX; i++) {
-		irq_desc[i].handler = &cpu_interrupt_type;
+		irq_desc[i].chip = &cpu_interrupt_type;
 	}
 
 	irq_desc[TIMER_IRQ].action = &timer_action;
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index e253a45dcf10..1bfad7c2f8c8 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -193,10 +193,10 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
 		struct irq_desc *desc = irq_descp(irq);
 
 		if (desc->status & IRQ_INPROGRESS)
-			desc->handler->end(irq);
+			desc->chip->end(irq);
 
 		if (!(desc->status & IRQ_DISABLED))
-			desc->handler->disable(irq);
+			desc->chip->disable(irq);
 	}
 
 	if (ppc_md.kexec_cpu_down)
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 40d4c14fde8f..8cfc779d882d 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -120,8 +120,8 @@ int show_interrupts(struct seq_file *p, void *v)
 #else
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #endif /* CONFIG_SMP */
-		if (desc->handler)
-			seq_printf(p, " %s ", desc->handler->typename);
+		if (desc->chip)
+			seq_printf(p, " %s ", desc->chip->typename);
 		else
 			seq_puts(p, "  None      ");
 		seq_printf(p, "%s", (desc->status & IRQ_LEVEL) ? "Level " : "Edge  ");
@@ -169,8 +169,8 @@ void fixup_irqs(cpumask_t map)
 			printk("Breaking affinity for irq %i\n", irq);
 			mask = map;
 		}
-		if (irq_desc[irq].handler->set_affinity)
-			irq_desc[irq].handler->set_affinity(irq, mask);
+		if (irq_desc[irq].chip->set_affinity)
+			irq_desc[irq].chip->set_affinity(irq, mask);
 		else if (irq_desc[irq].action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
 	}
diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index 1bbf822b4efc..7bff3cbc5723 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -307,7 +307,7 @@ static void iic_request_ipi(int ipi, const char *name)
 	irq = iic_ipi_to_irq(ipi);
 	/* IPIs are marked SA_INTERRUPT as they must run with irqs
 	 * disabled */
-	get_irq_desc(irq)->handler = &iic_pic;
+	get_irq_desc(irq)->chip = &iic_pic;
 	get_irq_desc(irq)->status |= IRQ_PER_CPU;
 	request_irq(irq, iic_ipi_action, SA_INTERRUPT, name, NULL);
 }
@@ -330,7 +330,7 @@ static void iic_setup_spe_handlers(void)
 	for (be=0; be < num_present_cpus() / 2; be++) {
 		for (isrc = 0; isrc < IIC_CLASS_STRIDE * 3; isrc++) {
 			int irq = IIC_NODE_STRIDE * be + IIC_SPE_OFFSET + isrc;
-			get_irq_desc(irq)->handler = &iic_pic;
+			get_irq_desc(irq)->chip = &iic_pic;
 		}
 	}
 }
diff --git a/arch/powerpc/platforms/cell/spider-pic.c b/arch/powerpc/platforms/cell/spider-pic.c
index 55cbdd77a62d..7c3a0b6d34fd 100644
--- a/arch/powerpc/platforms/cell/spider-pic.c
+++ b/arch/powerpc/platforms/cell/spider-pic.c
@@ -162,7 +162,7 @@ void spider_init_IRQ_hardcoded(void)
 		spider_pics[node] = ioremap(spiderpic, 0x800);
 		for (n = 0; n < IIC_NUM_EXT; n++) {
 			int irq = n + IIC_EXT_OFFSET + node * IIC_NODE_STRIDE;
-			get_irq_desc(irq)->handler = &spider_pic;
+			get_irq_desc(irq)->chip = &spider_pic;
 		}
 
  		/* do not mask any interrupts because of level */
@@ -217,7 +217,7 @@ void spider_init_IRQ(void)
 
 		for (n = 0; n < IIC_NUM_EXT; n++) {
 			int irq = n + IIC_EXT_OFFSET + node * IIC_NODE_STRIDE;
-			get_irq_desc(irq)->handler = &spider_pic;
+			get_irq_desc(irq)->chip = &spider_pic;
 		}
 
 		/* do not mask any interrupts because of level */
diff --git a/arch/powerpc/platforms/iseries/irq.c b/arch/powerpc/platforms/iseries/irq.c
index 62bbbcf5ded3..33bb4aa0e1e8 100644
--- a/arch/powerpc/platforms/iseries/irq.c
+++ b/arch/powerpc/platforms/iseries/irq.c
@@ -242,9 +242,9 @@ void __init iSeries_activate_IRQs()
 	for_each_irq (irq) {
 		irq_desc_t *desc = get_irq_desc(irq);
 
-		if (desc && desc->handler && desc->handler->startup) {
+		if (desc && desc->chip && desc->chip->startup) {
 			spin_lock_irqsave(&desc->lock, flags);
-			desc->handler->startup(irq);
+			desc->chip->startup(irq);
 			spin_unlock_irqrestore(&desc->lock, flags);
 		}
 	}
@@ -324,7 +324,7 @@ int __init iSeries_allocate_IRQ(HvBusNumber bus,
 		+ function;
 	virtirq = virt_irq_create_mapping(realirq);
 
-	irq_desc[virtirq].handler = &iSeries_IRQ_handler;
+	irq_desc[virtirq].chip = &iSeries_IRQ_handler;
 	return virtirq;
 }
 
diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index 18bf3011d1e3..9f6189af6dd6 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -446,7 +446,7 @@ static void __init pmac_pic_probe_oldstyle(void)
 
 	/* Set the handler for the main PIC */
 	for ( i = 0; i < max_real_irqs ; i++ )
-		irq_desc[i].handler = &pmac_pic;
+		irq_desc[i].chip = &pmac_pic;
 
 	/* Get addresses of first controller if we have a node for it */
 	BUG_ON(of_address_to_resource(master, 0, &r));
@@ -493,7 +493,7 @@ static void __init pmac_pic_probe_oldstyle(void)
 	/* Setup handlers for secondary controller and hook cascade irq*/
 	if (slave) {
 		for ( i = max_real_irqs ; i < max_irqs ; i++ )
-			irq_desc[i].handler = &gatwick_pic;
+			irq_desc[i].chip = &gatwick_pic;
 		setup_irq(irq_cascade, &gatwick_cascade_action);
 	}
 	printk(KERN_INFO "irq: System has %d possible interrupts\n", max_irqs);
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index b14f9b5c114e..bdc9e26a93cf 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -558,7 +558,7 @@ nextnode:
 	}
 
 	for (i = irq_offset_value(); i < NR_IRQS; ++i)
-		get_irq_desc(i)->handler = &xics_pic;
+		get_irq_desc(i)->chip = &xics_pic;
 
 	xics_setup_cpu();
 
@@ -701,9 +701,9 @@ void xics_migrate_irqs_away(void)
 			continue;
 
 		/* We only need to migrate enabled IRQS */
-		if (desc == NULL || desc->handler == NULL
+		if (desc == NULL || desc->chip == NULL
 		    || desc->action == NULL
-		    || desc->handler->set_affinity == NULL)
+		    || desc->chip->set_affinity == NULL)
 			continue;
 
 		spin_lock_irqsave(&desc->lock, flags);
@@ -728,7 +728,7 @@ void xics_migrate_irqs_away(void)
 		       virq, cpu);
 
 		/* Reset affinity to all cpus */
-		desc->handler->set_affinity(virq, CPU_MASK_ALL);
+		desc->chip->set_affinity(virq, CPU_MASK_ALL);
 		irq_affinity[virq] = CPU_MASK_ALL;
 unlock:
 		spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c
index b7ac32fdd776..2bff30f6d635 100644
--- a/arch/powerpc/sysdev/i8259.c
+++ b/arch/powerpc/sysdev/i8259.c
@@ -208,7 +208,7 @@ void __init i8259_init(unsigned long intack_addr, int offset)
 	spin_unlock_irqrestore(&i8259_lock, flags);
 
 	for (i = 0; i < NUM_ISA_INTERRUPTS; ++i)
-		irq_desc[offset + i].handler = &i8259_pic;
+		irq_desc[offset + i].chip = &i8259_pic;
 
 	/* reserve our resources */
 	setup_irq(offset + 2, &i8259_irqaction);
diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c
index 8f01e0f1d847..46801f5ec03f 100644
--- a/arch/powerpc/sysdev/ipic.c
+++ b/arch/powerpc/sysdev/ipic.c
@@ -472,7 +472,7 @@ void __init ipic_init(phys_addr_t phys_addr,
 	ipic_write(primary_ipic->regs, IPIC_SEMSR, temp);
 
 	for (i = 0 ; i < NR_IPIC_INTS ; i++) {
-		irq_desc[i+irq_offset].handler = &ipic;
+		irq_desc[i+irq_offset].chip = &ipic;
 		irq_desc[i+irq_offset].status = IRQ_LEVEL;
 	}
 
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index bffe50d02c99..f4613ee6b7a2 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -379,14 +379,14 @@ static inline u32 mpic_physmask(u32 cpumask)
 /* Get the mpic structure from the IPI number */
 static inline struct mpic * mpic_from_ipi(unsigned int ipi)
 {
-	return container_of(irq_desc[ipi].handler, struct mpic, hc_ipi);
+	return container_of(irq_desc[ipi].chip, struct mpic, hc_ipi);
 }
 #endif
 
 /* Get the mpic structure from the irq number */
 static inline struct mpic * mpic_from_irq(unsigned int irq)
 {
-	return container_of(irq_desc[irq].handler, struct mpic, hc_irq);
+	return container_of(irq_desc[irq].chip, struct mpic, hc_irq);
 }
 
 /* Send an EOI */
@@ -752,7 +752,7 @@ void __init mpic_init(struct mpic *mpic)
 		if (!(mpic->flags & MPIC_PRIMARY))
 			continue;
 		irq_desc[mpic->ipi_offset+i].status |= IRQ_PER_CPU;
-		irq_desc[mpic->ipi_offset+i].handler = &mpic->hc_ipi;
+		irq_desc[mpic->ipi_offset+i].chip = &mpic->hc_ipi;
 #endif /* CONFIG_SMP */
 	}
 
@@ -813,7 +813,7 @@ void __init mpic_init(struct mpic *mpic)
 		/* init linux descriptors */
 		if (i < mpic->irq_count) {
 			irq_desc[mpic->irq_offset+i].status = level ? IRQ_LEVEL : 0;
-			irq_desc[mpic->irq_offset+i].handler = &mpic->hc_irq;
+			irq_desc[mpic->irq_offset+i].chip = &mpic->hc_irq;
 		}
 	}
 	
diff --git a/arch/ppc/8xx_io/commproc.c b/arch/ppc/8xx_io/commproc.c
index 12b84ca51327..9b3ace26280c 100644
--- a/arch/ppc/8xx_io/commproc.c
+++ b/arch/ppc/8xx_io/commproc.c
@@ -187,7 +187,7 @@ cpm_interrupt_init(void)
          * interrupt vectors
          */
         for ( i = CPM_IRQ_OFFSET ; i < CPM_IRQ_OFFSET + NR_CPM_INTS ; i++ )
-                irq_desc[i].handler = &cpm_pic;
+                irq_desc[i].chip = &cpm_pic;
 
 	/* Set our interrupt handler with the core CPU.	*/
 	if (setup_irq(CPM_INTERRUPT, &cpm_interrupt_irqaction))
diff --git a/arch/ppc/platforms/apus_setup.c b/arch/ppc/platforms/apus_setup.c
index fe0cdc04d436..5c4118a459f3 100644
--- a/arch/ppc/platforms/apus_setup.c
+++ b/arch/ppc/platforms/apus_setup.c
@@ -734,9 +734,9 @@ void apus_init_IRQ(void)
 	for ( i = 0 ; i < AMI_IRQS; i++ ) {
 		irq_desc[i].status = IRQ_LEVEL;
 		if (i < IRQ_AMIGA_AUTO) {
-			irq_desc[i].handler = &amiga_irqctrl;
+			irq_desc[i].chip = &amiga_irqctrl;
 		} else {
-			irq_desc[i].handler = &amiga_sys_irqctrl;
+			irq_desc[i].chip = &amiga_sys_irqctrl;
 			action = &amiga_sys_irqaction[i-IRQ_AMIGA_AUTO];
 			if (action->name)
 				setup_irq(i, action);
diff --git a/arch/ppc/platforms/sbc82xx.c b/arch/ppc/platforms/sbc82xx.c
index 866807b4ad0b..41006d2b4b38 100644
--- a/arch/ppc/platforms/sbc82xx.c
+++ b/arch/ppc/platforms/sbc82xx.c
@@ -172,7 +172,7 @@ void __init sbc82xx_init_IRQ(void)
 	
 	/* Set up the interrupt handlers for the i8259 IRQs */
 	for (i = NR_SIU_INTS; i < NR_SIU_INTS + 8; i++) {
-                irq_desc[i].handler = &sbc82xx_i8259_ic;
+                irq_desc[i].chip = &sbc82xx_i8259_ic;
 		irq_desc[i].status |= IRQ_LEVEL;
 	}
 
diff --git a/arch/ppc/syslib/cpc700_pic.c b/arch/ppc/syslib/cpc700_pic.c
index 5add0a919ef6..172aa215fdb0 100644
--- a/arch/ppc/syslib/cpc700_pic.c
+++ b/arch/ppc/syslib/cpc700_pic.c
@@ -140,12 +140,12 @@ cpc700_init_IRQ(void)
 						        /* IRQ 0 is highest */
 
 	for (i = 0; i < 17; i++) {
-		irq_desc[i].handler = &cpc700_pic;
+		irq_desc[i].chip = &cpc700_pic;
 		cpc700_pic_init_irq(i);
 	}
 
 	for (i = 20; i < 32; i++) {
-		irq_desc[i].handler = &cpc700_pic;
+		irq_desc[i].chip = &cpc700_pic;
 		cpc700_pic_init_irq(i);
 	}
 
diff --git a/arch/ppc/syslib/cpm2_pic.c b/arch/ppc/syslib/cpm2_pic.c
index 29d95d415ceb..c0fee0beb815 100644
--- a/arch/ppc/syslib/cpm2_pic.c
+++ b/arch/ppc/syslib/cpm2_pic.c
@@ -171,7 +171,7 @@ void cpm2_init_IRQ(void)
 	/* Enable chaining to OpenPIC, and make everything level
 	 */
 	for (i = 0; i < NR_CPM_INTS; i++) {
-		irq_desc[i+CPM_IRQ_OFFSET].handler = &cpm2_pic;
+		irq_desc[i+CPM_IRQ_OFFSET].chip = &cpm2_pic;
 		irq_desc[i+CPM_IRQ_OFFSET].status |= IRQ_LEVEL;
 	}
 }
diff --git a/arch/ppc/syslib/gt64260_pic.c b/arch/ppc/syslib/gt64260_pic.c
index dc3bd9ecbbf6..91096b38ae70 100644
--- a/arch/ppc/syslib/gt64260_pic.c
+++ b/arch/ppc/syslib/gt64260_pic.c
@@ -98,7 +98,7 @@ gt64260_init_irq(void)
 
 	/* use the gt64260 for all (possible) interrupt sources */
 	for (i = gt64260_irq_base; i < (gt64260_irq_base + 96); i++)
-		irq_desc[i].handler = &gt64260_pic;
+		irq_desc[i].chip = &gt64260_pic;
 
 	if (ppc_md.progress)
 		ppc_md.progress("gt64260_init_irq: exit", 0x0);
diff --git a/arch/ppc/syslib/m82xx_pci.c b/arch/ppc/syslib/m82xx_pci.c
index 1941a8c7ca9a..63fa5b313396 100644
--- a/arch/ppc/syslib/m82xx_pci.c
+++ b/arch/ppc/syslib/m82xx_pci.c
@@ -159,7 +159,7 @@ pq2pci_init_irq(void)
 	immap->im_memctl.memc_or8 = 0xffff8010;
 #endif
 	for (irq = NR_CPM_INTS; irq < NR_CPM_INTS + 4; irq++)
-		irq_desc[irq].handler = &pq2pci_ic;
+		irq_desc[irq].chip = &pq2pci_ic;
 
 	/* make PCI IRQ level sensitive */
 	immap->im_intctl.ic_siexr &=
diff --git a/arch/ppc/syslib/m8xx_setup.c b/arch/ppc/syslib/m8xx_setup.c
index dae9af78bde1..0c4c0de7c59f 100644
--- a/arch/ppc/syslib/m8xx_setup.c
+++ b/arch/ppc/syslib/m8xx_setup.c
@@ -347,13 +347,13 @@ m8xx_init_IRQ(void)
 	int i;
 
 	for (i = SIU_IRQ_OFFSET ; i < SIU_IRQ_OFFSET + NR_SIU_INTS ; i++)
-		irq_desc[i].handler = &ppc8xx_pic;
+		irq_desc[i].chip = &ppc8xx_pic;
 
 	cpm_interrupt_init();
 
 #if defined(CONFIG_PCI)
 	for (i = I8259_IRQ_OFFSET ; i < I8259_IRQ_OFFSET + NR_8259_INTS ; i++)
-		irq_desc[i].handler = &i8259_pic;
+		irq_desc[i].chip = &i8259_pic;
 
 	i8259_pic_irq_offset = I8259_IRQ_OFFSET;
 	i8259_init(0);
diff --git a/arch/ppc/syslib/mpc52xx_pic.c b/arch/ppc/syslib/mpc52xx_pic.c
index c4406f9dc6a3..6425b5cee7db 100644
--- a/arch/ppc/syslib/mpc52xx_pic.c
+++ b/arch/ppc/syslib/mpc52xx_pic.c
@@ -204,9 +204,9 @@ mpc52xx_init_irq(void)
 	out_be32(&intr->main_pri1, 0);
 	out_be32(&intr->main_pri2, 0);
 
-	/* Initialize irq_desc[i].handler's with mpc52xx_ic. */
+	/* Initialize irq_desc[i].chip's with mpc52xx_ic. */
 	for (i = 0; i < NR_IRQS; i++) {
-		irq_desc[i].handler = &mpc52xx_ic;
+		irq_desc[i].chip = &mpc52xx_ic;
 		irq_desc[i].status = IRQ_LEVEL;
 	}
 
diff --git a/arch/ppc/syslib/mv64360_pic.c b/arch/ppc/syslib/mv64360_pic.c
index 5a19697060f0..a4244d468381 100644
--- a/arch/ppc/syslib/mv64360_pic.c
+++ b/arch/ppc/syslib/mv64360_pic.c
@@ -119,7 +119,7 @@ mv64360_init_irq(void)
 	/* All interrupts are level interrupts */
 	for (i = mv64360_irq_base; i < (mv64360_irq_base + 96); i++) {
 		irq_desc[i].status |= IRQ_LEVEL;
-		irq_desc[i].handler = &mv64360_pic;
+		irq_desc[i].chip = &mv64360_pic;
 	}
 
 	if (ppc_md.progress)
diff --git a/arch/ppc/syslib/open_pic.c b/arch/ppc/syslib/open_pic.c
index 70456c8f998c..822058c2983e 100644
--- a/arch/ppc/syslib/open_pic.c
+++ b/arch/ppc/syslib/open_pic.c
@@ -373,7 +373,7 @@ void __init openpic_init(int offset)
 				OPENPIC_VEC_IPI+i+offset);
 		/* IPIs are per-CPU */
 		irq_desc[OPENPIC_VEC_IPI+i+offset].status |= IRQ_PER_CPU;
-		irq_desc[OPENPIC_VEC_IPI+i+offset].handler = &open_pic_ipi;
+		irq_desc[OPENPIC_VEC_IPI+i+offset].chip = &open_pic_ipi;
 	}
 #endif
 
@@ -408,7 +408,7 @@ void __init openpic_init(int offset)
 
 	/* Init descriptors */
 	for (i = offset; i < NumSources + offset; i++)
-		irq_desc[i].handler = &open_pic;
+		irq_desc[i].chip = &open_pic;
 
 	/* Initialize the spurious interrupt */
 	if (ppc_md.progress) ppc_md.progress("openpic: spurious",0x3bd);
diff --git a/arch/ppc/syslib/open_pic2.c b/arch/ppc/syslib/open_pic2.c
index bcbe40de26fe..b8154efff6ed 100644
--- a/arch/ppc/syslib/open_pic2.c
+++ b/arch/ppc/syslib/open_pic2.c
@@ -290,7 +290,7 @@ void __init openpic2_init(int offset)
 
 	/* Init descriptors */
 	for (i = offset; i < NumSources + offset; i++)
-		irq_desc[i].handler = &open_pic2;
+		irq_desc[i].chip = &open_pic2;
 
 	/* Initialize the spurious interrupt */
 	if (ppc_md.progress) ppc_md.progress("openpic2: spurious",0x3bd);
diff --git a/arch/ppc/syslib/ppc403_pic.c b/arch/ppc/syslib/ppc403_pic.c
index c46043c47225..1584c8b1229f 100644
--- a/arch/ppc/syslib/ppc403_pic.c
+++ b/arch/ppc/syslib/ppc403_pic.c
@@ -121,5 +121,5 @@ ppc4xx_pic_init(void)
 	ppc_md.get_irq = ppc403_pic_get_irq;
 
 	for (i = 0; i < NR_IRQS; i++)
-		irq_desc[i].handler = &ppc403_aic;
+		irq_desc[i].chip = &ppc403_aic;
 }
diff --git a/arch/ppc/syslib/ppc4xx_pic.c b/arch/ppc/syslib/ppc4xx_pic.c
index fd9af0fc0e9f..e669c1335d47 100644
--- a/arch/ppc/syslib/ppc4xx_pic.c
+++ b/arch/ppc/syslib/ppc4xx_pic.c
@@ -276,7 +276,7 @@ void __init ppc4xx_pic_init(void)
 
 	/* Attach low-level handlers */
 	for (i = 0; i < (NR_UICS << 5); ++i) {
-		irq_desc[i].handler = &__uic[i >> 5].decl;
+		irq_desc[i].chip = &__uic[i >> 5].decl;
 		if (is_level_sensitive(i))
 			irq_desc[i].status |= IRQ_LEVEL;
 	}
diff --git a/arch/ppc/syslib/xilinx_pic.c b/arch/ppc/syslib/xilinx_pic.c
index e672b600f315..39a93dc6375b 100644
--- a/arch/ppc/syslib/xilinx_pic.c
+++ b/arch/ppc/syslib/xilinx_pic.c
@@ -143,7 +143,7 @@ ppc4xx_pic_init(void)
 	ppc_md.get_irq = xilinx_pic_get_irq;
 
 	for (i = 0; i < NR_IRQS; ++i) {
-		irq_desc[i].handler = &xilinx_intc;
+		irq_desc[i].chip = &xilinx_intc;
 
 		if (XPAR_INTC_0_KIND_OF_INTR & (0x00000001 << i))
 			irq_desc[i].status &= ~IRQ_LEVEL;
diff --git a/arch/sh/boards/adx/irq_maskreg.c b/arch/sh/boards/adx/irq_maskreg.c
index c0973f8d57ba..357fab1bac2b 100644
--- a/arch/sh/boards/adx/irq_maskreg.c
+++ b/arch/sh/boards/adx/irq_maskreg.c
@@ -102,6 +102,6 @@ static void end_maskreg_irq(unsigned int irq)
 void make_maskreg_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
-	irq_desc[irq].handler = &maskreg_irq_type;
+	irq_desc[irq].chip = &maskreg_irq_type;
 	disable_maskreg_irq(irq);
 }
diff --git a/arch/sh/boards/bigsur/irq.c b/arch/sh/boards/bigsur/irq.c
index 6ddbcc77244d..1d32425782c0 100644
--- a/arch/sh/boards/bigsur/irq.c
+++ b/arch/sh/boards/bigsur/irq.c
@@ -253,7 +253,7 @@ static void make_bigsur_l1isr(unsigned int irq) {
         /* sanity check first */
         if(irq >= BIGSUR_IRQ_LOW && irq < BIGSUR_IRQ_HIGH) {
                 /* save the handler in the main description table */
-                irq_desc[irq].handler = &bigsur_l1irq_type;
+                irq_desc[irq].chip = &bigsur_l1irq_type;
                 irq_desc[irq].status = IRQ_DISABLED;
                 irq_desc[irq].action = 0;
                 irq_desc[irq].depth = 1;
@@ -270,7 +270,7 @@ static void make_bigsur_l2isr(unsigned int irq) {
         /* sanity check first */
         if(irq >= BIGSUR_2NDLVL_IRQ_LOW && irq < BIGSUR_2NDLVL_IRQ_HIGH) {
                 /* save the handler in the main description table */
-                irq_desc[irq].handler = &bigsur_l2irq_type;
+                irq_desc[irq].chip = &bigsur_l2irq_type;
                 irq_desc[irq].status = IRQ_DISABLED;
                 irq_desc[irq].action = 0;
                 irq_desc[irq].depth = 1;
diff --git a/arch/sh/boards/cqreek/irq.c b/arch/sh/boards/cqreek/irq.c
index d1da0d844567..2955adc52310 100644
--- a/arch/sh/boards/cqreek/irq.c
+++ b/arch/sh/boards/cqreek/irq.c
@@ -103,7 +103,7 @@ void __init init_cqreek_IRQ(void)
 		cqreek_irq_data[14].stat_port = BRIDGE_IDE_INTR_STAT;
 		cqreek_irq_data[14].bit = 1;
 
-		irq_desc[14].handler = &cqreek_irq_type;
+		irq_desc[14].chip = &cqreek_irq_type;
 		irq_desc[14].status = IRQ_DISABLED;
 		irq_desc[14].action = 0;
 		irq_desc[14].depth = 1;
@@ -117,7 +117,7 @@ void __init init_cqreek_IRQ(void)
 		cqreek_irq_data[10].bit = (1 << 10);
 
 		/* XXX: Err... we may need demultiplexer for ISA irq... */
-		irq_desc[10].handler = &cqreek_irq_type;
+		irq_desc[10].chip = &cqreek_irq_type;
 		irq_desc[10].status = IRQ_DISABLED;
 		irq_desc[10].action = 0;
 		irq_desc[10].depth = 1;
diff --git a/arch/sh/boards/dreamcast/setup.c b/arch/sh/boards/dreamcast/setup.c
index 55dece35cde5..0027b80a2343 100644
--- a/arch/sh/boards/dreamcast/setup.c
+++ b/arch/sh/boards/dreamcast/setup.c
@@ -70,7 +70,7 @@ int __init platform_setup(void)
 
 	/* Assign all virtual IRQs to the System ASIC int. handler */
 	for (i = HW_EVENT_IRQ_BASE; i < HW_EVENT_IRQ_MAX; i++)
-		irq_desc[i].handler = &systemasic_int;
+		irq_desc[i].chip = &systemasic_int;
 
 	board_time_init = aica_time_init;
 
diff --git a/arch/sh/boards/ec3104/setup.c b/arch/sh/boards/ec3104/setup.c
index 5130ba2b6ff1..4b3ef16a0e96 100644
--- a/arch/sh/boards/ec3104/setup.c
+++ b/arch/sh/boards/ec3104/setup.c
@@ -63,7 +63,7 @@ int __init platform_setup(void)
 		str[i] = ctrl_readb(EC3104_BASE + i);
 
 	for (i = EC3104_IRQBASE; i < EC3104_IRQBASE + 32; i++)
-		irq_desc[i].handler = &ec3104_int;
+		irq_desc[i].chip = &ec3104_int;
 
 	printk("initializing EC3104 \"%.8s\" at %08x, IRQ %d, IRQ base %d\n",
 	       str, EC3104_BASE, EC3104_IRQ, EC3104_IRQBASE);
diff --git a/arch/sh/boards/harp/irq.c b/arch/sh/boards/harp/irq.c
index 52d0ba39031b..701fa55d5297 100644
--- a/arch/sh/boards/harp/irq.c
+++ b/arch/sh/boards/harp/irq.c
@@ -114,7 +114,7 @@ static void enable_harp_irq(unsigned int irq)
 static void __init make_harp_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
-	irq_desc[irq].handler = &harp_irq_type;
+	irq_desc[irq].chip = &harp_irq_type;
 	disable_harp_irq(irq);
 }
 
diff --git a/arch/sh/boards/mpc1211/setup.c b/arch/sh/boards/mpc1211/setup.c
index 2bb581b91683..b72f009c52c2 100644
--- a/arch/sh/boards/mpc1211/setup.c
+++ b/arch/sh/boards/mpc1211/setup.c
@@ -194,7 +194,7 @@ static struct hw_interrupt_type mpc1211_irq_type = {
 
 static void make_mpc1211_irq(unsigned int irq)
 {
-	irq_desc[irq].handler = &mpc1211_irq_type;
+	irq_desc[irq].chip = &mpc1211_irq_type;
 	irq_desc[irq].status  = IRQ_DISABLED;
 	irq_desc[irq].action  = 0;
 	irq_desc[irq].depth   = 1;
diff --git a/arch/sh/boards/overdrive/irq.c b/arch/sh/boards/overdrive/irq.c
index 715e8feb3a68..2c13a7de6b22 100644
--- a/arch/sh/boards/overdrive/irq.c
+++ b/arch/sh/boards/overdrive/irq.c
@@ -150,7 +150,7 @@ static void enable_od_irq(unsigned int irq)
 static void __init make_od_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
-	irq_desc[irq].handler = &od_irq_type;
+	irq_desc[irq].chip = &od_irq_type;
 	disable_od_irq(irq);
 }
 
diff --git a/arch/sh/boards/renesas/hs7751rvoip/irq.c b/arch/sh/boards/renesas/hs7751rvoip/irq.c
index ed4c5b50ea45..52a98b524e1f 100644
--- a/arch/sh/boards/renesas/hs7751rvoip/irq.c
+++ b/arch/sh/boards/renesas/hs7751rvoip/irq.c
@@ -86,7 +86,7 @@ static struct hw_interrupt_type hs7751rvoip_irq_type = {
 static void make_hs7751rvoip_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
-	irq_desc[irq].handler = &hs7751rvoip_irq_type;
+	irq_desc[irq].chip = &hs7751rvoip_irq_type;
 	disable_hs7751rvoip_irq(irq);
 }
 
diff --git a/arch/sh/boards/renesas/rts7751r2d/irq.c b/arch/sh/boards/renesas/rts7751r2d/irq.c
index d36c9374aed1..e16915d9cda4 100644
--- a/arch/sh/boards/renesas/rts7751r2d/irq.c
+++ b/arch/sh/boards/renesas/rts7751r2d/irq.c
@@ -100,7 +100,7 @@ static struct hw_interrupt_type rts7751r2d_irq_type = {
 static void make_rts7751r2d_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
-	irq_desc[irq].handler = &rts7751r2d_irq_type;
+	irq_desc[irq].chip = &rts7751r2d_irq_type;
 	disable_rts7751r2d_irq(irq);
 }
 
diff --git a/arch/sh/boards/renesas/systemh/irq.c b/arch/sh/boards/renesas/systemh/irq.c
index 7a2eb10edb56..845979181059 100644
--- a/arch/sh/boards/renesas/systemh/irq.c
+++ b/arch/sh/boards/renesas/systemh/irq.c
@@ -105,7 +105,7 @@ static void end_systemh_irq(unsigned int irq)
 void make_systemh_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
-	irq_desc[irq].handler = &systemh_irq_type;
+	irq_desc[irq].chip = &systemh_irq_type;
 	disable_systemh_irq(irq);
 }
 
diff --git a/arch/sh/boards/se/73180/irq.c b/arch/sh/boards/se/73180/irq.c
index 70f04caad9a4..402735c7c898 100644
--- a/arch/sh/boards/se/73180/irq.c
+++ b/arch/sh/boards/se/73180/irq.c
@@ -85,7 +85,7 @@ void
 make_intreq_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
-	irq_desc[irq].handler = &intreq_irq_type;
+	irq_desc[irq].chip = &intreq_irq_type;
 	disable_intreq_irq(irq);
 }
 
diff --git a/arch/sh/boards/superh/microdev/irq.c b/arch/sh/boards/superh/microdev/irq.c
index efcbd86b7cd2..cb5999425d16 100644
--- a/arch/sh/boards/superh/microdev/irq.c
+++ b/arch/sh/boards/superh/microdev/irq.c
@@ -147,7 +147,7 @@ static void enable_microdev_irq(unsigned int irq)
 static void __init make_microdev_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
-	irq_desc[irq].handler = &microdev_irq_type;
+	irq_desc[irq].chip = &microdev_irq_type;
 	disable_microdev_irq(irq);
 }
 
diff --git a/arch/sh/cchips/hd6446x/hd64461/setup.c b/arch/sh/cchips/hd6446x/hd64461/setup.c
index f014b9bf6922..724db04cb392 100644
--- a/arch/sh/cchips/hd6446x/hd64461/setup.c
+++ b/arch/sh/cchips/hd6446x/hd64461/setup.c
@@ -154,7 +154,7 @@ int __init setup_hd64461(void)
 	outw(0xffff, HD64461_NIMR);
 
 	for (i = HD64461_IRQBASE; i < HD64461_IRQBASE + 16; i++) {
-		irq_desc[i].handler = &hd64461_irq_type;
+		irq_desc[i].chip = &hd64461_irq_type;
 	}
 
 	setup_irq(CONFIG_HD64461_IRQ, &irq0);
diff --git a/arch/sh/cchips/hd6446x/hd64465/setup.c b/arch/sh/cchips/hd6446x/hd64465/setup.c
index 68e4c4e4283d..cf9142c620b7 100644
--- a/arch/sh/cchips/hd6446x/hd64465/setup.c
+++ b/arch/sh/cchips/hd6446x/hd64465/setup.c
@@ -182,7 +182,7 @@ static int __init setup_hd64465(void)
 	outw(0xffff, HD64465_REG_NIMR); 	/* mask all interrupts */
 
 	for (i = 0; i < HD64465_IRQ_NUM ; i++) {
-		irq_desc[HD64465_IRQ_BASE + i].handler = &hd64465_irq_type;
+		irq_desc[HD64465_IRQ_BASE + i].chip = &hd64465_irq_type;
 	}
 
 	setup_irq(CONFIG_HD64465_IRQ, &irq0);
diff --git a/arch/sh/cchips/voyagergx/irq.c b/arch/sh/cchips/voyagergx/irq.c
index 2ee330b3c38f..892214bade19 100644
--- a/arch/sh/cchips/voyagergx/irq.c
+++ b/arch/sh/cchips/voyagergx/irq.c
@@ -191,7 +191,7 @@ void __init setup_voyagergx_irq(void)
 			flag = 1;
 		}
 		if (flag == 1)
-			irq_desc[VOYAGER_IRQ_BASE + i].handler = &voyagergx_irq_type;
+			irq_desc[VOYAGER_IRQ_BASE + i].chip = &voyagergx_irq_type;
 	}
 
 	setup_irq(IRQ_VOYAGER, &irq0);
diff --git a/arch/sh/kernel/cpu/irq/imask.c b/arch/sh/kernel/cpu/irq/imask.c
index baed9a550d39..a33ae3e0a5a5 100644
--- a/arch/sh/kernel/cpu/irq/imask.c
+++ b/arch/sh/kernel/cpu/irq/imask.c
@@ -105,6 +105,6 @@ static void shutdown_imask_irq(unsigned int irq)
 void make_imask_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
-	irq_desc[irq].handler = &imask_irq_type;
+	irq_desc[irq].chip = &imask_irq_type;
 	enable_irq(irq);
 }
diff --git a/arch/sh/kernel/cpu/irq/intc2.c b/arch/sh/kernel/cpu/irq/intc2.c
index 06e8afab32e4..30064bf6e154 100644
--- a/arch/sh/kernel/cpu/irq/intc2.c
+++ b/arch/sh/kernel/cpu/irq/intc2.c
@@ -137,7 +137,7 @@ void make_intc2_irq(unsigned int irq,
 
 	local_irq_restore(flags);
 
-	irq_desc[irq].handler = &intc2_irq_type;
+	irq_desc[irq].chip = &intc2_irq_type;
 
 	disable_intc2_irq(irq);
 }
diff --git a/arch/sh/kernel/cpu/irq/ipr.c b/arch/sh/kernel/cpu/irq/ipr.c
index e55150ed0856..0373b65c77f9 100644
--- a/arch/sh/kernel/cpu/irq/ipr.c
+++ b/arch/sh/kernel/cpu/irq/ipr.c
@@ -115,7 +115,7 @@ void make_ipr_irq(unsigned int irq, unsigned int addr, int pos, int priority)
 	ipr_data[irq].shift = pos*4; /* POSition (0-3) x 4 means shift */
 	ipr_data[irq].priority = priority;
 
-	irq_desc[irq].handler = &ipr_irq_type;
+	irq_desc[irq].chip = &ipr_irq_type;
 	disable_ipr_irq(irq);
 }
 
diff --git a/arch/sh/kernel/cpu/irq/pint.c b/arch/sh/kernel/cpu/irq/pint.c
index 95d6024fe1ae..714963a25bba 100644
--- a/arch/sh/kernel/cpu/irq/pint.c
+++ b/arch/sh/kernel/cpu/irq/pint.c
@@ -85,7 +85,7 @@ static void end_pint_irq(unsigned int irq)
 void make_pint_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
-	irq_desc[irq].handler = &pint_irq_type;
+	irq_desc[irq].chip = &pint_irq_type;
 	disable_pint_irq(irq);
 }
 
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index b56e79632f24..c2e07f7f3496 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -47,7 +47,7 @@ int show_interrupts(struct seq_file *p, void *v)
 			goto unlock;
 		seq_printf(p, "%3d: ",i);
 		seq_printf(p, "%10u ", kstat_irqs(i));
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+		seq_printf(p, " %14s", irq_desc[i].chip->typename);
 		seq_printf(p, "  %s", action->name);
 
 		for (action=action->next; action; action = action->next)
diff --git a/arch/sh64/kernel/irq.c b/arch/sh64/kernel/irq.c
index d69879c0e063..675776a5477e 100644
--- a/arch/sh64/kernel/irq.c
+++ b/arch/sh64/kernel/irq.c
@@ -65,7 +65,7 @@ int show_interrupts(struct seq_file *p, void *v)
 			goto unlock;
 		seq_printf(p, "%3d: ",i);
 		seq_printf(p, "%10u ", kstat_irqs(i));
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+		seq_printf(p, " %14s", irq_desc[i].chip->typename);
 		seq_printf(p, "  %s", action->name);
 
 		for (action=action->next; action; action = action->next)
diff --git a/arch/sh64/kernel/irq_intc.c b/arch/sh64/kernel/irq_intc.c
index fc99bf4e362c..fa730f5fe2e6 100644
--- a/arch/sh64/kernel/irq_intc.c
+++ b/arch/sh64/kernel/irq_intc.c
@@ -178,7 +178,7 @@ static void end_intc_irq(unsigned int irq)
 void make_intc_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
-	irq_desc[irq].handler = &intc_irq_type;
+	irq_desc[irq].chip = &intc_irq_type;
 	disable_intc_irq(irq);
 }
 
@@ -208,7 +208,7 @@ void __init init_IRQ(void)
 	/* Set default: per-line enable/disable, priority driven ack/eoi */
 	for (i = 0; i < NR_INTC_IRQS; i++) {
 		if (platform_int_priority[i] != NO_PRIORITY) {
-			irq_desc[i].handler = &intc_irq_type;
+			irq_desc[i].chip = &intc_irq_type;
 		}
 	}
 
diff --git a/arch/sh64/mach-cayman/irq.c b/arch/sh64/mach-cayman/irq.c
index f797c84bfdd1..05eb7cdc26f0 100644
--- a/arch/sh64/mach-cayman/irq.c
+++ b/arch/sh64/mach-cayman/irq.c
@@ -187,7 +187,7 @@ void init_cayman_irq(void)
 	}
 
 	for (i=0; i<NR_EXT_IRQS; i++) {
-		irq_desc[START_EXT_IRQS + i].handler = &cayman_irq_type;
+		irq_desc[START_EXT_IRQS + i].chip = &cayman_irq_type;
 	}
 
 	/* Setup the SMSC interrupt */
diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c
index cc89b06d0178..5d7c821ab7f6 100644
--- a/arch/sparc64/kernel/irq.c
+++ b/arch/sparc64/kernel/irq.c
@@ -151,7 +151,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %9s", irq_desc[i].handler->typename);
+		seq_printf(p, " %9s", irq_desc[i].chip->typename);
 		seq_printf(p, "  %s", action->name);
 
 		for (action=action->next; action; action = action->next)
@@ -414,8 +414,8 @@ void irq_install_pre_handler(int virt_irq,
 	data->pre_handler_arg1 = arg1;
 	data->pre_handler_arg2 = arg2;
 
-	desc->handler = (desc->handler == &sun4u_irq ?
-			 &sun4u_irq_ack : &sun4v_irq_ack);
+	desc->chip = (desc->chip == &sun4u_irq ?
+		      &sun4u_irq_ack : &sun4v_irq_ack);
 }
 
 unsigned int build_irq(int inofixup, unsigned long iclr, unsigned long imap)
@@ -431,7 +431,7 @@ unsigned int build_irq(int inofixup, unsigned long iclr, unsigned long imap)
 	bucket = &ivector_table[ino];
 	if (!bucket->virt_irq) {
 		bucket->virt_irq = virt_irq_alloc(__irq(bucket));
-		irq_desc[bucket->virt_irq].handler = &sun4u_irq;
+		irq_desc[bucket->virt_irq].chip = &sun4u_irq;
 	}
 
 	desc = irq_desc + bucket->virt_irq;
@@ -465,7 +465,7 @@ unsigned int sun4v_build_irq(u32 devhandle, unsigned int devino)
 	bucket = &ivector_table[sysino];
 	if (!bucket->virt_irq) {
 		bucket->virt_irq = virt_irq_alloc(__irq(bucket));
-		irq_desc[bucket->virt_irq].handler = &sun4v_irq;
+		irq_desc[bucket->virt_irq].chip = &sun4v_irq;
 	}
 
 	desc = irq_desc + bucket->virt_irq;
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 2ffda012385e..fae43a3054a0 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -63,7 +63,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+		seq_printf(p, " %14s", irq_desc[i].chip->typename);
 		seq_printf(p, "  %s", action->name);
 
 		for (action=action->next; action; action = action->next)
@@ -451,13 +451,13 @@ void __init init_IRQ(void)
 	irq_desc[TIMER_IRQ].status = IRQ_DISABLED;
 	irq_desc[TIMER_IRQ].action = NULL;
 	irq_desc[TIMER_IRQ].depth = 1;
-	irq_desc[TIMER_IRQ].handler = &SIGVTALRM_irq_type;
+	irq_desc[TIMER_IRQ].chip = &SIGVTALRM_irq_type;
 	enable_irq(TIMER_IRQ);
 	for (i = 1; i < NR_IRQS; i++) {
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = NULL;
 		irq_desc[i].depth = 1;
-		irq_desc[i].handler = &normal_irq_type;
+		irq_desc[i].chip = &normal_irq_type;
 		enable_irq(i);
 	}
 }
diff --git a/arch/v850/kernel/irq.c b/arch/v850/kernel/irq.c
index 7a151c26f82e..858c45819aab 100644
--- a/arch/v850/kernel/irq.c
+++ b/arch/v850/kernel/irq.c
@@ -65,10 +65,10 @@ int show_interrupts(struct seq_file *p, void *v)
 			int j;
 			int count = 0;
 			int num = -1;
-			const char *type_name = irq_desc[irq].handler->typename;
+			const char *type_name = irq_desc[irq].chip->typename;
 
 			for (j = 0; j < NR_IRQS; j++)
-				if (irq_desc[j].handler->typename == type_name){
+				if (irq_desc[j].chip->typename == type_name){
 					if (irq == j)
 						num = count;
 					count++;
@@ -117,7 +117,7 @@ init_irq_handlers (int base_irq, int num, int interval,
 		irq_desc[base_irq].status  = IRQ_DISABLED;
 		irq_desc[base_irq].action  = NULL;
 		irq_desc[base_irq].depth   = 1;
-		irq_desc[base_irq].handler = irq_type;
+		irq_desc[base_irq].chip = irq_type;
 		base_irq += interval;
 	}
 }
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 86b2c1e197aa..3dd1659427dc 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -235,7 +235,7 @@ void make_8259A_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
 	io_apic_irqs &= ~(1<<irq);
-	irq_desc[irq].handler = &i8259A_irq_type;
+	irq_desc[irq].chip = &i8259A_irq_type;
 	enable_irq(irq);
 }
 
@@ -468,12 +468,12 @@ void __init init_ISA_irqs (void)
 			/*
 			 * 16 old-style INTA-cycle interrupts:
 			 */
-			irq_desc[i].handler = &i8259A_irq_type;
+			irq_desc[i].chip = &i8259A_irq_type;
 		} else {
 			/*
 			 * 'high' PCI IRQs filled in on demand
 			 */
-			irq_desc[i].handler = &no_irq_type;
+			irq_desc[i].chip = &no_irq_type;
 		}
 	}
 }
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index c768d8a036d0..88a8736eb8ce 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -876,15 +876,17 @@ static struct hw_interrupt_type ioapic_edge_type;
 #define IOAPIC_EDGE	0
 #define IOAPIC_LEVEL	1
 
-static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
 {
-	unsigned idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
+	unsigned idx;
+
+	idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 			trigger == IOAPIC_LEVEL)
-		irq_desc[idx].handler = &ioapic_level_type;
+		irq_desc[idx].chip = &ioapic_level_type;
 	else
-		irq_desc[idx].handler = &ioapic_edge_type;
+		irq_desc[idx].chip = &ioapic_edge_type;
 	set_intr_gate(vector, interrupt[idx]);
 }
 
@@ -986,7 +988,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 	 * The timer IRQ doesn't have to know that behind the
 	 * scene we have a 8259A-master in AEOI mode ...
 	 */
-	irq_desc[0].handler = &ioapic_edge_type;
+	irq_desc[0].chip = &ioapic_edge_type;
 
 	/*
 	 * Add it to the IO-APIC irq-routing table:
@@ -1683,7 +1685,7 @@ static inline void init_IO_APIC_traps(void)
 				make_8259A_irq(irq);
 			else
 				/* Strange. Oh, well.. */
-				irq_desc[irq].handler = &no_irq_type;
+				irq_desc[irq].chip = &no_irq_type;
 		}
 	}
 }
@@ -1900,7 +1902,7 @@ static inline void check_timer(void)
 	apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
 
 	disable_8259A_irq(0);
-	irq_desc[0].handler = &lapic_irq_type;
+	irq_desc[0].chip = &lapic_irq_type;
 	apic_write(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index bfa82f52a5cc..0c2fb2c77bd0 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -79,7 +79,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+		seq_printf(p, " %14s", irq_desc[i].chip->typename);
 
 		seq_printf(p, "  %s", action->name);
 		for (action=action->next; action; action = action->next)
@@ -151,8 +151,8 @@ void fixup_irqs(cpumask_t map)
 			printk("Breaking affinity for irq %i\n", irq);
 			mask = map;
 		}
-		if (irq_desc[irq].handler->set_affinity)
-			irq_desc[irq].handler->set_affinity(irq, mask);
+		if (irq_desc[irq].chip->set_affinity)
+			irq_desc[irq].chip->set_affinity(irq, mask);
 		else if (irq_desc[irq].action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
 	}
diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c
index 51f9bed455fa..1cf744ee0959 100644
--- a/arch/xtensa/kernel/irq.c
+++ b/arch/xtensa/kernel/irq.c
@@ -100,7 +100,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+		seq_printf(p, " %14s", irq_desc[i].chip->typename);
 		seq_printf(p, "  %s", action->name);
 
 		for (action=action->next; action; action = action->next)
@@ -181,7 +181,7 @@ void __init init_IRQ(void)
 	int i;
 
 	for (i=0; i < XTENSA_NR_IRQS; i++)
-		irq_desc[i].handler = &xtensa_irq_type;
+		irq_desc[i].chip = &xtensa_irq_type;
 
 	cached_irq_mask = 0;
 
diff --git a/drivers/char/vr41xx_giu.c b/drivers/char/vr41xx_giu.c
index 05e6e814d86f..073da48c092e 100644
--- a/drivers/char/vr41xx_giu.c
+++ b/drivers/char/vr41xx_giu.c
@@ -689,9 +689,9 @@ static int __devinit giu_probe(struct platform_device *dev)
 
 	for (i = GIU_IRQ_BASE; i <= GIU_IRQ_LAST; i++) {
 		if (i < GIU_IRQ(GIUINT_HIGH_OFFSET))
-			irq_desc[i].handler = &giuint_low_irq_type;
+			irq_desc[i].chip = &giuint_low_irq_type;
 		else
-			irq_desc[i].handler = &giuint_high_irq_type;
+			irq_desc[i].chip = &giuint_high_irq_type;
 	}
 
 	return cascade_irq(GIUINT_IRQ, giu_get_irq);
diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c
index 6e8ed0c81a6c..ce0a6ebcff15 100644
--- a/drivers/parisc/dino.c
+++ b/drivers/parisc/dino.c
@@ -299,7 +299,7 @@ struct pci_port_ops dino_port_ops = {
 
 static void dino_disable_irq(unsigned int irq)
 {
-	struct dino_device *dino_dev = irq_desc[irq].handler_data;
+	struct dino_device *dino_dev = irq_desc[irq].chip_data;
 	int local_irq = gsc_find_local_irq(irq, dino_dev->global_irq, DINO_LOCAL_IRQS);
 
 	DBG(KERN_WARNING "%s(0x%p, %d)\n", __FUNCTION__, dino_dev, irq);
@@ -311,7 +311,7 @@ static void dino_disable_irq(unsigned int irq)
 
 static void dino_enable_irq(unsigned int irq)
 {
-	struct dino_device *dino_dev = irq_desc[irq].handler_data;
+	struct dino_device *dino_dev = irq_desc[irq].chip_data;
 	int local_irq = gsc_find_local_irq(irq, dino_dev->global_irq, DINO_LOCAL_IRQS);
 	u32 tmp;
 
diff --git a/drivers/parisc/eisa.c b/drivers/parisc/eisa.c
index 9d3bd15bf53b..58f0ce8d78e0 100644
--- a/drivers/parisc/eisa.c
+++ b/drivers/parisc/eisa.c
@@ -350,7 +350,7 @@ static int __devinit eisa_probe(struct parisc_device *dev)
 	irq_desc[2].action = &irq2_action;
 	
 	for (i = 0; i < 16; i++) {
-		irq_desc[i].handler = &eisa_interrupt_type;
+		irq_desc[i].chip = &eisa_interrupt_type;
 	}
 	
 	EISA_bus = 1;
diff --git a/drivers/parisc/gsc.c b/drivers/parisc/gsc.c
index 16d40f95978d..5476ba7709b3 100644
--- a/drivers/parisc/gsc.c
+++ b/drivers/parisc/gsc.c
@@ -109,7 +109,7 @@ int gsc_find_local_irq(unsigned int irq, int *global_irqs, int limit)
 
 static void gsc_asic_disable_irq(unsigned int irq)
 {
-	struct gsc_asic *irq_dev = irq_desc[irq].handler_data;
+	struct gsc_asic *irq_dev = irq_desc[irq].chip_data;
 	int local_irq = gsc_find_local_irq(irq, irq_dev->global_irq, 32);
 	u32 imr;
 
@@ -124,7 +124,7 @@ static void gsc_asic_disable_irq(unsigned int irq)
 
 static void gsc_asic_enable_irq(unsigned int irq)
 {
-	struct gsc_asic *irq_dev = irq_desc[irq].handler_data;
+	struct gsc_asic *irq_dev = irq_desc[irq].chip_data;
 	int local_irq = gsc_find_local_irq(irq, irq_dev->global_irq, 32);
 	u32 imr;
 
@@ -164,8 +164,8 @@ int gsc_assign_irq(struct hw_interrupt_type *type, void *data)
 	if (irq > GSC_IRQ_MAX)
 		return NO_IRQ;
 
-	irq_desc[irq].handler = type;
-	irq_desc[irq].handler_data = data;
+	irq_desc[irq].chip = type;
+	irq_desc[irq].chip_data = data;
 	return irq++;
 }
 
diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c
index 7a458d5bc751..1fbda77cefc2 100644
--- a/drivers/parisc/iosapic.c
+++ b/drivers/parisc/iosapic.c
@@ -619,7 +619,7 @@ iosapic_set_irt_data( struct vector_info *vi, u32 *dp0, u32 *dp1)
 
 static struct vector_info *iosapic_get_vector(unsigned int irq)
 {
-	return irq_desc[irq].handler_data;
+	return irq_desc[irq].chip_data;
 }
 
 static void iosapic_disable_irq(unsigned int irq)
diff --git a/drivers/parisc/superio.c b/drivers/parisc/superio.c
index 828eb45062de..a988dc7a9abd 100644
--- a/drivers/parisc/superio.c
+++ b/drivers/parisc/superio.c
@@ -360,7 +360,7 @@ int superio_fixup_irq(struct pci_dev *pcidev)
 #endif
 
 	for (i = 0; i < 16; i++) {
-		irq_desc[i].handler = &superio_interrupt_type;
+		irq_desc[i].chip = &superio_interrupt_type;
 	}
 
 	/*
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 7f8429284fab..76d023d8a33b 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -429,12 +429,12 @@ static void irq_handler_init(int cap_id, int pos, int mask)
 
 	spin_lock_irqsave(&irq_desc[pos].lock, flags);
 	if (cap_id == PCI_CAP_ID_MSIX)
-		irq_desc[pos].handler = &msix_irq_type;
+		irq_desc[pos].chip = &msix_irq_type;
 	else {
 		if (!mask)
-			irq_desc[pos].handler = &msi_irq_wo_maskbit_type;
+			irq_desc[pos].chip = &msi_irq_wo_maskbit_type;
 		else
-			irq_desc[pos].handler = &msi_irq_w_maskbit_type;
+			irq_desc[pos].chip = &msi_irq_w_maskbit_type;
 	}
 	spin_unlock_irqrestore(&irq_desc[pos].lock, flags);
 }
diff --git a/drivers/pcmcia/hd64465_ss.c b/drivers/pcmcia/hd64465_ss.c
index b39435bbfaeb..c662e4f89d46 100644
--- a/drivers/pcmcia/hd64465_ss.c
+++ b/drivers/pcmcia/hd64465_ss.c
@@ -244,8 +244,8 @@ static void hs_map_irq(hs_socket_t *sp, unsigned int irq)
 
     	hs_mapped_irq[irq].sock = sp;
 	/* insert ourselves as the irq controller */
-	hs_mapped_irq[irq].old_handler = irq_desc[irq].handler;
-	irq_desc[irq].handler = &hd64465_ss_irq_type;
+	hs_mapped_irq[irq].old_handler = irq_desc[irq].chip;
+	irq_desc[irq].chip = &hd64465_ss_irq_type;
 }
 
 
@@ -260,7 +260,7 @@ static void hs_unmap_irq(hs_socket_t *sp, unsigned int irq)
 	    return;
 		
 	/* restore the original irq controller */
-	irq_desc[irq].handler = hs_mapped_irq[irq].old_handler;
+	irq_desc[irq].chip = hs_mapped_irq[irq].old_handler;
 }
 
 /*============================================================*/
diff --git a/include/asm-powerpc/hw_irq.h b/include/asm-powerpc/hw_irq.h
index ce0f7db63c16..a27ed3feb315 100644
--- a/include/asm-powerpc/hw_irq.h
+++ b/include/asm-powerpc/hw_irq.h
@@ -86,20 +86,20 @@ static inline void local_irq_save_ptr(unsigned long *flags)
 #define mask_irq(irq)						\
 	({							\
 	 	irq_desc_t *desc = get_irq_desc(irq);		\
-		if (desc->handler && desc->handler->disable)	\
-			desc->handler->disable(irq);		\
+		if (desc->chip && desc->chip->disable)	\
+			desc->chip->disable(irq);		\
 	})
 #define unmask_irq(irq)						\
 	({							\
 	 	irq_desc_t *desc = get_irq_desc(irq);		\
-		if (desc->handler && desc->handler->enable)	\
-			desc->handler->enable(irq);		\
+		if (desc->chip && desc->chip->enable)	\
+			desc->chip->enable(irq);		\
 	})
 #define ack_irq(irq)						\
 	({							\
 	 	irq_desc_t *desc = get_irq_desc(irq);		\
-		if (desc->handler && desc->handler->ack)	\
-			desc->handler->ack(irq);		\
+		if (desc->chip && desc->chip->ack)	\
+			desc->chip->ack(irq);		\
 	})
 
 /* Should we handle this via lost interrupts and IPIs or should we don't care like
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 676e00dfb21a..9597a6904239 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -69,8 +69,8 @@ typedef struct hw_interrupt_type  hw_irq_controller;
  * Pad this out to 32 bytes for cache and indexing reasons.
  */
 typedef struct irq_desc {
-	hw_irq_controller *handler;
-	void *handler_data;
+	hw_irq_controller *chip;
+	void *chip_data;
 	struct irqaction *action;	/* IRQ action list */
 	unsigned int status;		/* IRQ status */
 	unsigned int depth;		/* nested irq disables */
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 3467097ca61a..6f1e68a46cbc 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -41,7 +41,7 @@ unsigned long probe_irq_on(void)
 
 		spin_lock_irq(&desc->lock);
 		if (!irq_desc[i].action)
-			irq_desc[i].handler->startup(i);
+			irq_desc[i].chip->startup(i);
 		spin_unlock_irq(&desc->lock);
 	}
 
@@ -59,7 +59,7 @@ unsigned long probe_irq_on(void)
 		spin_lock_irq(&desc->lock);
 		if (!desc->action) {
 			desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
-			if (desc->handler->startup(i))
+			if (desc->chip->startup(i))
 				desc->status |= IRQ_PENDING;
 		}
 		spin_unlock_irq(&desc->lock);
@@ -85,7 +85,7 @@ unsigned long probe_irq_on(void)
 			/* It triggered already - consider it spurious. */
 			if (!(status & IRQ_WAITING)) {
 				desc->status = status & ~IRQ_AUTODETECT;
-				desc->handler->shutdown(i);
+				desc->chip->shutdown(i);
 			} else
 				if (i < 32)
 					val |= 1 << i;
@@ -128,7 +128,7 @@ unsigned int probe_irq_mask(unsigned long val)
 				mask |= 1 << i;
 
 			desc->status = status & ~IRQ_AUTODETECT;
-			desc->handler->shutdown(i);
+			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
 	}
@@ -173,7 +173,7 @@ int probe_irq_off(unsigned long val)
 				nr_irqs++;
 			}
 			desc->status = status & ~IRQ_AUTODETECT;
-			desc->handler->shutdown(i);
+			desc->chip->shutdown(i);
 		}
 		spin_unlock_irq(&desc->lock);
 	}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 0f6530117105..f9d95705a4ac 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -31,7 +31,7 @@
 irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
-		.handler = &no_irq_type,
+		.chip = &no_irq_type,
 		.lock = SPIN_LOCK_UNLOCKED
 	}
 };
@@ -118,16 +118,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 		/*
 		 * No locking required for CPU-local interrupts:
 		 */
-		if (desc->handler->ack)
-			desc->handler->ack(irq);
+		if (desc->chip->ack)
+			desc->chip->ack(irq);
 		action_ret = handle_IRQ_event(irq, regs, desc->action);
-		desc->handler->end(irq);
+		desc->chip->end(irq);
 		return 1;
 	}
 
 	spin_lock(&desc->lock);
-	if (desc->handler->ack)
-		desc->handler->ack(irq);
+	if (desc->chip->ack)
+		desc->chip->ack(irq);
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
@@ -187,7 +187,7 @@ out:
 	 * The ->end() handler has to deal with interrupts which got
 	 * disabled while the handler was running.
 	 */
-	desc->handler->end(irq);
+	desc->chip->end(irq);
 	spin_unlock(&desc->lock);
 
 	return 1;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1279e3499534..31ee1f3bfcf9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -69,7 +69,7 @@ void disable_irq_nosync(unsigned int irq)
 	spin_lock_irqsave(&desc->lock, flags);
 	if (!desc->depth++) {
 		desc->status |= IRQ_DISABLED;
-		desc->handler->disable(irq);
+		desc->chip->disable(irq);
 	}
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
@@ -131,9 +131,9 @@ void enable_irq(unsigned int irq)
 		desc->status = status;
 		if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
 			desc->status = status | IRQ_REPLAY;
-			hw_resend_irq(desc->handler,irq);
+			hw_resend_irq(desc->chip,irq);
 		}
-		desc->handler->enable(irq);
+		desc->chip->enable(irq);
 		/* fall-through */
 	}
 	default:
@@ -178,7 +178,7 @@ int setup_irq(unsigned int irq, struct irqaction * new)
 	if (irq >= NR_IRQS)
 		return -EINVAL;
 
-	if (desc->handler == &no_irq_type)
+	if (desc->chip == &no_irq_type)
 		return -ENOSYS;
 	/*
 	 * Some drivers like serial.c use request_irq() heavily,
@@ -230,10 +230,10 @@ int setup_irq(unsigned int irq, struct irqaction * new)
 		desc->depth = 0;
 		desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
 				  IRQ_WAITING | IRQ_INPROGRESS);
-		if (desc->handler->startup)
-			desc->handler->startup(irq);
+		if (desc->chip->startup)
+			desc->chip->startup(irq);
 		else
-			desc->handler->enable(irq);
+			desc->chip->enable(irq);
 	}
 	spin_unlock_irqrestore(&desc->lock,flags);
 
@@ -295,16 +295,16 @@ void free_irq(unsigned int irq, void *dev_id)
 
 			/* Currently used only by UML, might disappear one day.*/
 #ifdef CONFIG_IRQ_RELEASE_METHOD
-			if (desc->handler->release)
-				desc->handler->release(irq, dev_id);
+			if (desc->chip->release)
+				desc->chip->release(irq, dev_id);
 #endif
 
 			if (!desc->action) {
 				desc->status |= IRQ_DISABLED;
-				if (desc->handler->shutdown)
-					desc->handler->shutdown(irq);
+				if (desc->chip->shutdown)
+					desc->chip->shutdown(irq);
 				else
-					desc->handler->disable(irq);
+					desc->chip->disable(irq);
 			}
 			spin_unlock_irqrestore(&desc->lock,flags);
 			unregister_handler_proc(irq, action);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index a12d00eb5e7c..d978c87bca93 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -33,7 +33,7 @@ void move_native_irq(int irq)
 	if (unlikely(cpus_empty(pending_irq_cpumask[irq])))
 		return;
 
-	if (!desc->handler->set_affinity)
+	if (!desc->chip->set_affinity)
 		return;
 
 	assert_spin_locked(&desc->lock);
@@ -51,12 +51,12 @@ void move_native_irq(int irq)
 	 */
 	if (likely(!cpus_empty(tmp))) {
 		if (likely(!(desc->status & IRQ_DISABLED)))
-			desc->handler->disable(irq);
+			desc->chip->disable(irq);
 
-		desc->handler->set_affinity(irq,tmp);
+		desc->chip->set_affinity(irq,tmp);
 
 		if (likely(!(desc->status & IRQ_DISABLED)))
-			desc->handler->enable(irq);
+			desc->chip->enable(irq);
 	}
 	cpus_clear(pending_irq_cpumask[irq]);
 }
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index afacd6f585fa..90fe05f23e69 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -37,7 +37,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
 	set_balance_irq_affinity(irq, mask_val);
 	irq_affinity[irq] = mask_val;
-	irq_desc[irq].handler->set_affinity(irq, mask_val);
+	irq_desc[irq].chip->set_affinity(irq, mask_val);
 }
 #endif
 
@@ -59,7 +59,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 	unsigned int irq = (int)(long)data, full_count = count, err;
 	cpumask_t new_value, tmp;
 
-	if (!irq_desc[irq].handler->set_affinity || no_irq_affinity)
+	if (!irq_desc[irq].chip->set_affinity || no_irq_affinity)
 		return -EIO;
 
 	err = cpumask_parse(buffer, count, new_value);
@@ -122,7 +122,7 @@ void register_irq_proc(unsigned int irq)
 	char name [MAX_NAMELEN];
 
 	if (!root_irq_dir ||
-		(irq_desc[irq].handler == &no_irq_type) ||
+		(irq_desc[irq].chip == &no_irq_type) ||
 			irq_dir[irq])
 		return;
 
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index b2fb3c18d06b..ea3ceed362da 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -81,7 +81,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
 		 * IRQ controller clean up too
 		 */
 		if(work)
-			desc->handler->end(i);
+			desc->chip->end(i);
 		spin_unlock(&desc->lock);
 	}
 	/* So the caller can adjust the irq error counts */
@@ -166,7 +166,7 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
 		 */
 		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
 		desc->status |= IRQ_DISABLED;
-		desc->handler->disable(irq);
+		desc->chip->disable(irq);
 	}
 	desc->irqs_unhandled = 0;
 }
-- 
cgit v1.2.3


From a53da52fd743fd637637572838c0a7af23a2d038 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jun 2006 02:24:38 -0700
Subject: [PATCH] genirq: cleanup: merge irq_affinity[] into irq_desc[]

Consolidation: remove the irq_affinity[NR_IRQS] array and move it into the
irq_desc[NR_IRQS].affinity field.

[akpm@osdl.org: sparc64 build fix]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/irq.c               | 2 +-
 arch/i386/kernel/irq.c                | 2 +-
 arch/ia64/kernel/irq.c                | 4 ++--
 arch/parisc/kernel/irq.c              | 8 ++++----
 arch/powerpc/kernel/irq.c             | 2 +-
 arch/powerpc/platforms/pseries/xics.c | 4 ++--
 arch/powerpc/sysdev/mpic.c            | 2 +-
 arch/ppc/syslib/open_pic.c            | 4 ++--
 arch/sparc64/kernel/irq.c             | 2 +-
 arch/x86_64/kernel/irq.c              | 2 +-
 include/linux/irq.h                   | 7 ++++---
 kernel/irq/handle.c                   | 5 ++++-
 kernel/irq/manage.c                   | 2 --
 kernel/irq/proc.c                     | 4 ++--
 14 files changed, 26 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index ec9d243d42c9..63af36cf7f6e 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -56,7 +56,7 @@ select_smp_affinity(unsigned int irq)
 		cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0);
 	last_cpu = cpu;
 
-	irq_affinity[irq] = cpumask_of_cpu(cpu);
+	irq_desc[irq].affinity = cpumask_of_cpu(cpu);
 	irq_desc[irq].chip->set_affinity(irq, cpumask_of_cpu(cpu));
 	return 0;
 }
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index b942a5918dab..3336cef9605a 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -291,7 +291,7 @@ void fixup_irqs(cpumask_t map)
 		if (irq == 2)
 			continue;
 
-		cpus_and(mask, irq_affinity[irq], map);
+		cpus_and(mask, irq_desc[irq].affinity, map);
 		if (any_online_cpu(mask) == NR_CPUS) {
 			printk("Breaking affinity for irq %i\n", irq);
 			mask = map;
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index 2645153dba8a..c4e1b3b60b48 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -100,7 +100,7 @@ void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 	cpu_set(cpu_logical_id(hwid), mask);
 
 	if (irq < NR_IRQS) {
-		irq_affinity[irq] = mask;
+		irq_desc[irq].affinity = mask;
 		irq_redir[irq] = (char) (redir & 0xff);
 	}
 }
@@ -131,7 +131,7 @@ static void migrate_irqs(void)
 		if (desc->status == IRQ_PER_CPU)
 			continue;
 
-		cpus_and(mask, irq_affinity[irq], cpu_online_map);
+		cpus_and(mask, irq_desc[irq].affinity, cpu_online_map);
 		if (any_online_cpu(mask) == NR_CPUS) {
 			/*
 			 * Save it for phase 2 processing
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 26e69f73fdb0..ea6a55e1a0c5 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -94,7 +94,7 @@ int cpu_check_affinity(unsigned int irq, cpumask_t *dest)
 	if (irq == TIMER_IRQ || irq == IPI_IRQ) {
 		/* Bad linux design decision.  The mask has already
 		 * been set; we must reset it */
-		irq_affinity[irq] = CPU_MASK_ALL;
+		irq_desc[irq].affinity = CPU_MASK_ALL;
 		return -EINVAL;
 	}
 
@@ -110,7 +110,7 @@ static void cpu_set_affinity_irq(unsigned int irq, cpumask_t dest)
 	if (cpu_check_affinity(irq, &dest))
 		return;
 
-	irq_affinity[irq] = dest;
+	irq_desc[irq].affinity = dest;
 }
 #endif
 
@@ -265,7 +265,7 @@ int txn_alloc_irq(unsigned int bits_wide)
 unsigned long txn_affinity_addr(unsigned int irq, int cpu)
 {
 #ifdef CONFIG_SMP
-	irq_affinity[irq] = cpumask_of_cpu(cpu);
+	irq_desc[irq].affinity = cpumask_of_cpu(cpu);
 #endif
 
 	return cpu_data[cpu].txn_addr;
@@ -326,7 +326,7 @@ void do_cpu_irq_mask(struct pt_regs *regs)
 		/* Work our way from MSb to LSb...same order we alloc EIRs */
 		for (irq = TIMER_IRQ; eirr_val && bit; bit>>=1, irq++) {
 #ifdef CONFIG_SMP
-			cpumask_t dest = irq_affinity[irq];
+			cpumask_t dest = irq_desc[irq].affinity;
 #endif
 			if (!(bit & eirr_val))
 				continue;
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 8cfc779d882d..24f6050aa4ab 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -164,7 +164,7 @@ void fixup_irqs(cpumask_t map)
 		if (irq_desc[irq].status & IRQ_PER_CPU)
 			continue;
 
-		cpus_and(mask, irq_affinity[irq], map);
+		cpus_and(mask, irq_desc[irq].affinity, map);
 		if (any_online_cpu(mask) == NR_CPUS) {
 			printk("Breaking affinity for irq %i\n", irq);
 			mask = map;
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index bdc9e26a93cf..19c03dd43000 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -238,7 +238,7 @@ static int get_irq_server(unsigned int irq)
 {
 	unsigned int server;
 	/* For the moment only implement delivery to all cpus or one cpu */
-	cpumask_t cpumask = irq_affinity[irq];
+	cpumask_t cpumask = irq_desc[irq].affinity;
 	cpumask_t tmp = CPU_MASK_NONE;
 
 	if (!distribute_irqs)
@@ -729,7 +729,7 @@ void xics_migrate_irqs_away(void)
 
 		/* Reset affinity to all cpus */
 		desc->chip->set_affinity(virq, CPU_MASK_ALL);
-		irq_affinity[virq] = CPU_MASK_ALL;
+		irq_desc[irq].affinity = CPU_MASK_ALL;
 unlock:
 		spin_unlock_irqrestore(&desc->lock, flags);
 	}
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index f4613ee6b7a2..28df9c827ca6 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -906,7 +906,7 @@ void mpic_setup_this_cpu(void)
  	/* let the mpic know we want intrs. default affinity is 0xffffffff
 	 * until changed via /proc. That's how it's done on x86. If we want
 	 * it differently, then we should make sure we also change the default
-	 * values of irq_affinity in irq.c.
+	 * values of irq_desc[].affinity in irq.c.
  	 */
 	if (distribute_irqs) {
 	 	for (i = 0; i < mpic->num_sources ; i++)
diff --git a/arch/ppc/syslib/open_pic.c b/arch/ppc/syslib/open_pic.c
index 822058c2983e..767a0bc95817 100644
--- a/arch/ppc/syslib/open_pic.c
+++ b/arch/ppc/syslib/open_pic.c
@@ -615,8 +615,8 @@ void __devinit do_openpic_setup_cpu(void)
  	/* let the openpic know we want intrs. default affinity
  	 * is 0xffffffff until changed via /proc
  	 * That's how it's done on x86. If we want it differently, then
- 	 * we should make sure we also change the default values of irq_affinity
- 	 * in irq.c.
+ 	 * we should make sure we also change the default values of
+	 * irq_desc[].affinity in irq.c.
  	 */
  	for (i = 0; i < NumSources; i++)
 		openpic_mapirq(i, msk, CPU_MASK_ALL);
diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c
index 5d7c821ab7f6..ab9e640df228 100644
--- a/arch/sparc64/kernel/irq.c
+++ b/arch/sparc64/kernel/irq.c
@@ -224,7 +224,7 @@ static inline struct ino_bucket *virt_irq_to_bucket(unsigned int virt_irq)
 #ifdef CONFIG_SMP
 static int irq_choose_cpu(unsigned int virt_irq)
 {
-	cpumask_t mask = irq_affinity[virt_irq];
+	cpumask_t mask = irq_desc[virt_irq].affinity;
 	int cpuid;
 
 	if (cpus_equal(mask, CPU_MASK_ALL)) {
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 0c2fb2c77bd0..a1f1df5f7bfc 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -146,7 +146,7 @@ void fixup_irqs(cpumask_t map)
 		if (irq == 2)
 			continue;
 
-		cpus_and(mask, irq_affinity[irq], map);
+		cpus_and(mask, irq_desc[irq].affinity, map);
 		if (any_online_cpu(mask) == NR_CPUS) {
 			printk("Breaking affinity for irq %i\n", irq);
 			mask = map;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 9597a6904239..6e3ad6245bd3 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -77,6 +77,9 @@ typedef struct irq_desc {
 	unsigned int irq_count;		/* For detecting broken interrupts */
 	unsigned int irqs_unhandled;
 	spinlock_t lock;
+#ifdef CONFIG_SMP
+	cpumask_t affinity;
+#endif
 #if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
 	unsigned int move_irq;		/* Flag need to re-target intr dest*/
 #endif
@@ -96,12 +99,10 @@ irq_descp (int irq)
 extern int setup_irq(unsigned int irq, struct irqaction * new);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
-extern cpumask_t irq_affinity[NR_IRQS];
-
 #ifdef CONFIG_SMP
 static inline void set_native_irq_info(int irq, cpumask_t mask)
 {
-	irq_affinity[irq] = mask;
+	irq_desc[irq].affinity = mask;
 }
 #else
 static inline void set_native_irq_info(int irq, cpumask_t mask)
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index f9d95705a4ac..cc786aaf30d6 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -32,7 +32,10 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
 		.chip = &no_irq_type,
-		.lock = SPIN_LOCK_UNLOCKED
+		.lock = SPIN_LOCK_UNLOCKED,
+#ifdef CONFIG_SMP
+		.affinity = CPU_MASK_ALL
+#endif
 	}
 };
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 31ee1f3bfcf9..c53662edc73d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,8 +16,6 @@
 
 #ifdef CONFIG_SMP
 
-cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
-
 #if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
 cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
 #endif
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 90fe05f23e69..847b98a611e0 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -36,7 +36,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
 	set_balance_irq_affinity(irq, mask_val);
-	irq_affinity[irq] = mask_val;
+	irq_desc[irq].affinity = mask_val;
 	irq_desc[irq].chip->set_affinity(irq, mask_val);
 }
 #endif
@@ -44,7 +44,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
 				  int count, int *eof, void *data)
 {
-	int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]);
+	int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity);
 
 	if (count - len < 2)
 		return -EINVAL;
-- 
cgit v1.2.3


From a8553acd6c14e827078779c0a0ee1c18f27b2403 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jun 2006 02:24:38 -0700
Subject: [PATCH] genirq: cleanup: remove irq_descp()

Cleanup: remove irq_descp() - explicit use of irq_desc[] is shorter and more
readable.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/hp/sim/hpsim_irq.c |  6 +++---
 arch/ia64/kernel/iosapic.c   | 14 +++++++-------
 arch/ia64/kernel/irq.c       |  2 +-
 arch/ia64/kernel/irq_ia64.c  |  2 +-
 arch/ia64/kernel/mca.c       |  2 +-
 arch/ia64/kernel/smpboot.c   |  2 +-
 arch/powerpc/kernel/crash.c  |  2 +-
 include/linux/irq.h          |  7 -------
 kernel/irq/migration.c       |  2 +-
 9 files changed, 16 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/hp/sim/hpsim_irq.c b/arch/ia64/hp/sim/hpsim_irq.c
index c0d25a2a3e9c..8145547bb52d 100644
--- a/arch/ia64/hp/sim/hpsim_irq.c
+++ b/arch/ia64/hp/sim/hpsim_irq.c
@@ -44,8 +44,8 @@ hpsim_irq_init (void)
 	int i;
 
 	for (i = 0; i < NR_IRQS; ++i) {
-		idesc = irq_descp(i);
-		if (idesc->handler == &no_irq_type)
-			idesc->handler = &irq_type_hp_sim;
+		idesc = irq_desc + i;
+		if (idesc->chip == &no_irq_type)
+			idesc->chip = &irq_type_hp_sim;
 	}
 }
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index abb4cb1c831e..efc7df4b0fd2 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -456,7 +456,7 @@ iosapic_startup_edge_irq (unsigned int irq)
 static void
 iosapic_ack_edge_irq (unsigned int irq)
 {
-	irq_desc_t *idesc = irq_descp(irq);
+	irq_desc_t *idesc = irq_desc + irq;
 
 	move_native_irq(irq);
 	/*
@@ -659,7 +659,7 @@ register_intr (unsigned int gsi, int vector, unsigned char delivery,
 	else
 		irq_type = &irq_type_iosapic_level;
 
-	idesc = irq_descp(vector);
+	idesc = irq_desc + vector;
 	if (idesc->chip != irq_type) {
 		if (idesc->chip != &no_irq_type)
 			printk(KERN_WARNING
@@ -793,14 +793,14 @@ again:
 			return -ENOSPC;
 	}
 
-	spin_lock_irqsave(&irq_descp(vector)->lock, flags);
+	spin_lock_irqsave(&irq_desc[vector].lock, flags);
 	spin_lock(&iosapic_lock);
 	{
 		if (gsi_to_vector(gsi) > 0) {
 			if (list_empty(&iosapic_intr_info[vector].rtes))
 				free_irq_vector(vector);
 			spin_unlock(&iosapic_lock);
-			spin_unlock_irqrestore(&irq_descp(vector)->lock,
+			spin_unlock_irqrestore(&irq_desc[vector].lock,
 					       flags);
 			goto again;
 		}
@@ -810,7 +810,7 @@ again:
 			      polarity, trigger);
 		if (err < 0) {
 			spin_unlock(&iosapic_lock);
-			spin_unlock_irqrestore(&irq_descp(vector)->lock,
+			spin_unlock_irqrestore(&irq_desc[vector].lock,
 					       flags);
 			return err;
 		}
@@ -825,7 +825,7 @@ again:
 		set_rte(gsi, vector, dest, mask);
 	}
 	spin_unlock(&iosapic_lock);
-	spin_unlock_irqrestore(&irq_descp(vector)->lock, flags);
+	spin_unlock_irqrestore(&irq_desc[vector].lock, flags);
 
 	printk(KERN_INFO "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d\n",
 	       gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
@@ -860,7 +860,7 @@ iosapic_unregister_intr (unsigned int gsi)
 	}
 	vector = irq_to_vector(irq);
 
-	idesc = irq_descp(irq);
+	idesc = irq_desc + irq;
 	spin_lock_irqsave(&idesc->lock, flags);
 	spin_lock(&iosapic_lock);
 	{
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index c4e1b3b60b48..7852382de2fa 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -120,7 +120,7 @@ static void migrate_irqs(void)
 	int 		irq, new_cpu;
 
 	for (irq=0; irq < NR_IRQS; irq++) {
-		desc = irq_descp(irq);
+		desc = irq_desc + irq;
 
 		/*
 		 * No handling for now.
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 6d8fc9498ed9..f5035304594e 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -249,7 +249,7 @@ register_percpu_irq (ia64_vector vec, struct irqaction *action)
 
 	for (irq = 0; irq < NR_IRQS; ++irq)
 		if (irq_to_vector(irq) == vec) {
-			desc = irq_descp(irq);
+			desc = irq_desc + irq;
 			desc->status |= IRQ_PER_CPU;
 			desc->chip = &irq_type_ia64_lsapic;
 			if (action)
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 6a0880639bc9..d7dc5e63de63 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1788,7 +1788,7 @@ ia64_mca_late_init(void)
 			cpe_poll_enabled = 0;
 			for (irq = 0; irq < NR_IRQS; ++irq)
 				if (irq_to_vector(irq) == cpe_vector) {
-					desc = irq_descp(irq);
+					desc = irq_desc + irq;
 					desc->status |= IRQ_PER_CPU;
 					setup_irq(irq, &mca_cpe_irqaction);
 					ia64_cpe_irq = irq;
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index d69288055599..5203df78f150 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -677,7 +677,7 @@ int migrate_platform_irqs(unsigned int cpu)
 			new_cpei_cpu = any_online_cpu(cpu_online_map);
 			mask = cpumask_of_cpu(new_cpei_cpu);
 			set_cpei_target_cpu(new_cpei_cpu);
-			desc = irq_descp(ia64_cpe_irq);
+			desc = irq_desc + ia64_cpe_irq;
 			/*
 			 * Switch for now, immediatly, we need to do fake intr
 			 * as other interrupts, but need to study CPEI behaviour with
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index 1bfad7c2f8c8..22ceba844bf4 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -190,7 +190,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
 	local_irq_disable();
 
 	for_each_irq(irq) {
-		struct irq_desc *desc = irq_descp(irq);
+		struct irq_desc *desc = irq_desc + irq;
 
 		if (desc->status & IRQ_INPROGRESS)
 			desc->chip->end(irq);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 6e3ad6245bd3..80713d6e8c8d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -87,13 +87,6 @@ typedef struct irq_desc {
 
 extern irq_desc_t irq_desc [NR_IRQS];
 
-/* Return a pointer to the irq descriptor for IRQ.  */
-static inline irq_desc_t *
-irq_descp (int irq)
-{
-	return irq_desc + irq;
-}
-
 #include <asm/hw_irq.h> /* the arch dependent stuff */
 
 extern int setup_irq(unsigned int irq, struct irqaction * new);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index d978c87bca93..b4a4354d03d5 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -15,7 +15,7 @@ void set_pending_irq(unsigned int irq, cpumask_t mask)
 void move_native_irq(int irq)
 {
 	cpumask_t tmp;
-	irq_desc_t *desc = irq_descp(irq);
+	irq_desc_t *desc = irq_desc + irq;
 
 	if (likely(!desc->move_irq))
 		return;
-- 
cgit v1.2.3


From 2e60bbb6d50de654d8e68f115161e27878b5e72d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jun 2006 02:24:39 -0700
Subject: [PATCH] genirq: cleanup: remove fastcall

Now that i386 defaults to regparm, explicit uses of fastcall are not needed
anymore.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h | 10 +++++++---
 kernel/irq/handle.c |  4 ++--
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 80713d6e8c8d..eac1273dc4e1 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -170,11 +170,15 @@ static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
 extern int no_irq_affinity;
 extern int noirqdebug_setup(char *str);
 
-extern fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
-					struct irqaction *action);
+extern irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+				    struct irqaction *action);
+/*
+ * Explicit fastcall, because i386 4KSTACKS calls it from assembly:
+ */
 extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
+
 extern void note_interrupt(unsigned int irq, irq_desc_t *desc,
-					int action_ret, struct pt_regs *regs);
+			   int action_ret, struct pt_regs *regs);
 extern int can_request_irq(unsigned int irq, unsigned long irqflags);
 
 extern void init_irq_proc(void);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index cc786aaf30d6..6b313ccf0edd 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -79,8 +79,8 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
 /*
  * Have got an event to handle:
  */
-fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
-				struct irqaction *action)
+irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+			     struct irqaction *action)
 {
 	irqreturn_t ret, retval = IRQ_NONE;
 	unsigned int status = 0;
-- 
cgit v1.2.3


From 06fcb0c6fb3aae9570a32ac3b72a8222563baa69 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jun 2006 02:24:40 -0700
Subject: [PATCH] genirq: cleanup: misc code cleanups

Assorted code cleanups to the generic IRQ code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h    | 54 ++++++++++++++++++++++++++++++--------------------
 kernel/irq/autoprobe.c | 16 +++++++--------
 kernel/irq/handle.c    |  4 ++--
 kernel/irq/manage.c    | 29 +++++++++++----------------
 kernel/irq/spurious.c  | 23 +++++++++++----------
 5 files changed, 66 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index eac1273dc4e1..92c685414622 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -1,5 +1,5 @@
-#ifndef __irq_h
-#define __irq_h
+#ifndef _LINUX_IRQ_H
+#define _LINUX_IRQ_H
 
 /*
  * Please do not include this file in generic code.  There is currently
@@ -11,7 +11,7 @@
 
 #include <linux/smp.h>
 
-#if !defined(CONFIG_S390)
+#ifndef CONFIG_S390
 
 #include <linux/linkage.h>
 #include <linux/cache.h>
@@ -33,7 +33,7 @@
 #define IRQ_WAITING	32	/* IRQ not yet seen - for autodetection */
 #define IRQ_LEVEL	64	/* IRQ level triggered */
 #define IRQ_MASKED	128	/* IRQ masked - shouldn't be seen again */
-#if defined(ARCH_HAS_IRQ_PER_CPU)
+#ifdef ARCH_HAS_IRQ_PER_CPU
 # define IRQ_PER_CPU	256	/* IRQ is per CPU */
 # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
 #else
@@ -45,7 +45,7 @@
  * to describe about the low-level hardware. 
  */
 struct hw_interrupt_type {
-	const char * typename;
+	const char *typename;
 	unsigned int (*startup)(unsigned int irq);
 	void (*shutdown)(unsigned int irq);
 	void (*enable)(unsigned int irq);
@@ -80,7 +80,7 @@ typedef struct irq_desc {
 #ifdef CONFIG_SMP
 	cpumask_t affinity;
 #endif
-#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
 	unsigned int move_irq;		/* Flag need to re-target intr dest*/
 #endif
 } ____cacheline_aligned irq_desc_t;
@@ -89,9 +89,10 @@ extern irq_desc_t irq_desc [NR_IRQS];
 
 #include <asm/hw_irq.h> /* the arch dependent stuff */
 
-extern int setup_irq(unsigned int irq, struct irqaction * new);
+extern int setup_irq(unsigned int irq, struct irqaction *new);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
+
 #ifdef CONFIG_SMP
 static inline void set_native_irq_info(int irq, cpumask_t mask)
 {
@@ -105,7 +106,7 @@ static inline void set_native_irq_info(int irq, cpumask_t mask)
 
 #ifdef CONFIG_SMP
 
-#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
 extern cpumask_t pending_irq_cpumask[NR_IRQS];
 
 void set_pending_irq(unsigned int irq, cpumask_t mask);
@@ -127,7 +128,7 @@ static inline void set_irq_info(int irq, cpumask_t mask)
 {
 }
 
-#else // CONFIG_PCI_MSI
+#else /* CONFIG_PCI_MSI */
 
 static inline void move_irq(int irq)
 {
@@ -138,26 +139,36 @@ static inline void set_irq_info(int irq, cpumask_t mask)
 {
 	set_native_irq_info(irq, mask);
 }
-#endif // CONFIG_PCI_MSI
 
-#else	// CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE
+#endif /* CONFIG_PCI_MSI */
+
+#else /* CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE */
+
+static inline void move_irq(int irq)
+{
+}
+
+static inline void move_native_irq(int irq)
+{
+}
+
+static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
+{
+}
 
-#define move_irq(x)
-#define move_native_irq(x)
-#define set_pending_irq(x,y)
 static inline void set_irq_info(int irq, cpumask_t mask)
 {
 	set_native_irq_info(irq, mask);
 }
 
-#endif // CONFIG_GENERIC_PENDING_IRQ
+#endif /* CONFIG_GENERIC_PENDING_IRQ */
 
-#else // CONFIG_SMP
+#else /* CONFIG_SMP */
 
 #define move_irq(x)
 #define move_native_irq(x)
 
-#endif // CONFIG_SMP
+#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_IRQBALANCE
 extern void set_balance_irq_affinity(unsigned int irq, cpumask_t mask);
@@ -186,17 +197,16 @@ extern void init_irq_proc(void);
 #ifdef CONFIG_AUTO_IRQ_AFFINITY
 extern int select_smp_affinity(unsigned int irq);
 #else
-static inline int
-select_smp_affinity(unsigned int irq)
+static inline int select_smp_affinity(unsigned int irq)
 {
 	return 1;
 }
 #endif
 
-#endif
+#endif /* CONFIG_GENERIC_HARDIRQS */
 
 extern hw_irq_controller no_irq_type;  /* needed in every arch ? */
 
-#endif
+#endif /* !CONFIG_S390 */
 
-#endif /* __irq_h */
+#endif /* _LINUX_IRQ_H */
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index d1f1e6f10a23..d6eab98a3173 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -27,7 +27,7 @@ static DEFINE_MUTEX(probing_active);
  */
 unsigned long probe_irq_on(void)
 {
-	unsigned long val;
+	unsigned long mask;
 	irq_desc_t *desc;
 	unsigned int i;
 
@@ -40,8 +40,8 @@ unsigned long probe_irq_on(void)
 		desc = irq_desc + i;
 
 		spin_lock_irq(&desc->lock);
-		if (!irq_desc[i].action)
-			irq_desc[i].chip->startup(i);
+		if (!desc->action)
+			desc->chip->startup(i);
 		spin_unlock_irq(&desc->lock);
 	}
 
@@ -73,11 +73,11 @@ unsigned long probe_irq_on(void)
 	/*
 	 * Now filter out any obviously spurious interrupts
 	 */
-	val = 0;
+	mask = 0;
 	for (i = 0; i < NR_IRQS; i++) {
-		irq_desc_t *desc = irq_desc + i;
 		unsigned int status;
 
+		desc = irq_desc + i;
 		spin_lock_irq(&desc->lock);
 		status = desc->status;
 
@@ -88,14 +88,13 @@ unsigned long probe_irq_on(void)
 				desc->chip->shutdown(i);
 			} else
 				if (i < 32)
-					val |= 1 << i;
+					mask |= 1 << i;
 		}
 		spin_unlock_irq(&desc->lock);
 	}
 
-	return val;
+	return mask;
 }
-
 EXPORT_SYMBOL(probe_irq_on);
 
 /**
@@ -184,6 +183,5 @@ int probe_irq_off(unsigned long val)
 
 	return irq_found;
 }
-
 EXPORT_SYMBOL(probe_irq_off);
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 6b313ccf0edd..f9c33a86cbdf 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -18,7 +18,7 @@
  * Linux has a controller-independent interrupt architecture.
  * Every controller has a 'controller-template', that is used
  * by the main code to do the right thing. Each driver-visible
- * interrupt source is transparently wired to the apropriate
+ * interrupt source is transparently wired to the appropriate
  * controller. Thus drivers need not be aware of the
  * interrupt-controller.
  *
@@ -111,7 +111,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 {
 	irq_desc_t *desc = irq_desc + irq;
-	struct irqaction * action;
+	struct irqaction *action;
 	unsigned int status;
 
 	kstat_this_cpu.irqs[irq]++;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c53662edc73d..261906ebdf04 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -40,7 +40,6 @@ void synchronize_irq(unsigned int irq)
 	while (desc->status & IRQ_INPROGRESS)
 		cpu_relax();
 }
-
 EXPORT_SYMBOL(synchronize_irq);
 
 #endif
@@ -71,7 +70,6 @@ void disable_irq_nosync(unsigned int irq)
 	}
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
-
 EXPORT_SYMBOL(disable_irq_nosync);
 
 /**
@@ -97,7 +95,6 @@ void disable_irq(unsigned int irq)
 	if (desc->action)
 		synchronize_irq(irq);
 }
-
 EXPORT_SYMBOL(disable_irq);
 
 /**
@@ -139,7 +136,6 @@ void enable_irq(unsigned int irq)
 	}
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
-
 EXPORT_SYMBOL(enable_irq);
 
 /*
@@ -166,7 +162,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
  */
-int setup_irq(unsigned int irq, struct irqaction * new)
+int setup_irq(unsigned int irq, struct irqaction *new)
 {
 	struct irq_desc *desc = irq_desc + irq;
 	struct irqaction *old, **p;
@@ -198,9 +194,10 @@ int setup_irq(unsigned int irq, struct irqaction * new)
 	/*
 	 * The following block of code has to be executed atomically
 	 */
-	spin_lock_irqsave(&desc->lock,flags);
+	spin_lock_irqsave(&desc->lock, flags);
 	p = &desc->action;
-	if ((old = *p) != NULL) {
+	old = *p;
+	if (old) {
 		/* Can't share interrupts unless both agree to */
 		if (!(old->flags & new->flags & SA_SHIRQ))
 			goto mismatch;
@@ -233,7 +230,7 @@ int setup_irq(unsigned int irq, struct irqaction * new)
 		else
 			desc->chip->enable(irq);
 	}
-	spin_unlock_irqrestore(&desc->lock,flags);
+	spin_unlock_irqrestore(&desc->lock, flags);
 
 	new->irq = irq;
 	register_irq_proc(irq);
@@ -276,10 +273,10 @@ void free_irq(unsigned int irq, void *dev_id)
 		return;
 
 	desc = irq_desc + irq;
-	spin_lock_irqsave(&desc->lock,flags);
+	spin_lock_irqsave(&desc->lock, flags);
 	p = &desc->action;
 	for (;;) {
-		struct irqaction * action = *p;
+		struct irqaction *action = *p;
 
 		if (action) {
 			struct irqaction **pp = p;
@@ -304,7 +301,7 @@ void free_irq(unsigned int irq, void *dev_id)
 				else
 					desc->chip->disable(irq);
 			}
-			spin_unlock_irqrestore(&desc->lock,flags);
+			spin_unlock_irqrestore(&desc->lock, flags);
 			unregister_handler_proc(irq, action);
 
 			/* Make sure it's not being used on another CPU */
@@ -312,12 +309,11 @@ void free_irq(unsigned int irq, void *dev_id)
 			kfree(action);
 			return;
 		}
-		printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
-		spin_unlock_irqrestore(&desc->lock,flags);
+		printk(KERN_ERR "Trying to free free IRQ%d\n", irq);
+		spin_unlock_irqrestore(&desc->lock, flags);
 		return;
 	}
 }
-
 EXPORT_SYMBOL(free_irq);
 
 /**
@@ -351,9 +347,9 @@ EXPORT_SYMBOL(free_irq);
  */
 int request_irq(unsigned int irq,
 		irqreturn_t (*handler)(int, void *, struct pt_regs *),
-		unsigned long irqflags, const char * devname, void *dev_id)
+		unsigned long irqflags, const char *devname, void *dev_id)
 {
-	struct irqaction * action;
+	struct irqaction *action;
 	int retval;
 
 	/*
@@ -388,6 +384,5 @@ int request_irq(unsigned int irq,
 
 	return retval;
 }
-
 EXPORT_SYMBOL(request_irq);
 
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index ea3ceed362da..5eae7bf3c347 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -16,22 +16,20 @@ static int irqfixup __read_mostly;
 /*
  * Recovery handler for misrouted interrupts.
  */
-
 static int misrouted_irq(int irq, struct pt_regs *regs)
 {
 	int i;
-	irq_desc_t *desc;
 	int ok = 0;
 	int work = 0;	/* Did we do work for a real IRQ */
 
-	for(i = 1; i < NR_IRQS; i++) {
+	for (i = 1; i < NR_IRQS; i++) {
+		struct irq_desc *desc = irq_desc + i;
 		struct irqaction *action;
 
 		if (i == irq)	/* Already tried */
 			continue;
-		desc = &irq_desc[i];
+
 		spin_lock(&desc->lock);
-		action = desc->action;
 		/* Already running on another processor */
 		if (desc->status & IRQ_INPROGRESS) {
 			/*
@@ -45,7 +43,9 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
 		}
 		/* Honour the normal IRQ locking */
 		desc->status |= IRQ_INPROGRESS;
+		action = desc->action;
 		spin_unlock(&desc->lock);
+
 		while (action) {
 			/* Only shared IRQ handlers are safe to call */
 			if (action->flags & SA_SHIRQ) {
@@ -62,9 +62,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
 
 		/*
 		 * While we were looking for a fixup someone queued a real
-		 * IRQ clashing with our walk
+		 * IRQ clashing with our walk:
 		 */
-
 		while ((desc->status & IRQ_PENDING) && action) {
 			/*
 			 * Perform real IRQ processing for the IRQ we deferred
@@ -80,7 +79,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
 		 * If we did actual work for the real IRQ line we must let the
 		 * IRQ controller clean up too
 		 */
-		if(work)
+		if (work)
 			desc->chip->end(i);
 		spin_unlock(&desc->lock);
 	}
@@ -113,6 +112,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
 	}
 	dump_stack();
 	printk(KERN_ERR "handlers:\n");
+
 	action = desc->action;
 	while (action) {
 		printk(KERN_ERR "[<%p>]", action->handler);
@@ -123,7 +123,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
 	}
 }
 
-static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+static void
+report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
 {
 	static int count = 100;
 
@@ -134,7 +135,7 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio
 }
 
 void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
-			struct pt_regs *regs)
+		    struct pt_regs *regs)
 {
 	if (unlikely(action_ret != IRQ_HANDLED)) {
 		desc->irqs_unhandled++;
@@ -177,6 +178,7 @@ int __init noirqdebug_setup(char *str)
 {
 	noirqdebug = 1;
 	printk(KERN_INFO "IRQ lockup detection disabled\n");
+
 	return 1;
 }
 
@@ -187,6 +189,7 @@ static int __init irqfixup_setup(char *str)
 	irqfixup = 1;
 	printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
 	printk(KERN_WARNING "This may impact system performance.\n");
+
 	return 1;
 }
 
-- 
cgit v1.2.3


From 34ffdb7233d5847808d2b63ca6761dac3af9c942 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jun 2006 02:24:40 -0700
Subject: [PATCH] genirq: cleanup: reduce irq_desc_t use, mark it obsolete

Cleanup: remove irq_desc_t use from the generic IRQ code, and mark it
obsolete.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h    | 18 +++++++++++++-----
 kernel/irq/autoprobe.c |  6 +++---
 kernel/irq/handle.c    |  4 ++--
 kernel/irq/manage.c    |  6 +++---
 kernel/irq/migration.c |  4 ++--
 kernel/irq/spurious.c  |  9 +++++----
 6 files changed, 28 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 92c685414622..9ce276a2374a 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -68,7 +68,7 @@ typedef struct hw_interrupt_type  hw_irq_controller;
  *
  * Pad this out to 32 bytes for cache and indexing reasons.
  */
-typedef struct irq_desc {
+struct irq_desc {
 	hw_irq_controller *chip;
 	void *chip_data;
 	struct irqaction *action;	/* IRQ action list */
@@ -83,11 +83,19 @@ typedef struct irq_desc {
 #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
 	unsigned int move_irq;		/* Flag need to re-target intr dest*/
 #endif
-} ____cacheline_aligned irq_desc_t;
+} ____cacheline_aligned;
 
-extern irq_desc_t irq_desc [NR_IRQS];
+extern struct irq_desc irq_desc[NR_IRQS];
 
-#include <asm/hw_irq.h> /* the arch dependent stuff */
+/*
+ * Migration helpers for obsolete names, they will go away:
+ */
+typedef struct irq_desc		irq_desc_t;
+
+/*
+ * Pick up the arch-dependent methods:
+ */
+#include <asm/hw_irq.h>
 
 extern int setup_irq(unsigned int irq, struct irqaction *new);
 
@@ -188,7 +196,7 @@ extern irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
  */
 extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
 
-extern void note_interrupt(unsigned int irq, irq_desc_t *desc,
+extern void note_interrupt(unsigned int irq, struct irq_desc *desc,
 			   int action_ret, struct pt_regs *regs);
 extern int can_request_irq(unsigned int irq, unsigned long irqflags);
 
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index d6eab98a3173..5c988bba401f 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -27,8 +27,8 @@ static DEFINE_MUTEX(probing_active);
  */
 unsigned long probe_irq_on(void)
 {
+	struct irq_desc *desc;
 	unsigned long mask;
-	irq_desc_t *desc;
 	unsigned int i;
 
 	mutex_lock(&probing_active);
@@ -116,7 +116,7 @@ unsigned int probe_irq_mask(unsigned long val)
 
 	mask = 0;
 	for (i = 0; i < NR_IRQS; i++) {
-		irq_desc_t *desc = irq_desc + i;
+		struct irq_desc *desc = irq_desc + i;
 		unsigned int status;
 
 		spin_lock_irq(&desc->lock);
@@ -159,7 +159,7 @@ int probe_irq_off(unsigned long val)
 	int i, irq_found = 0, nr_irqs = 0;
 
 	for (i = 0; i < NR_IRQS; i++) {
-		irq_desc_t *desc = irq_desc + i;
+		struct irq_desc *desc = irq_desc + i;
 		unsigned int status;
 
 		spin_lock_irq(&desc->lock);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index f9c33a86cbdf..8eda1005d10a 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -28,7 +28,7 @@
  *
  * Controller mappings for all interrupt sources:
  */
-irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
 		.chip = &no_irq_type,
@@ -110,7 +110,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
  */
 fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 {
-	irq_desc_t *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_desc + irq;
 	struct irqaction *action;
 	unsigned int status;
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 261906ebdf04..6a6f1d3dd399 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -57,7 +57,7 @@ EXPORT_SYMBOL(synchronize_irq);
  */
 void disable_irq_nosync(unsigned int irq)
 {
-	irq_desc_t *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_desc + irq;
 	unsigned long flags;
 
 	if (irq >= NR_IRQS)
@@ -86,7 +86,7 @@ EXPORT_SYMBOL(disable_irq_nosync);
  */
 void disable_irq(unsigned int irq)
 {
-	irq_desc_t *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_desc + irq;
 
 	if (irq >= NR_IRQS)
 		return;
@@ -109,7 +109,7 @@ EXPORT_SYMBOL(disable_irq);
  */
 void enable_irq(unsigned int irq)
 {
-	irq_desc_t *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_desc + irq;
 	unsigned long flags;
 
 	if (irq >= NR_IRQS)
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index b4a4354d03d5..a571c3abb793 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -3,7 +3,7 @@
 
 void set_pending_irq(unsigned int irq, cpumask_t mask)
 {
-	irq_desc_t *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_desc + irq;
 	unsigned long flags;
 
 	spin_lock_irqsave(&desc->lock, flags);
@@ -14,8 +14,8 @@ void set_pending_irq(unsigned int irq, cpumask_t mask)
 
 void move_native_irq(int irq)
 {
+	struct irq_desc *desc = irq_desc + irq;
 	cpumask_t tmp;
-	irq_desc_t *desc = irq_desc + irq;
 
 	if (likely(!desc->move_irq))
 		return;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 5eae7bf3c347..3a0a62123301 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -99,7 +99,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
  */
 
 static void
-__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+__report_bad_irq(unsigned int irq, struct irq_desc *desc,
+		 irqreturn_t action_ret)
 {
 	struct irqaction *action;
 
@@ -124,7 +125,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
 }
 
 static void
-report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
 {
 	static int count = 100;
 
@@ -134,8 +135,8 @@ report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
 	}
 }
 
-void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
-		    struct pt_regs *regs)
+void note_interrupt(unsigned int irq, struct irq_desc *desc,
+		    irqreturn_t action_ret, struct pt_regs *regs)
 {
 	if (unlikely(action_ret != IRQ_HANDLED)) {
 		desc->irqs_unhandled++;
-- 
cgit v1.2.3


From 71d218b75fa91219c6bd310fbdd257dfbcac6c88 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jun 2006 02:24:41 -0700
Subject: [PATCH] genirq: cleanup: include/linux/irq.h

Small cleanups in include/linux/irq.h.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h | 56 ++++++++++++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 9ce276a2374a..c13f23dee286 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -45,17 +45,17 @@
  * to describe about the low-level hardware. 
  */
 struct hw_interrupt_type {
-	const char *typename;
-	unsigned int (*startup)(unsigned int irq);
-	void (*shutdown)(unsigned int irq);
-	void (*enable)(unsigned int irq);
-	void (*disable)(unsigned int irq);
-	void (*ack)(unsigned int irq);
-	void (*end)(unsigned int irq);
-	void (*set_affinity)(unsigned int irq, cpumask_t dest);
+	const char	*typename;
+	unsigned int	(*startup)(unsigned int irq);
+	void		(*shutdown)(unsigned int irq);
+	void		(*enable)(unsigned int irq);
+	void		(*disable)(unsigned int irq);
+	void		(*ack)(unsigned int irq);
+	void		(*end)(unsigned int irq);
+	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
 	/* Currently used only by UML, might disappear one day.*/
 #ifdef CONFIG_IRQ_RELEASE_METHOD
-	void (*release)(unsigned int irq, void *dev_id);
+	void		(*release)(unsigned int irq, void *dev_id);
 #endif
 };
 
@@ -69,19 +69,19 @@ typedef struct hw_interrupt_type  hw_irq_controller;
  * Pad this out to 32 bytes for cache and indexing reasons.
  */
 struct irq_desc {
-	hw_irq_controller *chip;
-	void *chip_data;
-	struct irqaction *action;	/* IRQ action list */
-	unsigned int status;		/* IRQ status */
-	unsigned int depth;		/* nested irq disables */
-	unsigned int irq_count;		/* For detecting broken interrupts */
-	unsigned int irqs_unhandled;
-	spinlock_t lock;
+	hw_irq_controller	*chip;
+	void			*chip_data;
+	struct irqaction	*action;	/* IRQ action list */
+	unsigned int		status;		/* IRQ status */
+	unsigned int		depth;		/* nested irq disables */
+	unsigned int		irq_count;	/* For detecting broken IRQs */
+	unsigned int		irqs_unhandled;
+	spinlock_t		lock;
 #ifdef CONFIG_SMP
-	cpumask_t affinity;
+	cpumask_t		affinity;
 #endif
 #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
-	unsigned int move_irq;		/* Flag need to re-target intr dest*/
+	unsigned int		move_irq;	/* need to re-target IRQ dest */
 #endif
 } ____cacheline_aligned;
 
@@ -186,6 +186,15 @@ static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
 }
 #endif
 
+#ifdef CONFIG_AUTO_IRQ_AFFINITY
+extern int select_smp_affinity(unsigned int irq);
+#else
+static inline int select_smp_affinity(unsigned int irq)
+{
+	return 1;
+}
+#endif
+
 extern int no_irq_affinity;
 extern int noirqdebug_setup(char *str);
 
@@ -202,15 +211,6 @@ extern int can_request_irq(unsigned int irq, unsigned long irqflags);
 
 extern void init_irq_proc(void);
 
-#ifdef CONFIG_AUTO_IRQ_AFFINITY
-extern int select_smp_affinity(unsigned int irq);
-#else
-static inline int select_smp_affinity(unsigned int irq)
-{
-	return 1;
-}
-#endif
-
 #endif /* CONFIG_GENERIC_HARDIRQS */
 
 extern hw_irq_controller no_irq_type;  /* needed in every arch ? */
-- 
cgit v1.2.3


From 4a733ee12618cf3ec25cbc337a5e0ba3ad5d7fb6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jun 2006 02:24:42 -0700
Subject: [PATCH] genirq: cleanup: merge irq_dir[], smp_affinity_entry[] into
 irq_desc[]

Consolidation: remove the irq_dir[NR_IRQS] and the smp_affinity_entry[NR_IRQS]
arrays and move them into the irq_desc[] array.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h |  5 +++++
 kernel/irq/proc.c   | 20 +++++++-------------
 2 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c13f23dee286..1022c5d42546 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -61,6 +61,8 @@ struct hw_interrupt_type {
 
 typedef struct hw_interrupt_type  hw_irq_controller;
 
+struct proc_dir_entry;
+
 /*
  * This is the "IRQ descriptor", which contains various information
  * about the irq, including what kind of hardware handling it has,
@@ -83,6 +85,9 @@ struct irq_desc {
 #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
 	unsigned int		move_irq;	/* need to re-target IRQ dest */
 #endif
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *dir;
+#endif
 } ____cacheline_aligned;
 
 extern struct irq_desc irq_desc[NR_IRQS];
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 847b98a611e0..f60b85b61e8b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -12,15 +12,10 @@
 
 #include "internals.h"
 
-static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
+static struct proc_dir_entry *root_irq_dir;
 
 #ifdef CONFIG_SMP
 
-/*
- * The /proc/irq/<irq>/smp_affinity values:
- */
-static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
-
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
@@ -102,7 +97,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
 {
 	char name [MAX_NAMELEN];
 
-	if (!irq_dir[irq] || action->dir || !action->name ||
+	if (!irq_desc[irq].dir || action->dir || !action->name ||
 					!name_unique(irq, action))
 		return;
 
@@ -110,7 +105,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
 	snprintf(name, MAX_NAMELEN, "%s", action->name);
 
 	/* create /proc/irq/1234/handler/ */
-	action->dir = proc_mkdir(name, irq_dir[irq]);
+	action->dir = proc_mkdir(name, irq_desc[irq].dir);
 }
 
 #undef MAX_NAMELEN
@@ -123,21 +118,21 @@ void register_irq_proc(unsigned int irq)
 
 	if (!root_irq_dir ||
 		(irq_desc[irq].chip == &no_irq_type) ||
-			irq_dir[irq])
+			irq_desc[irq].dir)
 		return;
 
 	memset(name, 0, MAX_NAMELEN);
 	sprintf(name, "%d", irq);
 
 	/* create /proc/irq/1234 */
-	irq_dir[irq] = proc_mkdir(name, root_irq_dir);
+	irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
 
 #ifdef CONFIG_SMP
 	{
 		struct proc_dir_entry *entry;
 
 		/* create /proc/irq/<irq>/smp_affinity */
-		entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]);
+		entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
 
 		if (entry) {
 			entry->nlink = 1;
@@ -145,7 +140,6 @@ void register_irq_proc(unsigned int irq)
 			entry->read_proc = irq_affinity_read_proc;
 			entry->write_proc = irq_affinity_write_proc;
 		}
-		smp_affinity_entry[irq] = entry;
 	}
 #endif
 }
@@ -155,7 +149,7 @@ void register_irq_proc(unsigned int irq)
 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 {
 	if (action->dir)
-		remove_proc_entry(action->dir->name, irq_dir[irq]);
+		remove_proc_entry(action->dir->name, irq_desc[irq].dir);
 }
 
 void init_irq_proc(void)
-- 
cgit v1.2.3


From cd916d31cc31273eca8a620fae02b7bf7f577559 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jun 2006 02:24:42 -0700
Subject: [PATCH] genirq: cleanup: merge pending_irq_cpumask[] into irq_desc[]

Consolidation: remove the pending_irq_cpumask[NR_IRQS] array and move it into
the irq_desc[NR_IRQS].pending_mask field.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/io_apic.c | 2 +-
 include/linux/irq.h        | 2 +-
 kernel/irq/manage.c        | 4 ----
 kernel/irq/migration.c     | 8 ++++----
 4 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 4a74b696c6a3..afe54f257cb8 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -581,7 +581,7 @@ static int balanced_irq(void *unused)
 	
 	/* push everything to CPU 0 to give us a starting point.  */
 	for (i = 0 ; i < NR_IRQS ; i++) {
-		pending_irq_cpumask[i] = cpumask_of_cpu(0);
+		irq_desc[i].pending_mask = cpumask_of_cpu(0);
 		set_pending_irq(i, cpumask_of_cpu(0));
 	}
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 1022c5d42546..81f3d976bb32 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -83,6 +83,7 @@ struct irq_desc {
 	cpumask_t		affinity;
 #endif
 #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
+	cpumask_t		pending_mask;
 	unsigned int		move_irq;	/* need to re-target IRQ dest */
 #endif
 #ifdef CONFIG_PROC_FS
@@ -120,7 +121,6 @@ static inline void set_native_irq_info(int irq, cpumask_t mask)
 #ifdef CONFIG_SMP
 
 #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
-extern cpumask_t pending_irq_cpumask[NR_IRQS];
 
 void set_pending_irq(unsigned int irq, cpumask_t mask);
 void move_native_irq(int irq);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6a6f1d3dd399..ca9b5d36abe8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,10 +16,6 @@
 
 #ifdef CONFIG_SMP
 
-#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
-cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
-#endif
-
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *	@irq: interrupt number to wait for
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index a571c3abb793..a57ebe9fa6f6 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -8,7 +8,7 @@ void set_pending_irq(unsigned int irq, cpumask_t mask)
 
 	spin_lock_irqsave(&desc->lock, flags);
 	desc->move_irq = 1;
-	pending_irq_cpumask[irq] = mask;
+	irq_desc[irq].pending_mask = mask;
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
 
@@ -30,7 +30,7 @@ void move_native_irq(int irq)
 
 	desc->move_irq = 0;
 
-	if (unlikely(cpus_empty(pending_irq_cpumask[irq])))
+	if (unlikely(cpus_empty(irq_desc[irq].pending_mask)))
 		return;
 
 	if (!desc->chip->set_affinity)
@@ -38,7 +38,7 @@ void move_native_irq(int irq)
 
 	assert_spin_locked(&desc->lock);
 
-	cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
+	cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map);
 
 	/*
 	 * If there was a valid mask to work with, please
@@ -58,5 +58,5 @@ void move_native_irq(int irq)
 		if (likely(!(desc->status & IRQ_DISABLED)))
 			desc->chip->enable(irq);
 	}
-	cpus_clear(pending_irq_cpumask[irq]);
+	cpus_clear(irq_desc[irq].pending_mask);
 }
-- 
cgit v1.2.3


From 0d7012a968d006e277eb0fe20edd7a9b5563c2b7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jun 2006 02:24:43 -0700
Subject: [PATCH] genirq: cleanup: turn ARCH_HAS_IRQ_PER_CPU into
 CONFIG_IRQ_PER_CPU

Cleanup: change ARCH_HAS_IRQ_PER_CPU into a Kconfig method.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/cris/Kconfig                | 4 ++++
 arch/ia64/Kconfig                | 4 ++++
 arch/mips/Kconfig                | 5 +++++
 arch/parisc/Kconfig              | 4 ++++
 arch/powerpc/Kconfig             | 4 ++++
 include/asm-cris/irq.h           | 5 -----
 include/asm-ia64/irq.h           | 5 -----
 include/asm-mips/mach-mips/irq.h | 6 ------
 include/asm-parisc/irq.h         | 5 -----
 include/asm-powerpc/irq.h        | 5 -----
 include/linux/irq.h              | 2 +-
 kernel/irq/manage.c              | 4 ++--
 12 files changed, 24 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 856b665020e7..6a1238a29d6c 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -28,6 +28,10 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
+config IRQ_PER_CPU
+	bool
+	default y
+
 config CRIS
 	bool
 	default y
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 5faacbb8d16c..b487e227a1f7 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -492,6 +492,10 @@ config GENERIC_PENDING_IRQ
 	depends on GENERIC_HARDIRQS && SMP
 	default y
 
+config IRQ_PER_CPU
+	bool
+	default y
+
 source "arch/ia64/hp/sim/Kconfig"
 
 menu "Instrumentation Support"
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 35e038a974c6..08c2ece4ae40 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1618,6 +1618,11 @@ config GENERIC_IRQ_PROBE
 	bool
 	default y
 
+config IRQ_PER_CPU
+	depends on SMP
+	bool
+	default y
+
 #
 # - Highmem only makes sense for the 32-bit kernel.
 # - The current highmem code will only work properly on physically indexed
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 910fb3afc0b5..6dd0ea8f88e0 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -51,6 +51,10 @@ config GENERIC_HARDIRQS
 config GENERIC_IRQ_PROBE
 	def_bool y
 
+config IRQ_PER_CPU
+	bool
+	default y
+
 # unless you want to implement ACPI on PA-RISC ... ;-)
 config PM
 	bool
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e2e9df344ab7..d43e4521abf2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -30,6 +30,10 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
+config IRQ_PER_CPU
+	bool
+	default y
+
 config RWSEM_GENERIC_SPINLOCK
 	bool
 
diff --git a/include/asm-cris/irq.h b/include/asm-cris/irq.h
index 4b338792218b..998cce9f3200 100644
--- a/include/asm-cris/irq.h
+++ b/include/asm-cris/irq.h
@@ -1,11 +1,6 @@
 #ifndef _ASM_IRQ_H
 #define _ASM_IRQ_H
 
-/*
- * IRQ line status macro IRQ_PER_CPU is used
- */
-#define ARCH_HAS_IRQ_PER_CPU
-
 #include <asm/arch/irq.h>
 
 static inline int irq_canonicalize(int irq)
diff --git a/include/asm-ia64/irq.h b/include/asm-ia64/irq.h
index dbe86c0bbce5..79479e2c6966 100644
--- a/include/asm-ia64/irq.h
+++ b/include/asm-ia64/irq.h
@@ -14,11 +14,6 @@
 #define NR_IRQS		256
 #define NR_IRQ_VECTORS	NR_IRQS
 
-/*
- * IRQ line status macro IRQ_PER_CPU is used
- */
-#define ARCH_HAS_IRQ_PER_CPU
-
 static __inline__ int
 irq_canonicalize (int irq)
 {
diff --git a/include/asm-mips/mach-mips/irq.h b/include/asm-mips/mach-mips/irq.h
index 083d9c512a04..e994b0c01227 100644
--- a/include/asm-mips/mach-mips/irq.h
+++ b/include/asm-mips/mach-mips/irq.h
@@ -4,10 +4,4 @@
 
 #define NR_IRQS	256
 
-#ifdef CONFIG_SMP
-
-#define ARCH_HAS_IRQ_PER_CPU
-
-#endif
-
 #endif /* __ASM_MACH_MIPS_IRQ_H */
diff --git a/include/asm-parisc/irq.h b/include/asm-parisc/irq.h
index 377ba90c7d02..5cae260615a2 100644
--- a/include/asm-parisc/irq.h
+++ b/include/asm-parisc/irq.h
@@ -26,11 +26,6 @@
 
 #define NR_IRQS		(CPU_IRQ_MAX + 1)
 
-/*
- * IRQ line status macro IRQ_PER_CPU is used
- */
-#define ARCH_HAS_IRQ_PER_CPU
-
 static __inline__ int irq_canonicalize(int irq)
 {
 	return (irq == 2) ? 9 : irq;
diff --git a/include/asm-powerpc/irq.h b/include/asm-powerpc/irq.h
index a10feec29d4d..eb5f33e1977a 100644
--- a/include/asm-powerpc/irq.h
+++ b/include/asm-powerpc/irq.h
@@ -30,11 +30,6 @@
 #define IRQ_POLARITY_POSITIVE	0x2	/* high level or low->high edge */
 #define IRQ_POLARITY_NEGATIVE	0x0	/* low level or high->low edge */
 
-/*
- * IRQ line status macro IRQ_PER_CPU is used
- */
-#define ARCH_HAS_IRQ_PER_CPU
-
 #define get_irq_desc(irq) (&irq_desc[(irq)])
 
 /* Define a way to iterate across irqs. */
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 81f3d976bb32..519a1cb7c331 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -33,7 +33,7 @@
 #define IRQ_WAITING	32	/* IRQ not yet seen - for autodetection */
 #define IRQ_LEVEL	64	/* IRQ level triggered */
 #define IRQ_MASKED	128	/* IRQ masked - shouldn't be seen again */
-#ifdef ARCH_HAS_IRQ_PER_CPU
+#ifdef CONFIG_IRQ_PER_CPU
 # define IRQ_PER_CPU	256	/* IRQ is per CPU */
 # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
 #else
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ca9b5d36abe8..8389d1817fe8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -198,7 +198,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 		if (!(old->flags & new->flags & SA_SHIRQ))
 			goto mismatch;
 
-#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
+#if defined(CONFIG_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
 		/* All handlers must agree on per-cpuness */
 		if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU))
 			goto mismatch;
@@ -213,7 +213,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 	}
 
 	*p = new;
-#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
+#if defined(CONFIG_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
 	if (new->flags & SA_PERCPU_IRQ)
 		desc->status |= IRQ_PER_CPU;
 #endif
-- 
cgit v1.2.3


From c0ad90a32fb60f4129d0e24dfd5fd7128e2e09f2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jun 2006 02:24:44 -0700
Subject: [PATCH] genirq: add ->retrigger() irq op to consolidate
 hw_irq_resend()

Add ->retrigger() irq op to consolidate hw_irq_resend() implementations.
(Most architectures had it defined to NOP anyway.)

NOTE: ia64 needs testing. i386 and x86_64 tested.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/io_apic.c    |  9 +++++++++
 arch/ia64/kernel/irq_lsapic.c | 10 +++++++++-
 arch/ia64/kernel/perfmon.c    |  4 ++--
 arch/parisc/kernel/irq.c      | 11 ++++-------
 arch/x86_64/kernel/io_apic.c  |  9 +++++++++
 include/asm-alpha/hw_irq.h    |  2 --
 include/asm-cris/hw_irq.h     |  2 --
 include/asm-i386/hw_irq.h     | 10 ----------
 include/asm-ia64/hw_irq.h     |  3 +--
 include/asm-m32r/hw_irq.h     |  5 -----
 include/asm-mips/hw_irq.h     |  8 ++++----
 include/asm-parisc/hw_irq.h   |  9 ---------
 include/asm-powerpc/hw_irq.h  |  6 +++---
 include/asm-sh/hw_irq.h       |  5 -----
 include/asm-sh64/hw_irq.h     |  1 -
 include/asm-um/hw_irq.h       |  3 ---
 include/asm-v850/hw_irq.h     |  4 ----
 include/asm-x86_64/hw_irq.h   |  9 ---------
 include/asm-xtensa/hw_irq.h   |  4 ----
 include/linux/irq.h           |  2 ++
 kernel/irq/manage.c           |  3 ++-
 21 files changed, 45 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index afe54f257cb8..ec9ea0269d36 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -2071,6 +2071,13 @@ static void set_ioapic_affinity_vector (unsigned int vector,
 #endif
 #endif
 
+static int ioapic_retrigger(unsigned int irq)
+{
+	send_IPI_self(IO_APIC_VECTOR(irq));
+
+	return 1;
+}
+
 /*
  * Level and edge triggered IO-APIC interrupts need different handling,
  * so we use two separate IRQ descriptors. Edge triggered IRQs can be
@@ -2090,6 +2097,7 @@ static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
 #ifdef CONFIG_SMP
 	.set_affinity 	= set_ioapic_affinity,
 #endif
+	.retrigger	= ioapic_retrigger,
 };
 
 static struct hw_interrupt_type ioapic_level_type __read_mostly = {
@@ -2103,6 +2111,7 @@ static struct hw_interrupt_type ioapic_level_type __read_mostly = {
 #ifdef CONFIG_SMP
 	.set_affinity 	= set_ioapic_affinity,
 #endif
+	.retrigger	= ioapic_retrigger,
 };
 
 static inline void init_IO_APIC_traps(void)
diff --git a/arch/ia64/kernel/irq_lsapic.c b/arch/ia64/kernel/irq_lsapic.c
index ea14e6a04409..1ab58b09f3d7 100644
--- a/arch/ia64/kernel/irq_lsapic.c
+++ b/arch/ia64/kernel/irq_lsapic.c
@@ -26,6 +26,13 @@ lsapic_noop (unsigned int irq)
 	/* nuthing to do... */
 }
 
+static int lsapic_retrigger(unsigned int irq)
+{
+	ia64_resend_irq(irq);
+
+	return 1;
+}
+
 struct hw_interrupt_type irq_type_ia64_lsapic = {
 	.typename =	"LSAPIC",
 	.startup =	lsapic_noop_startup,
@@ -33,5 +40,6 @@ struct hw_interrupt_type irq_type_ia64_lsapic = {
 	.enable =	lsapic_noop,
 	.disable =	lsapic_noop,
 	.ack =		lsapic_noop,
-	.end =		lsapic_noop
+	.end =		lsapic_noop,
+	.retrigger =	lsapic_retrigger,
 };
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 6d7bc8ff7b3a..a0055d3d695c 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -6165,7 +6165,7 @@ pfm_load_regs (struct task_struct *task)
 		/*
 		 * will replay the PMU interrupt
 		 */
-		if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
+		if (need_irq_resend) ia64_resend_irq(IA64_PERFMON_VECTOR);
 
 		pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
 	}
@@ -6305,7 +6305,7 @@ pfm_load_regs (struct task_struct *task)
 		/*
 		 * will replay the PMU interrupt
 		 */
-		if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
+		if (need_irq_resend) ia64_resend_irq(IA64_PERFMON_VECTOR);
 
 		pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
 	}
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index ea6a55e1a0c5..82fe6ba29727 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -125,6 +125,10 @@ static struct hw_interrupt_type cpu_interrupt_type = {
 #ifdef CONFIG_SMP
 	.set_affinity	= cpu_set_affinity_irq,
 #endif
+	/* XXX: Needs to be written.  We managed without it so far, but
+	 * we really ought to write it.
+	 */
+	.retrigger	= NULL,
 };
 
 int show_interrupts(struct seq_file *p, void *v)
@@ -404,13 +408,6 @@ void __init init_IRQ(void)
 
 }
 
-void hw_resend_irq(struct hw_interrupt_type *type, unsigned int irq)
-{
-	/* XXX: Needs to be written.  We managed without it so far, but
-	 * we really ought to write it.
-	 */
-}
-
 void ack_bad_irq(unsigned int irq)
 {
 	printk("unexpected IRQ %d\n", irq);
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 88a8736eb8ce..401b687fef21 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -1618,6 +1618,13 @@ static void set_ioapic_affinity_vector (unsigned int vector,
 #endif // CONFIG_SMP
 #endif // CONFIG_PCI_MSI
 
+static int ioapic_retrigger(unsigned int irq)
+{
+	send_IPI_self(IO_APIC_VECTOR(irq));
+
+	return 1;
+}
+
 /*
  * Level and edge triggered IO-APIC interrupts need different handling,
  * so we use two separate IRQ descriptors. Edge triggered IRQs can be
@@ -1638,6 +1645,7 @@ static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
 #ifdef CONFIG_SMP
 	.set_affinity = set_ioapic_affinity,
 #endif
+	.retrigger	= ioapic_retrigger,
 };
 
 static struct hw_interrupt_type ioapic_level_type __read_mostly = {
@@ -1651,6 +1659,7 @@ static struct hw_interrupt_type ioapic_level_type __read_mostly = {
 #ifdef CONFIG_SMP
 	.set_affinity = set_ioapic_affinity,
 #endif
+	.retrigger	= ioapic_retrigger,
 };
 
 static inline void init_IO_APIC_traps(void)
diff --git a/include/asm-alpha/hw_irq.h b/include/asm-alpha/hw_irq.h
index ca9d43b63502..a37db0f95092 100644
--- a/include/asm-alpha/hw_irq.h
+++ b/include/asm-alpha/hw_irq.h
@@ -2,8 +2,6 @@
 #define _ALPHA_HW_IRQ_H
 
 
-static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i) {}
-
 extern volatile unsigned long irq_err_count;
 
 #ifdef CONFIG_ALPHA_GENERIC
diff --git a/include/asm-cris/hw_irq.h b/include/asm-cris/hw_irq.h
index 341536a234e9..298066020af2 100644
--- a/include/asm-cris/hw_irq.h
+++ b/include/asm-cris/hw_irq.h
@@ -1,7 +1,5 @@
 #ifndef _ASM_HW_IRQ_H
 #define _ASM_HW_IRQ_H
 
-static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i) {}
-
 #endif
 
diff --git a/include/asm-i386/hw_irq.h b/include/asm-i386/hw_irq.h
index a4c0a5a9ffd8..87e5a351d881 100644
--- a/include/asm-i386/hw_irq.h
+++ b/include/asm-i386/hw_irq.h
@@ -69,14 +69,4 @@ extern atomic_t irq_mis_count;
 
 #define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
 
-#if defined(CONFIG_X86_IO_APIC)
-static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
-{
-	if (IO_APIC_IRQ(i))
-		send_IPI_self(IO_APIC_VECTOR(i));
-}
-#else
-static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i) {}
-#endif
-
 #endif /* _ASM_HW_IRQ_H */
diff --git a/include/asm-ia64/hw_irq.h b/include/asm-ia64/hw_irq.h
index ea8b8c407ab4..27f9df6b9145 100644
--- a/include/asm-ia64/hw_irq.h
+++ b/include/asm-ia64/hw_irq.h
@@ -97,8 +97,7 @@ extern int reserve_irq_vector (int vector);
 extern void ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect);
 extern void register_percpu_irq (ia64_vector vec, struct irqaction *action);
 
-static inline void
-hw_resend_irq (struct hw_interrupt_type *h, unsigned int vector)
+static inline void ia64_resend_irq(unsigned int vector)
 {
 	platform_send_ipi(smp_processor_id(), vector, IA64_IPI_DM_INT, 0);
 }
diff --git a/include/asm-m32r/hw_irq.h b/include/asm-m32r/hw_irq.h
index 8d7e9d0e09e8..7138537cda03 100644
--- a/include/asm-m32r/hw_irq.h
+++ b/include/asm-m32r/hw_irq.h
@@ -1,9 +1,4 @@
 #ifndef _ASM_M32R_HW_IRQ_H
 #define _ASM_M32R_HW_IRQ_H
 
-static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
-{
-	/* Nothing to do */
-}
-
 #endif /* _ASM_M32R_HW_IRQ_H */
diff --git a/include/asm-mips/hw_irq.h b/include/asm-mips/hw_irq.h
index c854d017c0e5..458d9fdc76bf 100644
--- a/include/asm-mips/hw_irq.h
+++ b/include/asm-mips/hw_irq.h
@@ -19,9 +19,9 @@ extern void init_8259A(int aeoi);
 
 extern atomic_t irq_err_count;
 
-/* This may not be apropriate for all machines, we'll see ...  */
-static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
-{
-}
+/*
+ * interrupt-retrigger: NOP for now. This may not be apropriate for all
+ * machines, we'll see ...
+ */
 
 #endif /* __ASM_HW_IRQ_H */
diff --git a/include/asm-parisc/hw_irq.h b/include/asm-parisc/hw_irq.h
index 151426e27521..6707f7df3921 100644
--- a/include/asm-parisc/hw_irq.h
+++ b/include/asm-parisc/hw_irq.h
@@ -3,15 +3,6 @@
 
 /*
  *	linux/include/asm/hw_irq.h
- *
- *	(C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
- *
- *	moved some of the old arch/i386/kernel/irq.h to here. VY
- *
- *	IRQ/IPI changes taken from work by Thomas Radke
- *	<tomsoft@informatik.tu-chemnitz.de>
  */
 
-extern void hw_resend_irq(struct hw_interrupt_type *, unsigned int);
-
 #endif
diff --git a/include/asm-powerpc/hw_irq.h b/include/asm-powerpc/hw_irq.h
index a27ed3feb315..d40359204aba 100644
--- a/include/asm-powerpc/hw_irq.h
+++ b/include/asm-powerpc/hw_irq.h
@@ -102,11 +102,11 @@ static inline void local_irq_save_ptr(unsigned long *flags)
 			desc->chip->ack(irq);		\
 	})
 
-/* Should we handle this via lost interrupts and IPIs or should we don't care like
- * we do now ? --BenH.
+/*
+ * interrupt-retrigger: should we handle this via lost interrupts and IPIs
+ * or should we not care like we do now ? --BenH.
  */
 struct hw_interrupt_type;
-static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i) {}
 
 #endif	/* __KERNEL__ */
 #endif	/* _ASM_POWERPC_HW_IRQ_H */
diff --git a/include/asm-sh/hw_irq.h b/include/asm-sh/hw_irq.h
index 1d934fb2c581..fed26616967a 100644
--- a/include/asm-sh/hw_irq.h
+++ b/include/asm-sh/hw_irq.h
@@ -1,9 +1,4 @@
 #ifndef __ASM_SH_HW_IRQ_H
 #define __ASM_SH_HW_IRQ_H
 
-static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
-{
-	/* Nothing to do */
-}
-
 #endif /* __ASM_SH_HW_IRQ_H */
diff --git a/include/asm-sh64/hw_irq.h b/include/asm-sh64/hw_irq.h
index ae718d1f2d6c..ebb39089b0ac 100644
--- a/include/asm-sh64/hw_irq.h
+++ b/include/asm-sh64/hw_irq.h
@@ -11,6 +11,5 @@
  * Copyright (C) 2000, 2001  Paolo Alberelli
  *
  */
-static __inline__ void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i) { /* Nothing to do */ }
 
 #endif /* __ASM_SH64_HW_IRQ_H */
diff --git a/include/asm-um/hw_irq.h b/include/asm-um/hw_irq.h
index 4ee38c0b6a64..1cf84cf5f21a 100644
--- a/include/asm-um/hw_irq.h
+++ b/include/asm-um/hw_irq.h
@@ -4,7 +4,4 @@
 #include "asm/irq.h"
 #include "asm/archparam.h"
 
-static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
-{}
-
 #endif
diff --git a/include/asm-v850/hw_irq.h b/include/asm-v850/hw_irq.h
index a8aab4342712..043e94bb6bd8 100644
--- a/include/asm-v850/hw_irq.h
+++ b/include/asm-v850/hw_irq.h
@@ -1,8 +1,4 @@
 #ifndef __V850_HW_IRQ_H__
 #define __V850_HW_IRQ_H__
 
-static inline void hw_resend_irq (struct hw_interrupt_type *h, unsigned int i)
-{
-}
-
 #endif /* __V850_HW_IRQ_H__ */
diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h
index 931877462788..48a4a5364e85 100644
--- a/include/asm-x86_64/hw_irq.h
+++ b/include/asm-x86_64/hw_irq.h
@@ -127,15 +127,6 @@ __asm__( \
 	"push $~(" #nr ") ; " \
 	"jmp common_interrupt");
 
-#if defined(CONFIG_X86_IO_APIC)
-static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i) {
-	if (IO_APIC_IRQ(i))
-		send_IPI_self(IO_APIC_VECTOR(i));
-}
-#else
-static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i) {}
-#endif
-
 #define platform_legacy_irq(irq)	((irq) < 16)
 
 #endif
diff --git a/include/asm-xtensa/hw_irq.h b/include/asm-xtensa/hw_irq.h
index ccf436249eaa..3ddbea759b2b 100644
--- a/include/asm-xtensa/hw_irq.h
+++ b/include/asm-xtensa/hw_irq.h
@@ -11,8 +11,4 @@
 #ifndef _XTENSA_HW_IRQ_H
 #define _XTENSA_HW_IRQ_H
 
-static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
-{
-}
-
 #endif
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 519a1cb7c331..e58cfb9c66a3 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -53,6 +53,8 @@ struct hw_interrupt_type {
 	void		(*ack)(unsigned int irq);
 	void		(*end)(unsigned int irq);
 	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
+	int		(*retrigger)(unsigned int irq);
+
 	/* Currently used only by UML, might disappear one day.*/
 #ifdef CONFIG_IRQ_RELEASE_METHOD
 	void		(*release)(unsigned int irq, void *dev_id);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 76c8eda67295..19b438e09f12 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -123,7 +123,8 @@ void enable_irq(unsigned int irq)
 		desc->status = status;
 		if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
 			desc->status = status | IRQ_REPLAY;
-			hw_resend_irq(desc->chip,irq);
+			if (desc->chip && desc->chip->retrigger)
+				desc->chip->retrigger(irq);
 		}
 		desc->chip->enable(irq);
 		/* fall-through */
-- 
cgit v1.2.3


From 8fee5c36177ee098fa41f5fe72999609fef4df6b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jun 2006 02:24:45 -0700
Subject: [PATCH] genirq: doc: comment include/linux/irq.h structures

Better document the hw_interrupt_type and irq_desc structures.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h | 44 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index e58cfb9c66a3..b2688157b51b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -40,9 +40,27 @@
 # define CHECK_IRQ_PER_CPU(var) 0
 #endif
 
-/*
- * Interrupt controller descriptor. This is all we need
- * to describe about the low-level hardware. 
+/**
+ * struct hw_interrupt_type - hardware interrupt type descriptor
+ *
+ * @name:		name for /proc/interrupts
+ * @startup:		start up the interrupt (defaults to ->enable if NULL)
+ * @shutdown:		shut down the interrupt (defaults to ->disable if NULL)
+ * @enable:		enable the interrupt (defaults to chip->unmask if NULL)
+ * @disable:		disable the interrupt (defaults to chip->mask if NULL)
+ * @handle_irq:		irq flow handler called from the arch IRQ glue code
+ * @ack:		start of a new interrupt
+ * @mask:		mask an interrupt source
+ * @mask_ack:		ack and mask an interrupt source
+ * @unmask:		unmask an interrupt source
+ * @hold:		same interrupt while the handler is running
+ * @end:		end of interrupt
+ * @set_affinity:	set the CPU affinity on SMP machines
+ * @retrigger:		resend an IRQ to the CPU
+ * @set_type:		set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ
+ * @set_wake:		enable/disable power-management wake-on of an IRQ
+ *
+ * @release:		release function solely used by UML
  */
 struct hw_interrupt_type {
 	const char	*typename;
@@ -65,10 +83,22 @@ typedef struct hw_interrupt_type  hw_irq_controller;
 
 struct proc_dir_entry;
 
-/*
- * This is the "IRQ descriptor", which contains various information
- * about the irq, including what kind of hardware handling it has,
- * whether it is disabled etc etc.
+/**
+ * struct irq_desc - interrupt descriptor
+ *
+ * @handler:		interrupt type dependent handler functions
+ * @handler_data:	data for the type handlers
+ * @action:		the irq action chain
+ * @status:		status information
+ * @depth:		disable-depth, for nested irq_disable() calls
+ * @irq_count:		stats field to detect stalled irqs
+ * @irqs_unhandled:	stats field for spurious unhandled interrupts
+ * @lock:		locking for SMP
+ * @affinity:		IRQ affinity on SMP
+ * @pending_mask:	pending rebalanced interrupts
+ * @move_irq:		need to re-target IRQ destination
+ * @dir:		/proc/irq/ procfs entry
+ * @affinity_entry:	/proc/irq/smp_affinity procfs entry on SMP
  *
  * Pad this out to 32 bytes for cache and indexing reasons.
  */
-- 
cgit v1.2.3


From a4633adcdbc15ac51afcd0e1395de58cee27cf92 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jun 2006 02:24:48 -0700
Subject: [PATCH] genirq: add genirq sw IRQ-retrigger

Enable platforms that do not have a hardware-assisted hardirq-resend mechanism
to resend them via a softirq-driven IRQ emulation mechanism.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h |  3 +++
 kernel/irq/Makefile |  2 +-
 kernel/irq/manage.c | 10 +------
 kernel/irq/resend.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+), 10 deletions(-)
 create mode 100644 kernel/irq/resend.c

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index b2688157b51b..9a39756bfd31 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -246,6 +246,9 @@ extern void note_interrupt(unsigned int irq, struct irq_desc *desc,
 			   int action_ret, struct pt_regs *regs);
 extern int can_request_irq(unsigned int irq, unsigned long irqflags);
 
+/* Resending of interrupts :*/
+void check_irq_resend(struct irq_desc *desc, unsigned int irq);
+
 extern void init_irq_proc(void);
 
 #endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 9f77f50d8143..627ace98d4a2 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,5 @@
 
-obj-y := handle.o manage.o spurious.o
+obj-y := handle.o manage.o spurious.o resend.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 19b438e09f12..cffde4843897 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -118,15 +118,7 @@ void enable_irq(unsigned int irq)
 		WARN_ON(1);
 		break;
 	case 1: {
-		unsigned int status = desc->status & ~IRQ_DISABLED;
-
-		desc->status = status;
-		if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
-			desc->status = status | IRQ_REPLAY;
-			if (desc->chip && desc->chip->retrigger)
-				desc->chip->retrigger(irq);
-		}
-		desc->chip->enable(irq);
+		check_irq_resend(desc, irq);
 		/* fall-through */
 	}
 	default:
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
new file mode 100644
index 000000000000..096b102fb392
--- /dev/null
+++ b/kernel/irq/resend.c
@@ -0,0 +1,78 @@
+/*
+ * linux/kernel/irq/resend.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner
+ *
+ * This file contains the IRQ-resend code
+ *
+ * If the interrupt is waiting to be processed, we try to re-run it.
+ * We can't directly run it from here since the caller might be in an
+ * interrupt-protected region. Not all irq controller chips can
+ * retrigger interrupts at the hardware level, so in those cases
+ * we allow the resending of IRQs via a tasklet.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+#ifdef CONFIG_HARDIRQS_SW_RESEND
+
+/* Bitmap to handle software resend of interrupts: */
+static DECLARE_BITMAP(irqs_resend, NR_IRQS);
+
+/*
+ * Run software resends of IRQ's
+ */
+static void resend_irqs(unsigned long arg)
+{
+	struct irq_desc *desc;
+	int irq;
+
+	while (!bitmap_empty(irqs_resend, NR_IRQS)) {
+		irq = find_first_bit(irqs_resend, NR_IRQS);
+		clear_bit(irq, irqs_resend);
+		desc = irq_desc + irq;
+		spin_lock_irqsave(&desc->lock, flags);
+		desc->handle_irq(irq, desc, NULL);
+		spin_unlock_irqrestore(&desc->lock, flags);
+	}
+}
+
+/* Tasklet to handle resend: */
+static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
+
+#endif
+
+/*
+ * IRQ resend
+ *
+ * Is called with interrupts disabled and desc->lock held.
+ */
+void check_irq_resend(struct irq_desc *desc, unsigned int irq)
+{
+	unsigned int status = desc->status;
+
+	/*
+	 * Make sure the interrupt is enabled, before resending it:
+	 */
+	desc->chip->enable(irq);
+
+	if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+		desc->status &= ~IRQ_PENDING;
+		desc->status = status | IRQ_REPLAY;
+
+		if (!desc->chip || !desc->chip->retrigger ||
+					!desc->chip->retrigger(irq)) {
+#ifdef CONFIG_HARDIRQS_SW_RESEND
+			/* Set it pending and activate the softirq: */
+			set_bit(irq, irqs_resend);
+			tasklet_schedule(&resend_tasklet);
+#endif
+		}
+	}
+}
-- 
cgit v1.2.3


From 3418d72404e35eb19e7995cbf3e7a76ba8fefbce Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jun 2006 02:24:49 -0700
Subject: [PATCH] genirq: add IRQ_NOPROBE support

Introduce IRQ_NOPROBE: enables platforms to control chip-probing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h    | 1 +
 kernel/irq/autoprobe.c | 4 ++--
 kernel/irq/manage.c    | 4 ++++
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 9a39756bfd31..b2fcf330cf2e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -40,6 +40,7 @@
 # define CHECK_IRQ_PER_CPU(var) 0
 #endif
 
+#define IRQ_NOPROBE	512	/* IRQ is not valid for probing */
 /**
  * struct hw_interrupt_type - hardware interrupt type descriptor
  *
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 5c988bba401f..ed98c7d46cf2 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -40,7 +40,7 @@ unsigned long probe_irq_on(void)
 		desc = irq_desc + i;
 
 		spin_lock_irq(&desc->lock);
-		if (!desc->action)
+		if (!desc->action && !(desc->status & IRQ_NOPROBE))
 			desc->chip->startup(i);
 		spin_unlock_irq(&desc->lock);
 	}
@@ -57,7 +57,7 @@ unsigned long probe_irq_on(void)
 		desc = irq_desc + i;
 
 		spin_lock_irq(&desc->lock);
-		if (!desc->action) {
+		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
 			desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
 			if (desc->chip->startup(i))
 				desc->status |= IRQ_PENDING;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index cffde4843897..90a944a7fadc 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -118,6 +118,10 @@ void enable_irq(unsigned int irq)
 		WARN_ON(1);
 		break;
 	case 1: {
+		unsigned int status = desc->status & ~IRQ_DISABLED;
+
+		/* Prevent probing on this irq: */
+		desc->status = status | IRQ_NOPROBE;
 		check_irq_resend(desc, irq);
 		/* fall-through */
 	}
-- 
cgit v1.2.3


From 6550c775cb5ee94c132d93d84de3bb23f0abf37b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jun 2006 02:24:49 -0700
Subject: [PATCH] genirq: add IRQ_NOREQUEST support

Enable platforms to disable request_irq() for certain interrupts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h | 1 +
 kernel/irq/manage.c | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index b2fcf330cf2e..1df49ec7f820 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -41,6 +41,7 @@
 #endif
 
 #define IRQ_NOPROBE	512	/* IRQ is not valid for probing */
+#define IRQ_NOREQUEST	1024	/* IRQ cannot be requested */
 /**
  * struct hw_interrupt_type - hardware interrupt type descriptor
  *
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 90a944a7fadc..cae900a849c4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -141,7 +141,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
 	struct irqaction *action;
 
-	if (irq >= NR_IRQS)
+	if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST)
 		return 0;
 
 	action = irq_desc[irq].action;
@@ -356,6 +356,8 @@ int request_irq(unsigned int irq,
 		return -EINVAL;
 	if (irq >= NR_IRQS)
 		return -EINVAL;
+	if (irq_desc[irq].status & IRQ_NOREQUEST)
+		return -EINVAL;
 	if (!handler)
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 94d39e1f6e8132ea982a1d61acbe0423d3d14365 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jun 2006 02:24:50 -0700
Subject: [PATCH] genirq: add IRQ_NOAUTOEN support

Enable platforms to disable the automatic enabling of freshly set up irqs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h |  1 +
 kernel/irq/handle.c |  1 +
 kernel/irq/manage.c | 18 +++++++++++-------
 3 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 1df49ec7f820..14d7e94048dd 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -42,6 +42,7 @@
 
 #define IRQ_NOPROBE	512	/* IRQ is not valid for probing */
 #define IRQ_NOREQUEST	1024	/* IRQ cannot be requested */
+#define IRQ_NOAUTOEN	2048	/* IRQ will not be enabled on request irq */
 /**
  * struct hw_interrupt_type - hardware interrupt type descriptor
  *
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 402fa3aec1e4..9b398d52f1b7 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -32,6 +32,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
 		.chip = &no_irq_type,
+		.depth = 1,
 		.lock = SPIN_LOCK_UNLOCKED,
 #ifdef CONFIG_SMP
 		.affinity = CPU_MASK_ALL
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index cae900a849c4..9ea18879fb62 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,13 +216,17 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 		desc->status |= IRQ_PER_CPU;
 #endif
 	if (!shared) {
-		desc->depth = 0;
-		desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
-				  IRQ_WAITING | IRQ_INPROGRESS);
-		if (desc->chip->startup)
-			desc->chip->startup(irq);
-		else
-			desc->chip->enable(irq);
+		desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
+				  IRQ_INPROGRESS);
+
+		if (!(desc->status & IRQ_NOAUTOEN)) {
+			desc->depth = 0;
+			desc->status &= ~IRQ_DISABLED;
+			if (desc->chip->startup)
+				desc->chip->startup(irq);
+			else
+				desc->chip->enable(irq);
+		}
 	}
 	spin_unlock_irqrestore(&desc->lock, flags);
 
-- 
cgit v1.2.3


From 6a6de9ef5850d063c3d3fb50784bfe3a6d0712c6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jun 2006 02:24:51 -0700
Subject: [PATCH] genirq: core

Core genirq support: add the irq-chip and irq-flow abstractions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h    | 160 +++++++++++++++++++++++++++++++++++++++++++------
 kernel/irq/autoprobe.c |   9 ++-
 kernel/irq/handle.c    |  10 ++++
 kernel/irq/internals.h |   6 ++
 kernel/irq/manage.c    |  14 +++++
 kernel/irq/resend.c    |   4 +-
 kernel/irq/spurious.c  |   1 +
 7 files changed, 182 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 14d7e94048dd..437f2c635db6 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -43,20 +43,36 @@
 #define IRQ_NOPROBE	512	/* IRQ is not valid for probing */
 #define IRQ_NOREQUEST	1024	/* IRQ cannot be requested */
 #define IRQ_NOAUTOEN	2048	/* IRQ will not be enabled on request irq */
+#define IRQ_DELAYED_DISABLE \
+			4096	/* IRQ disable (masking) happens delayed. */
+
+/*
+ * IRQ types, see also include/linux/interrupt.h
+ */
+#define IRQ_TYPE_NONE		0x0000		/* Default, unspecified type */
+#define IRQ_TYPE_EDGE_RISING	0x0001		/* Edge rising type */
+#define IRQ_TYPE_EDGE_FALLING	0x0002		/* Edge falling type */
+#define IRQ_TYPE_EDGE_BOTH (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING)
+#define IRQ_TYPE_LEVEL_HIGH	0x0004		/* Level high type */
+#define IRQ_TYPE_LEVEL_LOW	0x0008		/* Level low type */
+#define IRQ_TYPE_SIMPLE		0x0010		/* Simple type */
+#define IRQ_TYPE_PERCPU		0x0020		/* Per CPU type */
+#define IRQ_TYPE_PROBE		0x0040		/* Probing in progress */
+
+struct proc_dir_entry;
+
 /**
- * struct hw_interrupt_type - hardware interrupt type descriptor
+ * struct irq_chip - hardware interrupt chip descriptor
  *
  * @name:		name for /proc/interrupts
  * @startup:		start up the interrupt (defaults to ->enable if NULL)
  * @shutdown:		shut down the interrupt (defaults to ->disable if NULL)
  * @enable:		enable the interrupt (defaults to chip->unmask if NULL)
  * @disable:		disable the interrupt (defaults to chip->mask if NULL)
- * @handle_irq:		irq flow handler called from the arch IRQ glue code
  * @ack:		start of a new interrupt
  * @mask:		mask an interrupt source
  * @mask_ack:		ack and mask an interrupt source
  * @unmask:		unmask an interrupt source
- * @hold:		same interrupt while the handler is running
  * @end:		end of interrupt
  * @set_affinity:	set the CPU affinity on SMP machines
  * @retrigger:		resend an IRQ to the CPU
@@ -64,33 +80,45 @@
  * @set_wake:		enable/disable power-management wake-on of an IRQ
  *
  * @release:		release function solely used by UML
+ * @typename:		obsoleted by name, kept as migration helper
  */
-struct hw_interrupt_type {
-	const char	*typename;
+struct irq_chip {
+	const char	*name;
 	unsigned int	(*startup)(unsigned int irq);
 	void		(*shutdown)(unsigned int irq);
 	void		(*enable)(unsigned int irq);
 	void		(*disable)(unsigned int irq);
+
 	void		(*ack)(unsigned int irq);
+	void		(*mask)(unsigned int irq);
+	void		(*mask_ack)(unsigned int irq);
+	void		(*unmask)(unsigned int irq);
+
 	void		(*end)(unsigned int irq);
 	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
 	int		(*retrigger)(unsigned int irq);
+	int		(*set_type)(unsigned int irq, unsigned int flow_type);
+	int		(*set_wake)(unsigned int irq, unsigned int on);
 
 	/* Currently used only by UML, might disappear one day.*/
 #ifdef CONFIG_IRQ_RELEASE_METHOD
 	void		(*release)(unsigned int irq, void *dev_id);
 #endif
+	/*
+	 * For compatibility, ->typename is copied into ->name.
+	 * Will disappear.
+	 */
+	const char	*typename;
 };
 
-typedef struct hw_interrupt_type  hw_irq_controller;
-
-struct proc_dir_entry;
-
 /**
  * struct irq_desc - interrupt descriptor
  *
- * @handler:		interrupt type dependent handler functions
- * @handler_data:	data for the type handlers
+ * @handle_irq:		highlevel irq-events handler [if NULL, __do_IRQ()]
+ * @chip:		low level interrupt hardware access
+ * @handler_data:	per-IRQ data for the irq_chip methods
+ * @chip_data:		platform-specific per-chip private data for the chip
+ *			methods, to allow shared chip implementations
  * @action:		the irq action chain
  * @status:		status information
  * @depth:		disable-depth, for nested irq_disable() calls
@@ -98,6 +126,7 @@ struct proc_dir_entry;
  * @irqs_unhandled:	stats field for spurious unhandled interrupts
  * @lock:		locking for SMP
  * @affinity:		IRQ affinity on SMP
+ * @cpu:		cpu index useful for balancing
  * @pending_mask:	pending rebalanced interrupts
  * @move_irq:		need to re-target IRQ destination
  * @dir:		/proc/irq/ procfs entry
@@ -106,16 +135,22 @@ struct proc_dir_entry;
  * Pad this out to 32 bytes for cache and indexing reasons.
  */
 struct irq_desc {
-	hw_irq_controller	*chip;
+	void fastcall		(*handle_irq)(unsigned int irq,
+					      struct irq_desc *desc,
+					      struct pt_regs *regs);
+	struct irq_chip		*chip;
+	void			*handler_data;
 	void			*chip_data;
 	struct irqaction	*action;	/* IRQ action list */
 	unsigned int		status;		/* IRQ status */
+
 	unsigned int		depth;		/* nested irq disables */
 	unsigned int		irq_count;	/* For detecting broken IRQs */
 	unsigned int		irqs_unhandled;
 	spinlock_t		lock;
 #ifdef CONFIG_SMP
 	cpumask_t		affinity;
+	unsigned int		cpu;
 #endif
 #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
 	cpumask_t		pending_mask;
@@ -131,6 +166,9 @@ extern struct irq_desc irq_desc[NR_IRQS];
 /*
  * Migration helpers for obsolete names, they will go away:
  */
+#define hw_interrupt_type	irq_chip
+typedef struct irq_chip		hw_irq_controller;
+#define no_irq_type		no_irq_chip
 typedef struct irq_desc		irq_desc_t;
 
 /*
@@ -138,6 +176,17 @@ typedef struct irq_desc		irq_desc_t;
  */
 #include <asm/hw_irq.h>
 
+/*
+ * Architectures call this to let the generic IRQ layer
+ * handle an interrupt:
+ */
+static inline void generic_handle_irq(unsigned int irq, struct pt_regs *regs)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	desc->handle_irq(irq, desc, regs);
+}
+
 extern int setup_irq(unsigned int irq, struct irqaction *new);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -236,27 +285,100 @@ static inline int select_smp_affinity(unsigned int irq)
 #endif
 
 extern int no_irq_affinity;
-extern int noirqdebug_setup(char *str);
 
-extern irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
-				    struct irqaction *action);
+/* Handle irq action chains: */
+extern int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+			    struct irqaction *action);
+
+/*
+ * Built-in IRQ handlers for various IRQ types,
+ * callable via desc->chip->handle_irq()
+ */
+extern void fastcall
+handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+extern void fastcall
+handle_fastack_irq(unsigned int irq, struct irq_desc *desc,
+			 struct pt_regs *regs);
+extern void fastcall
+handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+extern void fastcall
+handle_simple_irq(unsigned int irq, struct irq_desc *desc,
+		  struct pt_regs *regs);
+extern void fastcall
+handle_percpu_irq(unsigned int irq, struct irq_desc *desc,
+		  struct pt_regs *regs);
+extern void fastcall
+handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+
+/*
+ * Get a descriptive string for the highlevel handler, for
+ * /proc/interrupts output:
+ */
+extern const char *
+handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *,
+					struct pt_regs *));
+
 /*
- * Explicit fastcall, because i386 4KSTACKS calls it from assembly:
+ * Monolithic do_IRQ implementation.
+ * (is an explicit fastcall, because i386 4KSTACKS calls it from assembly)
  */
 extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
 
+/* Handling of unhandled and spurious interrupts: */
 extern void note_interrupt(unsigned int irq, struct irq_desc *desc,
 			   int action_ret, struct pt_regs *regs);
-extern int can_request_irq(unsigned int irq, unsigned long irqflags);
 
 /* Resending of interrupts :*/
 void check_irq_resend(struct irq_desc *desc, unsigned int irq);
 
+/* Initialize /proc/irq/ */
 extern void init_irq_proc(void);
 
-#endif /* CONFIG_GENERIC_HARDIRQS */
+/* Enable/disable irq debugging output: */
+extern int noirqdebug_setup(char *str);
+
+/* Checks whether the interrupt can be requested by request_irq(): */
+extern int can_request_irq(unsigned int irq, unsigned long irqflags);
+
+/* Dummy irq-chip implementation: */
+extern struct irq_chip no_irq_chip;
+
+extern void
+set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
+			 void fastcall (*handle)(unsigned int,
+						 struct irq_desc *,
+						 struct pt_regs *));
+extern void
+__set_irq_handler(unsigned int irq,
+		  void fastcall (*handle)(unsigned int, struct irq_desc *,
+					  struct pt_regs *),
+		  int is_chained);
 
-extern hw_irq_controller no_irq_type;  /* needed in every arch ? */
+/*
+ * Set a highlevel flow handler for a given IRQ:
+ */
+static inline void
+set_irq_handler(unsigned int irq,
+		void fastcall (*handle)(unsigned int, struct irq_desc *,
+					struct pt_regs *))
+{
+	__set_irq_handler(irq, handle, 0);
+}
+
+/*
+ * Set a highlevel chained flow handler for a given IRQ.
+ * (a chained handler is automatically enabled and set to
+ *  IRQ_NOREQUEST and IRQ_NOPROBE)
+ */
+static inline void
+set_irq_chained_handler(unsigned int irq,
+			void fastcall (*handle)(unsigned int, struct irq_desc *,
+						struct pt_regs *))
+{
+	__set_irq_handler(irq, handle, 1);
+}
+
+#endif /* CONFIG_GENERIC_HARDIRQS */
 
 #endif /* !CONFIG_S390 */
 
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index ed98c7d46cf2..cfdb63eb5c94 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -40,8 +40,15 @@ unsigned long probe_irq_on(void)
 		desc = irq_desc + i;
 
 		spin_lock_irq(&desc->lock);
-		if (!desc->action && !(desc->status & IRQ_NOPROBE))
+		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
+			/*
+			 * Some chips need to know about probing in
+			 * progress:
+			 */
+			if (desc->chip->set_type)
+				desc->chip->set_type(i, IRQ_TYPE_PROBE);
 			desc->chip->startup(i);
+		}
 		spin_unlock_irq(&desc->lock);
 	}
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index bddcb8f5fea4..a04b516afa59 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -18,6 +18,16 @@
 
 #include "internals.h"
 
+/**
+ * handle_bad_irq - handle spurious and unhandled irqs
+ */
+void fastcall
+handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+{
+	kstat_this_cpu.irqs[irq]++;
+	ack_bad_irq(irq);
+}
+
 /*
  * Linux has a controller-independent interrupt architecture.
  * Every controller has a 'controller-template', that is used
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 46feba630266..2ba8ae3c8e96 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -4,6 +4,12 @@
 
 extern int noirqdebug;
 
+/* Set default functions for irq_chip structures: */
+extern void irq_chip_set_defaults(struct irq_chip *chip);
+
+/* Set default handler: */
+extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
+
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1a2e7663096a..b61784ee78b2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -153,6 +153,17 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 	return !action;
 }
 
+void compat_irq_chip_set_default_handler(struct irq_desc *desc)
+{
+	/*
+	 * If the architecture still has not overriden
+	 * the flow handler then zap the default. This
+	 * should catch incorrect flow-type setting.
+	 */
+	if (desc->handle_irq == &handle_bad_irq)
+		desc->handle_irq = NULL;
+}
+
 /*
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
@@ -217,6 +228,9 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 		desc->status |= IRQ_PER_CPU;
 #endif
 	if (!shared) {
+		irq_chip_set_defaults(desc->chip);
+		compat_irq_chip_set_default_handler(desc);
+
 		desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
 				  IRQ_INPROGRESS);
 
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 096b102fb392..872f91ba2ce8 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -37,9 +37,9 @@ static void resend_irqs(unsigned long arg)
 		irq = find_first_bit(irqs_resend, NR_IRQS);
 		clear_bit(irq, irqs_resend);
 		desc = irq_desc + irq;
-		spin_lock_irqsave(&desc->lock, flags);
+		local_irq_disable();
 		desc->handle_irq(irq, desc, NULL);
-		spin_unlock_irqrestore(&desc->lock, flags);
+		local_irq_enable();
 	}
 }
 
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3a0a62123301..ca187b83f897 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -168,6 +168,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		 */
 		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
 		desc->status |= IRQ_DISABLED;
+		desc->depth = 1;
 		desc->chip->disable(irq);
 	}
 	desc->irqs_unhandled = 0;
-- 
cgit v1.2.3


From dae8620421833bb2e9a01c4ccc42bdc3759b81df Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jun 2006 02:24:52 -0700
Subject: [PATCH] genirq MSI fixes

This is a fixed up and cleaned up replacement for genirq-msi-fixes.patch,
which should solve the i386 4KSTACKS problem.  I also added Ben's idea of
pushing the __do_IRQ() check into generic_handle_irq().

I booted this with MSI enabled, but i only have MSI devices, not MSI-X
devices.  I'd still expect MSI-X to work now.

irqchip migration helper: call __do_IRQ() if a descriptor is attached to an
irqtype-style controller.  This also fixes MSI-X IRQ handling on i386 and
x86_64.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/irq.c |  5 +++++
 include/linux/irq.h    | 27 ++++++++++++++++-----------
 2 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 3336cef9605a..16b491703967 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -82,6 +82,10 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
 	}
 #endif
 
+	if (!irq_desc[irq].handle_irq) {
+		__do_IRQ(irq, regs);
+		goto out_exit;
+	}
 #ifdef CONFIG_4KSTACKS
 
 	curctx = (union irq_ctx *) current_thread_info();
@@ -121,6 +125,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
 #endif
 		__do_IRQ(irq, regs);
 
+out_exit:
 	irq_exit();
 
 	return 1;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 437f2c635db6..b40771dd114a 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -176,17 +176,6 @@ typedef struct irq_desc		irq_desc_t;
  */
 #include <asm/hw_irq.h>
 
-/*
- * Architectures call this to let the generic IRQ layer
- * handle an interrupt:
- */
-static inline void generic_handle_irq(unsigned int irq, struct pt_regs *regs)
-{
-	struct irq_desc *desc = irq_desc + irq;
-
-	desc->handle_irq(irq, desc, regs);
-}
-
 extern int setup_irq(unsigned int irq, struct irqaction *new);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -324,6 +313,22 @@ handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *,
  */
 extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
 
+/*
+ * Architectures call this to let the generic IRQ layer
+ * handle an interrupt. If the descriptor is attached to an
+ * irqchip-style controller then we call the ->handle_irq() handler,
+ * and it calls __do_IRQ() if it's attached to an irqtype-style controller.
+ */
+static inline void generic_handle_irq(unsigned int irq, struct pt_regs *regs)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	if (likely(desc->handle_irq))
+		desc->handle_irq(irq, desc, regs);
+	else
+		__do_IRQ(irq, regs);
+}
+
 /* Handling of unhandled and spurious interrupts: */
 extern void note_interrupt(unsigned int irq, struct irq_desc *desc,
 			   int action_ret, struct pt_regs *regs);
-- 
cgit v1.2.3


From dd87eb3a24c4527741122713e223d74b85d43c85 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jun 2006 02:24:53 -0700
Subject: [PATCH] genirq: add irq-chip support

Enable platforms to use the irq-chip and irq-flow abstractions: allow setting
of the chip, the type and provide highlevel handlers for common irq-flows.

[rostedt@goodmis.org: misroute-irq: Don't call desc->chip->end because of edge interrupts]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h   |  11 ++
 kernel/irq/Makefile   |   2 +-
 kernel/irq/chip.c     | 525 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/irq/spurious.c |   2 +-
 4 files changed, 538 insertions(+), 2 deletions(-)
 create mode 100644 kernel/irq/chip.c

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index b40771dd114a..ca8d2a849cff 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -383,6 +383,17 @@ set_irq_chained_handler(unsigned int irq,
 	__set_irq_handler(irq, handle, 1);
 }
 
+/* Set/get chip/data for an IRQ: */
+
+extern int set_irq_chip(unsigned int irq, struct irq_chip *chip);
+extern int set_irq_data(unsigned int irq, void *data);
+extern int set_irq_chip_data(unsigned int irq, void *data);
+extern int set_irq_type(unsigned int irq, unsigned int type);
+
+#define get_irq_chip(irq)	(irq_desc[irq].chip)
+#define get_irq_chip_data(irq)	(irq_desc[irq].chip_data)
+#define get_irq_data(irq)	(irq_desc[irq].handler_data)
+
 #endif /* CONFIG_GENERIC_HARDIRQS */
 
 #endif /* !CONFIG_S390 */
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 627ace98d4a2..1dab0ac3f797 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,5 @@
 
-obj-y := handle.o manage.o spurious.o resend.o
+obj-y := handle.o manage.o spurious.o resend.o chip.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
new file mode 100644
index 000000000000..8736f2ca8a3b
--- /dev/null
+++ b/kernel/irq/chip.c
@@ -0,0 +1,525 @@
+/*
+ * linux/kernel/irq/chip.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the core interrupt handling code, for irq-chip
+ * based architectures.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+/**
+ *	set_irq_chip - set the irq chip for an irq
+ *	@irq:	irq number
+ *	@chip:	pointer to irq chip description structure
+ */
+int set_irq_chip(unsigned int irq, struct irq_chip *chip)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if (!chip)
+		chip = &no_irq_chip;
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	irq_chip_set_defaults(chip);
+	desc->chip = chip;
+	/*
+	 * For compatibility only:
+	 */
+	desc->chip = chip;
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(set_irq_chip);
+
+/**
+ *	set_irq_type - set the irq type for an irq
+ *	@irq:	irq number
+ *	@type:	interrupt type - see include/linux/interrupt.h
+ */
+int set_irq_type(unsigned int irq, unsigned int type)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+	int ret = -ENXIO;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
+		return -ENODEV;
+	}
+
+	desc = irq_desc + irq;
+	if (desc->chip->set_type) {
+		spin_lock_irqsave(&desc->lock, flags);
+		ret = desc->chip->set_type(irq, type);
+		spin_unlock_irqrestore(&desc->lock, flags);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(set_irq_type);
+
+/**
+ *	set_irq_data - set irq type data for an irq
+ *	@irq:	Interrupt number
+ *	@data:	Pointer to interrupt specific data
+ *
+ *	Set the hardware irq controller data for an irq
+ */
+int set_irq_data(unsigned int irq, void *data)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR
+		       "Trying to install controller data for IRQ%d\n", irq);
+		return -EINVAL;
+	}
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->handler_data = data;
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return 0;
+}
+EXPORT_SYMBOL(set_irq_data);
+
+/**
+ *	set_irq_chip_data - set irq chip data for an irq
+ *	@irq:	Interrupt number
+ *	@data:	Pointer to chip specific data
+ *
+ *	Set the hardware irq chip data for an irq
+ */
+int set_irq_chip_data(unsigned int irq, void *data)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS || !desc->chip) {
+		printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->chip_data = data;
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(set_irq_chip_data);
+
+/*
+ * default enable function
+ */
+static void default_enable(unsigned int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	desc->chip->unmask(irq);
+	desc->status &= ~IRQ_MASKED;
+}
+
+/*
+ * default disable function
+ */
+static void default_disable(unsigned int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	if (!(desc->status & IRQ_DELAYED_DISABLE))
+		irq_desc[irq].chip->mask(irq);
+}
+
+/*
+ * default startup function
+ */
+static unsigned int default_startup(unsigned int irq)
+{
+	irq_desc[irq].chip->enable(irq);
+
+	return 0;
+}
+
+/*
+ * Fixup enable/disable function pointers
+ */
+void irq_chip_set_defaults(struct irq_chip *chip)
+{
+	if (!chip->enable)
+		chip->enable = default_enable;
+	if (!chip->disable)
+		chip->disable = default_disable;
+	if (!chip->startup)
+		chip->startup = default_startup;
+	if (!chip->shutdown)
+		chip->shutdown = chip->disable;
+	if (!chip->name)
+		chip->name = chip->typename;
+}
+
+static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+{
+	if (desc->chip->mask_ack)
+		desc->chip->mask_ack(irq);
+	else {
+		desc->chip->mask(irq);
+		desc->chip->ack(irq);
+	}
+}
+
+/**
+ *	handle_simple_irq - Simple and software-decoded IRQs.
+ *	@irq:	the interrupt number
+ *	@desc:	the interrupt description structure for this irq
+ *	@regs:	pointer to a register structure
+ *
+ *	Simple interrupts are either sent from a demultiplexing interrupt
+ *	handler or come from hardware, where no interrupt hardware control
+ *	is necessary.
+ *
+ *	Note: The caller is expected to handle the ack, clear, mask and
+ *	unmask issues if necessary.
+ */
+void fastcall
+handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+{
+	struct irqaction *action;
+	irqreturn_t action_ret;
+	const unsigned int cpu = smp_processor_id();
+
+	spin_lock(&desc->lock);
+
+	if (unlikely(desc->status & IRQ_INPROGRESS))
+		goto out_unlock;
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+	kstat_cpu(cpu).irqs[irq]++;
+
+	action = desc->action;
+	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+		goto out_unlock;
+
+	desc->status |= IRQ_INPROGRESS;
+	spin_unlock(&desc->lock);
+
+	action_ret = handle_IRQ_event(irq, regs, action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret, regs);
+
+	spin_lock(&desc->lock);
+	desc->status &= ~IRQ_INPROGRESS;
+out_unlock:
+	spin_unlock(&desc->lock);
+}
+
+/**
+ *	handle_level_irq - Level type irq handler
+ *	@irq:	the interrupt number
+ *	@desc:	the interrupt description structure for this irq
+ *	@regs:	pointer to a register structure
+ *
+ *	Level type interrupts are active as long as the hardware line has
+ *	the active level. This may require to mask the interrupt and unmask
+ *	it after the associated handler has acknowledged the device, so the
+ *	interrupt line is back to inactive.
+ */
+void fastcall
+handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+{
+	unsigned int cpu = smp_processor_id();
+	struct irqaction *action;
+	irqreturn_t action_ret;
+
+	spin_lock(&desc->lock);
+	mask_ack_irq(desc, irq);
+
+	if (unlikely(desc->status & IRQ_INPROGRESS))
+		goto out;
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+	kstat_cpu(cpu).irqs[irq]++;
+
+	/*
+	 * If its disabled or no action available
+	 * keep it masked and get out of here
+	 */
+	action = desc->action;
+	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+		goto out;
+
+	desc->status |= IRQ_INPROGRESS;
+	spin_unlock(&desc->lock);
+
+	action_ret = handle_IRQ_event(irq, regs, action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret, regs);
+
+	spin_lock(&desc->lock);
+	desc->status &= ~IRQ_INPROGRESS;
+out:
+	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+		desc->chip->unmask(irq);
+	spin_unlock(&desc->lock);
+}
+
+/**
+ *	handle_fastack_irq - irq handler for transparent controllers
+ *	@irq:	the interrupt number
+ *	@desc:	the interrupt description structure for this irq
+ *	@regs:	pointer to a register structure
+ *
+ *	Only a single callback will be issued to the chip: an ->ack()
+ *	call when the interrupt has been serviced. This enables support
+ *	for modern forms of interrupt handlers, which handle the flow
+ *	details in hardware, transparently.
+ */
+void fastcall
+handle_fastack_irq(unsigned int irq, struct irq_desc *desc,
+		   struct pt_regs *regs)
+{
+	unsigned int cpu = smp_processor_id();
+	struct irqaction *action;
+	irqreturn_t action_ret;
+
+	spin_lock(&desc->lock);
+
+	if (unlikely(desc->status & IRQ_INPROGRESS))
+		goto out;
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+	kstat_cpu(cpu).irqs[irq]++;
+
+	/*
+	 * If its disabled or no action available
+	 * keep it masked and get out of here
+	 */
+	action = desc->action;
+	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+		goto out;
+
+	desc->status |= IRQ_INPROGRESS;
+	spin_unlock(&desc->lock);
+
+	action_ret = handle_IRQ_event(irq, regs, action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret, regs);
+
+	spin_lock(&desc->lock);
+	desc->status &= ~IRQ_INPROGRESS;
+out:
+	if (!(desc->status & IRQ_DISABLED))
+		desc->chip->ack(irq);
+	else
+		desc->chip->mask(irq);
+
+	spin_unlock(&desc->lock);
+}
+
+/**
+ *	handle_edge_irq - edge type IRQ handler
+ *	@irq:	the interrupt number
+ *	@desc:	the interrupt description structure for this irq
+ *	@regs:	pointer to a register structure
+ *
+ *	Interrupt occures on the falling and/or rising edge of a hardware
+ *	signal. The occurence is latched into the irq controller hardware
+ *	and must be acked in order to be reenabled. After the ack another
+ *	interrupt can happen on the same source even before the first one
+ *	is handled by the assosiacted event handler. If this happens it
+ *	might be necessary to disable (mask) the interrupt depending on the
+ *	controller hardware. This requires to reenable the interrupt inside
+ *	of the loop which handles the interrupts which have arrived while
+ *	the handler was running. If all pending interrupts are handled, the
+ *	loop is left.
+ */
+void fastcall
+handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+{
+	const unsigned int cpu = smp_processor_id();
+
+	spin_lock(&desc->lock);
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+
+	/*
+	 * If we're currently running this IRQ, or its disabled,
+	 * we shouldn't process the IRQ. Mark it pending, handle
+	 * the necessary masking and go out
+	 */
+	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
+		    !desc->action)) {
+		desc->status |= (IRQ_PENDING | IRQ_MASKED);
+		mask_ack_irq(desc, irq);
+		goto out_unlock;
+	}
+
+	kstat_cpu(cpu).irqs[irq]++;
+
+	/* Start handling the irq */
+	desc->chip->ack(irq);
+
+	/* Mark the IRQ currently in progress.*/
+	desc->status |= IRQ_INPROGRESS;
+
+	do {
+		struct irqaction *action = desc->action;
+		irqreturn_t action_ret;
+
+		if (unlikely(!action)) {
+			desc->chip->mask(irq);
+			goto out_unlock;
+		}
+
+		/*
+		 * When another irq arrived while we were handling
+		 * one, we could have masked the irq.
+		 * Renable it, if it was not disabled in meantime.
+		 */
+		if (unlikely((desc->status &
+			       (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
+			      (IRQ_PENDING | IRQ_MASKED))) {
+			desc->chip->unmask(irq);
+			desc->status &= ~IRQ_MASKED;
+		}
+
+		desc->status &= ~IRQ_PENDING;
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, regs, action);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret, regs);
+		spin_lock(&desc->lock);
+
+	} while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
+
+	desc->status &= ~IRQ_INPROGRESS;
+out_unlock:
+	spin_unlock(&desc->lock);
+}
+
+#ifdef CONFIG_SMP
+/**
+ *	handle_percpu_IRQ - Per CPU local irq handler
+ *	@irq:	the interrupt number
+ *	@desc:	the interrupt description structure for this irq
+ *	@regs:	pointer to a register structure
+ *
+ *	Per CPU interrupts on SMP machines without locking requirements
+ */
+void fastcall
+handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+{
+	irqreturn_t action_ret;
+
+	kstat_this_cpu.irqs[irq]++;
+
+	if (desc->chip->ack)
+		desc->chip->ack(irq);
+
+	action_ret = handle_IRQ_event(irq, regs, desc->action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret, regs);
+
+	if (desc->chip->eoi)
+		desc->chip->eoi(irq);
+}
+
+#endif /* CONFIG_SMP */
+
+void
+__set_irq_handler(unsigned int irq,
+		  void fastcall (*handle)(unsigned int, irq_desc_t *,
+					  struct pt_regs *),
+		  int is_chained)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR
+		       "Trying to install type control for IRQ%d\n", irq);
+		return;
+	}
+
+	desc = irq_desc + irq;
+
+	if (!handle)
+		handle = handle_bad_irq;
+
+	if (is_chained && desc->chip == &no_irq_chip)
+		printk(KERN_WARNING "Trying to install "
+		       "chained interrupt type for IRQ%d\n", irq);
+
+	spin_lock_irqsave(&desc->lock, flags);
+
+	/* Uninstall? */
+	if (handle == handle_bad_irq) {
+		if (desc->chip != &no_irq_chip) {
+			desc->chip->mask(irq);
+			desc->chip->ack(irq);
+		}
+		desc->status |= IRQ_DISABLED;
+		desc->depth = 1;
+	}
+	desc->handle_irq = handle;
+
+	if (handle != handle_bad_irq && is_chained) {
+		desc->status &= ~IRQ_DISABLED;
+		desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
+		desc->depth = 0;
+		desc->chip->unmask(irq);
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+void
+set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
+			 void fastcall (*handle)(unsigned int,
+						 struct irq_desc *,
+						 struct pt_regs *))
+{
+	set_irq_chip(irq, chip);
+	__set_irq_handler(irq, handle, 0);
+}
+
+/*
+ * Get a descriptive string for the highlevel handler, for
+ * /proc/interrupts output:
+ */
+const char *
+handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *,
+					struct pt_regs *))
+{
+	if (handle == handle_level_irq)
+		return "level ";
+	if (handle == handle_fastack_irq)
+		return "level ";
+	if (handle == handle_edge_irq)
+		return "edge  ";
+	if (handle == handle_simple_irq)
+		return "simple";
+#ifdef CONFIG_SMP
+	if (handle == handle_percpu_irq)
+		return "percpu";
+#endif
+	if (handle == handle_bad_irq)
+		return "bad   ";
+
+	return NULL;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index ca187b83f897..b483deed311c 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -79,7 +79,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
 		 * If we did actual work for the real IRQ line we must let the
 		 * IRQ controller clean up too
 		 */
-		if (work)
+		if (work && desc->chip && desc->chip->end)
 			desc->chip->end(i);
 		spin_unlock(&desc->lock);
 	}
-- 
cgit v1.2.3


From ba9a2331bae5da8f65be3722b9e2d210f1987857 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jun 2006 02:24:55 -0700
Subject: [PATCH] genirq: add irq-wake (power-management) support

Enable platforms to set the irq-wake (power-management) properties of an IRQ.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/interrupt.h | 14 ++++++++++++++
 kernel/irq/manage.c       | 21 +++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 70741e170114..db2a63a11633 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -36,6 +36,20 @@ extern void free_irq(unsigned int, void *);
 extern void disable_irq_nosync(unsigned int irq);
 extern void disable_irq(unsigned int irq);
 extern void enable_irq(unsigned int irq);
+
+/* IRQ wakeup (PM) control: */
+extern int set_irq_wake(unsigned int irq, unsigned int on);
+
+static inline int enable_irq_wake(unsigned int irq)
+{
+	return set_irq_wake(irq, 1);
+}
+
+static inline int disable_irq_wake(unsigned int irq)
+{
+	return set_irq_wake(irq, 0);
+}
+
 #endif
 
 #ifndef __ARCH_SET_SOFTIRQ_PENDING
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b61784ee78b2..3ed7aee84865 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -133,6 +133,27 @@ void enable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(enable_irq);
 
+/**
+ *	set_irq_wake - control irq power management wakeup
+ *	@irq:	interrupt to control
+ *	@on:	enable/disable power management wakeup
+ *
+ *	Enable/disable power management wakeup mode
+ */
+int set_irq_wake(unsigned int irq, unsigned int on)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	unsigned long flags;
+	int ret = -ENXIO;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	if (desc->chip->set_wake)
+		ret = desc->chip->set_wake(irq, on);
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(set_irq_wake);
+
 /*
  * Internal function that tells the architecture code whether a
  * particular irq has been exclusively allocated or is available
-- 
cgit v1.2.3


From f210be198ddd3f54b17d4aa6e69b829f75f226e5 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 29 Jun 2006 02:25:00 -0700
Subject: [PATCH] genirq: add IRQ_TYPE_SENSE_MASK

Add a #define for the mask of the part of IRQ_TYPE that represents the
trigger type.  I use that in my in-progress work as I've standardized the
way the irq description in the firmware device-tree get translated to linux
useable things by using those constants.  Having this mask to isolate the
"trigger type" part of the flags is useful in a few places.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index ca8d2a849cff..0d8eaf3e4036 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -55,6 +55,7 @@
 #define IRQ_TYPE_EDGE_BOTH (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING)
 #define IRQ_TYPE_LEVEL_HIGH	0x0004		/* Level high type */
 #define IRQ_TYPE_LEVEL_LOW	0x0008		/* Level low type */
+#define IRQ_TYPE_SENSE_MASK	0x000f		/* Mask of the above */
 #define IRQ_TYPE_SIMPLE		0x0010		/* Simple type */
 #define IRQ_TYPE_PERCPU		0x0020		/* Per CPU type */
 #define IRQ_TYPE_PROBE		0x0040		/* Probing in progress */
-- 
cgit v1.2.3


From 47c2a3aa4475d27073dd3c7e183fcc13f495c8f5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jun 2006 02:25:03 -0700
Subject: [PATCH] genirq: add chip->eoi(), fastack -> fasteoi

Clean up the fastack concept by turning it into fasteoi and introducing the
->eoi() method for chips.

This also allows the cleanup of an i386 EOI quirk - now the quirk is
cleanly separated from the pure ACK implementation.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h |  6 ++++--
 kernel/irq/chip.c   | 25 +++++++++++--------------
 2 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 0d8eaf3e4036..0832149cdb18 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -74,7 +74,8 @@ struct proc_dir_entry;
  * @mask:		mask an interrupt source
  * @mask_ack:		ack and mask an interrupt source
  * @unmask:		unmask an interrupt source
- * @end:		end of interrupt
+ * @eoi:		end of interrupt - chip level
+ * @end:		end of interrupt - flow level
  * @set_affinity:	set the CPU affinity on SMP machines
  * @retrigger:		resend an IRQ to the CPU
  * @set_type:		set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ
@@ -94,6 +95,7 @@ struct irq_chip {
 	void		(*mask)(unsigned int irq);
 	void		(*mask_ack)(unsigned int irq);
 	void		(*unmask)(unsigned int irq);
+	void		(*eoi)(unsigned int irq);
 
 	void		(*end)(unsigned int irq);
 	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
@@ -287,7 +289,7 @@ extern int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 extern void fastcall
 handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
 extern void fastcall
-handle_fastack_irq(unsigned int irq, struct irq_desc *desc,
+handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc,
 			 struct pt_regs *regs);
 extern void fastcall
 handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a99047a324eb..4a0952d9458b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -280,18 +280,18 @@ out:
 }
 
 /**
- *	handle_fastack_irq - irq handler for transparent controllers
+ *	handle_fasteoi_irq - irq handler for transparent controllers
  *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
  *	@regs:	pointer to a register structure
  *
- *	Only a single callback will be issued to the chip: an ->ack()
+ *	Only a single callback will be issued to the chip: an ->eoi()
  *	call when the interrupt has been serviced. This enables support
  *	for modern forms of interrupt handlers, which handle the flow
  *	details in hardware, transparently.
  */
 void fastcall
-handle_fastack_irq(unsigned int irq, struct irq_desc *desc,
+handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc,
 		   struct pt_regs *regs)
 {
 	unsigned int cpu = smp_processor_id();
@@ -327,10 +327,7 @@ handle_fastack_irq(unsigned int irq, struct irq_desc *desc,
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
 out:
-	if (!(desc->status & IRQ_DISABLED))
-		desc->chip->ack(irq);
-	else
-		desc->chip->mask(irq);
+	desc->chip->eoi(irq);
 
 	spin_unlock(&desc->lock);
 }
@@ -510,19 +507,19 @@ handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *,
 					struct pt_regs *))
 {
 	if (handle == handle_level_irq)
-		return "level ";
-	if (handle == handle_fastack_irq)
-		return "level ";
+		return "level  ";
+	if (handle == handle_fasteoi_irq)
+		return "fasteoi";
 	if (handle == handle_edge_irq)
-		return "edge  ";
+		return "edge   ";
 	if (handle == handle_simple_irq)
-		return "simple";
+		return "simple ";
 #ifdef CONFIG_SMP
 	if (handle == handle_percpu_irq)
-		return "percpu";
+		return "percpu ";
 #endif
 	if (handle == handle_bad_irq)
-		return "bad   ";
+		return "bad    ";
 
 	return NULL;
 }
-- 
cgit v1.2.3