From ee05e0b0a30a5e1965d1baeadaa3b534ad7a1686 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 7 Mar 2005 17:23:53 -0800
Subject: [PATCH] remove drivers/char/tpqic02.c

Since at about half a year, this driver was no longer selectable via
Kconfig.

Since it seems noone missed this driver, therefore this patch removes
it.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/major.h   |   1 -
 include/linux/mtio.h    |  28 --
 include/linux/tpqic02.h | 738 ------------------------------------------------
 3 files changed, 767 deletions(-)
 delete mode 100644 include/linux/tpqic02.h

(limited to 'include/linux')

diff --git a/include/linux/major.h b/include/linux/major.h
index 8585730af484..4b62c42b842c 100644
--- a/include/linux/major.h
+++ b/include/linux/major.h
@@ -25,7 +25,6 @@
 #define MISC_MAJOR		10
 #define SCSI_CDROM_MAJOR	11
 #define MUX_MAJOR		11	/* PA-RISC only */
-#define QIC02_TAPE_MAJOR	12
 #define XT_DISK_MAJOR		13
 #define INPUT_MAJOR		13
 #define SOUND_MAJOR		14
diff --git a/include/linux/mtio.h b/include/linux/mtio.h
index 4f2daa83d3ad..8c66151821e3 100644
--- a/include/linux/mtio.h
+++ b/include/linux/mtio.h
@@ -150,34 +150,6 @@ struct	mtpos {
 };
 
 
-/* structure for MTIOCGETCONFIG/MTIOCSETCONFIG primarily intended
- * as an interim solution for QIC-02 until DDI is fully implemented.
- */
-struct mtconfiginfo {
-	long	mt_type;	/* drive type */
-	long	ifc_type;	/* interface card type */
-	unsigned short	irqnr;	/* IRQ number to use */
-	unsigned short	dmanr;	/* DMA channel to use */
-	unsigned short	port;	/* IO port base address */
-
-	unsigned long	debug;	/* debugging flags */
-
-	unsigned	have_dens:1;
-	unsigned	have_bsf:1;
-	unsigned	have_fsr:1;
-	unsigned	have_bsr:1;
-	unsigned	have_eod:1;
-	unsigned	have_seek:1;
-	unsigned	have_tell:1;
-	unsigned	have_ras1:1;
-	unsigned	have_ras2:1;
-	unsigned	have_ras3:1;
-	unsigned	have_qfa:1;
-
-	unsigned	pad1:5;
-	char		reserved[10];
-};
-
 /*  structure for MTIOCVOLINFO, query information about the volume
  *  currently positioned at (zftape)
  */
diff --git a/include/linux/tpqic02.h b/include/linux/tpqic02.h
deleted file mode 100644
index f0dfcfa56273..000000000000
--- a/include/linux/tpqic02.h
+++ /dev/null
@@ -1,738 +0,0 @@
-/* $Id: tpqic02.h,v 1.5 1996/12/14 23:01:38 root Exp root $
- *
- * Include file for QIC-02 driver for Linux.
- *
- * Copyright (c) 1992--1995 by H. H. Bergman. All rights reserved.
- *
- * ******* USER CONFIG SECTION BELOW (Near line 70) *******
- */
-
-#ifndef _LINUX_TPQIC02_H
-#define _LINUX_TPQIC02_H
-
-#include <linux/config.h>
-
-#if defined(CONFIG_QIC02_TAPE) || defined(CONFIG_QIC02_TAPE_MODULE)
-
-/* need to have QIC02_TAPE_DRIVE and QIC02_TAPE_IFC expand to something */
-#include <linux/mtio.h>
-
-
-/* Make QIC02_TAPE_IFC expand to something.
- *
- * The only difference between WANGTEK and EVEREX is in the 
- * handling of the DMA channel 3.
- * Note that the driver maps EVEREX to WANGTEK internally for speed
- * reasons. Externally WANGTEK==1, EVEREX==2, ARCHIVE==3.
- * These must correspond to the values used in qic02config(1).
- *
- * Support for Mountain controllers was added by Erik Jacobson
- * and severely hacked by me.   -- hhb
- * 
- * Support for Emerald controllers by Alan Bain <afrb2@chiark.chu.cam.ac.uk>
- * with more hacks by me.   -- hhb
- */
-#define WANGTEK		1		   /* don't know about Wangtek QIC-36 */
-#define EVEREX		(WANGTEK+1)  /* I heard *some* of these are identical */
-#define EVEREX_811V	EVEREX			      /* With TEAC MT 2ST 45D */
-#define EVEREX_831V	EVEREX
-#define ARCHIVE		3
-#define ARCHIVE_SC400	ARCHIVE	       /* rumoured to be from the pre-SMD-age */
-#define ARCHIVE_SC402	ARCHIVE		       /* don't know much about SC400 */
-#define ARCHIVE_SC499	ARCHIVE       /* SC402 and SC499R should be identical */
-
-#define MOUNTAIN	5		       /* Mountain Computer Interface */
-#define EMERALD		6		       /* Emerald Interface card */
-
-
-
-#define QIC02_TAPE_PORT_RANGE 	8	 /* number of IO locations to reserve */
-
-
-/*********** START OF USER CONFIGURABLE SECTION ************/
-
-/* Tape configuration: Select DRIVE, IFC, PORT, IRQ and DMA below.
- * Runtime (re)configuration is not supported yet.
- *
- * Tape drive configuration:	(MT_IS* constants are defined in mtio.h)
- *
- * QIC02_TAPE_DRIVE = MT_ISWT5150
- *	- Wangtek 5150, format: up to QIC-150.
- * QIC02_TAPE_DRIVE = MT_ISQIC02_ALL_FEATURES
- *	- Enables some optional QIC02 commands that some drives may lack.
- *	  It is provided so you can check which are supported by your drive.
- *	  Refer to tpqic02.h for others.
- *
- * Supported interface cards: QIC02_TAPE_IFC =
- *	WANGTEK,
- *	ARCHIVE_SC402, ARCHIVE_SC499.	(both same programming interface)
- *
- * Make sure you have the I/O ports/DMA channels 
- * and IRQ stuff configured properly!
- * NOTE: There may be other device drivers using the same major
- *       number. This must be avoided. Check for timer.h conflicts too.
- *
- * If you have an EVEREX EV-831 card and you are using DMA channel 3,
- * you will probably have to ``#define QIC02_TAPE_DMA3_FIX'' below.
- */
-
-/* CONFIG_QIC02_DYNCONF can be defined in autoconf.h, by `make config' */
-
-/*** #undef CONFIG_QIC02_DYNCONF ***/
-
-#ifndef CONFIG_QIC02_DYNCONF
-
-#define QIC02_TAPE_DRIVE	MT_ISQIC02_ALL_FEATURES	 /* drive type */
-/* #define QIC02_TAPE_DRIVE	MT_ISWT5150 */
-/* #define QIC02_TAPE_DRIVE	MT_ISARCHIVE_5945L2 */
-/* #define QIC02_TAPE_DRIVE	MT_ISTEAC_MT2ST */
-/* #define QIC02_TAPE_DRIVE	MT_ISARCHIVE_2150L */
-/* #define QIC02_TAPE_DRIVE	MT_ISARCHIVESC499 */
-
-/* Either WANGTEK, ARCHIVE or MOUNTAIN. Not EVEREX. 
- * If you have an EVEREX, use WANGTEK and try the DMA3_FIX below.
- */
-#define QIC02_TAPE_IFC		WANGTEK	/* interface card type */
-/* #define QIC02_TAPE_IFC		ARCHIVE */
-/* #define QIC02_TAPE_IFC		MOUNTAIN */
-
-#define QIC02_TAPE_PORT 	0x300	/* controller port address */
-#define QIC02_TAPE_IRQ		5	/* For IRQ2, use 9 here, others normal. */
-#define QIC02_TAPE_DMA		1	/* either 1 or 3, because 2 is used by the floppy */
-
-/* If DMA3 doesn't work, but DMA1 does, and you have a 
- * Wangtek/Everex card, you can try #define-ing the flag
- * below. Note that you should also change the DACK jumper
- * for Wangtek/Everex cards when changing the DMA channel.
- */
-#undef QIC02_TAPE_DMA3_FIX
-
-/************ END OF USER CONFIGURABLE SECTION *************/
-
-/* I put the stuff above in config.in, but a few recompiles, to
- * verify different configurations, and several days later I decided
- * to change it back again.
- */
-
-
-
-/* NOTE: TP_HAVE_DENS should distinguish between available densities (?)
- * NOTE: Drive select is not implemented -- I have only one tape streamer,
- *	 so I'm unable and unmotivated to test and implement that. ;-) ;-)
- */
-#if QIC02_TAPE_DRIVE == MT_ISWT5150
-#define TP_HAVE_DENS	1
-#define TP_HAVE_BSF	0	/* nope */
-#define TP_HAVE_FSR	0	/* nope */
-#define TP_HAVE_BSR	0	/* nope */
-#define TP_HAVE_EOD	0	/* most of the time */
-#define TP_HAVE_SEEK	0
-#define TP_HAVE_TELL	0
-#define TP_HAVE_RAS1	1
-#define TP_HAVE_RAS2	1
-
-#elif QIC02_TAPE_DRIVE == MT_ISARCHIVESC499	/* Archive SC-499 QIC-36 controller */
-#define TP_HAVE_DENS	1	/* can do set density (QIC-11 / QIC-24) */
-#define TP_HAVE_BSF	0
-#define TP_HAVE_FSR	1	/* can skip one block forwards */
-#define TP_HAVE_BSR	1	/* can skip one block backwards */
-#define TP_HAVE_EOD	1	/* can seek to end of recorded data */
-#define TP_HAVE_SEEK	0
-#define TP_HAVE_TELL	0
-#define TP_HAVE_RAS1	1	/* can run selftest 1 */
-#define TP_HAVE_RAS2	1	/* can run selftest 2 */
-/* These last two selftests shouldn't be used yet! */
-
-#elif (QIC02_TAPE_DRIVE == MT_ISARCHIVE_2060L) || (QIC02_TAPE_DRIVE == MT_ISARCHIVE_2150L)
-#define TP_HAVE_DENS	1	/* can do set density (QIC-24 / QIC-120 / QIC-150) */
-#define TP_HAVE_BSF	0
-#define TP_HAVE_FSR	1	/* can skip one block forwards */
-#define TP_HAVE_BSR	1	/* can skip one block backwards */
-#define TP_HAVE_EOD	1	/* can seek to end of recorded data */
-#define TP_HAVE_TELL	1	/* can read current block address */
-#define TP_HAVE_SEEK	1	/* can seek to block */
-#define TP_HAVE_RAS1	1	/* can run selftest 1 */
-#define TP_HAVE_RAS2	1	/* can run selftest 2 */
-/* These last two selftests shouldn't be used yet! */
-
-#elif QIC02_TAPE_DRIVE == MT_ISARCHIVE_5945L2
-/* can anyone verify this entry?? */
-#define TP_HAVE_DENS	1	/* can do set density?? (QIC-24??) */
-#define TP_HAVE_BSF	0
-#define TP_HAVE_FSR	1	/* can skip one block forwards */
-#define TP_HAVE_BSR	1	/* can skip one block backwards */
-#define TP_HAVE_EOD	1	/* can seek to end of recorded data */
-#define TP_HAVE_TELL	1	/* can read current block address */
-#define TP_HAVE_SEEK	1	/* can seek to block */
-#define TP_HAVE_RAS1	1	/* can run selftest 1 */
-#define TP_HAVE_RAS2	1	/* can run selftest 2 */
-/* These last two selftests shouldn't be used yet! */
-
-#elif QIC02_TAPE_DRIVE == MT_ISTEAC_MT2ST
-/* can anyone verify this entry?? */
-#define TP_HAVE_DENS	0	/* cannot do set density?? (QIC-150?) */
-#define TP_HAVE_BSF	0
-#define TP_HAVE_FSR	1	/* can skip one block forwards */
-#define TP_HAVE_BSR	1	/* can skip one block backwards */
-#define TP_HAVE_EOD	1	/* can seek to end of recorded data */
-#define TP_HAVE_SEEK	1	/* can seek to block */
-#define TP_HAVE_TELL	1	/* can read current block address */
-#define TP_HAVE_RAS1	1	/* can run selftest 1 */
-#define TP_HAVE_RAS2	1	/* can run selftest 2 */
-/* These last two selftests shouldn't be used yet! */
-
-#elif QIC02_TAPE_DRIVE == MT_ISQIC02_ALL_FEATURES
-#define TP_HAVE_DENS	1	/* can do set density */
-#define TP_HAVE_BSF	1	/* can search filemark backwards */
-#define TP_HAVE_FSR	1	/* can skip one block forwards */
-#define TP_HAVE_BSR	1	/* can skip one block backwards */
-#define TP_HAVE_EOD	1	/* can seek to end of recorded data */
-#define TP_HAVE_SEEK	1	/* seek to block address */
-#define TP_HAVE_TELL	1	/* tell current block address */
-#define TP_HAVE_RAS1	1	/* can run selftest 1 */
-#define TP_HAVE_RAS2	1	/* can run selftest 2 */
-/* These last two selftests shouldn't be used yet! */
-
-
-#else
-#error No QIC-02 tape drive type defined!
-/* If your drive is not listed above, first try the 'ALL_FEATURES',
- * to see what commands are supported, then create your own entry in
- * the list above. You may want to mail it to me, so that I can include
- * it in the next release.
- */
-#endif
-
-#endif /* !CONFIG_QIC02_DYNCONF */
-
-
-/* WANGTEK interface card specifics */
-#define WT_QIC02_STAT_PORT	(QIC02_TAPE_PORT)
-#define WT_QIC02_CTL_PORT	(QIC02_TAPE_PORT)
-#define WT_QIC02_CMD_PORT	(QIC02_TAPE_PORT+1)
-#define WT_QIC02_DATA_PORT	(QIC02_TAPE_PORT+1)
-
-/* status register bits (Active LOW!) */
-#define WT_QIC02_STAT_POLARITY	0
-#define WT_QIC02_STAT_READY	0x01
-#define WT_QIC02_STAT_EXCEPTION	0x02
-#define WT_QIC02_STAT_MASK	(WT_QIC02_STAT_READY|WT_QIC02_STAT_EXCEPTION)
-
-#define WT_QIC02_STAT_RESETMASK	0x07
-#define WT_QIC02_STAT_RESETVAL	(WT_QIC02_STAT_RESETMASK & ~WT_QIC02_STAT_EXCEPTION)
-
-/* controller register (QIC02_CTL_PORT) bits */
-#define WT_QIC02_CTL_RESET	0x02
-#define WT_QIC02_CTL_REQUEST	0x04
-#define WT_CTL_ONLINE		0x01
-#define WT_CTL_CMDOFF		0xC0 
-
-#define WT_CTL_DMA3		0x10			  /* enable dma chan3 */
-#define WT_CTL_DMA1		0x08	         /* enable dma chan1 or chan2 */
-
-/* EMERALD interface card specifics
- * Much like Wangtek, only different polarity and bit locations
- */
-#define EMR_QIC02_STAT_PORT	(QIC02_TAPE_PORT)
-#define EMR_QIC02_CTL_PORT	(QIC02_TAPE_PORT)
-#define EMR_QIC02_CMD_PORT	(QIC02_TAPE_PORT+1)
-#define EMR_QIC02_DATA_PORT	(QIC02_TAPE_PORT+1)
-
-/* status register bits (Active High!) */
-#define EMR_QIC02_STAT_POLARITY		1
-#define EMR_QIC02_STAT_READY		0x01
-#define EMR_QIC02_STAT_EXCEPTION	0x02
-#define EMR_QIC02_STAT_MASK	(EMR_QIC02_STAT_READY|EMR_QIC02_STAT_EXCEPTION)
-
-#define EMR_QIC02_STAT_RESETMASK	0x07
-#define EMR_QIC02_STAT_RESETVAL	(EMR_QIC02_STAT_RESETMASK & ~EMR_QIC02_STAT_EXCEPTION)
-
-/* controller register (QIC02_CTL_PORT) bits */
-#define EMR_QIC02_CTL_RESET	0x02
-#define EMR_QIC02_CTL_REQUEST	0x04
-#define EMR_CTL_ONLINE		0x01
-#define EMR_CTL_CMDOFF		0xC0 
-
-#define EMR_CTL_DMA3		0x10			  /* enable dma chan3 */
-#define EMR_CTL_DMA1		0x08	         /* enable dma chan1 or chan2 */
-
-
-
-/* ARCHIVE interface card specifics */
-#define AR_QIC02_STAT_PORT	(QIC02_TAPE_PORT+1)
-#define AR_QIC02_CTL_PORT	(QIC02_TAPE_PORT+1)
-#define AR_QIC02_CMD_PORT	(QIC02_TAPE_PORT)
-#define AR_QIC02_DATA_PORT	(QIC02_TAPE_PORT)
-
-#define AR_START_DMA_PORT	(QIC02_TAPE_PORT+2)
-#define AR_RESET_DMA_PORT	(QIC02_TAPE_PORT+3)
-
-/* STAT port bits */
-#define AR_QIC02_STAT_POLARITY	0
-#define AR_STAT_IRQF		0x80	/* active high, interrupt request flag */
-#define AR_QIC02_STAT_READY	0x40	/* active low */
-#define AR_QIC02_STAT_EXCEPTION	0x20	/* active low */
-#define AR_QIC02_STAT_MASK	(AR_QIC02_STAT_READY|AR_QIC02_STAT_EXCEPTION)
-#define AR_STAT_DMADONE		0x10	/* active high, DMA done */
-#define AR_STAT_DIRC		0x08	/* active high, direction */
-
-#define AR_QIC02_STAT_RESETMASK	0x70	/* check RDY,EXC,DMADONE */
-#define AR_QIC02_STAT_RESETVAL	((AR_QIC02_STAT_RESETMASK & ~AR_STAT_IRQF & ~AR_QIC02_STAT_EXCEPTION) | AR_STAT_DMADONE)
-
-/* CTL port bits */
-#define AR_QIC02_CTL_RESET	0x80	/* drive reset */
-#define AR_QIC02_CTL_REQUEST	0x40	/* notify of new command */
-#define AR_CTL_IEN		0x20	/* interrupt enable */
-#define AR_CTL_DNIEN		0x10	/* done-interrupt enable */
-  /* Note: All of these bits are cleared automatically when writing to
-   * AR_RESET_DMA_PORT. So AR_CTL_IEN and AR_CTL_DNIEN must be
-   * reprogrammed before the write to AR_START_DMA_PORT.
-   */
-
-
-/* MOUNTAIN interface specifics */
-#define MTN_QIC02_STAT_PORT	(QIC02_TAPE_PORT+1)
-#define MTN_QIC02_CTL_PORT	(QIC02_TAPE_PORT+1)
-#define MTN_QIC02_CMD_PORT	(QIC02_TAPE_PORT)
-#define MTN_QIC02_DATA_PORT	(QIC02_TAPE_PORT)
-
-#define MTN_W_SELECT_DMA_PORT	(QIC02_TAPE_PORT+2)
-#define MTN_R_DESELECT_DMA_PORT	(QIC02_TAPE_PORT+2)
-#define MTN_W_DMA_WRITE_PORT	(QIC02_TAPE_PORT+3)
-
-/* STAT port bits */
-#define MTN_QIC02_STAT_POLARITY	 0
-#define MTN_QIC02_STAT_READY	 0x02	/* active low */
-#define MTN_QIC02_STAT_EXCEPTION 0x04	/* active low */
-#define MTN_QIC02_STAT_MASK	 (MTN_QIC02_STAT_READY|MTN_QIC02_STAT_EXCEPTION)
-#define MTN_STAT_DMADONE	 0x01	/* active high, DMA done */
-
-#define MTN_QIC02_STAT_RESETMASK 0x07	/* check RDY,EXC,DMADONE */
-#define MTN_QIC02_STAT_RESETVAL	 ((MTN_QIC02_STAT_RESETMASK & ~MTN_QIC02_STAT_EXCEPTION) | MTN_STAT_DMADONE)
-
-/* CTL port bits */
-#define MTN_QIC02_CTL_RESET_NOT	 0x80	/* drive reset, active low */
-#define MTN_QIC02_CTL_RESET	 0x80	/* Fodder #definition to keep gcc happy */
-
-#define MTN_QIC02_CTL_ONLINE	 0x40	/* Put drive on line  */
-#define MTN_QIC02_CTL_REQUEST	 0x20	/* notify of new command */
-#define MTN_QIC02_CTL_IRQ_DRIVER 0x10	/* Enable IRQ tristate driver */
-#define MTN_QIC02_CTL_DMA_DRIVER 0x08	/* Enable DMA tristate driver */
-#define MTN_CTL_EXC_IEN		 0x04	/* Exception interrupt enable */
-#define MTN_CTL_RDY_IEN		 0x02	/* Ready interrupt enable */
-#define MTN_CTL_DNIEN		 0x01	/* done-interrupt enable */
-
-#define MTN_CTL_ONLINE		(MTN_QIC02_CTL_RESET_NOT | MTN_QIC02_CTL_IRQ_DRIVER | MTN_QIC02_CTL_DMA_DRIVER)
-
-
-#ifndef CONFIG_QIC02_DYNCONF
-
-# define QIC02_TAPE_DEBUG	(qic02_tape_debug)
-
-# if QIC02_TAPE_IFC == WANGTEK	
-#  define QIC02_STAT_POLARITY	WT_QIC02_STAT_POLARITY
-#  define QIC02_STAT_PORT	WT_QIC02_STAT_PORT
-#  define QIC02_CTL_PORT	WT_QIC02_CTL_PORT
-#  define QIC02_CMD_PORT	WT_QIC02_CMD_PORT
-#  define QIC02_DATA_PORT	WT_QIC02_DATA_PORT
-
-#  define QIC02_STAT_READY	WT_QIC02_STAT_READY
-#  define QIC02_STAT_EXCEPTION	WT_QIC02_STAT_EXCEPTION
-#  define QIC02_STAT_MASK	WT_QIC02_STAT_MASK
-#  define QIC02_STAT_RESETMASK	WT_QIC02_STAT_RESETMASK
-#  define QIC02_STAT_RESETVAL	WT_QIC02_STAT_RESETVAL
-
-#  define QIC02_CTL_RESET	WT_QIC02_CTL_RESET
-#  define QIC02_CTL_REQUEST	WT_QIC02_CTL_REQUEST
-
-#  if QIC02_TAPE_DMA == 3
-#   ifdef QIC02_TAPE_DMA3_FIX
-#    define WT_CTL_DMA		WT_CTL_DMA1
-#   else
-#    define WT_CTL_DMA		WT_CTL_DMA3
-#   endif
-#  elif QIC02_TAPE_DMA == 1
-#    define WT_CTL_DMA		WT_CTL_DMA1
-#  else
-#   error Unsupported or incorrect DMA configuration.
-#  endif
-
-# elif QIC02_TAPE_IFC == EMERALD
-#  define QIC02_STAT_POLARITY	EMR_QIC02_STAT_POLARITY
-#  define QIC02_STAT_PORT	EMR_QIC02_STAT_PORT
-#  define QIC02_CTL_PORT	EMR_QIC02_CTL_PORT
-#  define QIC02_CMD_PORT	EMR_QIC02_CMD_PORT
-#  define QIC02_DATA_PORT	EMR_QIC02_DATA_PORT
-
-#  define QIC02_STAT_READY	EMR_QIC02_STAT_READY
-#  define QIC02_STAT_EXCEPTION	EMR_QIC02_STAT_EXCEPTION
-#  define QIC02_STAT_MASK	EMR_QIC02_STAT_MASK
-#  define QIC02_STAT_RESETMASK	EMR_QIC02_STAT_RESETMASK
-#  define QIC02_STAT_RESETVAL	EMR_QIC02_STAT_RESETVAL
-
-#  define QIC02_CTL_RESET	EMR_QIC02_CTL_RESET
-#  define QIC02_CTL_REQUEST	EMR_QIC02_CTL_REQUEST
-
-#  if QIC02_TAPE_DMA == 3
-#   ifdef QIC02_TAPE_DMA3_FIX
-#    define EMR_CTL_DMA		EMR_CTL_DMA1
-#   else
-#    define EMR_CTL_DMA		EMR_CTL_DMA3
-#   endif
-#  elif QIC02_TAPE_DMA == 1
-#    define EMR_CTL_DMA		EMR_CTL_DMA1
-#  else
-#   error Unsupported or incorrect DMA configuration.
-#  endif
-
-# elif QIC02_TAPE_IFC == ARCHIVE
-#  define QIC02_STAT_POLARITY	AR_QIC02_STAT_POLARITY
-#  define QIC02_STAT_PORT	AR_QIC02_STAT_PORT
-#  define QIC02_CTL_PORT	AR_QIC02_CTL_PORT
-#  define QIC02_CMD_PORT	AR_QIC02_CMD_PORT
-#  define QIC02_DATA_PORT	AR_QIC02_DATA_PORT
-
-#  define QIC02_STAT_READY	AR_QIC02_STAT_READY
-#  define QIC02_STAT_EXCEPTION	AR_QIC02_STAT_EXCEPTION
-#  define QIC02_STAT_MASK	AR_QIC02_STAT_MASK
-#  define QIC02_STAT_RESETMASK	AR_QIC02_STAT_RESETMASK
-#  define QIC02_STAT_RESETVAL	AR_QIC02_STAT_RESETVAL
-
-#  define QIC02_CTL_RESET	AR_QIC02_CTL_RESET
-#  define QIC02_CTL_REQUEST	AR_QIC02_CTL_REQUEST
-
-#  if QIC02_TAPE_DMA > 3	/* channel 2 is used by the floppy driver */
-#   error DMA channels other than 1 and 3 are not supported.
-#  endif
-
-# elif QIC02_TAPE_IFC == MOUNTAIN
-#  define QIC02_STAT_POLARITY	MTN_QIC02_STAT_POLARITY
-#  define QIC02_STAT_PORT	MTN_QIC02_STAT_PORT
-#  define QIC02_CTL_PORT	MTN_QIC02_CTL_PORT
-#  define QIC02_CMD_PORT	MTN_QIC02_CMD_PORT
-#  define QIC02_DATA_PORT	MTN_QIC02_DATA_PORT
-
-#  define QIC02_STAT_READY	MTN_QIC02_STAT_READY
-#  define QIC02_STAT_EXCEPTION	MTN_QIC02_STAT_EXCEPTION
-#  define QIC02_STAT_MASK	MTN_QIC02_STAT_MASK
-#  define QIC02_STAT_RESETMASK	MTN_QIC02_STAT_RESETMASK
-#  define QIC02_STAT_RESETVAL	MTN_QIC02_STAT_RESETVAL
-
-#  define QIC02_CTL_RESET	MTN_QIC02_CTL_RESET
-#  define QIC02_CTL_REQUEST	MTN_QIC02_CTL_REQUEST
-
-#  if QIC02_TAPE_DMA > 3	/* channel 2 is used by the floppy driver */
-#   error DMA channels other than 1 and 3 are not supported.
-#  endif
-
-# else
-#  error No valid interface card specified!
-# endif /* QIC02_TAPE_IFC */
-
-
-  /* An ugly hack to make sure WT_CTL_DMA is defined even for the
-   * static, non-Wangtek case. The alternative was even worse.
-   */ 
-# ifndef WT_CTL_DMA
-#  define WT_CTL_DMA		WT_CTL_DMA1
-# endif
-
-/*******************/
-
-#else /* !CONFIG_QIC02_DYNCONF */
-
-/* Now the runtime config version, using variables instead of constants.
- *
- * qic02_tape_dynconf is R/O for the kernel, set from userspace.
- * qic02_tape_ccb is private to the driver, R/W.
- */
-
-# define QIC02_TAPE_DRIVE	(qic02_tape_dynconf.mt_type)
-# define QIC02_TAPE_IFC		(qic02_tape_ccb.ifc_type)
-# define QIC02_TAPE_IRQ		(qic02_tape_dynconf.irqnr)
-# define QIC02_TAPE_DMA		(qic02_tape_dynconf.dmanr)
-# define QIC02_TAPE_PORT	(qic02_tape_dynconf.port)
-# define WT_CTL_DMA		(qic02_tape_ccb.dma_enable_value)
-# define QIC02_TAPE_DEBUG	(qic02_tape_dynconf.debug)
-
-# define QIC02_STAT_PORT	(qic02_tape_ccb.port_stat)
-# define QIC02_CTL_PORT 	(qic02_tape_ccb.port_ctl)
-# define QIC02_CMD_PORT 	(qic02_tape_ccb.port_cmd)
-# define QIC02_DATA_PORT 	(qic02_tape_ccb.port_data)
-
-# define QIC02_STAT_POLARITY	(qic02_tape_ccb.stat_polarity)
-# define QIC02_STAT_READY	(qic02_tape_ccb.stat_ready)
-# define QIC02_STAT_EXCEPTION	(qic02_tape_ccb.stat_exception)
-# define QIC02_STAT_MASK	(qic02_tape_ccb.stat_mask)
-
-# define QIC02_STAT_RESETMASK	(qic02_tape_ccb.stat_resetmask)
-# define QIC02_STAT_RESETVAL	(qic02_tape_ccb.stat_resetval)
-
-# define QIC02_CTL_RESET	(qic02_tape_ccb.ctl_reset)
-# define QIC02_CTL_REQUEST	(qic02_tape_ccb.ctl_request)
-
-# define TP_HAVE_DENS		(qic02_tape_dynconf.have_dens)
-# define TP_HAVE_BSF		(qic02_tape_dynconf.have_bsf)
-# define TP_HAVE_FSR		(qic02_tape_dynconf.have_fsr)
-# define TP_HAVE_BSR		(qic02_tape_dynconf.have_bsr)
-# define TP_HAVE_EOD		(qic02_tape_dynconf.have_eod)
-# define TP_HAVE_SEEK		(qic02_tape_dynconf.have_seek)
-# define TP_HAVE_TELL		(qic02_tape_dynconf.have_tell)
-# define TP_HAVE_RAS1		(qic02_tape_dynconf.have_ras1)
-# define TP_HAVE_RAS2		(qic02_tape_dynconf.have_ras2)
-
-#endif /* CONFIG_QIC02_DYNCONF */
-
-
-/* "Vendor Unique" codes */
-/* Archive seek & tell stuff */
-#define AR_QCMDV_TELL_BLK	0xAE	/* read current block address */
-#define AR_QCMDV_SEEK_BLK	0xAD	/* seek to specific block */
-#define AR_SEEK_BUF_SIZE	3	/* address is 3 bytes */
-
-
-
-/*
- * Misc common stuff
- */
-
-/* Standard QIC-02 commands -- rev F.  All QIC-02 drives must support these */
-#define QCMD_SEL_1	0x01		/* select drive 1 */
-#define QCMD_SEL_2	0x02		/* select drive 2 */
-#define QCMD_SEL_3	0x04		/* select drive 3 */
-#define QCMD_SEL_4	0x08		/* select drive 4 */
-#define	QCMD_REWIND	0x21		/* rewind tape */
-#define QCMD_ERASE	0x22		/* erase tape */
-#define QCMD_RETEN	0x24		/* retension tape */
-#define	QCMD_WRT_DATA	0x40		/* write data */
-#define	QCMD_WRT_FM	0x60		/* write file mark */
-#define	QCMD_RD_DATA	0x80		/* read data */
-#define	QCMD_RD_FM	0xA0		/* read file mark (forward direction) */
-#define	QCMD_RD_STAT	0xC0		/* read status */
-
-/* Other (optional/vendor unique) commands */
- /* Density commands are only valid when TP_BOM is set! */
-#define QCMD_DENS_11	0x26		/* QIC-11 */
-#define QCMD_DENS_24	0x27		/* QIC-24: 9 track 60MB */
-#define QCMD_DENS_120	0x28		/* QIC-120: 15 track 120MB */
-#define QCMD_DENS_150	0x29		/* QIC-150: 18 track 150MB */
-#define QCMD_DENS_300	0x2A		/* QIC-300/QIC-2100 */
-#define QCMD_DENS_600	0x2B		/* QIC-600/QIC-2200 */
-/* don't know about QIC-1000 and QIC-1350 */
-
-#define	QCMD_WRTNU_DATA	0x40		/* write data, no underruns, insert filler. */
-#define QCMD_SPACE_FWD	0x81		/* skip next block */
-#define QCMD_SPACE_BCK	0x89		/* move tape head one block back -- very useful! */
-#define QCMD_RD_FM_BCK	0xA8		/* read filemark (backwards) */
-#define QCMD_SEEK_EOD	0xA3		/* skip to EOD */
-#define	QCMD_RD_STAT_X1	0xC1		/* read extended status 1 */
-#define	QCMD_RD_STAT_X2	0xC4		/* read extended status 2 */
-#define	QCMD_RD_STAT_X3	0xE0		/* read extended status 3 */
-#define QCMD_SELF_TST1	0xC2		/* run self test 1 (nondestructive) */
-#define QCMD_SELF_TST2	0xCA		/* run self test 2 (destructive) */
-
-
-
-/* Optional, QFA (Quick File Access) commands.
- * Not all drives support this, but those that do could use these commands
- * to implement semi-non-sequential access. `mt fsf` would benefit from this.
- * QFA divides the tape into 2 partitions, a data and a directory partition,
- * causing some incompatibility problems wrt std QIC-02 data exchange.
- * It would be useful to cache the directory info, but that might be tricky
- * to do in kernel-space. [Size constraints.]
- * Refer to the QIC-02 specs, appendix A for more information.
- * I have no idea how other *nix variants implement QFA.
- * I have no idea which drives support QFA and which don't.
- */
-#define QFA_ENABLE	0x2D		/* enter QFA mode, give @ BOT only */
-#define QFA_DATA	0x20		/* select data partition */
-#define QFA_DIR		0x23		/* select directory partition */
-#define QFA_RD_POS	0xCF		/* read position+status bytes */
-#define QFA_SEEK_EOD	0xA1		/* seek EOD within current partition */
-#define QFA_SEEK_BLK	0xAF		/* seek to a block within current partition */
-
-
-
-
-/*
- * Debugging flags
- */
-#define TPQD_SENSE_TEXT	0x0001
-#define TPQD_SENSE_CNTS 0x0002
-#define TPQD_REWIND	0x0004
-#define TPQD_TERM_CYCLE	0x0008
-#define TPQD_IOCTLS	0x0010
-#define TPQD_DMAX	0x0020
-#define TPQD_BLKSZ	0x0040
-#define TPQD_MISC	0x0080
-
-#define TPQD_DEBUG	0x0100
-
-#define TPQD_DIAGS	0x1000
-
-#define TPQD_ALWAYS	0x8000
-
-#define TPQD_DEFAULT_FLAGS	0x00fc
-
-
-#define TPQDBG(f)	((QIC02_TAPE_DEBUG) & (TPQD_##f))
-
-
-/* Minor device codes for tapes:
- * |7|6|5|4|3|2|1|0|
- *  | \ | / \ | / |_____ 1=rewind on close, 0=no rewind on close
- *  |  \|/    |_________ Density: 000=none, 001=QIC-11, 010=24, 011=120,
- *  |   |                100=QIC-150, 101..111 reserved.
- *  |   |_______________ Reserved for unit numbers.
- *  |___________________ Reserved for diagnostics during debugging.
- */
-
-#define	TP_REWCLOSE(d)	((d)&1)	   		/* rewind bit */
-			   /* rewind is only done if data has been transferred */
-#define	TP_DENS(d)	(((d) >> 1) & 0x07) 	      /* tape density */
-#define TP_UNIT(d)	(((d) >> 4) & 0x07)	       /* unit number */
-
-/* print excessive diagnostics */
-#define TP_DIAGS(dev)	(QIC02_TAPE_DEBUG & TPQD_DIAGS)
-
-/* status codes returned by a WTS_RDSTAT call */
-struct tpstatus {	/* sizeof(short)==2), LSB first */
-	unsigned short	exs;	/* Drive exception flags */
-	unsigned short	dec;	/* data error count: nr of blocks rewritten/soft read errors */
-	unsigned short	urc;	/* underrun count: nr of times streaming was interrupted */
-};
-#define TPSTATSIZE	sizeof(struct tpstatus)
-
-
-/* defines for tpstatus.exs -- taken from 386BSD wt driver */
-#define	TP_POR		0x100	/* Power on or reset occurred */
-#define	TP_EOR		0x200	/* REServed for end of RECORDED media */
-#define	TP_PAR		0x400	/* REServed for bus parity */
-#define	TP_BOM		0x800	/* Beginning of media */
-#define	TP_MBD		0x1000	/* Marginal block detected */
-#define	TP_NDT		0x2000	/* No data detected */
-#define	TP_ILL		0x4000	/* Illegal command */
-#define	TP_ST1		0x8000	/* Status byte 1 flag */
-#define	TP_FIL		0x01	/* File mark detected */
-#define	TP_BNL		0x02	/* Bad block not located */
-#define	TP_UDA		0x04	/* Unrecoverable data error */
-#define	TP_EOM		0x08	/* End of media */
-#define	TP_WRP		0x10	/* Write protected cartridge */
-#define	TP_USL		0x20	/* Unselected drive */
-#define	TP_CNI		0x40	/* Cartridge not in place */
-#define	TP_ST0		0x80	/* Status byte 0 flag */
-
-#define REPORT_ERR0	(TP_CNI|TP_USL|TP_WRP|TP_EOM|TP_UDA|TP_BNL|TP_FIL)
-#define REPORT_ERR1	(TP_ILL|TP_NDT|TP_MBD|TP_PAR)
-
-
-/* exception numbers */
-#define EXC_UNKNOWN	0	/* (extra) Unknown exception code */
-#define EXC_NDRV	1	/* No drive */
-#define EXC_NCART	2	/* No cartridge */
-#define EXC_WP		3	/* Write protected */
-#define EXC_EOM		4	/* EOM */
-#define EXC_RWA		5	/* read/write abort */
-#define EXC_XBAD	6	/* read error, bad block transferred */
-#define EXC_XFILLER	7	/* read error, filler block transferred */
-#define EXC_NDT		8	/* read error, no data */
-#define EXC_NDTEOM	9	/* read error, no data & EOM */
-#define EXC_NDTBOM	10	/* read error, no data & BOM */
-#define EXC_FM		11	/* Read a filemark */
-#define EXC_ILL		12	/* Illegal command */
-#define EXC_POR		13	/* Power on/reset */
-#define EXC_MARGINAL	14	/* Marginal block detected */
-#define EXC_EOR		15	/* (extra, for SEEKEOD) End Of Recorded data reached */
-#define EXC_BOM		16	/* (extra) BOM reached */
-
-
-#define TAPE_NOTIFY_TIMEOUT	1000000
-
-/* internal function return codes */
-#define TE_OK	0		/* everything is fine */
-#define TE_EX	1		/* exception detected */
-#define TE_ERR	2		/* some error */
-#define TE_NS	3		/* can't read status */
-#define TE_TIM	4		/* timed out */
-#define TE_DEAD	5		/* tape drive doesn't respond */
-#define TE_END	6		/******** Archive hack *****/
-
-/* timeout timer values -- check these! */
-#define TIM_S	(4*HZ)		/* 4 seconds (normal cmds) */
-#define TIM_M	(30*HZ)		/* 30 seconds (write FM) */
-#define TIM_R	(8*60*HZ)	/* 8 minutes (retensioning) */
-#define TIM_F	(2*3600*HZ)	/* est. 1.2hr for full tape read/write+2 retens */
-
-#define TIMERON(t)	mod_timer(&tp_timer, jiffies + (t))
-#define TIMEROFF	del_timer_sync(&tp_timer);
-#define TIMERCONT	add_timer(&tp_timer);
-
-
-typedef char flag;
-#define NO	0	/* NO must be 0 */
-#define YES	1	/* YES must be != 0 */
-
-
-#ifdef TDEBUG
-# define TPQDEB(s)	s
-# define TPQPUTS(s)	tpqputs(s)
-#else
-# define TPQDEB(s)
-# define TPQPUTS(s)
-#endif
-
-
-/* NR_BLK_BUF is a `tuneable parameter'. If you're really low on
- * kernel space, you could decrease it to 1, or if you got a very
- * slow machine, you could increase it up to 127 blocks. Less kernel
- * buffer blocks result in more context-switching.
- */
-#define NR_BLK_BUF	20				    /* max 127 blocks */
-#define TAPE_BLKSIZE	512		  /* streamer tape block size (fixed) */
-#define TPQBUF_SIZE	(TAPE_BLKSIZE*NR_BLK_BUF)	       /* buffer size */
-
-
-#define BLOCKS_BEYOND_EW	2	/* nr of blocks after Early Warning hole */
-#define BOGUS_IRQ		32009
-
-
-/* This is internal data, filled in based on the ifc_type field given
- * by the user. Everex is mapped to Wangtek with a different
- * `dma_enable_value', if dmanr==3.
- */
-struct qic02_ccb {
-	long	ifc_type;
-
-	unsigned short	port_stat;	/* Status port address */
-	unsigned short	port_ctl;	/* Control port address */
-	unsigned short	port_cmd;	/* Command port address */
-	unsigned short	port_data;	/* Data port address */
-
-	/* status register bits */
-	unsigned short	stat_polarity;	/* invert status bits or not */
-	unsigned short	stat_ready;	/* drive ready */
-	unsigned short	stat_exception;	/* drive signals exception */
-	unsigned short	stat_mask;
-	unsigned short	stat_resetmask;
-	unsigned short	stat_resetval;
-
-	/* control register bits */
-	unsigned short	ctl_reset;	/* reset drive */
-	unsigned short	ctl_request;	/* latch command */
-	
-	/* This is used to change the DMA3 behaviour */
-	unsigned short	dma_enable_value;
-};
-
-#if MODULE
-static int qic02_tape_init(void);
-#else
-extern int qic02_tape_init(void);			  /* for mem.c */
-#endif
-
-
-
-#endif /* CONFIG_QIC02_TAPE */
-
-#endif /* _LINUX_TPQIC02_H */
-
-- 
cgit v1.2.3


From 26eecbf3543b7a57699ce5b0bed82b84d3b61705 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <marcelo.tosatti@cyclades.com>
Date: Mon, 7 Mar 2005 17:25:19 -0800
Subject: [PATCH] vm: pageout throttling

With silly pageout testcases it is possible to place huge amounts of memory
under I/O.  With a large request queue (CFQ uses 8192 requests) it is
possible to place _all_ memory under I/O at the same time.

This means that all memory is pinned and unreclaimable and the VM gets
upset and goes oom.

The patch limits the amount of memory which is under pageout writeout to be
a little more than the amount of memory at which balance_dirty_pages()
callers will synchronously throttle.

This means that heavy pageout activity can starve heavy writeback activity
completely, but heavy writeback activity will not cause starvation of
pageout.  Because we don't want a simple `dd' to be causing excessive
latencies in page reclaim.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/writeback.h |  1 +
 mm/page-writeback.c       | 22 ++++++++++++++++++++++
 mm/vmscan.c               |  2 ++
 3 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 4ab519ad9f55..1262cb43c3ab 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -86,6 +86,7 @@ static inline void wait_on_inode(struct inode *inode)
 int wakeup_bdflush(long nr_pages);
 void laptop_io_completion(void);
 void laptop_sync_completion(void);
+void throttle_vm_writeout(void);
 
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f80248a53a08..c5943e7e810b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -289,6 +289,28 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
 
+void throttle_vm_writeout(void)
+{
+	struct writeback_state wbs;
+	long background_thresh;
+	long dirty_thresh;
+
+        for ( ; ; ) {
+		get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
+
+                /*
+                 * Boost the allowable dirty threshold a bit for page
+                 * allocators so they don't get DoS'ed by heavy writers
+                 */
+                dirty_thresh += dirty_thresh / 10;      /* wheeee... */
+
+                if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh)
+                        break;
+                blk_congestion_wait(WRITE, HZ/10);
+        }
+}
+
+
 /*
  * writeback at least _min_pages, and keep writing until the amount of dirty
  * memory is less than the background threshold, or until we're all clean.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 62f1ad405eb6..5babdd7b3f20 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -828,6 +828,8 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 				break;
 		}
 	}
+
+	throttle_vm_writeout();
 }
 
 /*
-- 
cgit v1.2.3


From 415b536d1338a7b636f6c3866ba208c402168f47 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 7 Mar 2005 17:29:39 -0800
Subject: [PATCH] vfs: adds the S_PRIVATE flag and adds use to security

This patch series adds SELinux support to reiserfs.

This patch adds an S_PRIVATE flag to inode->i_flags to mark an inode as
filesystem-internal. As such, it should be excepted from the security
infrastructure to allow the filesystem to perform its own access control.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Chris Wright <chrisw@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fs.h       |  2 ++
 include/linux/security.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c4081935da26..3ea266972f06 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -129,6 +129,7 @@ extern int dir_notify_enable;
 #define S_DIRSYNC	64	/* Directory modifications are synchronous */
 #define S_NOCMTIME	128	/* Do not update file c/mtime */
 #define S_SWAPFILE	256	/* Do not truncate: swapon got its bmaps */
+#define S_PRIVATE	512	/* Inode is fs-internal */
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -162,6 +163,7 @@ extern int dir_notify_enable;
 #define IS_DEADDIR(inode)	((inode)->i_flags & S_DEAD)
 #define IS_NOCMTIME(inode)	((inode)->i_flags & S_NOCMTIME)
 #define IS_SWAPFILE(inode)	((inode)->i_flags & S_SWAPFILE)
+#define IS_PRIVATE(inode)	((inode)->i_flags & S_PRIVATE)
 
 /* the read-only stuff doesn't really belong here, but any other place is
    probably as bad and I don't want to create yet another include file. */
diff --git a/include/linux/security.h b/include/linux/security.h
index 2b048ec62e9c..78d91972c4dc 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1426,11 +1426,15 @@ static inline void security_sb_post_pivotroot (struct nameidata *old_nd,
 
 static inline int security_inode_alloc (struct inode *inode)
 {
+	if (unlikely (IS_PRIVATE (inode)))
+		return 0;
 	return security_ops->inode_alloc_security (inode);
 }
 
 static inline void security_inode_free (struct inode *inode)
 {
+	if (unlikely (IS_PRIVATE (inode)))
+		return;
 	security_ops->inode_free_security (inode);
 }
 	
@@ -1438,6 +1442,8 @@ static inline int security_inode_create (struct inode *dir,
 					 struct dentry *dentry,
 					 int mode)
 {
+	if (unlikely (IS_PRIVATE (dir)))
+		return 0;
 	return security_ops->inode_create (dir, dentry, mode);
 }
 
@@ -1445,6 +1451,8 @@ static inline void security_inode_post_create (struct inode *dir,
 					       struct dentry *dentry,
 					       int mode)
 {
+	if (dentry->d_inode && unlikely (IS_PRIVATE (dentry->d_inode)))
+		return;
 	security_ops->inode_post_create (dir, dentry, mode);
 }
 
@@ -1452,6 +1460,8 @@ static inline int security_inode_link (struct dentry *old_dentry,
 				       struct inode *dir,
 				       struct dentry *new_dentry)
 {
+	if (unlikely (IS_PRIVATE (old_dentry->d_inode)))
+		return 0;
 	return security_ops->inode_link (old_dentry, dir, new_dentry);
 }
 
@@ -1459,12 +1469,16 @@ static inline void security_inode_post_link (struct dentry *old_dentry,
 					     struct inode *dir,
 					     struct dentry *new_dentry)
 {
+	if (new_dentry->d_inode && unlikely (IS_PRIVATE (new_dentry->d_inode)))
+		return;
 	security_ops->inode_post_link (old_dentry, dir, new_dentry);
 }
 
 static inline int security_inode_unlink (struct inode *dir,
 					 struct dentry *dentry)
 {
+	if (unlikely (IS_PRIVATE (dentry->d_inode)))
+		return 0;
 	return security_ops->inode_unlink (dir, dentry);
 }
 
@@ -1472,6 +1486,8 @@ static inline int security_inode_symlink (struct inode *dir,
 					  struct dentry *dentry,
 					  const char *old_name)
 {
+	if (unlikely (IS_PRIVATE (dir)))
+		return 0;
 	return security_ops->inode_symlink (dir, dentry, old_name);
 }
 
@@ -1479,6 +1495,8 @@ static inline void security_inode_post_symlink (struct inode *dir,
 						struct dentry *dentry,
 						const char *old_name)
 {
+	if (dentry->d_inode && unlikely (IS_PRIVATE (dentry->d_inode)))
+		return;
 	security_ops->inode_post_symlink (dir, dentry, old_name);
 }
 
@@ -1486,6 +1504,8 @@ static inline int security_inode_mkdir (struct inode *dir,
 					struct dentry *dentry,
 					int mode)
 {
+	if (unlikely (IS_PRIVATE (dir)))
+		return 0;
 	return security_ops->inode_mkdir (dir, dentry, mode);
 }
 
@@ -1493,12 +1513,16 @@ static inline void security_inode_post_mkdir (struct inode *dir,
 					      struct dentry *dentry,
 					      int mode)
 {
+	if (dentry->d_inode && unlikely (IS_PRIVATE (dentry->d_inode)))
+		return;
 	security_ops->inode_post_mkdir (dir, dentry, mode);
 }
 
 static inline int security_inode_rmdir (struct inode *dir,
 					struct dentry *dentry)
 {
+	if (unlikely (IS_PRIVATE (dentry->d_inode)))
+		return 0;
 	return security_ops->inode_rmdir (dir, dentry);
 }
 
@@ -1506,6 +1530,8 @@ static inline int security_inode_mknod (struct inode *dir,
 					struct dentry *dentry,
 					int mode, dev_t dev)
 {
+	if (unlikely (IS_PRIVATE (dir)))
+		return 0;
 	return security_ops->inode_mknod (dir, dentry, mode, dev);
 }
 
@@ -1513,6 +1539,8 @@ static inline void security_inode_post_mknod (struct inode *dir,
 					      struct dentry *dentry,
 					      int mode, dev_t dev)
 {
+	if (dentry->d_inode && unlikely (IS_PRIVATE (dentry->d_inode)))
+		return;
 	security_ops->inode_post_mknod (dir, dentry, mode, dev);
 }
 
@@ -1521,6 +1549,9 @@ static inline int security_inode_rename (struct inode *old_dir,
 					 struct inode *new_dir,
 					 struct dentry *new_dentry)
 {
+        if (unlikely (IS_PRIVATE (old_dentry->d_inode) ||
+            (new_dentry->d_inode && IS_PRIVATE (new_dentry->d_inode))))
+		return 0;
 	return security_ops->inode_rename (old_dir, old_dentry,
 					   new_dir, new_dentry);
 }
@@ -1530,83 +1561,114 @@ static inline void security_inode_post_rename (struct inode *old_dir,
 					       struct inode *new_dir,
 					       struct dentry *new_dentry)
 {
+	if (unlikely (IS_PRIVATE (old_dentry->d_inode) ||
+	    (new_dentry->d_inode && IS_PRIVATE (new_dentry->d_inode))))
+		return;
 	security_ops->inode_post_rename (old_dir, old_dentry,
 						new_dir, new_dentry);
 }
 
 static inline int security_inode_readlink (struct dentry *dentry)
 {
+	if (unlikely (IS_PRIVATE (dentry->d_inode)))
+		return 0;
 	return security_ops->inode_readlink (dentry);
 }
 
 static inline int security_inode_follow_link (struct dentry *dentry,
 					      struct nameidata *nd)
 {
+	if (unlikely (IS_PRIVATE (dentry->d_inode)))
+		return 0;
 	return security_ops->inode_follow_link (dentry, nd);
 }
 
 static inline int security_inode_permission (struct inode *inode, int mask,
 					     struct nameidata *nd)
 {
+	if (unlikely (IS_PRIVATE (inode)))
+		return 0;
 	return security_ops->inode_permission (inode, mask, nd);
 }
 
 static inline int security_inode_setattr (struct dentry *dentry,
 					  struct iattr *attr)
 {
+	if (unlikely (IS_PRIVATE (dentry->d_inode)))
+		return 0;
 	return security_ops->inode_setattr (dentry, attr);
 }
 
 static inline int security_inode_getattr (struct vfsmount *mnt,
 					  struct dentry *dentry)
 {
+	if (unlikely (IS_PRIVATE (dentry->d_inode)))
+		return 0;
 	return security_ops->inode_getattr (mnt, dentry);
 }
 
 static inline void security_inode_delete (struct inode *inode)
 {
+	if (unlikely (IS_PRIVATE (inode)))
+		return;
 	security_ops->inode_delete (inode);
 }
 
 static inline int security_inode_setxattr (struct dentry *dentry, char *name,
 					   void *value, size_t size, int flags)
 {
+	if (unlikely (IS_PRIVATE (dentry->d_inode)))
+		return 0;
 	return security_ops->inode_setxattr (dentry, name, value, size, flags);
 }
 
 static inline void security_inode_post_setxattr (struct dentry *dentry, char *name,
 						void *value, size_t size, int flags)
 {
+	if (unlikely (IS_PRIVATE (dentry->d_inode)))
+		return;
 	security_ops->inode_post_setxattr (dentry, name, value, size, flags);
 }
 
 static inline int security_inode_getxattr (struct dentry *dentry, char *name)
 {
+	if (unlikely (IS_PRIVATE (dentry->d_inode)))
+		return 0;
 	return security_ops->inode_getxattr (dentry, name);
 }
 
 static inline int security_inode_listxattr (struct dentry *dentry)
 {
+	if (unlikely (IS_PRIVATE (dentry->d_inode)))
+		return 0;
 	return security_ops->inode_listxattr (dentry);
 }
 
 static inline int security_inode_removexattr (struct dentry *dentry, char *name)
 {
+	if (unlikely (IS_PRIVATE (dentry->d_inode)))
+		return 0;
 	return security_ops->inode_removexattr (dentry, name);
 }
 
 static inline int security_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size)
 {
+	if (unlikely (IS_PRIVATE (inode)))
+		return 0;
 	return security_ops->inode_getsecurity(inode, name, buffer, size);
 }
 
 static inline int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags)
 {
+	if (unlikely (IS_PRIVATE (inode)))
+		return 0;
 	return security_ops->inode_setsecurity(inode, name, value, size, flags);
 }
 
 static inline int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size)
 {
+	if (unlikely (IS_PRIVATE (inode)))
+		return 0;
 	return security_ops->inode_listsecurity(inode, buffer, buffer_size);
 }
 
@@ -1883,6 +1945,8 @@ static inline int security_sem_semop (struct sem_array * sma,
 
 static inline void security_d_instantiate (struct dentry *dentry, struct inode *inode)
 {
+	if (unlikely (inode && IS_PRIVATE (inode)))
+		return;
 	security_ops->d_instantiate (dentry, inode);
 }
 
-- 
cgit v1.2.3


From 0bf0b7bc87b078e1cd9f0baa2db30a991c4a285c Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 7 Mar 2005 17:30:08 -0800
Subject: [PATCH] reiserfs: private inode abstracted to static inline

This patch moves the assignment of i_priv_object to a static inline.  This
is in preparation for selinux support in reiserfs.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/inode.c            | 2 ++
 fs/reiserfs/namei.c            | 2 +-
 fs/reiserfs/xattr.c            | 6 +-----
 fs/reiserfs/xattr_acl.c        | 2 +-
 include/linux/reiserfs_xattr.h | 7 +++++++
 5 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6060b3d6e835..7543031396f4 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1843,6 +1843,8 @@ int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
     } else if (inode->i_sb->s_flags & MS_POSIXACL) {
 	reiserfs_warning (inode->i_sb, "ACLs aren't enabled in the fs, "
 			  "but vfs thinks they are!");
+    } else if (is_reiserfs_priv_object (dir)) {
+	reiserfs_mark_inode_private (inode);
     }
 
     insert_inode_hash (inode);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index eb292870b53f..963e9ef61158 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -353,7 +353,7 @@ static struct dentry * reiserfs_lookup (struct inode * dir, struct dentry * dent
 
 	/* Propogate the priv_object flag so we know we're in the priv tree */
 	if (is_reiserfs_priv_object (dir))
-	    REISERFS_I(inode)->i_flags |= i_priv_object;
+	    reiserfs_mark_inode_private (inode);
     }
     reiserfs_write_unlock(dir->i_sb);
     if ( retval == IO_ERROR ) {
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 0b1247c4750b..45582fe8b466 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -181,8 +181,6 @@ open_xa_dir (const struct inode *inode, int flags)
             dput (xadir);
             return ERR_PTR (-ENODATA);
         }
-        /* Newly created object.. Need to mark it private */
-        REISERFS_I(xadir->d_inode)->i_flags |= i_priv_object;
     }
 
     dput (xaroot);
@@ -230,8 +228,6 @@ get_xa_file_dentry (const struct inode *inode, const char *name, int flags)
             dput (xafile);
             goto out;
         }
-        /* Newly created object.. Need to mark it private */
-        REISERFS_I(xafile->d_inode)->i_flags |= i_priv_object;
     }
 
 out:
@@ -1316,7 +1312,7 @@ reiserfs_xattr_init (struct super_block *s, int mount_flags)
 
       if (!err && dentry) {
           s->s_root->d_op = &xattr_lookup_poison_ops;
-          REISERFS_I(dentry->d_inode)->i_flags |= i_priv_object;
+          reiserfs_mark_inode_private (dentry->d_inode);
           REISERFS_SB(s)->priv_root = dentry;
       } else if (!(mount_flags & MS_RDONLY)) { /* xattrs are unavailable */
           /* If we're read-only it just means that the dir hasn't been
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index df4592a4f107..e302071903a1 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -339,7 +339,7 @@ reiserfs_inherit_default_acl (struct inode *dir, struct dentry *dentry, struct i
      * would be useless since permissions are ignored, and a pain because
      * it introduces locking cycles */
     if (is_reiserfs_priv_object (dir)) {
-        REISERFS_I(inode)->i_flags |= i_priv_object;
+        reiserfs_mark_inode_private (inode);
         goto apply_umask;
     }
 
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 9c40c4e9ba03..f2f77fd44bd2 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -103,9 +103,16 @@ reiserfs_read_unlock_xattr_i(struct inode *inode)
     up_read (&REISERFS_I(inode)->xattr_sem);
 }
 
+static inline void
+reiserfs_mark_inode_private(struct inode *inode)
+{
+    REISERFS_I(inode)->i_flags |= i_priv_object;
+}
+
 #else
 
 #define is_reiserfs_priv_object(inode) 0
+#define reiserfs_mark_inode_private(inode)
 #define reiserfs_getxattr NULL
 #define reiserfs_setxattr NULL
 #define reiserfs_listxattr NULL
-- 
cgit v1.2.3


From 24a4286fc41bfaf74460c90630a7ff9212bbf43a Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 7 Mar 2005 17:30:25 -0800
Subject: [PATCH] reiserfs: change reiserfs to use S_PRIVATE

This patch changes reiserfs to use the VFS level private inode flags, and
eliminates the old reiserfs private inode flag.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/reiserfs_fs_i.h  | 5 ++---
 include/linux/reiserfs_xattr.h | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/reiserfs_fs_i.h b/include/linux/reiserfs_fs_i.h
index e08142a75f03..e321eb050d65 100644
--- a/include/linux/reiserfs_fs_i.h
+++ b/include/linux/reiserfs_fs_i.h
@@ -23,9 +23,8 @@ typedef enum {
       space on crash with some files open, but unlinked. */
     i_link_saved_unlink_mask   =  0x0010,
     i_link_saved_truncate_mask =  0x0020,
-    i_priv_object              =  0x0080,
-    i_has_xattr_dir            =  0x0100,
-    i_data_log	               =  0x0200,
+    i_has_xattr_dir            =  0x0040,
+    i_data_log	               =  0x0080,
 } reiserfs_inode_flags;
 
 
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index f2f77fd44bd2..1eaa48eca811 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -31,7 +31,7 @@ struct reiserfs_xattr_handler {
 
 
 #ifdef CONFIG_REISERFS_FS_XATTR
-#define is_reiserfs_priv_object(inode) (REISERFS_I(inode)->i_flags & i_priv_object)
+#define is_reiserfs_priv_object(inode) IS_PRIVATE(inode)
 #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
 ssize_t reiserfs_getxattr (struct dentry *dentry, const char *name,
 			   void *buffer, size_t size);
@@ -106,7 +106,7 @@ reiserfs_read_unlock_xattr_i(struct inode *inode)
 static inline void
 reiserfs_mark_inode_private(struct inode *inode)
 {
-    REISERFS_I(inode)->i_flags |= i_priv_object;
+    inode->i_flags |= S_PRIVATE;
 }
 
 #else
-- 
cgit v1.2.3


From dccafd6e87b700ca429461c62466de2e1e9183d3 Mon Sep 17 00:00:00 2001
From: Henrik Brix Andersen <brix@gentoo.org>
Date: Mon, 7 Mar 2005 17:39:00 -0800
Subject: [PATCH] Determine SCx200 CB address at run-time

The current SCx200 drivers use a fixed base address of 0x9000 for the
Configuration Block, but some systems (at least the Soekris net4801) uses a
base address of 0x6000.  This patch first tries the fixed address then - if
no configuration block could be found - tries the address written to the
Configuration Block Address Scratchpad register by the BIOS.

Signed-off-by: Henrik Brix Andersen <brix@gentoo.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/scx200.c          | 51 ++++++++++++++++++++++++++++----------
 drivers/char/watchdog/Kconfig      |  4 +--
 drivers/char/watchdog/scx200_wdt.c | 42 +++++++++++--------------------
 drivers/mtd/maps/Kconfig           |  2 +-
 drivers/mtd/maps/scx200_docflash.c | 13 +++++-----
 include/linux/pci_ids.h            |  2 ++
 include/linux/scx200.h             | 10 +++++---
 7 files changed, 69 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/scx200.c b/arch/i386/kernel/scx200.c
index d02df17e4480..05918941f88a 100644
--- a/arch/i386/kernel/scx200.c
+++ b/arch/i386/kernel/scx200.c
@@ -13,6 +13,9 @@
 
 #include <linux/scx200.h>
 
+/* Verify that the configuration block really is there */
+#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
+
 #define NAME "scx200"
 
 MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
@@ -22,9 +25,13 @@ MODULE_LICENSE("GPL");
 unsigned scx200_gpio_base = 0;
 long scx200_gpio_shadow[2];
 
+unsigned scx200_cb_base = 0;
+
 static struct pci_device_id scx200_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS)   },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS)   },
 	{ },
 };
 MODULE_DEVICE_TABLE(pci,scx200_tbl);
@@ -45,22 +52,39 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_
 	int bank;
 	unsigned base;
 
-	base = pci_resource_start(pdev, 0);
-	printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
-
-	if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) {
-		printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
-		return -EBUSY;
+	if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
+	    pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
+		base = pci_resource_start(pdev, 0);
+		printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
+
+		if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) {
+			printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
+			return -EBUSY;
+		}
+
+		scx200_gpio_base = base;
+
+		/* read the current values driven on the GPIO signals */
+		for (bank = 0; bank < 2; ++bank)
+			scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
+
+	} else {
+		/* find the base of the Configuration Block */
+		if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) {
+			scx200_cb_base = SCx200_CB_BASE_FIXED;
+		} else {
+			pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base);
+			if (scx200_cb_probe(base)) {
+				scx200_cb_base = base;
+			} else {
+				printk(KERN_WARNING NAME ": Configuration Block not found\n");
+				return -ENODEV;
+			}
+		}
+		printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base);
 	}
 
-	scx200_gpio_base = base;
-
-	/* read the current values driven on the GPIO signals */
-	for (bank = 0; bank < 2; ++bank)
-		scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
-
 	return 0;
-
 }
 
 u32 scx200_gpio_configure(int index, u32 mask, u32 bits)
@@ -134,6 +158,7 @@ EXPORT_SYMBOL(scx200_gpio_shadow);
 EXPORT_SYMBOL(scx200_gpio_lock);
 EXPORT_SYMBOL(scx200_gpio_configure);
 EXPORT_SYMBOL(scx200_gpio_dump);
+EXPORT_SYMBOL(scx200_cb_base);
 
 /*
     Local variables:
diff --git a/drivers/char/watchdog/Kconfig b/drivers/char/watchdog/Kconfig
index d117e2b33ef2..06a31da2381c 100644
--- a/drivers/char/watchdog/Kconfig
+++ b/drivers/char/watchdog/Kconfig
@@ -268,12 +268,12 @@ config SC1200_WDT
 
 config SCx200_WDT
 	tristate "National Semiconductor SCx200 Watchdog"
-	depends on WATCHDOG && X86 && PCI
+	depends on WATCHDOG && SCx200 && PCI
 	help
 	  Enable the built-in watchdog timer support on the National
 	  Semiconductor SCx200 processors.
 
-	  If compiled as a module, it will be called scx200_watchdog.
+	  If compiled as a module, it will be called scx200_wdt.
 
 config 60XX_WDT
 	tristate "SBC-60XX Watchdog Timer"
diff --git a/drivers/char/watchdog/scx200_wdt.c b/drivers/char/watchdog/scx200_wdt.c
index 0c47a43068e6..b569670e4ed5 100644
--- a/drivers/char/watchdog/scx200_wdt.c
+++ b/drivers/char/watchdog/scx200_wdt.c
@@ -4,7 +4,7 @@
 
    Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
 
-   Som code taken from:
+   Some code taken from:
    National Semiconductor PC87307/PC97307 (ala SC1200) WDT driver
    (c) Copyright 2002 Zwane Mwaikambo <zwane@commfireservices.com>
 
@@ -64,7 +64,7 @@ static char expect_close;
 
 static void scx200_wdt_ping(void)
 {
-	outw(wdto_restart, SCx200_CB_BASE + SCx200_WDT_WDTO);
+	outw(wdto_restart, scx200_cb_base + SCx200_WDT_WDTO);
 }
 
 static void scx200_wdt_update_margin(void)
@@ -78,9 +78,9 @@ static void scx200_wdt_enable(void)
 	printk(KERN_DEBUG NAME ": enabling watchdog timer, wdto_restart = %d\n",
 	       wdto_restart);
 
-	outw(0, SCx200_CB_BASE + SCx200_WDT_WDTO);
-	outb(SCx200_WDT_WDSTS_WDOVF, SCx200_CB_BASE + SCx200_WDT_WDSTS);
-	outw(W_ENABLE, SCx200_CB_BASE + SCx200_WDT_WDCNFG);
+	outw(0, scx200_cb_base + SCx200_WDT_WDTO);
+	outb(SCx200_WDT_WDSTS_WDOVF, scx200_cb_base + SCx200_WDT_WDSTS);
+	outw(W_ENABLE, scx200_cb_base + SCx200_WDT_WDCNFG);
 
 	scx200_wdt_ping();
 }
@@ -89,9 +89,9 @@ static void scx200_wdt_disable(void)
 {
 	printk(KERN_DEBUG NAME ": disabling watchdog timer\n");
 
-	outw(0, SCx200_CB_BASE + SCx200_WDT_WDTO);
-	outb(SCx200_WDT_WDSTS_WDOVF, SCx200_CB_BASE + SCx200_WDT_WDSTS);
-	outw(W_DISABLE, SCx200_CB_BASE + SCx200_WDT_WDCNFG);
+	outw(0, scx200_cb_base + SCx200_WDT_WDTO);
+	outb(SCx200_WDT_WDSTS_WDOVF, scx200_cb_base + SCx200_WDT_WDSTS);
+	outw(W_DISABLE, scx200_cb_base + SCx200_WDT_WDCNFG);
 }
 
 static int scx200_wdt_open(struct inode *inode, struct file *file)
@@ -217,28 +217,14 @@ static struct miscdevice scx200_wdt_miscdev = {
 static int __init scx200_wdt_init(void)
 {
 	int r;
-	static struct pci_device_id ns_sc[] = {
-		{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
-		{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
-		{ },
-	};
 
 	printk(KERN_DEBUG NAME ": NatSemi SCx200 Watchdog Driver\n");
 
-	/*
-	 * First check that this really is a NatSemi SCx200 CPU or a Geode
-	 * SC1100 processor
-	 */
-	if (!pci_dev_present(ns_sc))
-		return -ENODEV;
-
-	/* More sanity checks, verify that the configuration block is there */
-	if (!scx200_cb_probe(SCx200_CB_BASE)) {
-		printk(KERN_WARNING NAME ": no configuration block found\n");
+	/* check that we have found the configuration block */
+	if (!scx200_cb_present())
 		return -ENODEV;
-	}
 
-	if (!request_region(SCx200_CB_BASE + SCx200_WDT_OFFSET,
+	if (!request_region(scx200_cb_base + SCx200_WDT_OFFSET,
 			    SCx200_WDT_SIZE,
 			    "NatSemi SCx200 Watchdog")) {
 		printk(KERN_WARNING NAME ": watchdog I/O region busy\n");
@@ -252,7 +238,7 @@ static int __init scx200_wdt_init(void)
 
 	r = misc_register(&scx200_wdt_miscdev);
 	if (r) {
-		release_region(SCx200_CB_BASE + SCx200_WDT_OFFSET,
+		release_region(scx200_cb_base + SCx200_WDT_OFFSET,
 				SCx200_WDT_SIZE);
 		return r;
 	}
@@ -261,7 +247,7 @@ static int __init scx200_wdt_init(void)
 	if (r) {
 		printk(KERN_ERR NAME ": unable to register reboot notifier");
 		misc_deregister(&scx200_wdt_miscdev);
-		release_region(SCx200_CB_BASE + SCx200_WDT_OFFSET,
+		release_region(scx200_cb_base + SCx200_WDT_OFFSET,
 				SCx200_WDT_SIZE);
 		return r;
 	}
@@ -273,7 +259,7 @@ static void __exit scx200_wdt_cleanup(void)
 {
 	unregister_reboot_notifier(&scx200_wdt_notifier);
 	misc_deregister(&scx200_wdt_miscdev);
-	release_region(SCx200_CB_BASE + SCx200_WDT_OFFSET,
+	release_region(scx200_cb_base + SCx200_WDT_OFFSET,
 		       SCx200_WDT_SIZE);
 }
 
diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig
index f00f24f23ee6..4e575f80734f 100644
--- a/drivers/mtd/maps/Kconfig
+++ b/drivers/mtd/maps/Kconfig
@@ -159,7 +159,7 @@ config MTD_VMAX
 
 config MTD_SCx200_DOCFLASH
 	tristate "Flash device mapped with DOCCS on NatSemi SCx200"
-	depends on X86 && MTD_CFI && MTD_PARTITIONS
+	depends on SCx200 && MTD_CFI && MTD_PARTITIONS
 	help
 	  Enable support for a flash chip mapped using the DOCCS signal on a
 	  National Semiconductor SCx200 processor.
diff --git a/drivers/mtd/maps/scx200_docflash.c b/drivers/mtd/maps/scx200_docflash.c
index 62df7e6f00e1..0ece3786d6ea 100644
--- a/drivers/mtd/maps/scx200_docflash.c
+++ b/drivers/mtd/maps/scx200_docflash.c
@@ -92,17 +92,16 @@ static int __init init_scx200_docflash(void)
 				      PCI_DEVICE_ID_NS_SCx200_BRIDGE,
 				      NULL)) == NULL)
 		return -ENODEV;
-	
-	if (!scx200_cb_probe(SCx200_CB_BASE)) {
-		printk(KERN_WARNING NAME ": no configuration block found\n");
+
+	/* check that we have found the configuration block */
+	if (!scx200_cb_present())
 		return -ENODEV;
-	}
 
 	if (probe) {
 		/* Try to use the present flash mapping if any */
 		pci_read_config_dword(bridge, SCx200_DOCCS_BASE, &base);
 		pci_read_config_dword(bridge, SCx200_DOCCS_CTRL, &ctrl);
-		pmr = inl(SCx200_CB_BASE + SCx200_PMR);
+		pmr = inl(scx200_cb_base + SCx200_PMR);
 
 		if (base == 0
 		    || (ctrl & 0x07000000) != 0x07000000
@@ -155,14 +154,14 @@ static int __init init_scx200_docflash(void)
 		
 		pci_write_config_dword(bridge, SCx200_DOCCS_BASE, docmem.start);
 		pci_write_config_dword(bridge, SCx200_DOCCS_CTRL, ctrl);
-		pmr = inl(SCx200_CB_BASE + SCx200_PMR);
+		pmr = inl(scx200_cb_base + SCx200_PMR);
 		
 		if (width == 8) {
 			pmr &= ~(1<<6);
 		} else {
 			pmr |= (1<<6);
 		}
-		outl(pmr, SCx200_CB_BASE + SCx200_PMR);
+		outl(pmr, scx200_cb_base + SCx200_PMR);
 	}
 	
        	printk(KERN_INFO NAME ": DOCCS mapped at 0x%lx-0x%lx, width %d\n", 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 45c638b9fdef..b65f84ec1118 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -397,6 +397,8 @@
 #define PCI_DEVICE_ID_NS_SCx200_VIDEO	0x0504
 #define PCI_DEVICE_ID_NS_SCx200_XBUS	0x0505
 #define PCI_DEVICE_ID_NS_SC1100_BRIDGE	0x0510
+#define PCI_DEVICE_ID_NS_SC1100_SMI	0x0511
+#define PCI_DEVICE_ID_NS_SC1100_XBUS	0x0515
 #define PCI_DEVICE_ID_NS_87410		0xd001
 
 #define PCI_VENDOR_ID_TSENG		0x100c
diff --git a/include/linux/scx200.h b/include/linux/scx200.h
index af7d53acad99..a22f9e173ad2 100644
--- a/include/linux/scx200.h
+++ b/include/linux/scx200.h
@@ -7,6 +7,10 @@
 
 /* Interesting stuff for the National Semiconductor SCx200 CPU */
 
+extern unsigned scx200_cb_base;
+
+#define scx200_cb_present() (scx200_cb_base!=0)
+
 /* F0 PCI Header/Bridge Configuration Registers */
 #define SCx200_DOCCS_BASE 0x78	/* DOCCS Base Address Register */
 #define SCx200_DOCCS_CTRL 0x7c	/* DOCCS Control Register */
@@ -15,7 +19,7 @@
 #define SCx200_GPIO_SIZE 0x2c	/* Size of GPIO register block */
 
 /* General Configuration Block */
-#define SCx200_CB_BASE 0x9000	/* Base fixed at 0x9000 according to errata */
+#define SCx200_CB_BASE_FIXED 0x9000	/* Base fixed at 0x9000 according to errata? */
 
 /* Watchdog Timer */
 #define SCx200_WDT_OFFSET 0x00	/* offset within configuration block */
@@ -44,9 +48,7 @@
 #define SCx200_IID 0x3c		/* IA On a Chip Identification Number Reg */
 #define SCx200_REV 0x3d		/* Revision Register */
 #define SCx200_CBA 0x3e		/* Configuration Base Address Register */
-
-/* Verify that the configuration block really is there */
-#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
+#define SCx200_CBA_SCRATCH 0x64	/* Configuration Base Address Scratchpad */
 
 /*
     Local variables:
-- 
cgit v1.2.3


From 7c58d9bf44b89eac1971473636d573dfa4f5e578 Mon Sep 17 00:00:00 2001
From: Pat Gefre <pfg@sgi.com>
Date: Mon, 7 Mar 2005 17:41:29 -0800
Subject: [PATCH] Altix: ioc4 serial driver support

Signed-off-by: Patrick Gefre <pfg@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/Makefile             |    1 +
 drivers/ide/pci/sgiioc4.c    |   41 +-
 drivers/serial/Makefile      |    1 +
 drivers/serial/ioc4_serial.c | 2909 ++++++++++++++++++++++++++++++++++++++++++
 drivers/sn/Makefile          |    6 +
 drivers/sn/ioc4.c            |   65 +
 include/linux/ioc4_common.h  |   21 +
 7 files changed, 3010 insertions(+), 34 deletions(-)
 create mode 100644 drivers/serial/ioc4_serial.c
 create mode 100644 drivers/sn/Makefile
 create mode 100644 drivers/sn/ioc4.c
 create mode 100644 include/linux/ioc4_common.h

(limited to 'include/linux')

diff --git a/drivers/Makefile b/drivers/Makefile
index 04fba9e5009b..d65be7138f12 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -61,5 +61,6 @@ obj-$(CONFIG_EISA)		+= eisa/
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_MMC)		+= mmc/
 obj-$(CONFIG_INFINIBAND)	+= infiniband/
+obj-$(CONFIG_BLK_DEV_SGIIOC4)	+= sn/
 obj-y				+= firmware/
 obj-$(CONFIG_CRYPTO)		+= crypto/
diff --git a/drivers/ide/pci/sgiioc4.c b/drivers/ide/pci/sgiioc4.c
index 56d67fde55a4..4651a22bf12e 100644
--- a/drivers/ide/pci/sgiioc4.c
+++ b/drivers/ide/pci/sgiioc4.c
@@ -34,6 +34,7 @@
 #include <linux/mm.h>
 #include <linux/ioport.h>
 #include <linux/blkdev.h>
+#include <linux/ioc4_common.h>
 #include <asm/io.h>
 
 #include <linux/ide.h>
@@ -684,15 +685,6 @@ pci_init_sgiioc4(struct pci_dev *dev, ide_pci_device_t * d)
 	unsigned int class_rev;
 	int ret;
 
-	ret = pci_enable_device(dev);
-	if (ret < 0) {
-		printk(KERN_ERR
-		       "Failed to enable device %s at slot %s\n",
-		       d->name, pci_name(dev));
-		goto out;
-	}
-	pci_set_master(dev);
-
 	pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev);
 	class_rev &= 0xff;
 	printk(KERN_INFO "%s: IDE controller at PCI slot %s, revision %d\n",
@@ -722,34 +714,15 @@ static ide_pci_device_t sgiioc4_chipsets[] __devinitdata = {
 	}
 };
 
-static int __devinit
-sgiioc4_init_one(struct pci_dev *dev, const struct pci_device_id *id)
+int
+ioc4_ide_attach_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
-	pci_init_sgiioc4(dev, &sgiioc4_chipsets[id->driver_data]);
-	return 0;
+	return pci_init_sgiioc4(dev, &sgiioc4_chipsets[id->driver_data]);
 }
 
-static struct pci_device_id sgiioc4_pci_tbl[] = {
-	{PCI_VENDOR_ID_SGI, PCI_DEVICE_ID_SGI_IOC4, PCI_ANY_ID,
-	 PCI_ANY_ID, 0x0b4000, 0xFFFFFF, 0},
-	{0}
-};
-MODULE_DEVICE_TABLE(pci, sgiioc4_pci_tbl);
-
-static struct pci_driver __devinitdata driver = {
-	.name = "SGI-IOC4_IDE",
-	.id_table = sgiioc4_pci_tbl,
-	.probe = sgiioc4_init_one,
-};
-
-static int __devinit
-sgiioc4_ide_init(void)
-{
-	return ide_pci_register_driver(&driver);
-}
-
-module_init(sgiioc4_ide_init);
 
 MODULE_AUTHOR("Aniket Malatpure - Silicon Graphics Inc. (SGI)");
-MODULE_DESCRIPTION("PCI driver module for SGI IOC4 Base-IO Card");
+MODULE_DESCRIPTION("IDE PCI driver module for SGI IOC4 Base-IO Card");
 MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL(ioc4_ide_attach_one);
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index 157808f2fa8d..ec04507a5969 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -49,3 +49,4 @@ obj-$(CONFIG_SERIAL_M32R_SIO) += m32r_sio.o
 obj-$(CONFIG_SERIAL_MPSC) += mpsc.o
 obj-$(CONFIG_ETRAX_SERIAL) += crisv10.o
 obj-$(CONFIG_SERIAL_TXX9) += serial_txx9.o
+obj-$(CONFIG_BLK_DEV_SGIIOC4) += ioc4_serial.o
diff --git a/drivers/serial/ioc4_serial.c b/drivers/serial/ioc4_serial.c
new file mode 100644
index 000000000000..6a42a84bd0b7
--- /dev/null
+++ b/drivers/serial/ioc4_serial.c
@@ -0,0 +1,2909 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2003-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+
+/*
+ * This file contains a module version of the ioc4 serial driver. This
+ * includes all the support functions needed (support functions, etc.)
+ * and the serial driver itself.
+ */
+#include <linux/errno.h>
+#include <linux/tty.h>
+#include <linux/serial.h>
+#include <linux/serialP.h>
+#include <linux/circ_buf.h>
+#include <linux/serial_reg.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/ioc4_common.h>
+#include <linux/serial_core.h>
+
+/*
+ * interesting things about the ioc4
+ */
+
+#define IOC4_NUM_SERIAL_PORTS	4	/* max ports per card */
+#define IOC4_NUM_CARDS		8	/* max cards per partition */
+
+#define	GET_SIO_IR(_n)	(_n == 0) ? (IOC4_SIO_IR_S0) : \
+				(_n == 1) ? (IOC4_SIO_IR_S1) : \
+				(_n == 2) ? (IOC4_SIO_IR_S2) : \
+				(IOC4_SIO_IR_S3)
+
+#define	GET_OTHER_IR(_n)  (_n == 0) ? (IOC4_OTHER_IR_S0_MEMERR) : \
+				(_n == 1) ? (IOC4_OTHER_IR_S1_MEMERR) : \
+				(_n == 2) ? (IOC4_OTHER_IR_S2_MEMERR) : \
+				(IOC4_OTHER_IR_S3_MEMERR)
+
+
+/*
+ * All IOC4 registers are 32 bits wide.
+ */
+
+/*
+ * PCI Memory Space Map
+ */
+#define IOC4_PCI_ERR_ADDR_L     0x000	/* Low Error Address */
+#define IOC4_PCI_ERR_ADDR_VLD	        (0x1 << 0)
+#define IOC4_PCI_ERR_ADDR_MST_ID_MSK    (0xf << 1)
+#define IOC4_PCI_ERR_ADDR_MST_NUM_MSK   (0xe << 1)
+#define IOC4_PCI_ERR_ADDR_MST_TYP_MSK   (0x1 << 1)
+#define IOC4_PCI_ERR_ADDR_MUL_ERR       (0x1 << 5)
+#define IOC4_PCI_ERR_ADDR_ADDR_MSK      (0x3ffffff << 6)
+
+/* Interrupt types */
+#define	IOC4_SIO_INTR_TYPE	0
+#define	IOC4_OTHER_INTR_TYPE	1
+#define	IOC4_NUM_INTR_TYPES	2
+
+/* Bitmasks for IOC4_SIO_IR, IOC4_SIO_IEC, and IOC4_SIO_IES  */
+#define IOC4_SIO_IR_S0_TX_MT	   0x00000001	/* Serial port 0 TX empty */
+#define IOC4_SIO_IR_S0_RX_FULL	   0x00000002	/* Port 0 RX buf full */
+#define IOC4_SIO_IR_S0_RX_HIGH	   0x00000004	/* Port 0 RX hiwat */
+#define IOC4_SIO_IR_S0_RX_TIMER	   0x00000008	/* Port 0 RX timeout */
+#define IOC4_SIO_IR_S0_DELTA_DCD   0x00000010	/* Port 0 delta DCD */
+#define IOC4_SIO_IR_S0_DELTA_CTS   0x00000020	/* Port 0 delta CTS */
+#define IOC4_SIO_IR_S0_INT	   0x00000040	/* Port 0 pass-thru intr */
+#define IOC4_SIO_IR_S0_TX_EXPLICIT 0x00000080	/* Port 0 explicit TX thru */
+#define IOC4_SIO_IR_S1_TX_MT	   0x00000100	/* Serial port 1 */
+#define IOC4_SIO_IR_S1_RX_FULL	   0x00000200	/* */
+#define IOC4_SIO_IR_S1_RX_HIGH	   0x00000400	/* */
+#define IOC4_SIO_IR_S1_RX_TIMER	   0x00000800	/* */
+#define IOC4_SIO_IR_S1_DELTA_DCD   0x00001000	/* */
+#define IOC4_SIO_IR_S1_DELTA_CTS   0x00002000	/* */
+#define IOC4_SIO_IR_S1_INT	   0x00004000	/* */
+#define IOC4_SIO_IR_S1_TX_EXPLICIT 0x00008000	/* */
+#define IOC4_SIO_IR_S2_TX_MT	   0x00010000	/* Serial port 2 */
+#define IOC4_SIO_IR_S2_RX_FULL	   0x00020000	/* */
+#define IOC4_SIO_IR_S2_RX_HIGH	   0x00040000	/* */
+#define IOC4_SIO_IR_S2_RX_TIMER	   0x00080000	/* */
+#define IOC4_SIO_IR_S2_DELTA_DCD   0x00100000	/* */
+#define IOC4_SIO_IR_S2_DELTA_CTS   0x00200000	/* */
+#define IOC4_SIO_IR_S2_INT	   0x00400000	/* */
+#define IOC4_SIO_IR_S2_TX_EXPLICIT 0x00800000	/* */
+#define IOC4_SIO_IR_S3_TX_MT	   0x01000000	/* Serial port 3 */
+#define IOC4_SIO_IR_S3_RX_FULL	   0x02000000	/* */
+#define IOC4_SIO_IR_S3_RX_HIGH	   0x04000000	/* */
+#define IOC4_SIO_IR_S3_RX_TIMER	   0x08000000	/* */
+#define IOC4_SIO_IR_S3_DELTA_DCD   0x10000000	/* */
+#define IOC4_SIO_IR_S3_DELTA_CTS   0x20000000	/* */
+#define IOC4_SIO_IR_S3_INT	   0x40000000	/* */
+#define IOC4_SIO_IR_S3_TX_EXPLICIT 0x80000000	/* */
+
+/* Per device interrupt masks */
+#define IOC4_SIO_IR_S0		(IOC4_SIO_IR_S0_TX_MT | \
+				 IOC4_SIO_IR_S0_RX_FULL | \
+				 IOC4_SIO_IR_S0_RX_HIGH | \
+				 IOC4_SIO_IR_S0_RX_TIMER | \
+				 IOC4_SIO_IR_S0_DELTA_DCD | \
+				 IOC4_SIO_IR_S0_DELTA_CTS | \
+				 IOC4_SIO_IR_S0_INT | \
+				 IOC4_SIO_IR_S0_TX_EXPLICIT)
+#define IOC4_SIO_IR_S1		(IOC4_SIO_IR_S1_TX_MT | \
+				 IOC4_SIO_IR_S1_RX_FULL | \
+				 IOC4_SIO_IR_S1_RX_HIGH | \
+				 IOC4_SIO_IR_S1_RX_TIMER | \
+				 IOC4_SIO_IR_S1_DELTA_DCD | \
+				 IOC4_SIO_IR_S1_DELTA_CTS | \
+				 IOC4_SIO_IR_S1_INT | \
+				 IOC4_SIO_IR_S1_TX_EXPLICIT)
+#define IOC4_SIO_IR_S2		(IOC4_SIO_IR_S2_TX_MT | \
+				 IOC4_SIO_IR_S2_RX_FULL | \
+				 IOC4_SIO_IR_S2_RX_HIGH | \
+				 IOC4_SIO_IR_S2_RX_TIMER | \
+				 IOC4_SIO_IR_S2_DELTA_DCD | \
+				 IOC4_SIO_IR_S2_DELTA_CTS | \
+				 IOC4_SIO_IR_S2_INT | \
+				 IOC4_SIO_IR_S2_TX_EXPLICIT)
+#define IOC4_SIO_IR_S3		(IOC4_SIO_IR_S3_TX_MT | \
+				 IOC4_SIO_IR_S3_RX_FULL | \
+				 IOC4_SIO_IR_S3_RX_HIGH | \
+				 IOC4_SIO_IR_S3_RX_TIMER | \
+				 IOC4_SIO_IR_S3_DELTA_DCD | \
+				 IOC4_SIO_IR_S3_DELTA_CTS | \
+				 IOC4_SIO_IR_S3_INT | \
+				 IOC4_SIO_IR_S3_TX_EXPLICIT)
+
+/* Bitmasks for IOC4_OTHER_IR, IOC4_OTHER_IEC, and IOC4_OTHER_IES  */
+#define IOC4_OTHER_IR_ATA_INT           0x00000001  /* ATAPI intr pass-thru */
+#define IOC4_OTHER_IR_ATA_MEMERR        0x00000002  /* ATAPI DMA PCI error */
+#define IOC4_OTHER_IR_S0_MEMERR         0x00000004  /* Port 0 PCI error */
+#define IOC4_OTHER_IR_S1_MEMERR         0x00000008  /* Port 1 PCI error */
+#define IOC4_OTHER_IR_S2_MEMERR         0x00000010  /* Port 2 PCI error */
+#define IOC4_OTHER_IR_S3_MEMERR         0x00000020  /* Port 3 PCI error */
+
+/* Bitmasks for IOC4_SIO_CR */
+#define IOC4_SIO_CR_CMD_PULSE_SHIFT              0  /* byte bus strobe shift */
+#define IOC4_SIO_CR_ARB_DIAG_TX0	0x00000000
+#define IOC4_SIO_CR_ARB_DIAG_RX0	0x00000010
+#define IOC4_SIO_CR_ARB_DIAG_TX1	0x00000020
+#define IOC4_SIO_CR_ARB_DIAG_RX1	0x00000030
+#define IOC4_SIO_CR_ARB_DIAG_TX2	0x00000040
+#define IOC4_SIO_CR_ARB_DIAG_RX2	0x00000050
+#define IOC4_SIO_CR_ARB_DIAG_TX3	0x00000060
+#define IOC4_SIO_CR_ARB_DIAG_RX3	0x00000070
+#define IOC4_SIO_CR_SIO_DIAG_IDLE	0x00000080  /* 0 -> active request among
+							   serial ports (ro) */
+/* Defs for some of the generic I/O pins */
+#define IOC4_GPCR_UART0_MODESEL	   0x10	/* Pin is output to port 0
+						   mode sel */
+#define IOC4_GPCR_UART1_MODESEL	   0x20	/* Pin is output to port 1
+						   mode sel */
+#define IOC4_GPCR_UART2_MODESEL	   0x40	/* Pin is output to port 2
+						   mode sel */
+#define IOC4_GPCR_UART3_MODESEL	   0x80	/* Pin is output to port 3
+						   mode sel */
+
+#define IOC4_GPPR_UART0_MODESEL_PIN   4	/* GIO pin controlling
+					   uart 0 mode select */
+#define IOC4_GPPR_UART1_MODESEL_PIN   5	/* GIO pin controlling
+					   uart 1 mode select */
+#define IOC4_GPPR_UART2_MODESEL_PIN   6	/* GIO pin controlling
+					   uart 2 mode select */
+#define IOC4_GPPR_UART3_MODESEL_PIN   7	/* GIO pin controlling
+					   uart 3 mode select */
+
+/* Bitmasks for serial RX status byte */
+#define IOC4_RXSB_OVERRUN       0x01	/* Char(s) lost */
+#define IOC4_RXSB_PAR_ERR	0x02	/* Parity error */
+#define IOC4_RXSB_FRAME_ERR	0x04	/* Framing error */
+#define IOC4_RXSB_BREAK	        0x08	/* Break character */
+#define IOC4_RXSB_CTS	        0x10	/* State of CTS */
+#define IOC4_RXSB_DCD	        0x20	/* State of DCD */
+#define IOC4_RXSB_MODEM_VALID   0x40	/* DCD, CTS, and OVERRUN are valid */
+#define IOC4_RXSB_DATA_VALID    0x80	/* Data byte, FRAME_ERR PAR_ERR
+					 * & BREAK valid */
+
+/* Bitmasks for serial TX control byte */
+#define IOC4_TXCB_INT_WHEN_DONE 0x20	/* Interrupt after this byte is sent */
+#define IOC4_TXCB_INVALID	0x00	/* Byte is invalid */
+#define IOC4_TXCB_VALID	        0x40	/* Byte is valid */
+#define IOC4_TXCB_MCR	        0x80	/* Data<7:0> to modem control reg */
+#define IOC4_TXCB_DELAY	        0xc0	/* Delay data<7:0> mSec */
+
+/* Bitmasks for IOC4_SBBR_L */
+#define IOC4_SBBR_L_SIZE	0x00000001  /* 0 == 1KB rings, 1 == 4KB rings */
+
+/* Bitmasks for IOC4_SSCR_<3:0> */
+#define IOC4_SSCR_RX_THRESHOLD  0x000001ff  /* Hiwater mark */
+#define IOC4_SSCR_TX_TIMER_BUSY 0x00010000  /* TX timer in progress */
+#define IOC4_SSCR_HFC_EN	0x00020000  /* Hardware flow control enabled */
+#define IOC4_SSCR_RX_RING_DCD   0x00040000  /* Post RX record on delta-DCD */
+#define IOC4_SSCR_RX_RING_CTS   0x00080000  /* Post RX record on delta-CTS */
+#define IOC4_SSCR_DIAG	        0x00200000  /* Bypass clock divider for sim */
+#define IOC4_SSCR_RX_DRAIN	0x08000000  /* Drain RX buffer to memory */
+#define IOC4_SSCR_DMA_EN	0x10000000  /* Enable ring buffer DMA */
+#define IOC4_SSCR_DMA_PAUSE	0x20000000  /* Pause DMA */
+#define IOC4_SSCR_PAUSE_STATE   0x40000000  /* Sets when PAUSE takes effect */
+#define IOC4_SSCR_RESET	        0x80000000  /* Reset DMA channels */
+
+/* All producer/comsumer pointers are the same bitfield */
+#define IOC4_PROD_CONS_PTR_4K   0x00000ff8	/* For 4K buffers */
+#define IOC4_PROD_CONS_PTR_1K   0x000003f8	/* For 1K buffers */
+#define IOC4_PROD_CONS_PTR_OFF           3
+
+/* Bitmasks for IOC4_SRCIR_<3:0> */
+#define IOC4_SRCIR_ARM	        0x80000000	/* Arm RX timer */
+
+/* Bitmasks for IOC4_SHADOW_<3:0> */
+#define IOC4_SHADOW_DR	 0x00000001	/* Data ready */
+#define IOC4_SHADOW_OE	 0x00000002	/* Overrun error */
+#define IOC4_SHADOW_PE	 0x00000004	/* Parity error */
+#define IOC4_SHADOW_FE	 0x00000008	/* Framing error */
+#define IOC4_SHADOW_BI	 0x00000010	/* Break interrupt */
+#define IOC4_SHADOW_THRE 0x00000020	/* Xmit holding register empty */
+#define IOC4_SHADOW_TEMT 0x00000040	/* Xmit shift register empty */
+#define IOC4_SHADOW_RFCE 0x00000080	/* Char in RX fifo has an error */
+#define IOC4_SHADOW_DCTS 0x00010000	/* Delta clear to send */
+#define IOC4_SHADOW_DDCD 0x00080000	/* Delta data carrier detect */
+#define IOC4_SHADOW_CTS	 0x00100000	/* Clear to send */
+#define IOC4_SHADOW_DCD	 0x00800000	/* Data carrier detect */
+#define IOC4_SHADOW_DTR	 0x01000000	/* Data terminal ready */
+#define IOC4_SHADOW_RTS	 0x02000000	/* Request to send */
+#define IOC4_SHADOW_OUT1 0x04000000	/* 16550 OUT1 bit */
+#define IOC4_SHADOW_OUT2 0x08000000	/* 16550 OUT2 bit */
+#define IOC4_SHADOW_LOOP 0x10000000	/* Loopback enabled */
+
+/* Bitmasks for IOC4_SRTR_<3:0> */
+#define IOC4_SRTR_CNT	        0x00000fff	/* Reload value for RX timer */
+#define IOC4_SRTR_CNT_VAL	0x0fff0000	/* Current value of RX timer */
+#define IOC4_SRTR_CNT_VAL_SHIFT         16
+#define IOC4_SRTR_HZ                 16000	/* SRTR clock frequency */
+
+/* Serial port register map used for DMA and PIO serial I/O */
+struct ioc4_serialregs {
+	uint32_t sscr;
+	uint32_t stpir;
+	uint32_t stcir;
+	uint32_t srpir;
+	uint32_t srcir;
+	uint32_t srtr;
+	uint32_t shadow;
+};
+
+/* IOC4 UART register map */
+struct ioc4_uartregs {
+	char i4u_lcr;
+	union {
+		char iir;	/* read only */
+		char fcr;	/* write only */
+	} u3;
+	union {
+		char ier;	/* DLAB == 0 */
+		char dlm;	/* DLAB == 1 */
+	} u2;
+	union {
+		char rbr;	/* read only, DLAB == 0 */
+		char thr;	/* write only, DLAB == 0 */
+		char dll;	/* DLAB == 1 */
+	} u1;
+	char i4u_scr;
+	char i4u_msr;
+	char i4u_lsr;
+	char i4u_mcr;
+};
+
+/* short names */
+#define i4u_dll u1.dll
+#define i4u_ier u2.ier
+#define i4u_dlm u2.dlm
+#define i4u_fcr u3.fcr
+
+/* PCI memory space register map addressed using pci_bar0 */
+struct ioc4_memregs {
+	struct ioc4_mem {
+		/* Miscellaneous IOC4  registers */
+		uint32_t pci_err_addr_l;
+		uint32_t pci_err_addr_h;
+		uint32_t sio_ir;
+		uint32_t other_ir;
+
+		/* These registers are read-only for general kernel code.  */
+		uint32_t sio_ies_ro;
+		uint32_t other_ies_ro;
+		uint32_t sio_iec_ro;
+		uint32_t other_iec_ro;
+		uint32_t sio_cr;
+		uint32_t misc_fill1;
+		uint32_t int_out;
+		uint32_t misc_fill2;
+		uint32_t gpcr_s;
+		uint32_t gpcr_c;
+		uint32_t gpdr;
+		uint32_t misc_fill3;
+		uint32_t gppr_0;
+		uint32_t gppr_1;
+		uint32_t gppr_2;
+		uint32_t gppr_3;
+		uint32_t gppr_4;
+		uint32_t gppr_5;
+		uint32_t gppr_6;
+		uint32_t gppr_7;
+	} ioc4_mem;
+
+	char misc_fill4[0x100 - 0x5C - 4];
+
+	/* ATA/ATAP registers */
+	uint32_t ata_notused[9];
+	char ata_fill1[0x140 - 0x120 - 4];
+	uint32_t ata_notused1[8];
+	char ata_fill2[0x200 - 0x15C - 4];
+
+	/* Keyboard and mouse registers */
+	uint32_t km_notused[5];;
+	char km_fill1[0x300 - 0x210 - 4];
+
+	/* Serial port registers used for DMA serial I/O */
+	struct ioc4_serial {
+		uint32_t sbbr01_l;
+		uint32_t sbbr01_h;
+		uint32_t sbbr23_l;
+		uint32_t sbbr23_h;
+
+		struct ioc4_serialregs port_0;
+		struct ioc4_serialregs port_1;
+		struct ioc4_serialregs port_2;
+		struct ioc4_serialregs port_3;
+		struct ioc4_uartregs uart_0;
+		struct ioc4_uartregs uart_1;
+		struct ioc4_uartregs uart_2;
+		struct ioc4_uartregs uart_3;
+	} ioc4_serial;
+};
+
+/* UART clock speed */
+#define IOC4_SER_XIN_CLK        IOC4_SER_XIN_CLK_66
+#define IOC4_SER_XIN_CLK_66     66666667
+#define IOC4_SER_XIN_CLK_33     33333333
+
+#define IOC4_W_IES		0
+#define IOC4_W_IEC		1
+
+typedef void ioc4_intr_func_f(void *, uint32_t);
+typedef ioc4_intr_func_f *ioc4_intr_func_t;
+
+/* defining this will get you LOTS of great debug info */
+//#define DEBUG_INTERRUPTS
+#define DPRINT_CONFIG(_x...)	;
+//#define DPRINT_CONFIG(_x...)	printk _x
+
+/* number of characters left in xmit buffer before we ask for more */
+#define WAKEUP_CHARS	256
+
+/* number of characters we want to transmit to the lower level at a time */
+#define IOC4_MAX_CHARS	128
+
+/* Device name we're using */
+#define DEVICE_NAME	"ttyIOC"
+#define DEVICE_MAJOR 204
+#define DEVICE_MINOR 50
+
+/* register offsets */
+#define IOC4_SERIAL_OFFSET	0x300
+
+/* flags for next_char_state */
+#define NCS_BREAK	0x1
+#define NCS_PARITY	0x2
+#define NCS_FRAMING	0x4
+#define NCS_OVERRUN	0x8
+
+/* cause we need SOME parameters ... */
+#define MIN_BAUD_SUPPORTED	1200
+#define MAX_BAUD_SUPPORTED	115200
+
+/* protocol types supported */
+enum sio_proto {
+	PROTO_RS232,
+	PROTO_RS422
+};
+
+/* Notification types */
+#define N_DATA_READY	0x01
+#define N_OUTPUT_LOWAT	0x02
+#define N_BREAK		0x04
+#define N_PARITY_ERROR	0x08
+#define N_FRAMING_ERROR	0x10
+#define N_OVERRUN_ERROR	0x20
+#define N_DDCD		0x40
+#define N_DCTS		0x80
+
+#define N_ALL_INPUT	(N_DATA_READY | N_BREAK |			\
+			 N_PARITY_ERROR | N_FRAMING_ERROR |		\
+			 N_OVERRUN_ERROR | N_DDCD | N_DCTS)
+
+#define N_ALL_OUTPUT	N_OUTPUT_LOWAT
+
+#define N_ALL_ERRORS	(N_PARITY_ERROR | N_FRAMING_ERROR | N_OVERRUN_ERROR)
+
+#define N_ALL		(N_DATA_READY | N_OUTPUT_LOWAT | N_BREAK |	\
+			 N_PARITY_ERROR | N_FRAMING_ERROR |		\
+			 N_OVERRUN_ERROR | N_DDCD | N_DCTS)
+
+#define SER_DIVISOR(_x, clk)		(((clk) + (_x) * 8) / ((_x) * 16))
+#define DIVISOR_TO_BAUD(div, clk)	((clk) / 16 / (div))
+
+/* Some masks */
+#define LCR_MASK_BITS_CHAR	(UART_LCR_WLEN5 | UART_LCR_WLEN6 \
+					| UART_LCR_WLEN7 | UART_LCR_WLEN8)
+#define LCR_MASK_STOP_BITS	(UART_LCR_STOP)
+
+#define PENDING(_p)	(readl(&(_p)->ip_mem->sio_ir) & _p->ip_ienb)
+#define READ_SIO_IR(_p) readl(&(_p)->ip_mem->sio_ir)
+
+/* Default to 4k buffers */
+#ifdef IOC4_1K_BUFFERS
+#define RING_BUF_SIZE 1024
+#define IOC4_BUF_SIZE_BIT 0
+#define PROD_CONS_MASK IOC4_PROD_CONS_PTR_1K
+#else
+#define RING_BUF_SIZE 4096
+#define IOC4_BUF_SIZE_BIT IOC4_SBBR_L_SIZE
+#define PROD_CONS_MASK IOC4_PROD_CONS_PTR_4K
+#endif
+
+#define TOTAL_RING_BUF_SIZE (RING_BUF_SIZE * 4)
+
+/*
+ * This is the entry saved by the driver - one per card
+ */
+struct ioc4_control {
+	int ic_irq;
+	struct {
+		/* uart ports are allocated here */
+		struct uart_port icp_uart_port;
+		/* Handy reference material */
+		struct ioc4_port *icp_port;
+	} ic_port[IOC4_NUM_SERIAL_PORTS];
+	struct ioc4_soft *ic_soft;
+};
+
+/*
+ * per-IOC4 data structure
+ */
+#define MAX_IOC4_INTR_ENTS	(8 * sizeof(uint32_t))
+struct ioc4_soft {
+	struct ioc4_mem __iomem *is_ioc4_mem_addr;
+	struct ioc4_serial __iomem *is_ioc4_serial_addr;
+
+	/* Each interrupt type has an entry in the array */
+	struct ioc4_intr_type {
+
+		/*
+		 * Each in-use entry in this array contains at least
+		 * one nonzero bit in sd_bits; no two entries in this
+		 * array have overlapping sd_bits values.
+		 */
+		struct ioc4_intr_info {
+			uint32_t sd_bits;
+			ioc4_intr_func_f *sd_intr;
+			void *sd_info;
+		} is_intr_info[MAX_IOC4_INTR_ENTS];
+
+		/* Number of entries active in the above array */
+		atomic_t is_num_intrs;
+	} is_intr_type[IOC4_NUM_INTR_TYPES];
+
+	/* is_ir_lock must be held while
+	 * modifying sio_ie values, so
+	 * we can be sure that sio_ie is
+	 * not changing when we read it
+	 * along with sio_ir.
+	 */
+	spinlock_t is_ir_lock;	/* SIO_IE[SC] mod lock */
+};
+
+/* Local port info for each IOC4 serial ports */
+struct ioc4_port {
+	struct uart_port *ip_port;
+	/* Back ptrs for this port */
+	struct ioc4_control *ip_control;
+	struct pci_dev *ip_pdev;
+	struct ioc4_soft *ip_ioc4_soft;
+
+	/* pci mem addresses */
+	struct ioc4_mem __iomem *ip_mem;
+	struct ioc4_serial __iomem *ip_serial;
+	struct ioc4_serialregs __iomem *ip_serial_regs;
+	struct ioc4_uartregs __iomem *ip_uart_regs;
+
+	/* Ring buffer page for this port */
+	dma_addr_t ip_dma_ringbuf;
+	/* vaddr of ring buffer */
+	struct ring_buffer *ip_cpu_ringbuf;
+
+	/* Rings for this port */
+	struct ring *ip_inring;
+	struct ring *ip_outring;
+
+	/* Hook to port specific values */
+	struct hooks *ip_hooks;
+
+	spinlock_t ip_lock;
+
+	/* Various rx/tx parameters */
+	int ip_baud;
+	int ip_tx_lowat;
+	int ip_rx_timeout;
+
+	/* Copy of notification bits */
+	int ip_notify;
+
+	/* Shadow copies of various registers so we don't need to PIO
+	 * read them constantly
+	 */
+	uint32_t ip_ienb;	/* Enabled interrupts */
+	uint32_t ip_sscr;
+	uint32_t ip_tx_prod;
+	uint32_t ip_rx_cons;
+	int ip_pci_bus_speed;
+	unsigned char ip_flags;
+};
+
+/* tx low water mark.  We need to notify the driver whenever tx is getting
+ * close to empty so it can refill the tx buffer and keep things going.
+ * Let's assume that if we interrupt 1 ms before the tx goes idle, we'll
+ * have no trouble getting in more chars in time (I certainly hope so).
+ */
+#define TX_LOWAT_LATENCY      1000
+#define TX_LOWAT_HZ          (1000000 / TX_LOWAT_LATENCY)
+#define TX_LOWAT_CHARS(baud) (baud / 10 / TX_LOWAT_HZ)
+
+/* Flags per port */
+#define INPUT_HIGH	0x01
+#define DCD_ON		0x02
+#define LOWAT_WRITTEN	0x04
+#define READ_ABORTED	0x08
+
+/* Since each port has different register offsets and bitmasks
+ * for everything, we'll store those that we need in tables so we
+ * don't have to be constantly checking the port we are dealing with.
+ */
+struct hooks {
+	uint32_t intr_delta_dcd;
+	uint32_t intr_delta_cts;
+	uint32_t intr_tx_mt;
+	uint32_t intr_rx_timer;
+	uint32_t intr_rx_high;
+	uint32_t intr_tx_explicit;
+	uint32_t intr_dma_error;
+	uint32_t intr_clear;
+	uint32_t intr_all;
+	char rs422_select_pin;
+};
+
+static struct hooks hooks_array[IOC4_NUM_SERIAL_PORTS] = {
+	/* Values for port 0 */
+	{
+	 IOC4_SIO_IR_S0_DELTA_DCD, IOC4_SIO_IR_S0_DELTA_CTS,
+	 IOC4_SIO_IR_S0_TX_MT, IOC4_SIO_IR_S0_RX_TIMER,
+	 IOC4_SIO_IR_S0_RX_HIGH, IOC4_SIO_IR_S0_TX_EXPLICIT,
+	 IOC4_OTHER_IR_S0_MEMERR,
+	 (IOC4_SIO_IR_S0_TX_MT | IOC4_SIO_IR_S0_RX_FULL |
+	  IOC4_SIO_IR_S0_RX_HIGH | IOC4_SIO_IR_S0_RX_TIMER |
+	  IOC4_SIO_IR_S0_DELTA_DCD | IOC4_SIO_IR_S0_DELTA_CTS |
+	  IOC4_SIO_IR_S0_INT | IOC4_SIO_IR_S0_TX_EXPLICIT),
+	 IOC4_SIO_IR_S0, IOC4_GPPR_UART0_MODESEL_PIN,
+	 },
+
+	/* Values for port 1 */
+	{
+	 IOC4_SIO_IR_S1_DELTA_DCD, IOC4_SIO_IR_S1_DELTA_CTS,
+	 IOC4_SIO_IR_S1_TX_MT, IOC4_SIO_IR_S1_RX_TIMER,
+	 IOC4_SIO_IR_S1_RX_HIGH, IOC4_SIO_IR_S1_TX_EXPLICIT,
+	 IOC4_OTHER_IR_S1_MEMERR,
+	 (IOC4_SIO_IR_S1_TX_MT | IOC4_SIO_IR_S1_RX_FULL |
+	  IOC4_SIO_IR_S1_RX_HIGH | IOC4_SIO_IR_S1_RX_TIMER |
+	  IOC4_SIO_IR_S1_DELTA_DCD | IOC4_SIO_IR_S1_DELTA_CTS |
+	  IOC4_SIO_IR_S1_INT | IOC4_SIO_IR_S1_TX_EXPLICIT),
+	 IOC4_SIO_IR_S1, IOC4_GPPR_UART1_MODESEL_PIN,
+	 },
+
+	/* Values for port 2 */
+	{
+	 IOC4_SIO_IR_S2_DELTA_DCD, IOC4_SIO_IR_S2_DELTA_CTS,
+	 IOC4_SIO_IR_S2_TX_MT, IOC4_SIO_IR_S2_RX_TIMER,
+	 IOC4_SIO_IR_S2_RX_HIGH, IOC4_SIO_IR_S2_TX_EXPLICIT,
+	 IOC4_OTHER_IR_S2_MEMERR,
+	 (IOC4_SIO_IR_S2_TX_MT | IOC4_SIO_IR_S2_RX_FULL |
+	  IOC4_SIO_IR_S2_RX_HIGH | IOC4_SIO_IR_S2_RX_TIMER |
+	  IOC4_SIO_IR_S2_DELTA_DCD | IOC4_SIO_IR_S2_DELTA_CTS |
+	  IOC4_SIO_IR_S2_INT | IOC4_SIO_IR_S2_TX_EXPLICIT),
+	 IOC4_SIO_IR_S2, IOC4_GPPR_UART2_MODESEL_PIN,
+	 },
+
+	/* Values for port 3 */
+	{
+	 IOC4_SIO_IR_S3_DELTA_DCD, IOC4_SIO_IR_S3_DELTA_CTS,
+	 IOC4_SIO_IR_S3_TX_MT, IOC4_SIO_IR_S3_RX_TIMER,
+	 IOC4_SIO_IR_S3_RX_HIGH, IOC4_SIO_IR_S3_TX_EXPLICIT,
+	 IOC4_OTHER_IR_S3_MEMERR,
+	 (IOC4_SIO_IR_S3_TX_MT | IOC4_SIO_IR_S3_RX_FULL |
+	  IOC4_SIO_IR_S3_RX_HIGH | IOC4_SIO_IR_S3_RX_TIMER |
+	  IOC4_SIO_IR_S3_DELTA_DCD | IOC4_SIO_IR_S3_DELTA_CTS |
+	  IOC4_SIO_IR_S3_INT | IOC4_SIO_IR_S3_TX_EXPLICIT),
+	 IOC4_SIO_IR_S3, IOC4_GPPR_UART3_MODESEL_PIN,
+	 }
+};
+
+/* A ring buffer entry */
+struct ring_entry {
+	union {
+		struct {
+			uint32_t alldata;
+			uint32_t allsc;
+		} all;
+		struct {
+			char data[4];	/* data bytes */
+			char sc[4];	/* status/control */
+		} s;
+	} u;
+};
+
+/* Test the valid bits in any of the 4 sc chars using "allsc" member */
+#define RING_ANY_VALID \
+	((uint32_t)(IOC4_RXSB_MODEM_VALID | IOC4_RXSB_DATA_VALID) * 0x01010101)
+
+#define ring_sc     u.s.sc
+#define ring_data   u.s.data
+#define ring_allsc  u.all.allsc
+
+/* Number of entries per ring buffer. */
+#define ENTRIES_PER_RING (RING_BUF_SIZE / (int) sizeof(struct ring_entry))
+
+/* An individual ring */
+struct ring {
+	struct ring_entry entries[ENTRIES_PER_RING];
+};
+
+/* The whole enchilada */
+struct ring_buffer {
+	struct ring TX_0_OR_2;
+	struct ring RX_0_OR_2;
+	struct ring TX_1_OR_3;
+	struct ring RX_1_OR_3;
+};
+
+/* Get a ring from a port struct */
+#define RING(_p, _wh)	&(((struct ring_buffer *)((_p)->ip_cpu_ringbuf))->_wh)
+
+/* Infinite loop detection.
+ */
+#define MAXITER 10000000
+
+/* Prototypes */
+static void receive_chars(struct uart_port *);
+static void handle_intr(void *arg, uint32_t sio_ir);
+
+/**
+ * write_ireg - write the interrupt regs
+ * @ioc4_soft: ptr to soft struct for this port
+ * @val: value to write
+ * @which: which register
+ * @type: which ireg set
+ */
+static inline void
+write_ireg(struct ioc4_soft *ioc4_soft, uint32_t val, int which, int type)
+{
+	struct ioc4_mem __iomem *mem = ioc4_soft->is_ioc4_mem_addr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioc4_soft->is_ir_lock, flags);
+
+	switch (type) {
+	case IOC4_SIO_INTR_TYPE:
+		switch (which) {
+		case IOC4_W_IES:
+			writel(val, &mem->sio_ies_ro);
+			break;
+
+		case IOC4_W_IEC:
+			writel(val, &mem->sio_iec_ro);
+			break;
+		}
+		break;
+
+	case IOC4_OTHER_INTR_TYPE:
+		switch (which) {
+		case IOC4_W_IES:
+			writel(val, &mem->other_ies_ro);
+			break;
+
+		case IOC4_W_IEC:
+			writel(val, &mem->other_iec_ro);
+			break;
+		}
+		break;
+
+	default:
+		break;
+	}
+	spin_unlock_irqrestore(&ioc4_soft->is_ir_lock, flags);
+}
+
+/**
+ * set_baud - Baud rate setting code
+ * @port: port to set
+ * @baud: baud rate to use
+ */
+static int set_baud(struct ioc4_port *port, int baud)
+{
+	int actual_baud;
+	int diff;
+	int lcr;
+	unsigned short divisor;
+	struct ioc4_uartregs __iomem *uart;
+
+	divisor = SER_DIVISOR(baud, port->ip_pci_bus_speed);
+	if (!divisor)
+		return 1;
+	actual_baud = DIVISOR_TO_BAUD(divisor, port->ip_pci_bus_speed);
+
+	diff = actual_baud - baud;
+	if (diff < 0)
+		diff = -diff;
+
+	/* If we're within 1%, we've found a match */
+	if (diff * 100 > actual_baud)
+		return 1;
+
+	uart = port->ip_uart_regs;
+	lcr = readb(&uart->i4u_lcr);
+	writeb(lcr | UART_LCR_DLAB, &uart->i4u_lcr);
+	writeb((unsigned char)divisor, &uart->i4u_dll);
+	writeb((unsigned char)(divisor >> 8), &uart->i4u_dlm);
+	writeb(lcr, &uart->i4u_lcr);
+	return 0;
+}
+
+
+/**
+ * get_ioc4_port - given a uart port, return the control structure
+ * @port: uart port
+ */
+static struct ioc4_port *get_ioc4_port(struct uart_port *the_port)
+{
+	struct ioc4_control *control = dev_get_drvdata(the_port->dev);
+	int ii;
+
+	if (control) {
+		for ( ii = 0; ii < IOC4_NUM_SERIAL_PORTS; ii++ ) {
+			if (!control->ic_port[ii].icp_port)
+				continue;
+			if (the_port == control->ic_port[ii].icp_port->ip_port)
+				return control->ic_port[ii].icp_port;
+		}
+	}
+	return NULL;
+}
+
+/* The IOC4 hardware provides no atomic way to determine if interrupts
+ * are pending since two reads are required to do so.  The handler must
+ * read the SIO_IR and the SIO_IES, and take the logical and of the
+ * two.  When this value is zero, all interrupts have been serviced and
+ * the handler may return.
+ *
+ * This has the unfortunate "hole" that, if some other CPU or
+ * some other thread or some higher level interrupt manages to
+ * modify SIO_IE between our reads of SIO_IR and SIO_IE, we may
+ * think we have observed SIO_IR&SIO_IE==0 when in fact this
+ * condition never really occurred.
+ *
+ * To solve this, we use a simple spinlock that must be held
+ * whenever modifying SIO_IE; holding this lock while observing
+ * both SIO_IR and SIO_IE guarantees that we do not falsely
+ * conclude that no enabled interrupts are pending.
+ */
+
+static inline uint32_t
+pending_intrs(struct ioc4_soft *soft, int type)
+{
+	struct ioc4_mem __iomem *mem = soft->is_ioc4_mem_addr;
+	unsigned long flag;
+	uint32_t intrs = 0;
+
+	BUG_ON(!((type == IOC4_SIO_INTR_TYPE)
+	       || (type == IOC4_OTHER_INTR_TYPE)));
+
+	spin_lock_irqsave(&soft->is_ir_lock, flag);
+
+	switch (type) {
+	case IOC4_SIO_INTR_TYPE:
+		intrs = readl(&mem->sio_ir) & readl(&mem->sio_ies_ro);
+		break;
+
+	case IOC4_OTHER_INTR_TYPE:
+		intrs = readl(&mem->other_ir) & readl(&mem->other_ies_ro);
+
+		/* Don't process any ATA interrupte */
+		intrs &= ~(IOC4_OTHER_IR_ATA_INT | IOC4_OTHER_IR_ATA_MEMERR);
+		break;
+
+	default:
+		break;
+	}
+	spin_unlock_irqrestore(&soft->is_ir_lock, flag);
+	return intrs;
+}
+
+/**
+ * port_init - Initialize the sio and ioc4 hardware for a given port
+ *			called per port from attach...
+ * @port: port to initialize
+ */
+static int inline port_init(struct ioc4_port *port)
+{
+	uint32_t sio_cr;
+	struct hooks *hooks = port->ip_hooks;
+	struct ioc4_uartregs __iomem *uart;
+
+	/* Idle the IOC4 serial interface */
+	writel(IOC4_SSCR_RESET, &port->ip_serial_regs->sscr);
+
+	/* Wait until any pending bus activity for this port has ceased */
+	do
+		sio_cr = readl(&port->ip_mem->sio_cr);
+	while (!(sio_cr & IOC4_SIO_CR_SIO_DIAG_IDLE));
+
+	/* Finish reset sequence */
+	writel(0, &port->ip_serial_regs->sscr);
+
+	/* Once RESET is done, reload cached tx_prod and rx_cons values
+	 * and set rings to empty by making prod == cons
+	 */
+	port->ip_tx_prod = readl(&port->ip_serial_regs->stcir) & PROD_CONS_MASK;
+	writel(port->ip_tx_prod, &port->ip_serial_regs->stpir);
+	port->ip_rx_cons = readl(&port->ip_serial_regs->srpir) & PROD_CONS_MASK;
+	writel(port->ip_rx_cons, &port->ip_serial_regs->srcir);
+
+	/* Disable interrupts for this 16550 */
+	uart = port->ip_uart_regs;
+	writeb(0, &uart->i4u_lcr);
+	writeb(0, &uart->i4u_ier);
+
+	/* Set the default baud */
+	set_baud(port, port->ip_baud);
+
+	/* Set line control to 8 bits no parity */
+	writeb(UART_LCR_WLEN8 | 0, &uart->i4u_lcr);
+					/* UART_LCR_STOP == 1 stop */
+
+	/* Enable the FIFOs */
+	writeb(UART_FCR_ENABLE_FIFO, &uart->i4u_fcr);
+	/* then reset 16550 FIFOs */
+	writeb(UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
+			&uart->i4u_fcr);
+
+	/* Clear modem control register */
+	writeb(0, &uart->i4u_mcr);
+
+	/* Clear deltas in modem status register */
+	readb(&uart->i4u_msr);
+
+	/* Only do this once per port pair */
+	if (port->ip_hooks == &hooks_array[0]
+			    || port->ip_hooks == &hooks_array[2]) {
+		unsigned long ring_pci_addr;
+		uint32_t __iomem *sbbr_l;
+		uint32_t __iomem *sbbr_h;
+
+		if (port->ip_hooks == &hooks_array[0]) {
+			sbbr_l = &port->ip_serial->sbbr01_l;
+			sbbr_h = &port->ip_serial->sbbr01_h;
+		} else {
+			sbbr_l = &port->ip_serial->sbbr23_l;
+			sbbr_h = &port->ip_serial->sbbr23_h;
+		}
+
+		ring_pci_addr = (unsigned long __iomem)port->ip_dma_ringbuf;
+		DPRINT_CONFIG(("%s: ring_pci_addr 0x%lx\n",
+					__FUNCTION__, ring_pci_addr));
+
+		writel((unsigned int)((uint64_t)ring_pci_addr >> 32), sbbr_h);
+		writel((unsigned int)ring_pci_addr | IOC4_BUF_SIZE_BIT, sbbr_l);
+	}
+
+	/* Set the receive timeout value to 10 msec */
+	writel(IOC4_SRTR_HZ / 100, &port->ip_serial_regs->srtr);
+
+	/* Set rx threshold, enable DMA */
+	/* Set high water mark at 3/4 of full ring */
+	port->ip_sscr = (ENTRIES_PER_RING * 3 / 4);
+	writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+
+	/* Disable and clear all serial related interrupt bits */
+	write_ireg(port->ip_ioc4_soft, hooks->intr_clear,
+		       IOC4_W_IEC, IOC4_SIO_INTR_TYPE);
+	port->ip_ienb &= ~hooks->intr_clear;
+	writel(hooks->intr_clear, &port->ip_mem->sio_ir);
+	return 0;
+}
+
+/**
+ * handle_dma_error_intr - service any pending DMA error interrupts for the
+ *			given port - 2nd level called via sd_intr
+ * @arg: handler arg
+ * @other_ir: ioc4regs
+ */
+static void handle_dma_error_intr(void *arg, uint32_t other_ir)
+{
+	struct ioc4_port *port = (struct ioc4_port *)arg;
+	struct hooks *hooks = port->ip_hooks;
+	unsigned int flags;
+
+	spin_lock_irqsave(&port->ip_lock, flags);
+
+	/* ACK the interrupt */
+	writel(hooks->intr_dma_error, &port->ip_mem->other_ir);
+
+	if (readl(&port->ip_mem->pci_err_addr_l) & IOC4_PCI_ERR_ADDR_VLD) {
+		printk(KERN_ERR
+			"PCI error address is 0x%lx, "
+				"master is serial port %c %s\n",
+		     (((uint64_t)readl(&port->ip_mem->pci_err_addr_h)
+							 << 32)
+				| readl(&port->ip_mem->pci_err_addr_l))
+					& IOC4_PCI_ERR_ADDR_ADDR_MSK, '1' +
+		     ((char)(readl(&port->ip_mem-> pci_err_addr_l) &
+			     IOC4_PCI_ERR_ADDR_MST_NUM_MSK) >> 1),
+		     (readl(&port->ip_mem->pci_err_addr_l)
+				& IOC4_PCI_ERR_ADDR_MST_TYP_MSK)
+				? "RX" : "TX");
+
+		if (readl(&port->ip_mem->pci_err_addr_l)
+						& IOC4_PCI_ERR_ADDR_MUL_ERR) {
+			printk(KERN_ERR
+				"Multiple errors occurred\n");
+		}
+	}
+	spin_unlock_irqrestore(&port->ip_lock, flags);
+
+	/* Re-enable DMA error interrupts */
+	write_ireg(port->ip_ioc4_soft, hooks->intr_dma_error, IOC4_W_IES,
+						IOC4_OTHER_INTR_TYPE);
+}
+
+/**
+ * intr_connect - interrupt connect function
+ * @soft: soft struct for this card
+ * @type: interrupt type
+ * @intrbits: bit pattern to set
+ * @intr: handler function
+ * @info: handler arg
+ */
+static void
+intr_connect(struct ioc4_soft *soft, int type,
+		  uint32_t intrbits, ioc4_intr_func_f * intr, void *info)
+{
+	int i;
+	struct ioc4_intr_info *intr_ptr;
+
+	BUG_ON(!((type == IOC4_SIO_INTR_TYPE)
+	       || (type == IOC4_OTHER_INTR_TYPE)));
+
+	i = atomic_inc(&soft-> is_intr_type[type].is_num_intrs) - 1;
+	BUG_ON(!(i < MAX_IOC4_INTR_ENTS || (printk("i %d\n", i), 0)));
+
+	/* Save off the lower level interrupt handler */
+	intr_ptr = &soft->is_intr_type[type].is_intr_info[i];
+	intr_ptr->sd_bits = intrbits;
+	intr_ptr->sd_intr = intr;
+	intr_ptr->sd_info = info;
+}
+
+/**
+ * ioc4_intr - Top level IOC4 interrupt handler.
+ * @irq: irq value
+ * @arg: handler arg
+ * @regs: registers
+ */
+static irqreturn_t ioc4_intr(int irq, void *arg, struct pt_regs *regs)
+{
+	struct ioc4_soft *soft;
+	uint32_t this_ir, this_mir;
+	int xx, num_intrs = 0;
+	int intr_type;
+	int handled = 0;
+	struct ioc4_intr_info *ii;
+
+	soft = arg;
+	for (intr_type = 0; intr_type < IOC4_NUM_INTR_TYPES; intr_type++) {
+		num_intrs = (int)atomic_read(
+				&soft->is_intr_type[intr_type].is_num_intrs);
+
+		this_mir = this_ir = pending_intrs(soft, intr_type);
+
+		/* Farm out the interrupt to the various drivers depending on
+		 * which interrupt bits are set.
+		 */
+		for (xx = 0; xx < num_intrs; xx++) {
+			ii = &soft->is_intr_type[intr_type].is_intr_info[xx];
+			if ((this_mir = this_ir & ii->sd_bits)) {
+				/* Disable owned interrupts, call handler */
+				handled++;
+				write_ireg(soft, ii->sd_bits, IOC4_W_IEC,
+								intr_type);
+				ii->sd_intr(ii->sd_info, this_mir);
+				this_ir &= ~this_mir;
+			}
+		}
+		if (this_ir) {
+			printk(KERN_ERR
+			       "unknown IOC4 %s interrupt 0x%x, sio_ir = 0x%x,"
+				" sio_ies = 0x%x, other_ir = 0x%x :"
+				"other_ies = 0x%x\n",
+			       (intr_type == IOC4_SIO_INTR_TYPE) ? "sio" :
+			       "other", this_ir,
+			       readl(&soft->is_ioc4_mem_addr->sio_ir),
+			       readl(&soft->is_ioc4_mem_addr->sio_ies_ro),
+			       readl(&soft->is_ioc4_mem_addr->other_ir),
+			       readl(&soft->is_ioc4_mem_addr->other_ies_ro));
+		}
+	}
+#ifdef DEBUG_INTERRUPTS
+	{
+		struct ioc4_mem __iomem *mem = soft->is_ioc4_mem_addr;
+		spinlock_t *lp = &soft->is_ir_lock;
+		unsigned long flag;
+
+		spin_lock_irqsave(&soft->is_ir_lock, flag);
+		printk ("%s : %d : mem 0x%p sio_ir 0x%x sio_ies_ro 0x%x "
+				"other_ir 0x%x other_ies_ro 0x%x mask 0x%x\n",
+		     __FUNCTION__, __LINE__,
+		     (void *)mem, readl(&mem->sio_ir),
+		     readl(&mem->sio_ies_ro),
+		     readl(&mem->other_ir),
+		     readl(&mem->other_ies_ro),
+		     IOC4_OTHER_IR_ATA_INT | IOC4_OTHER_IR_ATA_MEMERR);
+		spin_unlock_irqrestore(&soft->is_ir_lock, flag);
+	}
+#endif
+	return handled ? IRQ_HANDLED : IRQ_NONE;
+}
+
+/**
+ * ioc4_attach_local - Device initialization.
+ *			Called at *_attach() time for each
+ *			IOC4 with serial ports in the system.
+ * @control: ioc4_control ptr
+ * @pdev: PCI handle for this device
+ * @soft: soft struct for this device
+ * @ioc4: ioc4 mem space
+ */
+static int inline ioc4_attach_local(struct pci_dev *pdev,
+			struct ioc4_control *control,
+			struct ioc4_soft *soft, void __iomem *ioc4_mem,
+			void __iomem *ioc4_serial)
+{
+	struct ioc4_port *port;
+	struct ioc4_port *ports[IOC4_NUM_SERIAL_PORTS];
+	int port_number;
+	uint16_t ioc4_revid_min = 62;
+	uint16_t ioc4_revid;
+
+	/* IOC4 firmware must be at least rev 62 */
+	pci_read_config_word(pdev, PCI_COMMAND_SPECIAL, &ioc4_revid);
+
+	printk(KERN_INFO "IOC4 firmware revision %d\n", ioc4_revid);
+	if (ioc4_revid < ioc4_revid_min) {
+		printk(KERN_WARNING
+		    "IOC4 serial not supported on firmware rev %d, "
+				"please upgrade to rev %d or higher\n",
+				ioc4_revid, ioc4_revid_min);
+		return -EPERM;
+	}
+	BUG_ON(ioc4_mem == NULL);
+	BUG_ON(ioc4_serial == NULL);
+
+	/* Create port structures for each port */
+	for (port_number = 0; port_number < IOC4_NUM_SERIAL_PORTS;
+							port_number++) {
+		port = kmalloc(sizeof(struct ioc4_port), GFP_KERNEL);
+		if (!port) {
+			printk(KERN_WARNING
+				"IOC4 serial memory not available for port\n");
+			return -ENOMEM;
+		}
+		memset(port, 0, sizeof(struct ioc4_port));
+
+		/* we need to remember the previous ones, to point back to
+		 * them farther down - setting up the ring buffers.
+		 */
+		ports[port_number] = port;
+
+		/* Allocate buffers and jumpstart the hardware.  */
+		control->ic_port[port_number].icp_port = port;
+		port->ip_ioc4_soft = soft;
+		port->ip_pdev = pdev;
+		port->ip_ienb = 0;
+		port->ip_pci_bus_speed = IOC4_SER_XIN_CLK;
+		port->ip_baud = 9600;
+		port->ip_control = control;
+		port->ip_mem = ioc4_mem;
+		port->ip_serial = ioc4_serial;
+
+		/* point to the right hook */
+		port->ip_hooks = &hooks_array[port_number];
+
+		/* Get direct hooks to the serial regs and uart regs
+		 * for this port
+		 */
+		switch (port_number) {
+		case 0:
+			port->ip_serial_regs = &(port->ip_serial->port_0);
+			port->ip_uart_regs = &(port->ip_serial->uart_0);
+			break;
+		case 1:
+			port->ip_serial_regs = &(port->ip_serial->port_1);
+			port->ip_uart_regs = &(port->ip_serial->uart_1);
+			break;
+		case 2:
+			port->ip_serial_regs = &(port->ip_serial->port_2);
+			port->ip_uart_regs = &(port->ip_serial->uart_2);
+			break;
+		default:
+		case 3:
+			port->ip_serial_regs = &(port->ip_serial->port_3);
+			port->ip_uart_regs = &(port->ip_serial->uart_3);
+			break;
+		}
+
+		/* ring buffers are 1 to a pair of ports */
+		if (port_number && (port_number & 1)) {
+			/* odd use the evens buffer */
+			port->ip_dma_ringbuf =
+					ports[port_number - 1]->ip_dma_ringbuf;
+			port->ip_cpu_ringbuf =
+					ports[port_number - 1]->ip_cpu_ringbuf;
+			port->ip_inring = RING(port, RX_1_OR_3);
+			port->ip_outring = RING(port, TX_1_OR_3);
+
+		} else {
+			if (port->ip_dma_ringbuf == 0) {
+				port->ip_cpu_ringbuf = pci_alloc_consistent
+					(pdev, TOTAL_RING_BUF_SIZE,
+					&port->ip_dma_ringbuf);
+
+			}
+			BUG_ON(!((((int64_t)port->ip_dma_ringbuf) &
+				(TOTAL_RING_BUF_SIZE - 1)) == 0));
+			DPRINT_CONFIG(("%s : ip_cpu_ringbuf 0x%p "
+						"ip_dma_ringbuf 0x%p\n",
+					__FUNCTION__,
+					(void *)port->ip_cpu_ringbuf,
+					(void *)port->ip_dma_ringbuf));
+			port->ip_inring = RING(port, RX_0_OR_2);
+			port->ip_outring = RING(port, TX_0_OR_2);
+		}
+		DPRINT_CONFIG(("%s : port %d [addr 0x%p] control 0x%p",
+				__FUNCTION__,
+				port_number, (void *)port, (void *)control));
+		DPRINT_CONFIG((" ip_serial_regs 0x%p ip_uart_regs 0x%p\n",
+				(void *)port->ip_serial_regs,
+				(void *)port->ip_uart_regs));
+
+		/* Initialize the hardware for IOC4 */
+		port_init(port);
+
+		DPRINT_CONFIG(("%s: port_number %d port 0x%p inring 0x%p "
+						"outring 0x%p\n",
+				__FUNCTION__,
+				port_number, (void *)port,
+				(void *)port->ip_inring,
+				(void *)port->ip_outring));
+
+		/* Attach interrupt handlers */
+		intr_connect(soft, IOC4_SIO_INTR_TYPE,
+				GET_SIO_IR(port_number),
+				handle_intr, port);
+
+		intr_connect(soft, IOC4_OTHER_INTR_TYPE,
+				GET_OTHER_IR(port_number),
+				handle_dma_error_intr, port);
+	}
+	return 0;
+}
+
+/**
+ * enable_intrs - enable interrupts
+ * @port: port to enable
+ * @mask: mask to use
+ */
+static void enable_intrs(struct ioc4_port *port, uint32_t mask)
+{
+	struct hooks *hooks = port->ip_hooks;
+
+	if ((port->ip_ienb & mask) != mask) {
+		write_ireg(port->ip_ioc4_soft, mask, IOC4_W_IES,
+						IOC4_SIO_INTR_TYPE);
+		port->ip_ienb |= mask;
+	}
+
+	if (port->ip_ienb)
+		write_ireg(port->ip_ioc4_soft, hooks->intr_dma_error,
+				IOC4_W_IES, IOC4_OTHER_INTR_TYPE);
+}
+
+/**
+ * local_open - local open a port
+ * @port: port to open
+ */
+static inline int local_open(struct ioc4_port *port)
+{
+	int spiniter = 0;
+
+	port->ip_flags = 0;
+
+	/* Pause the DMA interface if necessary */
+	if (port->ip_sscr & IOC4_SSCR_DMA_EN) {
+		writel(port->ip_sscr | IOC4_SSCR_DMA_PAUSE,
+			&port->ip_serial_regs->sscr);
+		while((readl(&port->ip_serial_regs-> sscr)
+				& IOC4_SSCR_PAUSE_STATE) == 0) {
+			spiniter++;
+			if (spiniter > MAXITER) {
+				return -1;
+			}
+		}
+	}
+
+	/* Reset the input fifo.  If the uart received chars while the port
+	 * was closed and DMA is not enabled, the uart may have a bunch of
+	 * chars hanging around in its rx fifo which will not be discarded
+	 * by rclr in the upper layer. We must get rid of them here.
+	 */
+	writeb(UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR,
+				&port->ip_uart_regs->i4u_fcr);
+
+	writeb(UART_LCR_WLEN8, &port->ip_uart_regs->i4u_lcr);
+					/* UART_LCR_STOP == 1 stop */
+
+	/* Re-enable DMA, set default threshold to intr whenever there is
+	 * data available.
+	 */
+	port->ip_sscr &= ~IOC4_SSCR_RX_THRESHOLD;
+	port->ip_sscr |= 1;	/* default threshold */
+
+	/* Plug in the new sscr.  This implicitly clears the DMA_PAUSE
+	 * flag if it was set above
+	 */
+	writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+	port->ip_tx_lowat = 1;
+	return 0;
+}
+
+/**
+ * set_rx_timeout - Set rx timeout and threshold values.
+ * @port: port to use
+ * @timeout: timeout value in ticks
+ */
+static inline int set_rx_timeout(struct ioc4_port *port, int timeout)
+{
+	int threshold;
+
+	port->ip_rx_timeout = timeout;
+
+	/* Timeout is in ticks.  Let's figure out how many chars we
+	 * can receive at the current baud rate in that interval
+	 * and set the rx threshold to that amount.  There are 4 chars
+	 * per ring entry, so we'll divide the number of chars that will
+	 * arrive in timeout by 4.
+	 */
+	threshold = timeout * port->ip_baud / 10 / HZ / 4;
+	if (threshold == 0)
+		threshold = 1;	/* otherwise we'll intr all the time! */
+
+	if ((unsigned)threshold > (unsigned)IOC4_SSCR_RX_THRESHOLD)
+		return 1;
+
+	port->ip_sscr &= ~IOC4_SSCR_RX_THRESHOLD;
+	port->ip_sscr |= threshold;
+
+	writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+
+	/* Now set the rx timeout to the given value */
+	timeout = timeout * IOC4_SRTR_HZ / HZ;
+	if (timeout > IOC4_SRTR_CNT)
+		timeout = IOC4_SRTR_CNT;
+
+	writel(timeout, &port->ip_serial_regs->srtr);
+	return 0;
+}
+
+/**
+ * config_port - config the hardware
+ * @port: port to config
+ * @baud: baud rate for the port
+ * @byte_size: data size
+ * @stop_bits: number of stop bits
+ * @parenb: parity enable ?
+ * @parodd: odd parity ?
+ */
+static inline int
+config_port(struct ioc4_port *port,
+	    int baud, int byte_size, int stop_bits, int parenb, int parodd)
+{
+	char lcr, sizebits;
+	int spiniter = 0;
+
+	DPRINT_CONFIG(("%s: baud %d byte_size %d stop %d parenb %d parodd %d\n",
+		__FUNCTION__, baud, byte_size, stop_bits, parenb, parodd));
+
+	if (set_baud(port, baud))
+		return 1;
+
+	switch (byte_size) {
+	case 5:
+		sizebits = UART_LCR_WLEN5;
+		break;
+	case 6:
+		sizebits = UART_LCR_WLEN6;
+		break;
+	case 7:
+		sizebits = UART_LCR_WLEN7;
+		break;
+	case 8:
+		sizebits = UART_LCR_WLEN8;
+		break;
+	default:
+		return 1;
+	}
+
+	/* Pause the DMA interface if necessary */
+	if (port->ip_sscr & IOC4_SSCR_DMA_EN) {
+		writel(port->ip_sscr | IOC4_SSCR_DMA_PAUSE,
+			&port->ip_serial_regs->sscr);
+		while((readl(&port->ip_serial_regs->sscr)
+						& IOC4_SSCR_PAUSE_STATE) == 0) {
+			spiniter++;
+			if (spiniter > MAXITER)
+				return -1;
+		}
+	}
+
+	/* Clear relevant fields in lcr */
+	lcr = readb(&port->ip_uart_regs->i4u_lcr);
+	lcr &= ~(LCR_MASK_BITS_CHAR | UART_LCR_EPAR |
+		 UART_LCR_PARITY | LCR_MASK_STOP_BITS);
+
+	/* Set byte size in lcr */
+	lcr |= sizebits;
+
+	/* Set parity */
+	if (parenb) {
+		lcr |= UART_LCR_PARITY;
+		if (!parodd)
+			lcr |= UART_LCR_EPAR;
+	}
+
+	/* Set stop bits */
+	if (stop_bits)
+		lcr |= UART_LCR_STOP /* 2 stop bits */ ;
+
+	writeb(lcr, &port->ip_uart_regs->i4u_lcr);
+
+	/* Re-enable the DMA interface if necessary */
+	if (port->ip_sscr & IOC4_SSCR_DMA_EN) {
+		writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+	}
+	port->ip_baud = baud;
+
+	/* When we get within this number of ring entries of filling the
+	 * entire ring on tx, place an EXPLICIT intr to generate a lowat
+	 * notification when output has drained.
+	 */
+	port->ip_tx_lowat = (TX_LOWAT_CHARS(baud) + 3) / 4;
+	if (port->ip_tx_lowat == 0)
+		port->ip_tx_lowat = 1;
+
+	set_rx_timeout(port, port->ip_rx_timeout);
+
+	return 0;
+}
+
+/**
+ * do_write - Write bytes to the port.  Returns the number of bytes
+ *			actually written. Called from transmit_chars
+ * @port: port to use
+ * @buf: the stuff to write
+ * @len: how many bytes in 'buf'
+ */
+static inline int do_write(struct ioc4_port *port, char *buf, int len)
+{
+	int prod_ptr, cons_ptr, total = 0;
+	struct ring *outring;
+	struct ring_entry *entry;
+	struct hooks *hooks = port->ip_hooks;
+
+	BUG_ON(!(len >= 0));
+
+	prod_ptr = port->ip_tx_prod;
+	cons_ptr = readl(&port->ip_serial_regs->stcir) & PROD_CONS_MASK;
+	outring = port->ip_outring;
+
+	/* Maintain a 1-entry red-zone.  The ring buffer is full when
+	 * (cons - prod) % ring_size is 1.  Rather than do this subtraction
+	 * in the body of the loop, I'll do it now.
+	 */
+	cons_ptr = (cons_ptr - (int)sizeof(struct ring_entry)) & PROD_CONS_MASK;
+
+	/* Stuff the bytes into the output */
+	while ((prod_ptr != cons_ptr) && (len > 0)) {
+		int xx;
+
+		/* Get 4 bytes (one ring entry) at a time */
+		entry = (struct ring_entry *)((caddr_t) outring + prod_ptr);
+
+		/* Invalidate all entries */
+		entry->ring_allsc = 0;
+
+		/* Copy in some bytes */
+		for (xx = 0; (xx < 4) && (len > 0); xx++) {
+			entry->ring_data[xx] = *buf++;
+			entry->ring_sc[xx] = IOC4_TXCB_VALID;
+			len--;
+			total++;
+		}
+
+		/* If we are within some small threshold of filling up the
+		 * entire ring buffer, we must place an EXPLICIT intr here
+		 * to generate a lowat interrupt in case we subsequently
+		 * really do fill up the ring and the caller goes to sleep.
+		 * No need to place more than one though.
+		 */
+		if (!(port->ip_flags & LOWAT_WRITTEN) &&
+			((cons_ptr - prod_ptr) & PROD_CONS_MASK)
+				<= port->ip_tx_lowat
+					* (int)sizeof(struct ring_entry)) {
+			port->ip_flags |= LOWAT_WRITTEN;
+			entry->ring_sc[0] |= IOC4_TXCB_INT_WHEN_DONE;
+		}
+
+		/* Go on to next entry */
+		prod_ptr += sizeof(struct ring_entry);
+		prod_ptr &= PROD_CONS_MASK;
+	}
+
+	/* If we sent something, start DMA if necessary */
+	if (total > 0 && !(port->ip_sscr & IOC4_SSCR_DMA_EN)) {
+		port->ip_sscr |= IOC4_SSCR_DMA_EN;
+		writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+	}
+
+	/* Store the new producer pointer.  If tx is disabled, we stuff the
+	 * data into the ring buffer, but we don't actually start tx.
+	 */
+	if (!uart_tx_stopped(port->ip_port)) {
+		writel(prod_ptr, &port->ip_serial_regs->stpir);
+
+		/* If we are now transmitting, enable tx_mt interrupt so we
+		 * can disable DMA if necessary when the tx finishes.
+		 */
+		if (total > 0)
+			enable_intrs(port, hooks->intr_tx_mt);
+	}
+	port->ip_tx_prod = prod_ptr;
+	return total;
+}
+
+/**
+ * disable_intrs - disable interrupts
+ * @port: port to enable
+ * @mask: mask to use
+ */
+static void disable_intrs(struct ioc4_port *port, uint32_t mask)
+{
+	struct hooks *hooks = port->ip_hooks;
+
+	if (port->ip_ienb & mask) {
+		write_ireg(port->ip_ioc4_soft, mask, IOC4_W_IEC,
+					IOC4_SIO_INTR_TYPE);
+		port->ip_ienb &= ~mask;
+	}
+
+	if (!port->ip_ienb)
+		write_ireg(port->ip_ioc4_soft, hooks->intr_dma_error,
+				IOC4_W_IEC, IOC4_OTHER_INTR_TYPE);
+}
+
+/**
+ * set_notification - Modify event notification
+ * @port: port to use
+ * @mask: events mask
+ * @set_on: set ?
+ */
+static int set_notification(struct ioc4_port *port, int mask, int set_on)
+{
+	struct hooks *hooks = port->ip_hooks;
+	uint32_t intrbits, sscrbits;
+
+	BUG_ON(!mask);
+
+	intrbits = sscrbits = 0;
+
+	if (mask & N_DATA_READY)
+		intrbits |= (hooks->intr_rx_timer | hooks->intr_rx_high);
+	if (mask & N_OUTPUT_LOWAT)
+		intrbits |= hooks->intr_tx_explicit;
+	if (mask & N_DDCD) {
+		intrbits |= hooks->intr_delta_dcd;
+		sscrbits |= IOC4_SSCR_RX_RING_DCD;
+	}
+	if (mask & N_DCTS)
+		intrbits |= hooks->intr_delta_cts;
+
+	if (set_on) {
+		enable_intrs(port, intrbits);
+		port->ip_notify |= mask;
+		port->ip_sscr |= sscrbits;
+	} else {
+		disable_intrs(port, intrbits);
+		port->ip_notify &= ~mask;
+		port->ip_sscr &= ~sscrbits;
+	}
+
+	/* We require DMA if either DATA_READY or DDCD notification is
+	 * currently requested. If neither of these is requested and
+	 * there is currently no tx in progress, DMA may be disabled.
+	 */
+	if (port->ip_notify & (N_DATA_READY | N_DDCD))
+		port->ip_sscr |= IOC4_SSCR_DMA_EN;
+	else if (!(port->ip_ienb & hooks->intr_tx_mt))
+		port->ip_sscr &= ~IOC4_SSCR_DMA_EN;
+
+	writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+	return 0;
+}
+
+/**
+ * set_mcr - set the master control reg
+ * @the_port: port to use
+ * @set: set ?
+ * @mask1: mcr mask
+ * @mask2: shadow mask
+ */
+static inline int set_mcr(struct uart_port *the_port, int set,
+		int mask1, int mask2)
+{
+	struct ioc4_port *port = get_ioc4_port(the_port);
+	uint32_t shadow;
+	int spiniter = 0;
+	char mcr;
+
+	if (!port)
+		return -1;
+
+	/* Pause the DMA interface if necessary */
+	if (port->ip_sscr & IOC4_SSCR_DMA_EN) {
+		writel(port->ip_sscr | IOC4_SSCR_DMA_PAUSE,
+			&port->ip_serial_regs->sscr);
+		while ((readl(&port->ip_serial_regs->sscr)
+					& IOC4_SSCR_PAUSE_STATE) == 0) {
+			spiniter++;
+			if (spiniter > MAXITER)
+				return -1;
+		}
+	}
+	shadow = readl(&port->ip_serial_regs->shadow);
+	mcr = (shadow & 0xff000000) >> 24;
+
+	/* Set new value */
+	if (set) {
+		mcr |= mask1;
+		shadow |= mask2;
+	} else {
+		mcr &= ~mask1;
+		shadow &= ~mask2;
+	}
+	writeb(mcr, &port->ip_uart_regs->i4u_mcr);
+	writel(shadow, &port->ip_serial_regs->shadow);
+
+	/* Re-enable the DMA interface if necessary */
+	if (port->ip_sscr & IOC4_SSCR_DMA_EN) {
+		writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+	}
+	return 0;
+}
+
+/**
+ * ioc4_set_proto - set the protocol for the port
+ * @port: port to use
+ * @proto: protocol to use
+ */
+static int ioc4_set_proto(struct ioc4_port *port, enum sio_proto proto)
+{
+	struct hooks *hooks = port->ip_hooks;
+
+	switch (proto) {
+	case PROTO_RS232:
+		/* Clear the appropriate GIO pin */
+		writel(0, (&port->ip_mem->gppr_0 +
+				  hooks->rs422_select_pin));
+		break;
+
+	case PROTO_RS422:
+		/* Set the appropriate GIO pin */
+		writel(1, (&port->ip_mem->gppr_0 +
+				  hooks->rs422_select_pin));
+		break;
+
+	default:
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * transmit_chars - upper level write, called with ip_lock
+ * @the_port: port to write
+ */
+static void transmit_chars(struct uart_port *the_port)
+{
+	int xmit_count, tail, head;
+	int result;
+	char *start;
+	struct tty_struct *tty;
+	struct ioc4_port *port = get_ioc4_port(the_port);
+	struct uart_info *info;
+
+	if (!the_port)
+		return;
+	if (!port)
+		return;
+
+	info = the_port->info;
+	tty = info->tty;
+
+	if (uart_circ_empty(&info->xmit) || uart_tx_stopped(the_port)) {
+		/* Nothing to do or hw stopped */
+		set_notification(port, N_ALL_OUTPUT, 0);
+		return;
+	}
+
+	head = info->xmit.head;
+	tail = info->xmit.tail;
+	start = (char *)&info->xmit.buf[tail];
+
+	/* write out all the data or until the end of the buffer */
+	xmit_count = (head < tail) ? (UART_XMIT_SIZE - tail) : (head - tail);
+	if (xmit_count > 0) {
+		result = do_write(port, start, xmit_count);
+		if (result > 0) {
+			/* booking */
+			xmit_count -= result;
+			the_port->icount.tx += result;
+			/* advance the pointers */
+			tail += result;
+			tail &= UART_XMIT_SIZE - 1;
+			info->xmit.tail = tail;
+			start = (char *)&info->xmit.buf[tail];
+		}
+	}
+	if (uart_circ_chars_pending(&info->xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(the_port);
+
+	if (uart_circ_empty(&info->xmit)) {
+		set_notification(port, N_OUTPUT_LOWAT, 0);
+	} else {
+		set_notification(port, N_OUTPUT_LOWAT, 1);
+	}
+}
+
+/**
+ * ioc4_change_speed - change the speed of the port
+ * @the_port: port to change
+ * @new_termios: new termios settings
+ * @old_termios: old termios settings
+ */
+static void
+ioc4_change_speed(struct uart_port *the_port,
+		  struct termios *new_termios, struct termios *old_termios)
+{
+	struct ioc4_port *port = get_ioc4_port(the_port);
+	int baud, bits;
+	unsigned cflag, cval;
+	int new_parity = 0, new_parity_enable = 0, new_stop = 1, new_data = 8;
+	struct uart_info *info = the_port->info;
+
+	cflag = new_termios->c_cflag;
+
+	switch (cflag & CSIZE) {
+	case CS5:
+		new_data = 5;
+		cval = 0x00;
+		bits = 7;
+		break;
+	case CS6:
+		new_data = 6;
+		cval = 0x01;
+		bits = 8;
+		break;
+	case CS7:
+		new_data = 7;
+		cval = 0x02;
+		bits = 9;
+		break;
+	case CS8:
+		new_data = 8;
+		cval = 0x03;
+		bits = 10;
+		break;
+	default:
+		/* cuz we always need a default ... */
+		new_data = 5;
+		cval = 0x00;
+		bits = 7;
+		break;
+	}
+	if (cflag & CSTOPB) {
+		cval |= 0x04;
+		bits++;
+		new_stop = 1;
+	}
+	if (cflag & PARENB) {
+		cval |= UART_LCR_PARITY;
+		bits++;
+		new_parity_enable = 1;
+	}
+	if (cflag & PARODD) {
+		cval |= UART_LCR_EPAR;
+		new_parity = 1;
+	}
+	if (cflag & IGNPAR) {
+		cval &= ~UART_LCR_PARITY;
+		new_parity_enable = 0;
+	}
+	baud = uart_get_baud_rate(the_port, new_termios, old_termios,
+				MIN_BAUD_SUPPORTED, MAX_BAUD_SUPPORTED);
+	DPRINT_CONFIG(("%s: returned baud %d\n", __FUNCTION__, baud));
+
+	/* default is 9600 */
+	if (!baud)
+		baud = 9600;
+
+	if (!the_port->fifosize)
+		the_port->fifosize = IOC4_MAX_CHARS;
+	the_port->timeout = ((the_port->fifosize * HZ * bits) / (baud / 10));
+	the_port->timeout += HZ / 50;	/* Add .02 seconds of slop */
+
+	the_port->ignore_status_mask = N_ALL_INPUT;
+
+	if (I_IGNPAR(info->tty))
+		the_port->ignore_status_mask &= ~(N_PARITY_ERROR
+						| N_FRAMING_ERROR);
+	if (I_IGNBRK(info->tty)) {
+		the_port->ignore_status_mask &= ~N_BREAK;
+		if (I_IGNPAR(info->tty))
+			the_port->ignore_status_mask &= ~N_OVERRUN_ERROR;
+	}
+	if (!(cflag & CREAD)) {
+		/* ignore everything */
+		the_port->ignore_status_mask &= ~N_DATA_READY;
+	}
+
+	if (cflag & CRTSCTS)
+		info->flags |= ASYNC_CTS_FLOW;
+	else
+		info->flags &= ~ASYNC_CTS_FLOW;
+
+	/* Set the configuration and proper notification call */
+	DPRINT_CONFIG(("%s : port 0x%p cflag 0%o "
+		"config_port(baud %d data %d stop %d p enable %d parity %d),"
+		" notification 0x%x\n",
+	     __FUNCTION__, (void *)port, cflag, baud, new_data, new_stop,
+	     new_parity_enable, new_parity, the_port->ignore_status_mask));
+
+	if ((config_port(port, baud,		/* baud */
+			 new_data,		/* byte size */
+			 new_stop,		/* stop bits */
+			 new_parity_enable,	/* set parity */
+			 new_parity)) >= 0) {	/* parity 1==odd */
+		set_notification(port, the_port->ignore_status_mask, 1);
+	}
+}
+
+/**
+ * ic4_startup_local - Start up the serial port - returns >= 0 if no errors
+ * @the_port: Port to operate on
+ */
+static inline int ic4_startup_local(struct uart_port *the_port)
+{
+	int retval = 0;
+	struct ioc4_port *port;
+	struct uart_info *info;
+
+	if (!the_port)
+		return -1;
+
+	port = get_ioc4_port(the_port);
+	if (!port)
+		return -1;
+
+	info = the_port->info;
+	if (info->flags & UIF_INITIALIZED) {
+		return retval;
+	}
+
+	if (info->tty) {
+		set_bit(TTY_IO_ERROR, &info->tty->flags);
+		clear_bit(TTY_IO_ERROR, &info->tty->flags);
+		if ((info->flags & ASYNC_SPD_MASK) == ASYNC_SPD_HI)
+			info->tty->alt_speed = 57600;
+		if ((info->flags & ASYNC_SPD_MASK) == ASYNC_SPD_VHI)
+			info->tty->alt_speed = 115200;
+		if ((info->flags & ASYNC_SPD_MASK) == ASYNC_SPD_SHI)
+			info->tty->alt_speed = 230400;
+		if ((info->flags & ASYNC_SPD_MASK) == ASYNC_SPD_WARP)
+			info->tty->alt_speed = 460800;
+	}
+	local_open(port);
+
+	/* set the speed of the serial port */
+	ioc4_change_speed(the_port, info->tty->termios, (struct termios *)0);
+
+	/* enable hardware flow control - after ioc4_change_speed because
+	 * ASYNC_CTS_FLOW is set there */
+	if (info->flags & ASYNC_CTS_FLOW) {
+		port->ip_sscr |= IOC4_SSCR_HFC_EN;
+		writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+	}
+	info->flags |= UIF_INITIALIZED;
+	return 0;
+}
+
+/*
+ * ioc4_cb_output_lowat - called when the output low water mark is hit
+ * @port: port to output
+ */
+static void ioc4_cb_output_lowat(struct ioc4_port *port)
+{
+	/* ip_lock is set on the call here */
+	if (port->ip_port) {
+		transmit_chars(port->ip_port);
+	}
+}
+
+
+/**
+ * handle_intr - service any interrupts for the given port - 2nd level
+ *			called via sd_intr
+ * @arg: handler arg
+ * @sio_ir: ioc4regs
+ */
+static void handle_intr(void *arg, uint32_t sio_ir)
+{
+	struct ioc4_port *port = (struct ioc4_port *)arg;
+	struct hooks *hooks = port->ip_hooks;
+	unsigned int rx_high_rd_aborted = 0;
+	unsigned int flags;
+	struct uart_port *the_port;
+	int loop_counter;
+
+	/* Possible race condition here: The tx_mt interrupt bit may be
+	 * cleared without the intervention of the interrupt handler,
+	 * e.g. by a write.  If the top level interrupt handler reads a
+	 * tx_mt, then some other processor does a write, starting up
+	 * output, then we come in here, see the tx_mt and stop DMA, the
+	 * output started by the other processor will hang.  Thus we can
+	 * only rely on tx_mt being legitimate if it is read while the
+	 * port lock is held.  Therefore this bit must be ignored in the
+	 * passed in interrupt mask which was read by the top level
+	 * interrupt handler since the port lock was not held at the time
+	 * it was read.  We can only rely on this bit being accurate if it
+	 * is read while the port lock is held.  So we'll clear it for now,
+	 * and reload it later once we have the port lock.
+	 */
+	sio_ir &= ~(hooks->intr_tx_mt);
+
+	spin_lock_irqsave(&port->ip_lock, flags);
+
+	loop_counter = MAXITER;	/* to avoid hangs */
+
+	do {
+		uint32_t shadow;
+
+		if ( loop_counter-- <= 0 ) {
+			printk(KERN_WARNING "IOC4 serial: "
+					"possible hang condition/"
+					"port stuck on interrupt.\n");
+			break;
+		}
+
+		/* Handle a DCD change */
+		if (sio_ir & hooks->intr_delta_dcd) {
+			/* ACK the interrupt */
+			writel(hooks->intr_delta_dcd,
+				&port->ip_mem->sio_ir);
+
+			shadow = readl(&port->ip_serial_regs->shadow);
+
+			if ((port->ip_notify & N_DDCD)
+					&& (shadow & IOC4_SHADOW_DCD)
+					&& (port->ip_port)) {
+				the_port = port->ip_port;
+				the_port->icount.dcd = 1;
+				wake_up_interruptible
+					    (&the_port-> info->delta_msr_wait);
+			} else if ((port->ip_notify & N_DDCD)
+					&& !(shadow & IOC4_SHADOW_DCD)) {
+				/* Flag delta DCD/no DCD */
+				port->ip_flags |= DCD_ON;
+			}
+		}
+
+		/* Handle a CTS change */
+		if (sio_ir & hooks->intr_delta_cts) {
+			/* ACK the interrupt */
+			writel(hooks->intr_delta_cts,
+					&port->ip_mem->sio_ir);
+
+			shadow = readl(&port->ip_serial_regs->shadow);
+
+			if ((port->ip_notify & N_DCTS)
+					&& (port->ip_port)) {
+				the_port = port->ip_port;
+				the_port->icount.cts =
+					(shadow & IOC4_SHADOW_CTS) ? 1 : 0;
+				wake_up_interruptible
+					(&the_port->info->delta_msr_wait);
+			}
+		}
+
+		/* rx timeout interrupt.  Must be some data available.  Put this
+		 * before the check for rx_high since servicing this condition
+		 * may cause that condition to clear.
+		 */
+		if (sio_ir & hooks->intr_rx_timer) {
+			/* ACK the interrupt */
+			writel(hooks->intr_rx_timer,
+				&port->ip_mem->sio_ir);
+
+			if ((port->ip_notify & N_DATA_READY)
+					&& (port->ip_port)) {
+				/* ip_lock is set on call here */
+				receive_chars(port->ip_port);
+			}
+		}
+
+		/* rx high interrupt. Must be after rx_timer.  */
+		else if (sio_ir & hooks->intr_rx_high) {
+			/* Data available, notify upper layer */
+			if ((port->ip_notify & N_DATA_READY)
+						&& port->ip_port) {
+				/* ip_lock is set on call here */
+				receive_chars(port->ip_port);
+			}
+
+			/* We can't ACK this interrupt.  If receive_chars didn't
+			 * cause the condition to clear, we'll have to disable
+			 * the interrupt until the data is drained.
+			 * If the read was aborted, don't disable the interrupt
+			 * as this may cause us to hang indefinitely.  An
+			 * aborted read generally means that this interrupt
+			 * hasn't been delivered to the cpu yet anyway, even
+			 * though we see it as asserted when we read the sio_ir.
+			 */
+			if ((sio_ir = PENDING(port)) & hooks->intr_rx_high) {
+				if ((port->ip_flags & READ_ABORTED) == 0) {
+					port->ip_ienb &= ~hooks->intr_rx_high;
+					port->ip_flags |= INPUT_HIGH;
+				} else {
+					rx_high_rd_aborted++;
+				}
+			}
+		}
+
+		/* We got a low water interrupt: notify upper layer to
+		 * send more data.  Must come before tx_mt since servicing
+		 * this condition may cause that condition to clear.
+		 */
+		if (sio_ir & hooks->intr_tx_explicit) {
+			port->ip_flags &= ~LOWAT_WRITTEN;
+
+			/* ACK the interrupt */
+			writel(hooks->intr_tx_explicit,
+					&port->ip_mem->sio_ir);
+
+			if (port->ip_notify & N_OUTPUT_LOWAT)
+				ioc4_cb_output_lowat(port);
+		}
+
+		/* Handle tx_mt.  Must come after tx_explicit.  */
+		else if (sio_ir & hooks->intr_tx_mt) {
+			/* If we are expecting a lowat notification
+			 * and we get to this point it probably means that for
+			 * some reason the tx_explicit didn't work as expected
+			 * (that can legitimately happen if the output buffer is
+			 * filled up in just the right way).
+			 * So send the notification now.
+			 */
+			if (port->ip_notify & N_OUTPUT_LOWAT) {
+				ioc4_cb_output_lowat(port);
+
+				/* We need to reload the sio_ir since the lowat
+				 * call may have caused another write to occur,
+				 * clearing the tx_mt condition.
+				 */
+				sio_ir = PENDING(port);
+			}
+
+			/* If the tx_mt condition still persists even after the
+			 * lowat call, we've got some work to do.
+			 */
+			if (sio_ir & hooks->intr_tx_mt) {
+
+				/* If we are not currently expecting DMA input,
+				 * and the transmitter has just gone idle,
+				 * there is no longer any reason for DMA, so
+				 * disable it.
+				 */
+				if (!(port->ip_notify
+						& (N_DATA_READY | N_DDCD))) {
+					BUG_ON(!(port->ip_sscr
+							& IOC4_SSCR_DMA_EN));
+					port->ip_sscr &= ~IOC4_SSCR_DMA_EN;
+					writel(port->ip_sscr,
+					   &port->ip_serial_regs->sscr);
+				}
+
+				/* Prevent infinite tx_mt interrupt */
+				port->ip_ienb &= ~hooks->intr_tx_mt;
+			}
+		}
+		sio_ir = PENDING(port);
+
+		/* if the read was aborted and only hooks->intr_rx_high,
+		 * clear hooks->intr_rx_high, so we do not loop forever.
+		 */
+
+		if (rx_high_rd_aborted && (sio_ir == hooks->intr_rx_high)) {
+			sio_ir &= ~hooks->intr_rx_high;
+		}
+	} while (sio_ir & hooks->intr_all);
+
+	spin_unlock_irqrestore(&port->ip_lock, flags);
+
+	/* Re-enable interrupts before returning from interrupt handler.
+	 * Getting interrupted here is okay.  It'll just v() our semaphore, and
+	 * we'll come through the loop again.
+	 */
+
+	write_ireg(port->ip_ioc4_soft, port->ip_ienb, IOC4_W_IES,
+							IOC4_SIO_INTR_TYPE);
+}
+
+/*
+ * ioc4_cb_post_ncs - called for some basic errors
+ * @port: port to use
+ * @ncs: event
+ */
+static void ioc4_cb_post_ncs(struct uart_port *the_port, int ncs)
+{
+	struct uart_icount *icount;
+
+	icount = &the_port->icount;
+
+	if (ncs & NCS_BREAK)
+		icount->brk++;
+	if (ncs & NCS_FRAMING)
+		icount->frame++;
+	if (ncs & NCS_OVERRUN)
+		icount->overrun++;
+	if (ncs & NCS_PARITY)
+		icount->parity++;
+}
+
+/**
+ * do_read - Read in bytes from the port.  Return the number of bytes
+ *			actually read.
+ * @the_port: port to use
+ * @buf: place to put the stuff we read
+ * @len: how big 'buf' is
+ */
+
+static inline int do_read(struct uart_port *the_port, unsigned char *buf,
+				int len)
+{
+	int prod_ptr, cons_ptr, total;
+	struct ioc4_port *port = get_ioc4_port(the_port);
+	struct ring *inring;
+	struct ring_entry *entry;
+	struct hooks *hooks = port->ip_hooks;
+	int byte_num;
+	char *sc;
+	int loop_counter;
+
+	BUG_ON(!(len >= 0));
+	BUG_ON(!port);
+
+	/* There is a nasty timing issue in the IOC4. When the rx_timer
+	 * expires or the rx_high condition arises, we take an interrupt.
+	 * At some point while servicing the interrupt, we read bytes from
+	 * the ring buffer and re-arm the rx_timer.  However the rx_timer is
+	 * not started until the first byte is received *after* it is armed,
+	 * and any bytes pending in the rx construction buffers are not drained
+	 * to memory until either there are 4 bytes available or the rx_timer
+	 * expires.  This leads to a potential situation where data is left
+	 * in the construction buffers forever - 1 to 3 bytes were received
+	 * after the interrupt was generated but before the rx_timer was
+	 * re-armed. At that point as long as no subsequent bytes are received
+	 * the timer will never be started and the bytes will remain in the
+	 * construction buffer forever.  The solution is to execute a DRAIN
+	 * command after rearming the timer.  This way any bytes received before
+	 * the DRAIN will be drained to memory, and any bytes received after
+	 * the DRAIN will start the TIMER and be drained when it expires.
+	 * Luckily, this only needs to be done when the DMA buffer is empty
+	 * since there is no requirement that this function return all
+	 * available data as long as it returns some.
+	 */
+	/* Re-arm the timer */
+	writel(port->ip_rx_cons | IOC4_SRCIR_ARM,
+			&port->ip_serial_regs->srcir);
+
+	prod_ptr = readl(&port->ip_serial_regs->srpir) & PROD_CONS_MASK;
+	cons_ptr = port->ip_rx_cons;
+
+	if (prod_ptr == cons_ptr) {
+		int reset_dma = 0;
+
+		/* Input buffer appears empty, do a flush. */
+
+		/* DMA must be enabled for this to work. */
+		if (!(port->ip_sscr & IOC4_SSCR_DMA_EN)) {
+			port->ip_sscr |= IOC4_SSCR_DMA_EN;
+			reset_dma = 1;
+		}
+
+		/* Potential race condition: we must reload the srpir after
+		 * issuing the drain command, otherwise we could think the rx
+		 * buffer is empty, then take a very long interrupt, and when
+		 * we come back it's full and we wait forever for the drain to
+		 * complete.
+		 */
+		writel(port->ip_sscr | IOC4_SSCR_RX_DRAIN,
+				&port->ip_serial_regs->sscr);
+		prod_ptr = readl(&port->ip_serial_regs->srpir)
+				& PROD_CONS_MASK;
+
+		/* We must not wait for the DRAIN to complete unless there are
+		 * at least 8 bytes (2 ring entries) available to receive the
+		 * data otherwise the DRAIN will never complete and we'll
+		 * deadlock here.
+		 * In fact, to make things easier, I'll just ignore the flush if
+		 * there is any data at all now available.
+		 */
+		if (prod_ptr == cons_ptr) {
+			loop_counter = 0;
+			while (readl(&port->ip_serial_regs->sscr) &
+						IOC4_SSCR_RX_DRAIN) {
+				loop_counter++;
+				if (loop_counter > MAXITER)
+					return -1;
+			}
+
+			/* SIGH. We have to reload the prod_ptr *again* since
+			 * the drain may have caused it to change
+			 */
+			prod_ptr = readl(&port->ip_serial_regs->srpir)
+							& PROD_CONS_MASK;
+		}
+		if (reset_dma) {
+			port->ip_sscr &= ~IOC4_SSCR_DMA_EN;
+			writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+		}
+	}
+	inring = port->ip_inring;
+	port->ip_flags &= ~READ_ABORTED;
+
+	total = 0;
+	loop_counter = 0xfffff;	/* to avoid hangs */
+
+	/* Grab bytes from the hardware */
+	while ((prod_ptr != cons_ptr) && (len > 0)) {
+		entry = (struct ring_entry *)((caddr_t)inring + cons_ptr);
+
+		if ( loop_counter-- <= 0 ) {
+			printk(KERN_WARNING "IOC4 serial: "
+					"possible hang condition/"
+					"port stuck on read.\n");
+			break;
+		}
+
+		/* According to the producer pointer, this ring entry
+		 * must contain some data.  But if the PIO happened faster
+		 * than the DMA, the data may not be available yet, so let's
+		 * wait until it arrives.
+		 */
+		if ((entry->ring_allsc & RING_ANY_VALID) == 0) {
+			/* Indicate the read is aborted so we don't disable
+			 * the interrupt thinking that the consumer is
+			 * congested.
+			 */
+			port->ip_flags |= READ_ABORTED;
+			len = 0;
+			break;
+		}
+
+		/* Load the bytes/status out of the ring entry */
+		for (byte_num = 0; byte_num < 4 && len > 0; byte_num++) {
+			sc = &(entry->ring_sc[byte_num]);
+
+			/* Check for change in modem state or overrun */
+			if ((*sc & IOC4_RXSB_MODEM_VALID)
+						&& (port->ip_notify & N_DDCD)) {
+				/* Notify upper layer if DCD dropped */
+
+				if ((port->ip_flags & DCD_ON)
+						&& !(*sc & IOC4_RXSB_DCD)) {
+
+					/* If we have already copied some data,
+					 * return it.  We'll pick up the carrier
+					 * drop on the next pass.  That way we
+					 * don't throw away the data that has
+					 * already been copied back to
+					 * the caller's buffer.
+					 */
+					if (total > 0) {
+						len = 0;
+						break;
+					}
+					port->ip_flags &= ~DCD_ON;
+
+					/* Turn off this notification so the
+					 * carrier drop protocol won't see it
+					 * again when it does a read.
+					 */
+					*sc &= ~IOC4_RXSB_MODEM_VALID;
+
+					/* To keep things consistent, we need
+					 * to update the consumer pointer so
+					 * the next reader won't come in and
+					 * try to read the same ring entries
+					 * again. This must be done here before
+					 * the dcd change.
+					 */
+
+					if ((entry->ring_allsc & RING_ANY_VALID)
+									== 0) {
+						cons_ptr += (int)sizeof
+							(struct ring_entry);
+						cons_ptr &= PROD_CONS_MASK;
+					}
+					writel(cons_ptr,
+						&port->ip_serial_regs->srcir);
+					port->ip_rx_cons = cons_ptr;
+
+					/* Notify upper layer of carrier drop */
+					if ((port->ip_notify & N_DDCD)
+						   && port->ip_port) {
+						the_port->icount.dcd = 0;
+						wake_up_interruptible
+						    (&the_port->info->
+							delta_msr_wait);
+					}
+
+					/* If we had any data to return, we
+					 * would have returned it above.
+					 */
+					return 0;
+				}
+			}
+			if (*sc & IOC4_RXSB_MODEM_VALID) {
+				/* Notify that an input overrun occurred */
+				if ((*sc & IOC4_RXSB_OVERRUN)
+				    && (port->ip_notify & N_OVERRUN_ERROR)) {
+					ioc4_cb_post_ncs(the_port, NCS_OVERRUN);
+				}
+				/* Don't look at this byte again */
+				*sc &= ~IOC4_RXSB_MODEM_VALID;
+			}
+
+			/* Check for valid data or RX errors */
+			if ((*sc & IOC4_RXSB_DATA_VALID) &&
+					((*sc & (IOC4_RXSB_PAR_ERR
+							| IOC4_RXSB_FRAME_ERR
+							| IOC4_RXSB_BREAK))
+					&& (port->ip_notify & (N_PARITY_ERROR
+							| N_FRAMING_ERROR
+							| N_BREAK)))) {
+				/* There is an error condition on the next byte.
+				 * If we have already transferred some bytes,
+				 * we'll stop here. Otherwise if this is the
+				 * first byte to be read, we'll just transfer
+				 * it alone after notifying the
+				 * upper layer of its status.
+				 */
+				if (total > 0) {
+					len = 0;
+					break;
+				} else {
+					if ((*sc & IOC4_RXSB_PAR_ERR) &&
+					   (port->ip_notify & N_PARITY_ERROR)) {
+						ioc4_cb_post_ncs(the_port,
+								NCS_PARITY);
+					}
+					if ((*sc & IOC4_RXSB_FRAME_ERR) &&
+					   (port->ip_notify & N_FRAMING_ERROR)){
+						ioc4_cb_post_ncs(the_port,
+								NCS_FRAMING);
+					}
+					if ((*sc & IOC4_RXSB_BREAK)
+					    && (port->ip_notify & N_BREAK)) {
+							ioc4_cb_post_ncs
+								    (the_port,
+								     NCS_BREAK);
+					}
+					len = 1;
+				}
+			}
+			if (*sc & IOC4_RXSB_DATA_VALID) {
+				*sc &= ~IOC4_RXSB_DATA_VALID;
+				*buf = entry->ring_data[byte_num];
+				buf++;
+				len--;
+				total++;
+			}
+		}
+
+		/* If we used up this entry entirely, go on to the next one,
+		 * otherwise we must have run out of buffer space, so
+		 * leave the consumer pointer here for the next read in case
+		 * there are still unread bytes in this entry.
+		 */
+		if ((entry->ring_allsc & RING_ANY_VALID) == 0) {
+			cons_ptr += (int)sizeof(struct ring_entry);
+			cons_ptr &= PROD_CONS_MASK;
+		}
+	}
+
+	/* Update consumer pointer and re-arm rx timer interrupt */
+	writel(cons_ptr, &port->ip_serial_regs->srcir);
+	port->ip_rx_cons = cons_ptr;
+
+	/* If we have now dipped below the rx high water mark and we have
+	 * rx_high interrupt turned off, we can now turn it back on again.
+	 */
+	if ((port->ip_flags & INPUT_HIGH) && (((prod_ptr - cons_ptr)
+			& PROD_CONS_MASK) < ((port->ip_sscr &
+				IOC4_SSCR_RX_THRESHOLD)
+					<< IOC4_PROD_CONS_PTR_OFF))) {
+		port->ip_flags &= ~INPUT_HIGH;
+		enable_intrs(port, hooks->intr_rx_high);
+	}
+	return total;
+}
+/**
+ * receive_chars - upper level read. Called with ip_lock.
+ * @the_port: port to read from
+ */
+static void receive_chars(struct uart_port *the_port)
+{
+	struct tty_struct *tty;
+	unsigned char ch[IOC4_MAX_CHARS];
+	int read_count, request_count;
+	struct uart_icount *icount;
+	struct uart_info *info = the_port->info;
+
+	/* Make sure all the pointers are "good" ones */
+	if (!info)
+		return;
+	if (!info->tty)
+		return;
+
+	tty = info->tty;
+
+	request_count = TTY_FLIPBUF_SIZE - tty->flip.count - 1;
+
+	if (request_count > 0) {
+		if (request_count > IOC4_MAX_CHARS - 2)
+			request_count = IOC4_MAX_CHARS - 2;
+		icount = &the_port->icount;
+		read_count = do_read(the_port, ch, request_count);
+		if (read_count > 0) {
+			memcpy(tty->flip.char_buf_ptr, ch, read_count);
+			memset(tty->flip.flag_buf_ptr, TTY_NORMAL, read_count);
+			tty->flip.char_buf_ptr += read_count;
+			tty->flip.flag_buf_ptr += read_count;
+			tty->flip.count += read_count;
+			icount->rx += read_count;
+		}
+	}
+	tty_flip_buffer_push(tty);
+}
+
+/**
+ * ic4_type - What type of console are we?
+ * @port: Port to operate with (we ignore since we only have one port)
+ *
+ */
+static const char *ic4_type(struct uart_port *the_port)
+{
+	return "SGI IOC4 Serial";
+}
+
+/**
+ * ic4_tx_empty - Is the transmitter empty?  We pretend we're always empty
+ * @port: Port to operate on (we ignore since we always return 1)
+ *
+ */
+static unsigned int ic4_tx_empty(struct uart_port *the_port)
+{
+	return 1;
+}
+
+/**
+ * ic4_stop_tx - stop the transmitter
+ * @port: Port to operate on
+ * @tty_stop: Set to 1 if called via uart_stop
+ *
+ */
+static void ic4_stop_tx(struct uart_port *the_port, unsigned int tty_stop)
+{
+}
+
+/**
+ * null_void_function -
+ * @port: Port to operate on
+ *
+ */
+static void null_void_function(struct uart_port *the_port)
+{
+}
+
+/**
+ * ic4_shutdown - shut down the port - free irq and disable
+ * @port: Port to shut down
+ *
+ */
+static void ic4_shutdown(struct uart_port *the_port)
+{
+	unsigned long port_flags;
+	struct ioc4_port *port;
+	struct uart_info *info;
+
+	port = get_ioc4_port(the_port);
+	if (!port)
+		return;
+
+	info = the_port->info;
+
+	if (!(info->flags & UIF_INITIALIZED))
+		return;
+
+	wake_up_interruptible(&info->delta_msr_wait);
+
+	if (info->tty)
+		set_bit(TTY_IO_ERROR, &info->tty->flags);
+
+	spin_lock_irqsave(&port->ip_lock, port_flags);
+	set_notification(port, N_ALL, 0);
+	info->flags &= ~UIF_INITIALIZED;
+	spin_unlock_irqrestore(&port->ip_lock, port_flags);
+}
+
+/**
+ * ic4_set_mctrl - set control lines (dtr, rts, etc)
+ * @port: Port to operate on
+ * @mctrl: Lines to set/unset
+ *
+ */
+static void ic4_set_mctrl(struct uart_port *the_port, unsigned int mctrl)
+{
+	unsigned char mcr = 0;
+
+	if (mctrl & TIOCM_RTS)
+		mcr |= UART_MCR_RTS;
+	if (mctrl & TIOCM_DTR)
+		mcr |= UART_MCR_DTR;
+	if (mctrl & TIOCM_OUT1)
+		mcr |= UART_MCR_OUT1;
+	if (mctrl & TIOCM_OUT2)
+		mcr |= UART_MCR_OUT2;
+	if (mctrl & TIOCM_LOOP)
+		mcr |= UART_MCR_LOOP;
+
+	set_mcr(the_port, 1, mcr, IOC4_SHADOW_DTR);
+}
+
+/**
+ * ic4_get_mctrl - get control line info
+ * @port: port to operate on
+ *
+ */
+static unsigned int ic4_get_mctrl(struct uart_port *the_port)
+{
+	struct ioc4_port *port = get_ioc4_port(the_port);
+	uint32_t shadow;
+	unsigned int ret = 0;
+
+	if (!port)
+		return 0;
+
+	shadow = readl(&port->ip_serial_regs->shadow);
+	if (shadow & IOC4_SHADOW_DCD)
+		ret |= TIOCM_CAR;
+	if (shadow & IOC4_SHADOW_DR)
+		ret |= TIOCM_DSR;
+	if (shadow & IOC4_SHADOW_CTS)
+		ret |= TIOCM_CTS;
+	return ret;
+}
+
+/**
+ * ic4_start_tx - Start transmitter, flush any output
+ * @port: Port to operate on
+ * @tty_stop: Set to 1 if called via uart_start
+ *
+ */
+static void ic4_start_tx(struct uart_port *the_port, unsigned int tty_stop)
+{
+	struct ioc4_port *port = get_ioc4_port(the_port);
+	unsigned long flags;
+
+	if (port) {
+		spin_lock_irqsave(&port->ip_lock, flags);
+		transmit_chars(the_port);
+		spin_unlock_irqrestore(&port->ip_lock, flags);
+	}
+}
+
+/**
+ * ic4_break_ctl - handle breaks
+ * @port: Port to operate on
+ * @break_state: Break state
+ *
+ */
+static void ic4_break_ctl(struct uart_port *the_port, int break_state)
+{
+}
+
+/**
+ * ic4_startup - Start up the serial port - always return 0 (We're always on)
+ * @port: Port to operate on
+ *
+ */
+static int ic4_startup(struct uart_port *the_port)
+{
+	int retval;
+	struct ioc4_port *port;
+	struct ioc4_control *control;
+	struct uart_info *info;
+	unsigned long port_flags;
+
+	if (!the_port) {
+		return -ENODEV;
+	}
+	port = get_ioc4_port(the_port);
+	if (!port) {
+		return -ENODEV;
+	}
+	info = the_port->info;
+
+	control = port->ip_control;
+	if (!control) {
+		return -ENODEV;
+	}
+
+	/* Start up the serial port */
+	spin_lock_irqsave(&port->ip_lock, port_flags);
+	retval = ic4_startup_local(the_port);
+	spin_unlock_irqrestore(&port->ip_lock, port_flags);
+	return retval;
+}
+
+/**
+ * ic4_set_termios - set termios stuff
+ * @port: port to operate on
+ * @termios: New settings
+ * @termios: Old
+ *
+ */
+static void
+ic4_set_termios(struct uart_port *the_port,
+		struct termios *termios, struct termios *old_termios)
+{
+	struct ioc4_port *port = get_ioc4_port(the_port);
+	unsigned long port_flags;
+
+	spin_lock_irqsave(&port->ip_lock, port_flags);
+	ioc4_change_speed(the_port, termios, old_termios);
+	spin_unlock_irqrestore(&port->ip_lock, port_flags);
+}
+
+/**
+ * ic4_request_port - allocate resources for port - no op....
+ * @port: port to operate on
+ *
+ */
+static int ic4_request_port(struct uart_port *port)
+{
+	return 0;
+}
+
+/* Associate the uart functions above - given to serial core */
+
+static struct uart_ops ioc4_ops = {
+	.tx_empty	= ic4_tx_empty,
+	.set_mctrl	= ic4_set_mctrl,
+	.get_mctrl	= ic4_get_mctrl,
+	.stop_tx	= ic4_stop_tx,
+	.start_tx	= ic4_start_tx,
+	.stop_rx	= null_void_function,
+	.enable_ms	= null_void_function,
+	.break_ctl	= ic4_break_ctl,
+	.startup	= ic4_startup,
+	.shutdown	= ic4_shutdown,
+	.set_termios	= ic4_set_termios,
+	.type		= ic4_type,
+	.release_port	= null_void_function,
+	.request_port	= ic4_request_port,
+};
+
+/*
+ * Boot-time initialization code
+ */
+
+static struct uart_driver ioc4_uart = {
+	.owner		= THIS_MODULE,
+	.driver_name	= "ioc4_serial",
+	.dev_name	= DEVICE_NAME,
+	.major		= DEVICE_MAJOR,
+	.minor		= DEVICE_MINOR,
+	.nr		= IOC4_NUM_CARDS * IOC4_NUM_SERIAL_PORTS,
+};
+
+/**
+ * ioc4_serial_core_attach - register with serial core
+ *		This is done during pci probing
+ * @pdev: handle for this card
+ */
+static inline int
+ioc4_serial_core_attach(struct pci_dev *pdev)
+{
+	struct ioc4_port *port;
+	struct uart_port *the_port;
+	struct ioc4_control *control = pci_get_drvdata(pdev);
+	int ii;
+
+	DPRINT_CONFIG(("%s: attach pdev 0x%p - control 0x%p\n",
+			__FUNCTION__, pdev, (void *)control));
+
+	if (!control)
+		return -ENODEV;
+
+	/* once around for each port on this card */
+	for (ii = 0; ii < IOC4_NUM_SERIAL_PORTS; ii++) {
+		the_port = &control->ic_port[ii].icp_uart_port;
+		port = control->ic_port[ii].icp_port;
+		port->ip_port = the_port;
+
+		DPRINT_CONFIG(("%s: attach the_port 0x%p / port 0x%p\n",
+				__FUNCTION__, (void *)the_port,
+				(void *)port));
+
+		the_port->lock = SPIN_LOCK_UNLOCKED;
+		/* membase, iobase and mapbase just need to be non-0 */
+		the_port->membase = (unsigned char __iomem *)1;
+		the_port->line = the_port->iobase = ii;
+		the_port->mapbase = 1;
+		the_port->type = PORT_16550A;
+		the_port->fifosize = IOC4_MAX_CHARS;
+		the_port->ops = &ioc4_ops;
+		the_port->irq = control->ic_irq;
+		the_port->dev = &pdev->dev;
+		if (uart_add_one_port(&ioc4_uart, the_port) < 0) {
+			printk(KERN_WARNING
+				       "%s: unable to add port %d\n",
+				       __FUNCTION__, the_port->line);
+		} else {
+			DPRINT_CONFIG(
+				    ("IOC4 serial driver port %d irq = %d\n",
+				       the_port->line, the_port->irq));
+		}
+		/* all ports are rs232 for now */
+		ioc4_set_proto(port, PROTO_RS232);
+	}
+	return 0;
+}
+
+/**
+ * ioc4_serial_attach_one - register attach function
+ *		called per card found from ioc4_serial_detect as part
+ *		of module_init().
+ * @pdev: handle for this card
+ * @pci_id: pci id for this card
+ */
+int
+ioc4_serial_attach_one(struct pci_dev *pdev, const struct pci_device_id *pci_id)
+{
+	struct ioc4_mem __iomem *mem;
+	unsigned long tmp_addr, tmp_addr1;
+	struct ioc4_serial __iomem *serial;
+	struct ioc4_soft *soft;
+	struct ioc4_control *control;
+	int tmp, ret = 0;
+
+
+	DPRINT_CONFIG(("%s (0x%p, 0x%p)\n", __FUNCTION__, pdev, pci_id));
+
+	/* Map in the ioc4 memory */
+	tmp_addr = pci_resource_start(pdev, 0);
+	if (!tmp_addr) {
+		printk(KERN_WARNING
+			 "ioc4 (%p) : unable to get PIO mapping for "
+				"MEM space\n", (void *)pdev);
+		return -ENODEV;
+	}
+	if (!request_region(tmp_addr, sizeof(struct ioc4_mem), "sioc4_mem")) {
+		printk(KERN_ALERT
+			"ioc4 (%p): unable to get request region for "
+			"MEM space\n", (void *)pdev);
+		return -ENODEV;
+	}
+	mem = ioremap(tmp_addr, sizeof(struct ioc4_mem));
+	if (!mem) {
+		printk(KERN_WARNING
+			 "ioc4 (%p) : unable to remap ioc4 memory\n",
+				(void *)pdev);
+		ret = -ENODEV;
+		goto out1;
+	}
+
+	/* request serial registers */
+	tmp_addr1 = pci_resource_start(pdev, 0) + IOC4_SERIAL_OFFSET;
+
+	if (!request_region(tmp_addr1, sizeof(struct ioc4_serial),
+					"sioc4_uart")) {
+		printk(KERN_WARNING
+			"ioc4 (%p): unable to get request region for "
+				"uart space\n", (void *)pdev);
+		ret = -ENODEV;
+		goto out1;
+	}
+	serial = ioremap(tmp_addr1, sizeof(struct ioc4_serial));
+	if (!serial) {
+		printk(KERN_WARNING
+			 "ioc4 (%p) : unable to remap ioc4 serial register\n",
+				(void *)pdev);
+		ret = -ENODEV;
+		goto out2;
+	}
+	DPRINT_CONFIG(("%s : mem 0x%p, serial 0x%p\n",
+				__FUNCTION__, (void *)mem, (void *)serial));
+
+	/* Get memory for the new card */
+	control = kmalloc(sizeof(struct ioc4_control) * IOC4_NUM_SERIAL_PORTS,
+						GFP_KERNEL);
+
+	if (!control) {
+		printk(KERN_WARNING "ioc4_attach_one"
+		       ": unable to get memory for the IOC4\n");
+		ret = -ENOMEM;
+		goto out2;
+	}
+	memset(control, 0, sizeof(struct ioc4_control));
+	pci_set_drvdata(pdev, control);
+
+	/* Allocate the soft structure */
+	soft = kmalloc(sizeof(struct ioc4_soft), GFP_KERNEL);
+	if (!soft) {
+		printk(KERN_WARNING
+		       "ioc4 (%p): unable to get memory for the soft struct\n",
+		       (void *)pdev);
+		ret = -ENOMEM;
+		goto out3;
+	}
+	memset(soft, 0, sizeof(struct ioc4_soft));
+
+	spin_lock_init(&soft->is_ir_lock);
+	soft->is_ioc4_mem_addr = mem;
+	soft->is_ioc4_serial_addr = serial;
+
+	/* Init the IOC4 */
+	pci_read_config_dword(pdev, PCI_COMMAND, &tmp);
+	pci_write_config_dword(pdev, PCI_COMMAND,
+			       tmp | PCI_COMMAND_PARITY | PCI_COMMAND_SERR);
+
+	writel(0xf << IOC4_SIO_CR_CMD_PULSE_SHIFT, &mem->sio_cr);
+
+	/* Enable serial port mode select generic PIO pins as outputs */
+	writel(IOC4_GPCR_UART0_MODESEL | IOC4_GPCR_UART1_MODESEL
+		| IOC4_GPCR_UART2_MODESEL | IOC4_GPCR_UART3_MODESEL,
+		&mem->gpcr_s);
+
+	/* Clear and disable all interrupts */
+	write_ireg(soft, ~0, IOC4_W_IEC, IOC4_SIO_INTR_TYPE);
+	writel(~0, &mem->sio_ir);
+	write_ireg(soft, ~(IOC4_OTHER_IR_ATA_INT | IOC4_OTHER_IR_ATA_MEMERR),
+			IOC4_W_IEC, IOC4_OTHER_INTR_TYPE);
+	writel(~(IOC4_OTHER_IR_ATA_MEMERR | IOC4_OTHER_IR_ATA_MEMERR),
+					&mem->other_ir);
+	control->ic_soft = soft;
+	if (!request_irq(pdev->irq, ioc4_intr, SA_SHIRQ,
+				"sgi-ioc4serial", (void *)soft)) {
+		control->ic_irq = pdev->irq;
+	} else {
+		printk(KERN_WARNING
+		    "%s : request_irq fails for IRQ 0x%x\n ",
+			__FUNCTION__, pdev->irq);
+	}
+	if ((ret = ioc4_attach_local(pdev, control, soft,
+				soft->is_ioc4_mem_addr,
+				soft->is_ioc4_serial_addr)))
+		goto out4;
+
+	/* register port with the serial core */
+
+	if ((ret = ioc4_serial_core_attach(pdev)))
+		goto out4;
+
+	return ret;
+
+	/* error exits that give back resources */
+out4:
+	kfree(soft);
+out3:
+	kfree(control);
+out2:
+	release_region(tmp_addr1, sizeof(struct ioc4_serial));
+out1:
+	release_region(tmp_addr, sizeof(struct ioc4_mem));
+
+	return ret;
+}
+
+
+/**
+ * ioc4_serial_remove_one - detach function
+ *
+ * @pdev: handle for this card
+ */
+
+#if 0
+void ioc4_serial_remove_one(struct pci_dev *pdev)
+{
+	int ii;
+	struct ioc4_control *control;
+	struct uart_port *the_port;
+	struct ioc4_port *port;
+	struct ioc4_soft *soft;
+
+	control = pci_get_drvdata(pdev);
+
+	for (ii = 0; ii < IOC4_NUM_SERIAL_PORTS; ii++) {
+		the_port = &control->ic_port[ii].icp_uart_port;
+		if (the_port) {
+			uart_remove_one_port(&ioc4_uart, the_port);
+		}
+		port = control->ic_port[ii].icp_port;
+		if (!(ii & 1) && port) {
+			pci_free_consistent(port->ip_pdev,
+					TOTAL_RING_BUF_SIZE,
+					(void *)port->ip_cpu_ringbuf,
+					port->ip_dma_ringbuf);
+			kfree(port);
+		}
+	}
+	soft = control->ic_soft;
+	if (soft) {
+		free_irq(control->ic_irq, (void *)soft);
+		if (soft->is_ioc4_serial_addr) {
+			release_region((unsigned long)
+			     soft->is_ioc4_serial_addr,
+				sizeof(struct ioc4_serial));
+		}
+		kfree(soft);
+	}
+	kfree(control);
+	pci_set_drvdata(pdev, NULL);
+	uart_unregister_driver(&ioc4_uart);
+}
+#endif
+
+/**
+ * ioc4_serial_init - module init
+ */
+int ioc4_serial_init(void)
+{
+	int ret;
+
+	/* register with serial core */
+	if ((ret = uart_register_driver(&ioc4_uart)) < 0) {
+		printk(KERN_WARNING
+			"%s: Couldn't register IOC4 serial driver\n",
+			__FUNCTION__);
+		return ret;
+	}
+	return 0;
+}
+
+MODULE_AUTHOR("Pat Gefre - Silicon Graphics Inc. (SGI) <pfg@sgi.com>");
+MODULE_DESCRIPTION("Serial PCI driver module for SGI IOC4 Base-IO Card");
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL(ioc4_serial_init);
+EXPORT_SYMBOL(ioc4_serial_attach_one);
diff --git a/drivers/sn/Makefile b/drivers/sn/Makefile
new file mode 100644
index 000000000000..631e54958448
--- /dev/null
+++ b/drivers/sn/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the Altix device drivers.
+#
+#
+
+obj-$(CONFIG_BLK_DEV_SGIIOC4) += ioc4.o
diff --git a/drivers/sn/ioc4.c b/drivers/sn/ioc4.c
new file mode 100644
index 000000000000..d9e4ee280e5f
--- /dev/null
+++ b/drivers/sn/ioc4.c
@@ -0,0 +1,65 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2005 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * This file contains a shim driver for the IOC4 IDE and serial drivers.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/ioc4_common.h>
+#include <linux/ide.h>
+
+
+static int __devinit
+ioc4_probe_one(struct pci_dev *pdev, const struct pci_device_id *pci_id)
+{
+	int ret;
+
+	if ((ret = pci_enable_device(pdev))) {
+		printk(KERN_WARNING
+			 "%s: Failed to enable device with "
+				"pci_dev 0x%p... returning\n",
+				__FUNCTION__, (void *)pdev);
+		return ret;
+	}
+	pci_set_master(pdev);
+
+	/* attach each sub-device */
+	ret = ioc4_ide_attach_one(pdev, pci_id);
+	if (ret)
+		return ret;
+	return ioc4_serial_attach_one(pdev, pci_id);
+}
+
+/* pci device struct */
+static struct pci_device_id ioc4_s_id_table[] = {
+	{PCI_VENDOR_ID_SGI, PCI_DEVICE_ID_SGI_IOC4, PCI_ANY_ID,
+	 PCI_ANY_ID, 0x0b4000, 0xFFFFFF},
+	{0}
+};
+MODULE_DEVICE_TABLE(pci, ioc4_s_id_table);
+
+static struct pci_driver __devinitdata ioc4_s_driver = {
+	.name	= "IOC4",
+	.id_table = ioc4_s_id_table,
+	.probe	= ioc4_probe_one,
+};
+
+static int __devinit ioc4_detect(void)
+{
+	ioc4_serial_init();
+
+	return pci_register_driver(&ioc4_s_driver);
+}
+module_init(ioc4_detect);
+
+MODULE_AUTHOR("Pat Gefre - Silicon Graphics Inc. (SGI) <pfg@sgi.com>");
+MODULE_DESCRIPTION("PCI driver module for SGI IOC4 Base-IO Card");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/ioc4_common.h b/include/linux/ioc4_common.h
new file mode 100644
index 000000000000..b03bcc46df55
--- /dev/null
+++ b/include/linux/ioc4_common.h
@@ -0,0 +1,21 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2005 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+#ifndef _LINUX_IOC4_COMMON_H
+#define _LINUX_IOC4_COMMON_H
+
+/* prototypes */
+
+int ioc4_serial_init(void);
+
+int ioc4_serial_attach_one(struct pci_dev *pdev, const struct
+				pci_device_id *pci_id);
+int ioc4_ide_attach_one(struct pci_dev *pdev, const struct
+				pci_device_id *pci_id);
+
+#endif	/* _LINUX_IOC4_COMMON_H */
-- 
cgit v1.2.3


From 56a25bd2cd79b069e7ef33ae1b0edf931f9b2cb5 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Mon, 7 Mar 2005 17:41:44 -0800
Subject: [PATCH] swsusp: do not use higher order memory allocations on suspend

This is patch from Rafael, it eliminates order-5 (or worse) allocations
during suspend.  I did few style/whitespace modifications.  It was tested
by me, Rafael, and Stefan from SuSE.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/suspend.h |  17 ++++-
 kernel/power/swsusp.c   | 175 +++++++++++++++++++++++++-----------------------
 2 files changed, 106 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 138f5b178f39..4b6993558ee2 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -15,11 +15,22 @@ typedef struct pbe {
 	unsigned long address;		/* address of the copy */
 	unsigned long orig_address;	/* original address of page */
 	swp_entry_t swap_address;	
-	swp_entry_t dummy;		/* we need scratch space at 
-					 * end of page (see link, diskpage)
-					 */
+
+	struct pbe *next;	/* also used as scratch space at
+				 * end of page (see link, diskpage)
+				 */
 } suspend_pagedir_t;
 
+#define for_each_pbe(pbe, pblist) \
+	for (pbe = pblist ; pbe ; pbe = pbe->next)
+
+#define PBES_PER_PAGE      (PAGE_SIZE/sizeof(struct pbe))
+#define PB_PAGE_SKIP       (PBES_PER_PAGE-1)
+
+#define for_each_pb_page(pbe, pblist) \
+	for (pbe = pblist ; pbe ; pbe = (pbe+PB_PAGE_SKIP)->next)
+
+
 #define SWAP_FILENAME_MAXLENGTH	32
 
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 9ca3b1c38b46..133d18ab97db 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -76,7 +76,6 @@
 extern const void __nosave_begin, __nosave_end;
 
 /* Variables to be preserved over suspend */
-static int pagedir_order_check;
 static int nr_copy_pages_check;
 
 extern char resume_file[];
@@ -225,8 +224,6 @@ static void lock_swapdevices(void)
 	swap_list_unlock();
 }
 
-
-
 /**
  *	write_swap_page - Write one page to a fresh swap location.
  *	@addr:	Address we're writing.
@@ -239,7 +236,6 @@ static void lock_swapdevices(void)
  *	This is a partial improvement, since we will at least return other
  *	errors, though we need to eventually fix the damn code.
  */
-
 static int write_page(unsigned long addr, swp_entry_t * loc)
 {
 	swp_entry_t entry;
@@ -259,14 +255,12 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
 	return error;
 }
 
-
 /**
  *	data_free - Free the swap entries used by the saved image.
  *
  *	Walk the list of used swap entries and free each one. 
  *	This is only used for cleanup when suspend fails.
  */
-
 static void data_free(void)
 {
 	swp_entry_t entry;
@@ -282,28 +276,27 @@ static void data_free(void)
 	}
 }
 
-
 /**
  *	data_write - Write saved image to swap.
  *
  *	Walk the list of pages in the image and sync each one to swap.
  */
-
 static int data_write(void)
 {
-	int error = 0;
-	int i;
+	int error = 0, i = 0;
 	unsigned int mod = nr_copy_pages / 100;
+	struct pbe *p;
 
 	if (!mod)
 		mod = 1;
 
 	printk( "Writing data to swap (%d pages)...     ", nr_copy_pages );
-	for (i = 0; i < nr_copy_pages && !error; i++) {
+	for_each_pbe(p, pagedir_nosave) {
 		if (!(i%mod))
 			printk( "\b\b\b\b%3d%%", i / mod );
-		error = write_page((pagedir_nosave+i)->address,
-					  &((pagedir_nosave+i)->swap_address));
+		if ((error = write_page(p->address, &(p->swap_address))))
+			return error;
+		i++;
 	}
 	printk("\b\b\b\bdone\n");
 	return error;
@@ -326,15 +319,14 @@ static void dump_info(void)
 
 static void init_header(void)
 {
-	memset(&swsusp_info,0,sizeof(swsusp_info));
+	memset(&swsusp_info, 0, sizeof(swsusp_info));
 	swsusp_info.version_code = LINUX_VERSION_CODE;
 	swsusp_info.num_physpages = num_physpages;
-	memcpy(&swsusp_info.uts,&system_utsname,sizeof(system_utsname));
+	memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
 
 	swsusp_info.suspend_pagedir = pagedir_nosave;
 	swsusp_info.cpus = num_online_cpus();
 	swsusp_info.image_pages = nr_copy_pages;
-	dump_info();
 }
 
 static int close_swap(void)
@@ -342,7 +334,8 @@ static int close_swap(void)
 	swp_entry_t entry;
 	int error;
 
-	error = write_page((unsigned long)&swsusp_info,&entry);
+	dump_info();
+	error = write_page((unsigned long)&swsusp_info, &entry);
 	if (!error) { 
 		printk( "S" );
 		error = mark_swapfiles(entry);
@@ -373,15 +366,18 @@ static void free_pagedir_entries(void)
 
 static int write_pagedir(void)
 {
-	unsigned long addr = (unsigned long)pagedir_nosave;
 	int error = 0;
-	int n = SUSPEND_PD_PAGES(nr_copy_pages);
-	int i;
+	unsigned n = 0;
+	struct pbe * pbe;
+
+	printk( "Writing pagedir...");
+	for_each_pb_page(pbe, pagedir_nosave) {
+		if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
+			return error;
+	}
 
 	swsusp_info.pagedir_pages = n;
-	printk( "Writing pagedir (%d pages)\n", n);
-	for (i = 0; i < n && !error; i++, addr += PAGE_SIZE)
-		error = write_page(addr, &swsusp_info.pagedir[i]);
+	printk("done (%u pages)\n", n);
 	return error;
 }
 
@@ -567,8 +563,8 @@ static void copy_data_pages(void)
 	struct zone *zone;
 	unsigned long zone_pfn;
 	struct pbe * pbe = pagedir_nosave;
-	int to_copy = nr_copy_pages;
 	
+	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
 	for_each_zone(zone) {
 		if (is_highmem(zone))
 			continue;
@@ -577,78 +573,94 @@ static void copy_data_pages(void)
 			if (saveable(zone, &zone_pfn)) {
 				struct page * page;
 				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				BUG_ON(!pbe);
 				pbe->orig_address = (long) page_address(page);
 				/* copy_page is not usable for copying task structs. */
 				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
-				pbe++;
-				to_copy--;
+				pbe = pbe->next;
 			}
 		}
 	}
-	BUG_ON(to_copy);
+	BUG_ON(pbe);
 }
 
 
 /**
- *	calc_order - Determine the order of allocation needed for pagedir_save.
- *
- *	This looks tricky, but is just subtle. Please fix it some time.
- *	Since there are %nr_copy_pages worth of pages in the snapshot, we need
- *	to allocate enough contiguous space to hold 
- *		(%nr_copy_pages * sizeof(struct pbe)), 
- *	which has the saved/orig locations of the page.. 
- *
- *	SUSPEND_PD_PAGES() tells us how many pages we need to hold those 
- *	structures, then we call get_bitmask_order(), which will tell us the
- *	last bit set in the number, starting with 1. (If we need 30 pages, that
- *	is 0x0000001e in hex. The last bit is the 5th, which is the order we 
- *	would use to allocate 32 contiguous pages).
- *
- *	Since we also need to save those pages, we add the number of pages that
- *	we need to nr_copy_pages, and in case of an overflow, do the 
- *	calculation again to update the number of pages needed. 
- *
- *	With this model, we will tend to waste a lot of memory if we just cross
- *	an order boundary. Plus, the higher the order of allocation that we try
- *	to do, the more likely we are to fail in a low-memory situtation 
- *	(though	we're unlikely to get this far in such a case, since swsusp 
- *	requires half of memory to be free anyway).
+ *	calc_nr - Determine the number of pages needed for a pbe list.
  */
 
-
-static void calc_order(void)
+static int calc_nr(int nr_copy)
 {
-	int diff = 0;
-	int order = 0;
+	int extra = 0;
+	int mod = !!(nr_copy % PBES_PER_PAGE);
+	int diff = (nr_copy / PBES_PER_PAGE) + mod;
 
 	do {
-		diff = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages)) - order;
-		if (diff) {
-			order += diff;
-			nr_copy_pages += 1 << diff;
-		}
-	} while(diff);
-	pagedir_order = order;
+		extra += diff;
+		nr_copy += diff;
+		mod = !!(nr_copy % PBES_PER_PAGE);
+		diff = (nr_copy / PBES_PER_PAGE) + mod - extra;
+	} while (diff > 0);
+
+	return nr_copy;
 }
 
+/**
+ *	free_pagedir - free pages allocated with alloc_pagedir()
+ */
+
+static inline void free_pagedir(struct pbe *pblist)
+{
+	struct pbe *pbe;
+
+	while (pblist) {
+		pbe = (pblist + PB_PAGE_SKIP)->next;
+		free_page((unsigned long)pblist);
+		pblist = pbe;
+	}
+}
 
 /**
  *	alloc_pagedir - Allocate the page directory.
  *
- *	First, determine exactly how many contiguous pages we need and
+ *	First, determine exactly how many pages we need and
  *	allocate them.
+ *
+ *	We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
+ *	struct pbe elements (pbes) and the last element in the page points
+ *	to the next page.
+ *
+ *	On each page we set up a list of struct_pbe elements.
  */
 
-static int alloc_pagedir(void)
+static struct pbe * alloc_pagedir(unsigned nr_pages)
 {
-	calc_order();
-	pagedir_save = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD,
-							     pagedir_order);
-	if (!pagedir_save)
-		return -ENOMEM;
-	memset(pagedir_save, 0, (1 << pagedir_order) * PAGE_SIZE);
-	pagedir_nosave = pagedir_save;
-	return 0;
+	unsigned num;
+	struct pbe *pblist, *pbe, *p;
+
+	if (!nr_pages)
+		return NULL;
+
+	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
+	pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
+        		pbe = pbe->next, num += PBES_PER_PAGE) {
+		p = pbe;
+		pbe += PB_PAGE_SKIP;
+		do
+			p->next = p + 1;
+		while (p++ < pbe);
+		pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	}
+	if (pbe) {
+		for (num -= PBES_PER_PAGE - 1, p = pbe; num < nr_pages; p++, num++)
+			p->next = p + 1;
+	} else { /* get_zeroed_page() failed */
+		free_pagedir(pblist);
+		pblist = NULL;
+        }
+	pr_debug("alloc_pagedir(): allocated %d PBEs\n", num);
+	return pblist;
 }
 
 /**
@@ -658,10 +670,8 @@ static int alloc_pagedir(void)
 static void free_image_pages(void)
 {
 	struct pbe * p;
-	int i;
 
-	p = pagedir_save;
-	for (i = 0, p = pagedir_save; i < nr_copy_pages; i++, p++) {
+	for_each_pbe(p, pagedir_save) {
 		if (p->address) {
 			ClearPageNosave(virt_to_page(p->address));
 			free_page(p->address);
@@ -672,15 +682,13 @@ static void free_image_pages(void)
 
 /**
  *	alloc_image_pages - Allocate pages for the snapshot.
- *
  */
 
 static int alloc_image_pages(void)
 {
 	struct pbe * p;
-	int i;
 
-	for (i = 0, p = pagedir_save; i < nr_copy_pages; i++, p++) {
+	for_each_pbe(p, pagedir_save) {
 		p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
 		if (!p->address)
 			return -ENOMEM;
@@ -694,7 +702,7 @@ void swsusp_free(void)
 	BUG_ON(PageNosave(virt_to_page(pagedir_save)));
 	BUG_ON(PageNosaveFree(virt_to_page(pagedir_save)));
 	free_image_pages();
-	free_pages((unsigned long) pagedir_save, pagedir_order);
+	free_pagedir(pagedir_save);
 }
 
 
@@ -752,10 +760,13 @@ static int swsusp_alloc(void)
 	if (!enough_swap())
 		return -ENOSPC;
 
-	if ((error = alloc_pagedir())) {
+	nr_copy_pages = calc_nr(nr_copy_pages);
+
+	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
-		return error;
+		return -ENOMEM;
 	}
+	pagedir_nosave = pagedir_save;
 	if ((error = alloc_image_pages())) {
 		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
 		swsusp_free();
@@ -763,7 +774,6 @@ static int swsusp_alloc(void)
 	}
 
 	nr_copy_pages_check = nr_copy_pages;
-	pagedir_order_check = pagedir_order;
 	return 0;
 }
 
@@ -780,7 +790,7 @@ static int suspend_prepare_image(void)
 
 	drain_local_pages();
 	count_data_pages();
-	printk("swsusp: Need to copy %u pages\n",nr_copy_pages);
+	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
 
 	error = swsusp_alloc();
 	if (error)
@@ -867,7 +877,6 @@ int swsusp_suspend(void)
 asmlinkage int swsusp_restore(void)
 {
 	BUG_ON (nr_copy_pages_check != nr_copy_pages);
-	BUG_ON (pagedir_order_check != pagedir_order);
 	
 	/* Even mappings of "global" things (vmalloc) need to be fixed */
 	__flush_tlb_global();
-- 
cgit v1.2.3


From 716a66395fbcdf572e3dcc972730f09e038656f3 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Mon, 7 Mar 2005 17:45:28 -0800
Subject: [PATCH] cleanup vc array access

This removes as far as possible unneccessary vc_cons lookups by using a
pointer to the vc_data structure instead of the index.  The hidden currcons
argument in console_macros.h is temporarily replaced with a hidden vc pointer.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/console_macros.h    | 132 +++---
 drivers/char/consolemap.c        |  97 ++---
 drivers/char/keyboard.c          |   6 +-
 drivers/char/selection.c         |   6 +-
 drivers/char/sysrq.c             |   5 +-
 drivers/char/tty_io.c            |   4 +-
 drivers/char/vc_screen.c         |   7 +-
 drivers/char/vt.c                | 848 +++++++++++++++++++--------------------
 drivers/char/vt_ioctl.c          |  78 ++--
 drivers/video/console/dummycon.c |   2 +-
 drivers/video/console/fbcon.c    |  46 +--
 drivers/video/console/mdacon.c   |   7 +-
 drivers/video/console/promcon.c  |  14 +-
 drivers/video/console/sticon.c   |   2 +-
 drivers/video/console/vgacon.c   |  10 +-
 fs/compat_ioctl.c                |  10 +-
 include/linux/consolemap.h       |   2 +-
 include/linux/vt_kern.h          |  44 +-
 18 files changed, 647 insertions(+), 673 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/console_macros.h b/drivers/char/console_macros.h
index 93b5d23fa137..d6062b6883d0 100644
--- a/drivers/char/console_macros.h
+++ b/drivers/char/console_macros.h
@@ -1,71 +1,71 @@
-#define cons_num	(vc_cons[currcons].d->vc_num)
-#define video_scan_lines (vc_cons[currcons].d->vc_scan_lines)
-#define sw		(vc_cons[currcons].d->vc_sw)
-#define screenbuf	(vc_cons[currcons].d->vc_screenbuf)
-#define screenbuf_size	(vc_cons[currcons].d->vc_screenbuf_size)
-#define origin		(vc_cons[currcons].d->vc_origin)
-#define scr_top		(vc_cons[currcons].d->vc_scr_top)
-#define visible_origin  (vc_cons[currcons].d->vc_visible_origin)
-#define scr_end		(vc_cons[currcons].d->vc_scr_end)
-#define pos		(vc_cons[currcons].d->vc_pos)
-#define top		(vc_cons[currcons].d->vc_top)
-#define bottom		(vc_cons[currcons].d->vc_bottom)
-#define x		(vc_cons[currcons].d->vc_x)
-#define y		(vc_cons[currcons].d->vc_y)
-#define vc_state	(vc_cons[currcons].d->vc_state)
-#define npar		(vc_cons[currcons].d->vc_npar)
-#define par		(vc_cons[currcons].d->vc_par)
-#define ques		(vc_cons[currcons].d->vc_ques)
-#define attr		(vc_cons[currcons].d->vc_attr)
-#define saved_x		(vc_cons[currcons].d->vc_saved_x)
-#define saved_y		(vc_cons[currcons].d->vc_saved_y)
-#define translate	(vc_cons[currcons].d->vc_translate)
-#define G0_charset	(vc_cons[currcons].d->vc_G0_charset)
-#define G1_charset	(vc_cons[currcons].d->vc_G1_charset)
-#define saved_G0	(vc_cons[currcons].d->vc_saved_G0)
-#define saved_G1	(vc_cons[currcons].d->vc_saved_G1)
-#define utf		(vc_cons[currcons].d->vc_utf)
-#define utf_count	(vc_cons[currcons].d->vc_utf_count)
-#define utf_char	(vc_cons[currcons].d->vc_utf_char)
-#define video_erase_char (vc_cons[currcons].d->vc_video_erase_char)
-#define disp_ctrl	(vc_cons[currcons].d->vc_disp_ctrl)
-#define toggle_meta	(vc_cons[currcons].d->vc_toggle_meta)
-#define decscnm		(vc_cons[currcons].d->vc_decscnm)
-#define decom		(vc_cons[currcons].d->vc_decom)
-#define decawm		(vc_cons[currcons].d->vc_decawm)
-#define deccm		(vc_cons[currcons].d->vc_deccm)
-#define decim		(vc_cons[currcons].d->vc_decim)
-#define deccolm		(vc_cons[currcons].d->vc_deccolm)
-#define need_wrap	(vc_cons[currcons].d->vc_need_wrap)
-#define kmalloced	(vc_cons[currcons].d->vc_kmalloced)
-#define report_mouse	(vc_cons[currcons].d->vc_report_mouse)
-#define color		(vc_cons[currcons].d->vc_color)
-#define s_color		(vc_cons[currcons].d->vc_s_color)
-#define def_color	(vc_cons[currcons].d->vc_def_color)
+#define cons_num	(vc->vc_num)
+#define video_scan_lines (vc->vc_scan_lines)
+#define sw		(vc->vc_sw)
+#define screenbuf	(vc->vc_screenbuf)
+#define screenbuf_size	(vc->vc_screenbuf_size)
+#define origin		(vc->vc_origin)
+#define scr_top		(vc->vc_scr_top)
+#define visible_origin  (vc->vc_visible_origin)
+#define scr_end		(vc->vc_scr_end)
+#define pos		(vc->vc_pos)
+#define top		(vc->vc_top)
+#define bottom		(vc->vc_bottom)
+#define x		(vc->vc_x)
+#define y		(vc->vc_y)
+#define vc_state	(vc->vc_state)
+#define npar		(vc->vc_npar)
+#define par		(vc->vc_par)
+#define ques		(vc->vc_ques)
+#define attr		(vc->vc_attr)
+#define saved_x		(vc->vc_saved_x)
+#define saved_y		(vc->vc_saved_y)
+#define translate	(vc->vc_translate)
+#define G0_charset	(vc->vc_G0_charset)
+#define G1_charset	(vc->vc_G1_charset)
+#define saved_G0	(vc->vc_saved_G0)
+#define saved_G1	(vc->vc_saved_G1)
+#define utf		(vc->vc_utf)
+#define utf_count	(vc->vc_utf_count)
+#define utf_char	(vc->vc_utf_char)
+#define video_erase_char (vc->vc_video_erase_char)
+#define disp_ctrl	(vc->vc_disp_ctrl)
+#define toggle_meta	(vc->vc_toggle_meta)
+#define decscnm		(vc->vc_decscnm)
+#define decom		(vc->vc_decom)
+#define decawm		(vc->vc_decawm)
+#define deccm		(vc->vc_deccm)
+#define decim		(vc->vc_decim)
+#define deccolm		(vc->vc_deccolm)
+#define need_wrap	(vc->vc_need_wrap)
+#define kmalloced	(vc->vc_kmalloced)
+#define report_mouse	(vc->vc_report_mouse)
+#define color		(vc->vc_color)
+#define s_color		(vc->vc_s_color)
+#define def_color	(vc->vc_def_color)
 #define foreground	(color & 0x0f)
 #define background	(color & 0xf0)
-#define charset		(vc_cons[currcons].d->vc_charset)
-#define s_charset	(vc_cons[currcons].d->vc_s_charset)
-#define	intensity	(vc_cons[currcons].d->vc_intensity)
-#define	underline	(vc_cons[currcons].d->vc_underline)
-#define	blink		(vc_cons[currcons].d->vc_blink)
-#define	reverse		(vc_cons[currcons].d->vc_reverse)
-#define	s_intensity	(vc_cons[currcons].d->vc_s_intensity)
-#define	s_underline	(vc_cons[currcons].d->vc_s_underline)
-#define	s_blink		(vc_cons[currcons].d->vc_s_blink)
-#define	s_reverse	(vc_cons[currcons].d->vc_s_reverse)
-#define	ulcolor		(vc_cons[currcons].d->vc_ulcolor)
-#define	halfcolor	(vc_cons[currcons].d->vc_halfcolor)
-#define tab_stop	(vc_cons[currcons].d->vc_tab_stop)
-#define palette		(vc_cons[currcons].d->vc_palette)
-#define bell_pitch	(vc_cons[currcons].d->vc_bell_pitch)
-#define bell_duration	(vc_cons[currcons].d->vc_bell_duration)
-#define cursor_type	(vc_cons[currcons].d->vc_cursor_type)
-#define display_fg	(vc_cons[currcons].d->vc_display_fg)
-#define complement_mask (vc_cons[currcons].d->vc_complement_mask)
-#define s_complement_mask (vc_cons[currcons].d->vc_s_complement_mask)
-#define hi_font_mask	(vc_cons[currcons].d->vc_hi_font_mask)
+#define charset		(vc->vc_charset)
+#define s_charset	(vc->vc_s_charset)
+#define	intensity	(vc->vc_intensity)
+#define	underline	(vc->vc_underline)
+#define	blink		(vc->vc_blink)
+#define	reverse		(vc->vc_reverse)
+#define	s_intensity	(vc->vc_s_intensity)
+#define	s_underline	(vc->vc_s_underline)
+#define	s_blink		(vc->vc_s_blink)
+#define	s_reverse	(vc->vc_s_reverse)
+#define	ulcolor		(vc->vc_ulcolor)
+#define	halfcolor	(vc->vc_halfcolor)
+#define tab_stop	(vc->vc_tab_stop)
+#define palette		(vc->vc_palette)
+#define bell_pitch	(vc->vc_bell_pitch)
+#define bell_duration	(vc->vc_bell_duration)
+#define cursor_type	(vc->vc_cursor_type)
+#define display_fg	(vc->vc_display_fg)
+#define complement_mask (vc->vc_complement_mask)
+#define s_complement_mask (vc->vc_s_complement_mask)
+#define hi_font_mask	(vc->vc_hi_font_mask)
 
-#define vcmode		(vt_cons[currcons]->vc_mode)
+#define vcmode		(vt_cons[vc->vc_num]->vc_mode)
 
 #define structsize	(sizeof(struct vc_data) + sizeof(struct vt_struct))
diff --git a/drivers/char/consolemap.c b/drivers/char/consolemap.c
index 6f96e3ce081c..c31881db6002 100644
--- a/drivers/char/consolemap.c
+++ b/drivers/char/consolemap.c
@@ -208,9 +208,9 @@ static void set_inverse_transl(struct vc_data *conp, struct uni_pagedir *p, int
 	}
 }
 
-unsigned short *set_translate(int m,int currcons)
+unsigned short *set_translate(int m, struct vc_data *vc)
 {
-	inv_translate[currcons] = m;
+	inv_translate[vc->vc_num] = m;
 	return translations[m];
 }
 
@@ -362,15 +362,16 @@ static void con_release_unimap(struct uni_pagedir *p)
 		}
 }
 
-void con_free_unimap(int con)
+void con_free_unimap(struct vc_data *vc)
 {
 	struct uni_pagedir *p;
-	struct vc_data *conp = vc_cons[con].d;
-	
-	p = (struct uni_pagedir *)*conp->vc_uni_pagedir_loc;
-	if (!p) return;
-	*conp->vc_uni_pagedir_loc = 0;
-	if (--p->refcount) return;
+
+	p = (struct uni_pagedir *)*vc->vc_uni_pagedir_loc;
+	if (!p)
+		return;
+	*vc->vc_uni_pagedir_loc = 0;
+	if (--p->refcount)
+		return;
 	con_release_unimap(p);
 	kfree(p);
 }
@@ -442,12 +443,11 @@ con_insert_unipair(struct uni_pagedir *p, u_short unicode, u_short fontpos)
 }
 
 /* ui is a leftover from using a hashtable, but might be used again */
-int con_clear_unimap(int con, struct unimapinit *ui)
+int con_clear_unimap(struct vc_data *vc, struct unimapinit *ui)
 {
 	struct uni_pagedir *p, *q;
-	struct vc_data *conp = vc_cons[con].d;
   
-	p = (struct uni_pagedir *)*conp->vc_uni_pagedir_loc;
+	p = (struct uni_pagedir *)*vc->vc_uni_pagedir_loc;
 	if (p && p->readonly) return -EIO;
 	if (!p || --p->refcount) {
 		q = (struct uni_pagedir *)kmalloc(sizeof(*p), GFP_KERNEL);
@@ -457,7 +457,7 @@ int con_clear_unimap(int con, struct unimapinit *ui)
 		}
 		memset(q, 0, sizeof(*q));
 		q->refcount=1;
-		*conp->vc_uni_pagedir_loc = (unsigned long)q;
+		*vc->vc_uni_pagedir_loc = (unsigned long)q;
 	} else {
 		if (p == dflt) dflt = NULL;
 		p->refcount++;
@@ -467,14 +467,12 @@ int con_clear_unimap(int con, struct unimapinit *ui)
 	return 0;
 }
 
-int
-con_set_unimap(int con, ushort ct, struct unipair __user *list)
+int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list)
 {
 	int err = 0, err1, i;
 	struct uni_pagedir *p, *q;
-	struct vc_data *conp = vc_cons[con].d;
 
-	p = (struct uni_pagedir *)*conp->vc_uni_pagedir_loc;
+	p = (struct uni_pagedir *)*vc->vc_uni_pagedir_loc;
 	if (p->readonly) return -EIO;
 	
 	if (!ct) return 0;
@@ -483,10 +481,10 @@ con_set_unimap(int con, ushort ct, struct unipair __user *list)
 		int j, k;
 		u16 **p1, *p2, l;
 		
-		err1 = con_clear_unimap(con, NULL);
+		err1 = con_clear_unimap(vc, NULL);
 		if (err1) return err1;
 		
-		q = (struct uni_pagedir *)*conp->vc_uni_pagedir_loc;
+		q = (struct uni_pagedir *)*vc->vc_uni_pagedir_loc;
 		for (i = 0, l = 0; i < 32; i++)
 		if ((p1 = p->uni_pgdir[i]))
 			for (j = 0; j < 32; j++)
@@ -496,7 +494,7 @@ con_set_unimap(int con, ushort ct, struct unipair __user *list)
 					err1 = con_insert_unipair(q, l, p2[k]);
 					if (err1) {
 						p->refcount++;
-						*conp->vc_uni_pagedir_loc = (unsigned long)p;
+						*vc->vc_uni_pagedir_loc = (unsigned long)p;
 						con_release_unimap(q);
 						kfree(q);
 						return err1; 
@@ -515,11 +513,11 @@ con_set_unimap(int con, ushort ct, struct unipair __user *list)
 			list++;
 	}
 	
-	if (con_unify_unimap(conp, p))
+	if (con_unify_unimap(vc, p))
 		return err;
 
 	for (i = 0; i <= 3; i++)
-		set_inverse_transl(conp, p, i); /* Update all inverse translations */
+		set_inverse_transl(vc, p, i); /* Update all inverse translations */
   
 	return err;
 }
@@ -529,20 +527,18 @@ con_set_unimap(int con, ushort ct, struct unipair __user *list)
    with.  This routine is executed at sys_setup time, and when the
    PIO_FONTRESET ioctl is called. */
 
-int
-con_set_default_unimap(int con)
+int con_set_default_unimap(struct vc_data *vc)
 {
 	int i, j, err = 0, err1;
 	u16 *q;
 	struct uni_pagedir *p;
-	struct vc_data *conp = vc_cons[con].d;
-	
+
 	if (dflt) {
-		p = (struct uni_pagedir *)*conp->vc_uni_pagedir_loc;
+		p = (struct uni_pagedir *)*vc->vc_uni_pagedir_loc;
 		if (p == dflt)
 			return 0;
 		dflt->refcount++;
-		*conp->vc_uni_pagedir_loc = (unsigned long)dflt;
+		*vc->vc_uni_pagedir_loc = (unsigned long)dflt;
 		if (p && --p->refcount) {
 			con_release_unimap(p);
 			kfree(p);
@@ -552,10 +548,10 @@ con_set_default_unimap(int con)
 	
 	/* The default font is always 256 characters */
 
-	err = con_clear_unimap(con,NULL);
+	err = con_clear_unimap(vc, NULL);
 	if (err) return err;
     
-	p = (struct uni_pagedir *)*conp->vc_uni_pagedir_loc;
+	p = (struct uni_pagedir *)*vc->vc_uni_pagedir_loc;
 	q = dfont_unitable;
 	
 	for (i = 0; i < 256; i++)
@@ -565,47 +561,42 @@ con_set_default_unimap(int con)
 				err = err1;
 		}
 			
-	if (con_unify_unimap(conp, p)) {
-		dflt = (struct uni_pagedir *)*conp->vc_uni_pagedir_loc;
+	if (con_unify_unimap(vc, p)) {
+		dflt = (struct uni_pagedir *)*vc->vc_uni_pagedir_loc;
 		return err;
 	}
 
 	for (i = 0; i <= 3; i++)
-		set_inverse_transl(conp, p, i);	/* Update all inverse translations */
+		set_inverse_transl(vc, p, i);	/* Update all inverse translations */
 	dflt = p;
 	return err;
 }
 EXPORT_SYMBOL(con_set_default_unimap);
 
-int
-con_copy_unimap(int dstcon, int srccon)
+int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc)
 {
-	struct vc_data *sconp = vc_cons[srccon].d;
-	struct vc_data *dconp = vc_cons[dstcon].d;
 	struct uni_pagedir *q;
-	
-	if (!vc_cons_allocated(srccon) || !*sconp->vc_uni_pagedir_loc)
+
+	if (!*src_vc->vc_uni_pagedir_loc)
 		return -EINVAL;
-	if (*dconp->vc_uni_pagedir_loc == *sconp->vc_uni_pagedir_loc)
+	if (*dst_vc->vc_uni_pagedir_loc == *src_vc->vc_uni_pagedir_loc)
 		return 0;
-	con_free_unimap(dstcon);
-	q = (struct uni_pagedir *)*sconp->vc_uni_pagedir_loc;
+	con_free_unimap(dst_vc);
+	q = (struct uni_pagedir *)*src_vc->vc_uni_pagedir_loc;
 	q->refcount++;
-	*dconp->vc_uni_pagedir_loc = (long)q;
+	*dst_vc->vc_uni_pagedir_loc = (long)q;
 	return 0;
 }
 
-int
-con_get_unimap(int con, ushort ct, ushort __user *uct, struct unipair __user *list)
+int con_get_unimap(struct vc_data *vc, ushort ct, ushort __user *uct, struct unipair __user *list)
 {
 	int i, j, k, ect;
 	u16 **p1, *p2;
 	struct uni_pagedir *p;
-	struct vc_data *conp = vc_cons[con].d;
 
 	ect = 0;
-	if (*conp->vc_uni_pagedir_loc) {
-		p = (struct uni_pagedir *)*conp->vc_uni_pagedir_loc;
+	if (*vc->vc_uni_pagedir_loc) {
+		p = (struct uni_pagedir *)*vc->vc_uni_pagedir_loc;
 		for (i = 0; i < 32; i++)
 		if ((p1 = p->uni_pgdir[i]))
 			for (j = 0; j < 32; j++)
@@ -625,12 +616,12 @@ con_get_unimap(int con, ushort ct, ushort __user *uct, struct unipair __user *li
 	return ((ect <= ct) ? 0 : -ENOMEM);
 }
 
-void con_protect_unimap(int con, int rdonly)
+void con_protect_unimap(struct vc_data *vc, int rdonly)
 {
-	struct uni_pagedir *p = (struct uni_pagedir *)
-		*vc_cons[con].d->vc_uni_pagedir_loc;
+	struct uni_pagedir *p = (struct uni_pagedir *)*vc->vc_uni_pagedir_loc;
 	
-	if (p) p->readonly = rdonly;
+	if (p)
+		p->readonly = rdonly;
 }
 
 int
@@ -679,7 +670,7 @@ console_map_init(void)
 	
 	for (i = 0; i < MAX_NR_CONSOLES; i++)
 		if (vc_cons_allocated(i) && !*vc_cons[i].d->vc_uni_pagedir_loc)
-			con_set_default_unimap(i);
+			con_set_default_unimap(vc_cons[i].d);
 }
 
 EXPORT_SYMBOL(con_copy_unimap);
diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c
index 2e83c65edba1..0e9d3c220bbe 100644
--- a/drivers/char/keyboard.c
+++ b/drivers/char/keyboard.c
@@ -536,12 +536,12 @@ static void fn_send_intr(struct vc_data *vc, struct pt_regs *regs)
 
 static void fn_scroll_forw(struct vc_data *vc, struct pt_regs *regs)
 {
-	scrollfront(0);
+	scrollfront(vc, 0);
 }
 
 static void fn_scroll_back(struct vc_data *vc, struct pt_regs *regs)
 {
-	scrollback(0);
+	scrollback(vc, 0);
 }
 
 static void fn_show_mem(struct vc_data *vc, struct pt_regs *regs)
@@ -581,7 +581,7 @@ static void fn_SAK(struct vc_data *vc, struct pt_regs *regs)
 	 */
 	if (tty)
 		do_SAK(tty);
-	reset_vc(fg_console);
+	reset_vc(vc);
 }
 
 static void fn_null(struct vc_data *vc, struct pt_regs *regs)
diff --git a/drivers/char/selection.c b/drivers/char/selection.c
index 791b1fc1f4a0..b7f584f6a919 100644
--- a/drivers/char/selection.c
+++ b/drivers/char/selection.c
@@ -43,15 +43,13 @@ static char *sel_buffer;
    from interrupt (via scrollback/front) */
 
 /* set reverse video on characters s-e of console with selection. */
-inline static void
-highlight(const int s, const int e)
+static inline void highlight(const int s, const int e)
 {
 	invert_screen(sel_cons, s, e-s+2, 1);
 }
 
 /* use complementary color to show the pointer */
-inline static void
-highlight_pointer(const int where)
+static inline void highlight_pointer(const int where)
 {
 	complement_pos(sel_cons, where);
 }
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 057eaf87b9b2..4b55da450f84 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -33,11 +33,10 @@
 #include <linux/buffer_head.h>		/* for fsync_bdev() */
 
 #include <linux/spinlock.h>
+#include <linux/vt_kern.h>
 
 #include <asm/ptrace.h>
 
-extern void reset_vc(unsigned int);
-
 /* Whether we react on sysrq keys or just ignore them */
 int sysrq_enabled = 1;
 
@@ -68,7 +67,7 @@ static void sysrq_handle_SAK(int key, struct pt_regs *pt_regs,
 {
 	if (tty)
 		do_SAK(tty);
-	reset_vc(fg_console);
+	reset_vc(vc_cons[fg_console].d);
 }
 static struct sysrq_key_op sysrq_SAK_op = {
 	.handler	= sysrq_handle_SAK,
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 33637e313e70..932951c57f5c 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -1789,7 +1789,6 @@ retry_open:
 	}
 #ifdef CONFIG_VT
 	if (device == MKDEV(TTY_MAJOR,0)) {
-		extern int fg_console;
 		extern struct tty_driver *console_driver;
 		driver = console_driver;
 		index = fg_console;
@@ -2016,11 +2015,10 @@ static int tiocswinsz(struct tty_struct *tty, struct tty_struct *real_tty,
 		return 0;
 #ifdef CONFIG_VT
 	if (tty->driver->type == TTY_DRIVER_TYPE_CONSOLE) {
-		unsigned int currcons = tty->index;
 		int rc;
 
 		acquire_console_sem();
-		rc = vc_resize(currcons, tmp_ws.ws_col, tmp_ws.ws_row);
+		rc = vc_resize(tty->driver_data, tmp_ws.ws_col, tmp_ws.ws_row);
 		release_console_sem();
 		if (rc)
 			return -ENXIO;
diff --git a/drivers/char/vc_screen.c b/drivers/char/vc_screen.c
index 55971a272ead..7abe405b8657 100644
--- a/drivers/char/vc_screen.c
+++ b/drivers/char/vc_screen.c
@@ -52,14 +52,17 @@ vcs_size(struct inode *inode)
 	int size;
 	int minor = iminor(inode);
 	int currcons = minor & 127;
+	struct vc_data *vc;
+
 	if (currcons == 0)
 		currcons = fg_console;
 	else
 		currcons--;
 	if (!vc_cons_allocated(currcons))
 		return -ENXIO;
+	vc = vc_cons[currcons].d;
 
-	size = vc_cons[currcons].d->vc_rows * vc_cons[currcons].d->vc_cols;
+	size = vc->vc_rows * vc->vc_cols;
 
 	if (minor & 128)
 		size = 2*size + HEADER_SIZE;
@@ -442,7 +445,7 @@ vcs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 		buf += orig_count;
 		pos += orig_count;
 		if (org0)
-			update_region(currcons, (unsigned long)(org0), org-org0);
+			update_region(vc, (unsigned long)(org0), org - org0);
 	}
 	*ppos += written;
 	ret = written;
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 261599fd095a..6573da5bdb6c 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -135,11 +135,11 @@ static const struct consw *con_driver_map[MAX_NR_CONSOLES];
 #endif
 
 static int con_open(struct tty_struct *, struct file *);
-static void vc_init(unsigned int console, unsigned int rows,
+static void vc_init(struct vc_data *vc, unsigned int rows,
 		    unsigned int cols, int do_clear);
 static void gotoxy(struct vc_data *vc, int new_x, int new_y);
-static void save_cur(int currcons);
-static void reset_terminal(int currcons, int do_clear);
+static void save_cur(struct vc_data *vc);
+static void reset_terminal(struct vc_data *vc, int do_clear);
 static void con_flush_chars(struct tty_struct *tty);
 static void set_vesa_blanking(char __user *p);
 static void set_cursor(struct vc_data *vc);
@@ -214,17 +214,12 @@ enum {
  *	Low-Level Functions
  */
 
-#define IS_FG			(currcons == fg_console)
-#define IS_FG_VC(vc)		(vc == vc_cons[fg_console].d)
-
-#define IS_VISIBLE CON_IS_VISIBLE(vc_cons[currcons].d)
+#define IS_FG(vc)	((vc)->vc_num == fg_console)
 
 #ifdef VT_BUF_VRAM_ONLY
-#define DO_UPDATE		0
-#define DO_UPDATE_VC(vc)	0
+#define DO_UPDATE(vc)	0
 #else
-#define DO_UPDATE 		IS_VISIBLE
-#define DO_UPDATE_VC(vc)	CON_IS_VISIBLE(vc)
+#define DO_UPDATE(vc)	CON_IS_VISIBLE(vc)
 #endif
 
 static int pm_con_request(struct pm_dev *dev, pm_request_t rqst, void *data);
@@ -254,38 +249,37 @@ void schedule_console_callback(void)
 	schedule_work(&console_work);
 }
 
-static void scrup(int currcons, unsigned int t, unsigned int b, int nr)
+static void scrup(struct vc_data *vc, unsigned int t, unsigned int b, int nr)
 {
 	unsigned short *d, *s;
 
 	if (t+nr >= b)
 		nr = b - t - 1;
-	if (b > vc_cons[currcons].d->vc_rows || t >= b || nr < 1)
+	if (b > vc->vc_rows || t >= b || nr < 1)
 		return;
-	if (IS_VISIBLE && sw->con_scroll(vc_cons[currcons].d, t, b, SM_UP, nr))
+	if (CON_IS_VISIBLE(vc) && sw->con_scroll(vc, t, b, SM_UP, nr))
 		return;
-	d = (unsigned short *) (origin+vc_cons[currcons].d->vc_size_row*t);
-	s = (unsigned short *) (origin+vc_cons[currcons].d->vc_size_row*(t+nr));
-	scr_memmovew(d, s, (b-t-nr) * vc_cons[currcons].d->vc_size_row);
-	scr_memsetw(d + (b-t-nr) * vc_cons[currcons].d->vc_cols, video_erase_char,
-			vc_cons[currcons].d->vc_size_row * nr);
+	d = (unsigned short *)(origin + vc->vc_size_row * t);
+	s = (unsigned short *)(origin + vc->vc_size_row * (t + nr));
+	scr_memmovew(d, s, (b - t - nr) * vc->vc_size_row);
+	scr_memsetw(d + (b - t - nr) * vc->vc_cols, video_erase_char,
+		    vc->vc_size_row * nr);
 }
 
-static void
-scrdown(int currcons, unsigned int t, unsigned int b, int nr)
+static void scrdown(struct vc_data *vc, unsigned int t, unsigned int b, int nr)
 {
 	unsigned short *s;
 	unsigned int step;
 
 	if (t+nr >= b)
 		nr = b - t - 1;
-	if (b > vc_cons[currcons].d->vc_rows || t >= b || nr < 1)
+	if (b > vc->vc_rows || t >= b || nr < 1)
 		return;
-	if (IS_VISIBLE && sw->con_scroll(vc_cons[currcons].d, t, b, SM_DOWN, nr))
+	if (CON_IS_VISIBLE(vc) && sw->con_scroll(vc, t, b, SM_DOWN, nr))
 		return;
-	s = (unsigned short *) (origin+vc_cons[currcons].d->vc_size_row*t);
-	step = vc_cons[currcons].d->vc_cols * nr;
-	scr_memmovew(s + step, s, (b-t-nr)*vc_cons[currcons].d->vc_size_row);
+	s = (unsigned short *)(origin + vc->vc_size_row * t);
+	step = vc->vc_cols * nr;
+	scr_memmovew(s + step, s, (b - t - nr) * vc->vc_size_row);
 	scr_memsetw(s, video_erase_char, 2*step);
 }
 
@@ -335,23 +329,23 @@ static void do_update_region(struct vc_data *vc, unsigned long start, int count)
 #endif
 }
 
-void update_region(int currcons, unsigned long start, int count)
+void update_region(struct vc_data *vc, unsigned long start, int count)
 {
 	WARN_CONSOLE_UNLOCKED();
 
-	if (DO_UPDATE) {
-		hide_cursor(vc_cons[currcons].d);
-		do_update_region(vc_cons[currcons].d, start, count);
-		set_cursor(vc_cons[currcons].d);
+	if (DO_UPDATE(vc)) {
+		hide_cursor(vc);
+		do_update_region(vc, start, count);
+		set_cursor(vc);
 	}
 }
 
 /* Structure of attributes is hardware-dependent */
 
-static u8 build_attr(int currcons, u8 _color, u8 _intensity, u8 _blink, u8 _underline, u8 _reverse)
+static u8 build_attr(struct vc_data *vc, u8 _color, u8 _intensity, u8 _blink, u8 _underline, u8 _reverse)
 {
 	if (sw->con_build_attr)
-		return sw->con_build_attr(vc_cons[currcons].d, _color, _intensity, _blink, _underline, _reverse);
+		return sw->con_build_attr(vc, _color, _intensity, _blink, _underline, _reverse);
 
 #ifndef VT_BUF_VRAM_ONLY
 /*
@@ -366,7 +360,7 @@ static u8 build_attr(int currcons, u8 _color, u8 _intensity, u8 _blink, u8 _unde
  */
 	{
 	u8 a = color;
-	if (!vc_cons[currcons].d->vc_can_do_color)
+	if (!vc->vc_can_do_color)
 		return _intensity |
 		       (_underline ? 4 : 0) |
 		       (_reverse ? 8 : 0) |
@@ -390,10 +384,10 @@ static u8 build_attr(int currcons, u8 _color, u8 _intensity, u8 _blink, u8 _unde
 #endif
 }
 
-static void update_attr(int currcons)
+static void update_attr(struct vc_data *vc)
 {
-	attr = build_attr(currcons, color, intensity, blink, underline, reverse ^ decscnm);
-	video_erase_char = (build_attr(currcons, color, 1, blink, 0, decscnm) << 8) | ' ';
+	attr = build_attr(vc, color, intensity, blink, underline, reverse ^ decscnm);
+	video_erase_char = (build_attr(vc, color, 1, blink, 0, decscnm) << 8) | ' ';
 }
 
 /* Note: inverting the screen twice should revert to the original state */
@@ -437,7 +431,7 @@ void invert_screen(struct vc_data *vc, int offset, int count, int viewed)
 		}
 	}
 #endif
-	if (DO_UPDATE_VC(vc))
+	if (DO_UPDATE(vc))
 		do_update_region(vc, (unsigned long) p, count);
 }
 
@@ -452,7 +446,7 @@ void complement_pos(struct vc_data *vc, int offset)
 
 	if (p) {
 		scr_writew(old, p);
-		if (DO_UPDATE_VC(vc))
+		if (DO_UPDATE(vc))
 			vc->vc_sw->con_putc(vc, old, oldy, oldx);
 	}
 	if (offset == -1)
@@ -463,7 +457,7 @@ void complement_pos(struct vc_data *vc, int offset)
 		old = scr_readw(p);
 		new = old ^ vc->vc_complement_mask;
 		scr_writew(new, p);
-		if (DO_UPDATE_VC(vc)) {
+		if (DO_UPDATE(vc)) {
 			oldx = (offset >> 1) % vc->vc_cols;
 			oldy = (offset >> 1) / vc->vc_cols;
 			vc->vc_sw->con_putc(vc, new, oldy, oldx);
@@ -471,47 +465,45 @@ void complement_pos(struct vc_data *vc, int offset)
 	}
 }
 
-static void insert_char(int currcons, unsigned int nr)
+static void insert_char(struct vc_data *vc, unsigned int nr)
 {
 	unsigned short *p, *q = (unsigned short *) pos;
 
-	p = q + vc_cons[currcons].d->vc_cols - nr - x;
+	p = q + vc->vc_cols - nr - x;
 	while (--p >= q)
 		scr_writew(scr_readw(p), p + nr);
 	scr_memsetw(q, video_erase_char, nr*2);
 	need_wrap = 0;
-	if (DO_UPDATE) {
+	if (DO_UPDATE(vc)) {
 		unsigned short oldattr = attr;
-		sw->con_bmove(vc_cons[currcons].d,y,x,y,x+nr,1,
-			      vc_cons[currcons].d->vc_cols-x-nr);
+		sw->con_bmove(vc, y, x, y, x + nr, 1,
+			      vc->vc_cols - x - nr);
 		attr = video_erase_char >> 8;
 		while (nr--)
-			sw->con_putc(vc_cons[currcons].d,
-				     video_erase_char,y,x+nr);
+			sw->con_putc(vc, video_erase_char, y, x + nr);
 		attr = oldattr;
 	}
 }
 
-static void delete_char(int currcons, unsigned int nr)
+static void delete_char(struct vc_data *vc, unsigned int nr)
 {
 	unsigned int i = x;
 	unsigned short *p = (unsigned short *) pos;
 
-	while (++i <= vc_cons[currcons].d->vc_cols - nr) {
+	while (++i <= vc->vc_cols - nr) {
 		scr_writew(scr_readw(p+nr), p);
 		p++;
 	}
 	scr_memsetw(p, video_erase_char, nr*2);
 	need_wrap = 0;
-	if (DO_UPDATE) {
+	if (DO_UPDATE(vc)) {
 		unsigned short oldattr = attr;
-		sw->con_bmove(vc_cons[currcons].d, y, x+nr, y, x, 1,
-			      vc_cons[currcons].d->vc_cols-x-nr);
+		sw->con_bmove(vc, y, x + nr, y, x, 1,
+			      vc->vc_cols - x - nr);
 		attr = video_erase_char >> 8;
 		while (nr--)
-			sw->con_putc(vc_cons[currcons].d,
-				     video_erase_char, y,
-				     vc_cons[currcons].d->vc_cols-1-nr);
+			sw->con_putc(vc, video_erase_char, y,
+				     vc->vc_cols - 1 - nr);
 		attr = oldattr;
 	}
 }
@@ -531,7 +523,7 @@ static void add_softcursor(struct vc_data *vc)
 	if ((type & 0x20) && ((softcursor_original & 0x7000) == (i & 0x7000))) i ^= 0x7000;
 	if ((type & 0x40) && ((i & 0x700) == ((i & 0x7000) >> 4))) i ^= 0x0700;
 	scr_writew(i, (u16 *) vc->vc_pos);
-	if (DO_UPDATE_VC(vc))
+	if (DO_UPDATE(vc))
 		vc->vc_sw->con_putc(vc, i, vc->vc_y, vc->vc_x);
 }
 
@@ -539,7 +531,7 @@ static void hide_softcursor(struct vc_data *vc)
 {
 	if (softcursor_original != -1) {
 		scr_writew(softcursor_original, (u16 *)vc->vc_pos);
-		if (DO_UPDATE_VC(vc))
+		if (DO_UPDATE(vc))
 			vc->vc_sw->con_putc(vc, softcursor_original,
 					vc->vc_y, vc->vc_x);
 		softcursor_original = -1;
@@ -556,7 +548,7 @@ static void hide_cursor(struct vc_data *vc)
 
 static void set_cursor(struct vc_data *vc)
 {
-	if (!IS_FG_VC(vc) || console_blanked ||
+	if (!IS_FG(vc) || console_blanked ||
 	    vc->vc_vt->vc_mode == KD_GRAPHICS)
 		return;
 	if (vc->vc_deccm) {
@@ -569,32 +561,32 @@ static void set_cursor(struct vc_data *vc)
 		hide_cursor(vc);
 }
 
-static void set_origin(int currcons)
+static void set_origin(struct vc_data *vc)
 {
 	WARN_CONSOLE_UNLOCKED();
 
-	if (!IS_VISIBLE ||
+	if (!CON_IS_VISIBLE(vc) ||
 	    !sw->con_set_origin ||
-	    !sw->con_set_origin(vc_cons[currcons].d))
+	    !sw->con_set_origin(vc))
 		origin = (unsigned long) screenbuf;
 	visible_origin = origin;
 	scr_end = origin + screenbuf_size;
-	pos = origin + vc_cons[currcons].d->vc_size_row*y + 2*x;
+	pos = origin + vc->vc_size_row * y + 2 * x;
 }
 
-static inline void save_screen(int currcons)
+static inline void save_screen(struct vc_data *vc)
 {
 	WARN_CONSOLE_UNLOCKED();
 
 	if (sw->con_save_screen)
-		sw->con_save_screen(vc_cons[currcons].d);
+		sw->con_save_screen(vc);
 }
 
 /*
  *	Redrawing of screen
  */
 
-static void clear_buffer_attributes(int currcons)
+static void clear_buffer_attributes(struct vc_data *vc)
 {
 	unsigned short *p = (unsigned short *) origin;
 	int count = screenbuf_size/2;
@@ -605,62 +597,57 @@ static void clear_buffer_attributes(int currcons)
 	}
 }
 
-void redraw_screen(int new_console, int is_switch)
+void redraw_screen(struct vc_data *vc, int is_switch)
 {
-	int redraw = 1;
-	int currcons, old_console;
+	int redraw = 0;
 
 	WARN_CONSOLE_UNLOCKED();
 
-	if (!vc_cons_allocated(new_console)) {
+	if (!vc) {
 		/* strange ... */
 		/* printk("redraw_screen: tty %d not allocated ??\n", new_console+1); */
 		return;
 	}
 
 	if (is_switch) {
-		currcons = fg_console;
-		hide_cursor(vc_cons[currcons].d);
-		if (fg_console != new_console) {
-			struct vc_data **display = vc_cons[new_console].d->vc_display_fg;
-			old_console = (*display) ? (*display)->vc_num : fg_console;
-			*display = vc_cons[new_console].d;
-			fg_console = new_console;
-			currcons = old_console;
-			if (!IS_VISIBLE) {
-				save_screen(currcons);
-				set_origin(currcons);
-			}
-			currcons = new_console;
-			if (old_console == new_console)
-				redraw = 0;
+		struct vc_data *old_vc = vc_cons[fg_console].d;
+		if (old_vc == vc)
+			return;
+		if (!CON_IS_VISIBLE(vc))
+			redraw = 1;
+		*vc->vc_display_fg = vc;
+		fg_console = vc->vc_num;
+		hide_cursor(old_vc);
+		if (!CON_IS_VISIBLE(old_vc)) {
+			save_screen(old_vc);
+			set_origin(old_vc);
 		}
 	} else {
-		currcons = new_console;
-		hide_cursor(vc_cons[currcons].d);
+		hide_cursor(vc);
+		redraw = 1;
 	}
 
 	if (redraw) {
 		int update;
-		int old_was_color = vc_cons[currcons].d->vc_can_do_color;
+		int old_was_color = vc->vc_can_do_color;
 
-		set_origin(currcons);
-		update = sw->con_switch(vc_cons[currcons].d);
-		set_palette(currcons);
+		set_origin(vc);
+		update = sw->con_switch(vc);
+		set_palette(vc);
 		/*
 		 * If console changed from mono<->color, the best we can do
 		 * is to clear the buffer attributes. As it currently stands,
 		 * rebuilding new attributes from the old buffer is not doable
 		 * without overly complex code.
 		 */
-		if (old_was_color != vc_cons[currcons].d->vc_can_do_color) {
-			update_attr(currcons);
-			clear_buffer_attributes(currcons);
+		if (old_was_color != vc->vc_can_do_color) {
+			update_attr(vc);
+			clear_buffer_attributes(vc);
 		}
 		if (update && vcmode != KD_GRAPHICS)
-			do_update_region(vc_cons[currcons].d, origin, screenbuf_size/2);
+			do_update_region(vc, origin, screenbuf_size / 2);
 	}
-	set_cursor(vc_cons[currcons].d);
+	set_cursor(vc);
 	if (is_switch) {
 		set_leds();
 		compute_shiftstate();
@@ -676,31 +663,30 @@ int vc_cons_allocated(unsigned int i)
 	return (i < MAX_NR_CONSOLES && vc_cons[i].d);
 }
 
-static void visual_init(int currcons, int init)
+static void visual_init(struct vc_data *vc, int num, int init)
 {
-    /* ++Geert: sw->con_init determines console size */
-    if (sw)
-	module_put(sw->owner);
-    sw = conswitchp;
+	/* ++Geert: sw->con_init determines console size */
+	if (sw)
+		module_put(sw->owner);
+	sw = conswitchp;
 #ifndef VT_SINGLE_DRIVER
-    if (con_driver_map[currcons])
-	sw = con_driver_map[currcons];
+	if (con_driver_map[num])
+		sw = con_driver_map[num];
 #endif
-    __module_get(sw->owner);
-    cons_num = currcons;
-    display_fg = &master_display_fg;
-    vc_cons[currcons].d->vc_uni_pagedir_loc = &vc_cons[currcons].d->vc_uni_pagedir;
-    vc_cons[currcons].d->vc_uni_pagedir = 0;
-    hi_font_mask = 0;
-    complement_mask = 0;
-    vc_cons[currcons].d->vc_can_do_color = 0;
-    sw->con_init(vc_cons[currcons].d, init);
-    if (!complement_mask)
-        complement_mask =
-		vc_cons[currcons].d->vc_can_do_color ? 0x7700 : 0x0800;
-    s_complement_mask = complement_mask;
-    vc_cons[currcons].d->vc_size_row = vc_cons[currcons].d->vc_cols<<1;
-    screenbuf_size = vc_cons[currcons].d->vc_rows * vc_cons[currcons].d->vc_size_row;
+	__module_get(sw->owner);
+	vc->vc_num = num;
+	display_fg = &master_display_fg;
+	vc->vc_uni_pagedir_loc = &vc->vc_uni_pagedir;
+	vc->vc_uni_pagedir = 0;
+	hi_font_mask = 0;
+	complement_mask = 0;
+	vc->vc_can_do_color = 0;
+	sw->con_init(vc, init);
+	if (!complement_mask)
+		complement_mask = vc->vc_can_do_color ? 0x7700 : 0x0800;
+	s_complement_mask = complement_mask;
+	vc->vc_size_row = vc->vc_cols << 1;
+	screenbuf_size = vc->vc_rows * vc->vc_size_row;
 }
 
 int vc_allocate(unsigned int currcons)	/* return 0 on success */
@@ -710,6 +696,7 @@ int vc_allocate(unsigned int currcons)	/* return 0 on success */
 	if (currcons >= MAX_NR_CONSOLES)
 		return -ENXIO;
 	if (!vc_cons[currcons].d) {
+	    struct vc_data *vc;
 	    long p, q;
 
 	    /* prevent users from taking too much memory */
@@ -726,12 +713,12 @@ int vc_allocate(unsigned int currcons)	/* return 0 on success */
 	    if (!p)
 		return -ENOMEM;
 	    memset((void *)p, 0, structsize);
-	    vc_cons[currcons].d = (struct vc_data *)p;
+	    vc_cons[currcons].d = vc = (struct vc_data *)p;
 	    vt_cons[currcons] = (struct vt_struct *)(p+sizeof(struct vc_data));
 	    vc_cons[currcons].d->vc_vt = vt_cons[currcons];
-	    visual_init(currcons, 1);
-	    if (!*vc_cons[currcons].d->vc_uni_pagedir_loc)
-		con_set_default_unimap(currcons);
+	    visual_init(vc, currcons, 1);
+	    if (!*vc->vc_uni_pagedir_loc)
+		con_set_default_unimap(vc);
 	    q = (long)kmalloc(screenbuf_size, GFP_KERNEL);
 	    if (!q) {
 		kfree((char *) p);
@@ -741,7 +728,7 @@ int vc_allocate(unsigned int currcons)	/* return 0 on success */
 	    }
 	    screenbuf = (unsigned short *) q;
 	    kmalloced = 1;
-	    vc_init(currcons, vc_cons[currcons].d->vc_rows, vc_cons[currcons].d->vc_cols, 1);
+	    vc_init(vc, vc->vc_rows, vc->vc_cols, 1);
 
 	    if (!pm_con) {
 		    pm_con = pm_register(PM_SYS_DEV,
@@ -752,13 +739,13 @@ int vc_allocate(unsigned int currcons)	/* return 0 on success */
 	return 0;
 }
 
-inline int resize_screen(int currcons, int width, int height)
+inline int resize_screen(struct vc_data *vc, int width, int height)
 {
 	/* Resizes the resolution of the display adapater */
 	int err = 0;
 
 	if (vcmode != KD_GRAPHICS && sw->con_resize)
-		err = sw->con_resize(vc_cons[currcons].d, width, height);
+		err = sw->con_resize(vc, width, height);
 	return err;
 }
 
@@ -769,7 +756,7 @@ inline int resize_screen(int currcons, int width, int height)
  */
 #define VC_RESIZE_MAXCOL (32767)
 #define VC_RESIZE_MAXROW (32767)
-int vc_resize(int currcons, unsigned int cols, unsigned int lines)
+int vc_resize(struct vc_data *vc, unsigned int cols, unsigned int lines)
 {
 	unsigned long old_origin, new_origin, new_scr_end, rlth, rrem, err = 0;
 	unsigned int old_cols, old_rows, old_row_size, old_screen_size;
@@ -778,38 +765,38 @@ int vc_resize(int currcons, unsigned int cols, unsigned int lines)
 
 	WARN_CONSOLE_UNLOCKED();
 
-	if (!vc_cons_allocated(currcons))
+	if (!vc)
 		return -ENXIO;
 
 	if (cols > VC_RESIZE_MAXCOL || lines > VC_RESIZE_MAXROW)
 		return -EINVAL;
 
-	new_cols = (cols ? cols : vc_cons[currcons].d->vc_cols);
-	new_rows = (lines ? lines : vc_cons[currcons].d->vc_rows);
+	new_cols = (cols ? cols : vc->vc_cols);
+	new_rows = (lines ? lines : vc->vc_rows);
 	new_row_size = new_cols << 1;
 	new_screen_size = new_row_size * new_rows;
 
-	if (new_cols == vc_cons[currcons].d->vc_cols && new_rows == vc_cons[currcons].d->vc_rows)
+	if (new_cols == vc->vc_cols && new_rows == vc->vc_rows)
 		return 0;
 
 	newscreen = (unsigned short *) kmalloc(new_screen_size, GFP_USER);
 	if (!newscreen)
 		return -ENOMEM;
 
-	old_rows = vc_cons[currcons].d->vc_rows;
-	old_cols = vc_cons[currcons].d->vc_cols;
-	old_row_size = vc_cons[currcons].d->vc_size_row;
+	old_rows = vc->vc_rows;
+	old_cols = vc->vc_cols;
+	old_row_size = vc->vc_size_row;
 	old_screen_size = screenbuf_size;
 
-	err = resize_screen(currcons, new_cols, new_rows);
+	err = resize_screen(vc, new_cols, new_rows);
 	if (err) {
 		kfree(newscreen);
 		return err;
 	}
 
-	vc_cons[currcons].d->vc_rows = new_rows;
-	vc_cons[currcons].d->vc_cols = new_cols;
-	vc_cons[currcons].d->vc_size_row = new_row_size;
+	vc->vc_rows = new_rows;
+	vc->vc_cols = new_cols;
+	vc->vc_size_row = new_row_size;
 	screenbuf_size = new_screen_size;
 
 	rlth = min(old_row_size, new_row_size);
@@ -820,7 +807,7 @@ int vc_resize(int currcons, unsigned int cols, unsigned int lines)
 	if (new_rows < old_rows)
 		old_origin += (old_rows - new_rows) * old_row_size;
 
-	update_attr(currcons);
+	update_attr(vc);
 
 	while (old_origin < scr_end) {
 		scr_memcpyw((unsigned short *) new_origin, (unsigned short *) old_origin, rlth);
@@ -836,29 +823,29 @@ int vc_resize(int currcons, unsigned int cols, unsigned int lines)
 	screenbuf = newscreen;
 	kmalloced = 1;
 	screenbuf_size = new_screen_size;
-	set_origin(currcons);
+	set_origin(vc);
 
 	/* do part of a reset_terminal() */
 	top = 0;
-	bottom = vc_cons[currcons].d->vc_rows;
-	gotoxy(vc_cons[currcons].d, x, y);
-	save_cur(currcons);
+	bottom = vc->vc_rows;
+	gotoxy(vc, x, y);
+	save_cur(vc);
 
-	if (vc_cons[currcons].d->vc_tty) {
-		struct winsize ws, *cws = &vc_cons[currcons].d->vc_tty->winsize;
+	if (vc->vc_tty) {
+		struct winsize ws, *cws = &vc->vc_tty->winsize;
 
 		memset(&ws, 0, sizeof(ws));
-		ws.ws_row = vc_cons[currcons].d->vc_rows;
-		ws.ws_col = vc_cons[currcons].d->vc_cols;
+		ws.ws_row = vc->vc_rows;
+		ws.ws_col = vc->vc_cols;
 		ws.ws_ypixel = video_scan_lines;
 		if ((ws.ws_row != cws->ws_row || ws.ws_col != cws->ws_col) &&
-		    vc_cons[currcons].d->vc_tty->pgrp > 0)
-			kill_pg(vc_cons[currcons].d->vc_tty->pgrp, SIGWINCH, 1);
+		    vc->vc_tty->pgrp > 0)
+			kill_pg(vc->vc_tty->pgrp, SIGWINCH, 1);
 		*cws = ws;
 	}
 
-	if (IS_VISIBLE)
-		update_screen(currcons);
+	if (CON_IS_VISIBLE(vc))
+		update_screen(vc);
 	return err;
 }
 
@@ -868,12 +855,13 @@ void vc_disallocate(unsigned int currcons)
 	WARN_CONSOLE_UNLOCKED();
 
 	if (vc_cons_allocated(currcons)) {
-	    sw->con_deinit(vc_cons[currcons].d);
-	    if (kmalloced)
-		kfree(screenbuf);
-	    if (currcons >= MIN_NR_CONSOLES)
-		kfree(vc_cons[currcons].d);
-	    vc_cons[currcons].d = NULL;
+		struct vc_data *vc = vc_cons[currcons].d;
+		sw->con_deinit(vc);
+		if (kmalloced)
+			kfree(screenbuf);
+		if (currcons >= MIN_NR_CONSOLES)
+			kfree(vc);
+		vc_cons[currcons].d = NULL;
 	}
 }
 
@@ -881,9 +869,9 @@ void vc_disallocate(unsigned int currcons)
  *	VT102 emulator
  */
 
-#define set_kbd(x) set_vc_kbd_mode(kbd_table+currcons,x)
-#define clr_kbd(x) clr_vc_kbd_mode(kbd_table+currcons,x)
-#define is_kbd(x) vc_kbd_mode(kbd_table+currcons,x)
+#define set_kbd(vc, x)	set_vc_kbd_mode(kbd_table + (vc)->vc_num, (x))
+#define clr_kbd(vc, x)	clr_vc_kbd_mode(kbd_table + (vc)->vc_num, (x))
+#define is_kbd(vc, x)	vc_kbd_mode(kbd_table + (vc)->vc_num, (x))
 
 #define decarm		VC_REPEAT
 #define decckm		VC_CKMODE
@@ -943,64 +931,60 @@ static void gotoxy(struct vc_data *vc, int new_x, int new_y)
 }
 
 /* for absolute user moves, when decom is set */
-static void gotoxay(int currcons, int new_x, int new_y)
+static void gotoxay(struct vc_data *vc, int new_x, int new_y)
 {
-	gotoxy(vc_cons[currcons].d, new_x, decom ? (top+new_y) : new_y);
+	gotoxy(vc, new_x, decom ? (top+new_y) : new_y);
 }
 
-void scrollback(int lines)
+void scrollback(struct vc_data *vc, int lines)
 {
-	int currcons = fg_console;
-
 	if (!lines)
-		lines = vc_cons[currcons].d->vc_rows/2;
+		lines = vc->vc_rows / 2;
 	scrolldelta(-lines);
 }
 
-void scrollfront(int lines)
+void scrollfront(struct vc_data *vc, int lines)
 {
-	int currcons = fg_console;
-
 	if (!lines)
-		lines = vc_cons[currcons].d->vc_rows/2;
+		lines = vc->vc_rows / 2;
 	scrolldelta(lines);
 }
 
-static void lf(int currcons)
+static void lf(struct vc_data *vc)
 {
     	/* don't scroll if above bottom of scrolling region, or
 	 * if below scrolling region
 	 */
     	if (y+1 == bottom)
-		scrup(currcons,top,bottom,1);
-	else if (y < vc_cons[currcons].d->vc_rows-1) {
+		scrup(vc, top, bottom, 1);
+	else if (y < vc->vc_rows - 1) {
 	    	y++;
-		pos += vc_cons[currcons].d->vc_size_row;
+		pos += vc->vc_size_row;
 	}
 	need_wrap = 0;
 }
 
-static void ri(int currcons)
+static void ri(struct vc_data *vc)
 {
     	/* don't scroll if below top of scrolling region, or
 	 * if above scrolling region
 	 */
 	if (y == top)
-		scrdown(currcons,top,bottom,1);
+		scrdown(vc, top, bottom, 1);
 	else if (y > 0) {
 		y--;
-		pos -= vc_cons[currcons].d->vc_size_row;
+		pos -= vc->vc_size_row;
 	}
 	need_wrap = 0;
 }
 
-static inline void cr(int currcons)
+static inline void cr(struct vc_data *vc)
 {
 	pos -= x<<1;
 	need_wrap = x = 0;
 }
 
-static inline void bs(int currcons)
+static inline void bs(struct vc_data *vc)
 {
 	if (x) {
 		pos -= 2;
@@ -1009,12 +993,12 @@ static inline void bs(int currcons)
 	}
 }
 
-static inline void del(int currcons)
+static inline void del(struct vc_data *vc)
 {
 	/* ignored */
 }
 
-static void csi_J(int currcons, int vpar)
+static void csi_J(struct vc_data *vc, int vpar)
 {
 	unsigned int count;
 	unsigned short * start;
@@ -1023,33 +1007,33 @@ static void csi_J(int currcons, int vpar)
 		case 0:	/* erase from cursor to end of display */
 			count = (scr_end-pos)>>1;
 			start = (unsigned short *) pos;
-			if (DO_UPDATE) {
+			if (DO_UPDATE(vc)) {
 				/* do in two stages */
-				sw->con_clear(vc_cons[currcons].d, y, x, 1,
-					      vc_cons[currcons].d->vc_cols-x);
-				sw->con_clear(vc_cons[currcons].d, y+1, 0,
-					      vc_cons[currcons].d->vc_rows-y-1,
-					      vc_cons[currcons].d->vc_cols);
+				sw->con_clear(vc, y, x, 1,
+					      vc->vc_cols - x);
+				sw->con_clear(vc, y + 1, 0,
+					      vc->vc_rows - y - 1,
+					      vc->vc_cols);
 			}
 			break;
 		case 1:	/* erase from start to cursor */
 			count = ((pos-origin)>>1)+1;
 			start = (unsigned short *) origin;
-			if (DO_UPDATE) {
+			if (DO_UPDATE(vc)) {
 				/* do in two stages */
-				sw->con_clear(vc_cons[currcons].d, 0, 0, y,
-					      vc_cons[currcons].d->vc_cols);
-				sw->con_clear(vc_cons[currcons].d, y, 0, 1,
+				sw->con_clear(vc, 0, 0, y,
+					      vc->vc_cols);
+				sw->con_clear(vc, y, 0, 1,
 					      x + 1);
 			}
 			break;
 		case 2: /* erase whole display */
-			count = vc_cons[currcons].d->vc_cols * vc_cons[currcons].d->vc_rows;
+			count = vc->vc_cols * vc->vc_rows;
 			start = (unsigned short *) origin;
-			if (DO_UPDATE)
-				sw->con_clear(vc_cons[currcons].d, 0, 0,
-					      vc_cons[currcons].d->vc_rows,
-					      vc_cons[currcons].d->vc_cols);
+			if (DO_UPDATE(vc))
+				sw->con_clear(vc, 0, 0,
+					      vc->vc_rows,
+					      vc->vc_cols);
 			break;
 		default:
 			return;
@@ -1058,32 +1042,32 @@ static void csi_J(int currcons, int vpar)
 	need_wrap = 0;
 }
 
-static void csi_K(int currcons, int vpar)
+static void csi_K(struct vc_data *vc, int vpar)
 {
 	unsigned int count;
 	unsigned short * start;
 
 	switch (vpar) {
 		case 0:	/* erase from cursor to end of line */
-			count = vc_cons[currcons].d->vc_cols-x;
+			count = vc->vc_cols - x;
 			start = (unsigned short *) pos;
-			if (DO_UPDATE)
-				sw->con_clear(vc_cons[currcons].d, y, x, 1,
-					      vc_cons[currcons].d->vc_cols-x);
+			if (DO_UPDATE(vc))
+				sw->con_clear(vc, y, x, 1,
+					      vc->vc_cols - x);
 			break;
 		case 1:	/* erase from start of line to cursor */
 			start = (unsigned short *) (pos - (x<<1));
 			count = x+1;
-			if (DO_UPDATE)
-				sw->con_clear(vc_cons[currcons].d, y, 0, 1,
+			if (DO_UPDATE(vc))
+				sw->con_clear(vc, y, 0, 1,
 					      x + 1);
 			break;
 		case 2: /* erase whole line */
 			start = (unsigned short *) (pos - (x<<1));
-			count = vc_cons[currcons].d->vc_cols;
-			if (DO_UPDATE)
-				sw->con_clear(vc_cons[currcons].d, y, 0, 1,
-					      vc_cons[currcons].d->vc_cols);
+			count = vc->vc_cols;
+			if (DO_UPDATE(vc))
+				sw->con_clear(vc, y, 0, 1,
+					      vc->vc_cols);
 			break;
 		default:
 			return;
@@ -1092,21 +1076,21 @@ static void csi_K(int currcons, int vpar)
 	need_wrap = 0;
 }
 
-static void csi_X(int currcons, int vpar) /* erase the following vpar positions */
+static void csi_X(struct vc_data *vc, int vpar) /* erase the following vpar positions */
 {					  /* not vt100? */
 	int count;
 
 	if (!vpar)
 		vpar++;
-	count = (vpar > vc_cons[currcons].d->vc_cols-x) ? (vc_cons[currcons].d->vc_cols-x) : vpar;
+	count = (vpar > vc->vc_cols - x) ? (vc->vc_cols - x) : vpar;
 
 	scr_memsetw((unsigned short *) pos, video_erase_char, 2 * count);
-	if (DO_UPDATE)
-		sw->con_clear(vc_cons[currcons].d, y, x, 1, count);
+	if (DO_UPDATE(vc))
+		sw->con_clear(vc, y, x, 1, count);
 	need_wrap = 0;
 }
 
-static void default_attr(int currcons)
+static void default_attr(struct vc_data *vc)
 {
 	intensity = 1;
 	underline = 0;
@@ -1116,14 +1100,14 @@ static void default_attr(int currcons)
 }
 
 /* console_sem is held */
-static void csi_m(int currcons)
+static void csi_m(struct vc_data *vc)
 {
 	int i;
 
 	for (i=0;i<=npar;i++)
 		switch (par[i]) {
 			case 0:	/* all attributes off */
-				default_attr(currcons);
+				default_attr(vc);
 				break;
 			case 1:
 				intensity = 2;
@@ -1147,7 +1131,7 @@ static void csi_m(int currcons)
 				  */
 				translate = set_translate(charset == 0
 						? G0_charset
-						: G1_charset,currcons);
+						: G1_charset, vc);
 				disp_ctrl = 0;
 				toggle_meta = 0;
 				break;
@@ -1155,7 +1139,7 @@ static void csi_m(int currcons)
 				  * Select first alternate font, lets
 				  * chars < 32 be displayed as ROM chars.
 				  */
-				translate = set_translate(IBMPC_MAP,currcons);
+				translate = set_translate(IBMPC_MAP, vc);
 				disp_ctrl = 1;
 				toggle_meta = 0;
 				break;
@@ -1163,7 +1147,7 @@ static void csi_m(int currcons)
 				  * Select second alternate font, toggle
 				  * high bit before displaying as ROM char.
 				  */
-				translate = set_translate(IBMPC_MAP,currcons);
+				translate = set_translate(IBMPC_MAP, vc);
 				disp_ctrl = 1;
 				toggle_meta = 1;
 				break;
@@ -1208,7 +1192,7 @@ static void csi_m(int currcons)
 						| foreground;
 				break;
 		}
-	update_attr(currcons);
+	update_attr(vc);
 }
 
 static void respond_string(const char *p, struct tty_struct *tty)
@@ -1220,7 +1204,7 @@ static void respond_string(const char *p, struct tty_struct *tty)
 	con_schedule_flip(tty);
 }
 
-static void cursor_report(int currcons, struct tty_struct *tty)
+static void cursor_report(struct vc_data *vc, struct tty_struct *tty)
 {
 	char buf[40];
 
@@ -1250,13 +1234,13 @@ void mouse_report(struct tty_struct *tty, int butt, int mrx, int mry)
 /* invoked via ioctl(TIOCLINUX) and through set_selection */
 int mouse_reporting(void)
 {
-	int currcons = fg_console;
+	struct vc_data *vc = vc_cons[fg_console].d;
 
 	return report_mouse;
 }
 
 /* console_sem is held */
-static void set_mode(int currcons, int on_off)
+static void set_mode(struct vc_data *vc, int on_off)
 {
 	int i;
 
@@ -1264,14 +1248,14 @@ static void set_mode(int currcons, int on_off)
 		if (ques) switch(par[i]) {	/* DEC private modes set/reset */
 			case 1:			/* Cursor keys send ^[Ox/^[[x */
 				if (on_off)
-					set_kbd(decckm);
+					set_kbd(vc, decckm);
 				else
-					clr_kbd(decckm);
+					clr_kbd(vc, decckm);
 				break;
 			case 3:	/* 80/132 mode switch unimplemented */
 				deccolm = on_off;
 #if 0
-				(void) vc_resize(deccolm ? 132 : 80, vc_cons[currcons].d->vc_rows);
+				vc_resize(deccolm ? 132 : 80, vc->vc_rows);
 				/* this alone does not suffice; some user mode
 				   utility has to change the hardware regs */
 #endif
@@ -1279,22 +1263,22 @@ static void set_mode(int currcons, int on_off)
 			case 5:			/* Inverted screen on/off */
 				if (decscnm != on_off) {
 					decscnm = on_off;
-					invert_screen(vc_cons[currcons].d, 0, screenbuf_size, 0);
-					update_attr(currcons);
+					invert_screen(vc, 0, screenbuf_size, 0);
+					update_attr(vc);
 				}
 				break;
 			case 6:			/* Origin relative/absolute */
 				decom = on_off;
-				gotoxay(currcons,0,0);
+				gotoxay(vc, 0, 0);
 				break;
 			case 7:			/* Autowrap on/off */
 				decawm = on_off;
 				break;
 			case 8:			/* Autorepeat on/off */
 				if (on_off)
-					set_kbd(decarm);
+					set_kbd(vc, decarm);
 				else
-					clr_kbd(decarm);
+					clr_kbd(vc, decarm);
 				break;
 			case 9:
 				report_mouse = on_off ? 1 : 0;
@@ -1314,39 +1298,39 @@ static void set_mode(int currcons, int on_off)
 				break;
 			case 20:		/* Lf, Enter == CrLf/Lf */
 				if (on_off)
-					set_kbd(lnm);
+					set_kbd(vc, lnm);
 				else
-					clr_kbd(lnm);
+					clr_kbd(vc, lnm);
 				break;
 		}
 }
 
 /* console_sem is held */
-static void setterm_command(int currcons)
+static void setterm_command(struct vc_data *vc)
 {
 	switch(par[0]) {
 		case 1:	/* set color for underline mode */
-			if (vc_cons[currcons].d->vc_can_do_color &&
+			if (vc->vc_can_do_color &&
 					par[1] < 16) {
 				ulcolor = color_table[par[1]];
 				if (underline)
-					update_attr(currcons);
+					update_attr(vc);
 			}
 			break;
 		case 2:	/* set color for half intensity mode */
-			if (vc_cons[currcons].d->vc_can_do_color &&
+			if (vc->vc_can_do_color &&
 					par[1] < 16) {
 				halfcolor = color_table[par[1]];
 				if (intensity == 0)
-					update_attr(currcons);
+					update_attr(vc);
 			}
 			break;
 		case 8:	/* store colors as defaults */
 			def_color = attr;
 			if (hi_font_mask == 0x100)
 				def_color >>= 1;
-			default_attr(currcons);
-			update_attr(currcons);
+			default_attr(vc);
+			update_attr(vc);
 			break;
 		case 9:	/* set blanking interval */
 			blankinterval = ((par[1] < 60) ? par[1] : 60) * 60 * HZ;
@@ -1382,49 +1366,49 @@ static void setterm_command(int currcons)
 }
 
 /* console_sem is held */
-static void csi_at(int currcons, unsigned int nr)
+static void csi_at(struct vc_data *vc, unsigned int nr)
 {
-	if (nr > vc_cons[currcons].d->vc_cols - x)
-		nr = vc_cons[currcons].d->vc_cols - x;
+	if (nr > vc->vc_cols - x)
+		nr = vc->vc_cols - x;
 	else if (!nr)
 		nr = 1;
-	insert_char(currcons, nr);
+	insert_char(vc, nr);
 }
 
 /* console_sem is held */
-static void csi_L(int currcons, unsigned int nr)
+static void csi_L(struct vc_data *vc, unsigned int nr)
 {
-	if (nr > vc_cons[currcons].d->vc_rows - y)
-		nr = vc_cons[currcons].d->vc_rows - y;
+	if (nr > vc->vc_rows - y)
+		nr = vc->vc_rows - y;
 	else if (!nr)
 		nr = 1;
-	scrdown(currcons,y,bottom,nr);
+	scrdown(vc, y, bottom, nr);
 	need_wrap = 0;
 }
 
 /* console_sem is held */
-static void csi_P(int currcons, unsigned int nr)
+static void csi_P(struct vc_data *vc, unsigned int nr)
 {
-	if (nr > vc_cons[currcons].d->vc_cols - x)
-		nr = vc_cons[currcons].d->vc_cols - x;
+	if (nr > vc->vc_cols - x)
+		nr = vc->vc_cols - x;
 	else if (!nr)
 		nr = 1;
-	delete_char(currcons, nr);
+	delete_char(vc, nr);
 }
 
 /* console_sem is held */
-static void csi_M(int currcons, unsigned int nr)
+static void csi_M(struct vc_data *vc, unsigned int nr)
 {
-	if (nr > vc_cons[currcons].d->vc_rows - y)
-		nr = vc_cons[currcons].d->vc_rows - y;
+	if (nr > vc->vc_rows - y)
+		nr = vc->vc_rows - y;
 	else if (!nr)
 		nr=1;
-	scrup(currcons,y,bottom,nr);
+	scrup(vc, y, bottom, nr);
 	need_wrap = 0;
 }
 
 /* console_sem is held (except via vc_init->reset_terminal */
-static void save_cur(int currcons)
+static void save_cur(struct vc_data *vc)
 {
 	saved_x		= x;
 	saved_y		= y;
@@ -1439,9 +1423,9 @@ static void save_cur(int currcons)
 }
 
 /* console_sem is held */
-static void restore_cur(int currcons)
+static void restore_cur(struct vc_data *vc)
 {
-	gotoxy(vc_cons[currcons].d,saved_x,saved_y);
+	gotoxy(vc, saved_x, saved_y);
 	intensity	= s_intensity;
 	underline	= s_underline;
 	blink		= s_blink;
@@ -1450,8 +1434,8 @@ static void restore_cur(int currcons)
 	color		= s_color;
 	G0_charset	= saved_G0;
 	G1_charset	= saved_G1;
-	translate	= set_translate(charset ? G1_charset : G0_charset,currcons);
-	update_attr(currcons);
+	translate	= set_translate(charset ? G1_charset : G0_charset, vc);
+	update_attr(vc);
 	need_wrap = 0;
 }
 
@@ -1460,13 +1444,13 @@ enum { ESnormal, ESesc, ESsquare, ESgetpars, ESgotpars, ESfunckey,
 	ESpalette };
 
 /* console_sem is held (except via vc_init()) */
-static void reset_terminal(int currcons, int do_clear)
+static void reset_terminal(struct vc_data *vc, int do_clear)
 {
 	top		= 0;
-	bottom		= vc_cons[currcons].d->vc_rows;
+	bottom		= vc->vc_rows;
 	vc_state	= ESnormal;
 	ques		= 0;
-	translate	= set_translate(LAT1_MAP,currcons);
+	translate	= set_translate(LAT1_MAP, vc);
 	G0_charset	= LAT1_MAP;
 	G1_charset	= GRAF_MAP;
 	charset		= 0;
@@ -1484,22 +1468,22 @@ static void reset_terminal(int currcons, int do_clear)
 	deccm		= 1;
 	decim		= 0;
 
-	set_kbd(decarm);
-	clr_kbd(decckm);
-	clr_kbd(kbdapplic);
-	clr_kbd(lnm);
-	kbd_table[currcons].lockstate = 0;
-	kbd_table[currcons].slockstate = 0;
-	kbd_table[currcons].ledmode = LED_SHOW_FLAGS;
-	kbd_table[currcons].ledflagstate = kbd_table[currcons].default_ledflagstate;
+	set_kbd(vc, decarm);
+	clr_kbd(vc, decckm);
+	clr_kbd(vc, kbdapplic);
+	clr_kbd(vc, lnm);
+	kbd_table[vc->vc_num].lockstate = 0;
+	kbd_table[vc->vc_num].slockstate = 0;
+	kbd_table[vc->vc_num].ledmode = LED_SHOW_FLAGS;
+	kbd_table[vc->vc_num].ledflagstate = kbd_table[vc->vc_num].default_ledflagstate;
 	/* do not do set_leds here because this causes an endless tasklet loop
 	   when the keyboard hasn't been initialized yet */
 
 	cursor_type = CUR_DEFAULT;
 	complement_mask = s_complement_mask;
 
-	default_attr(currcons);
-	update_attr(currcons);
+	default_attr(vc);
+	update_attr(vc);
 
 	tab_stop[0]	= 0x01010100;
 	tab_stop[1]	=
@@ -1510,14 +1494,14 @@ static void reset_terminal(int currcons, int do_clear)
 	bell_pitch = DEFAULT_BELL_PITCH;
 	bell_duration = DEFAULT_BELL_DURATION;
 
-	gotoxy(vc_cons[currcons].d, 0, 0);
-	save_cur(currcons);
+	gotoxy(vc, 0, 0);
+	save_cur(vc);
 	if (do_clear)
-	    csi_J(currcons,2);
+	    csi_J(vc, 2);
 }
 
 /* console_sem is held */
-static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
+static void do_con_trol(struct tty_struct *tty, struct vc_data *vc, int c)
 {
 	/*
 	 *  Control characters can be used in the _middle_
@@ -1531,11 +1515,11 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 			kd_mksound(bell_pitch, bell_duration);
 		return;
 	case 8:
-		bs(currcons);
+		bs(vc);
 		return;
 	case 9:
 		pos -= (x << 1);
-		while (x < vc_cons[currcons].d->vc_cols - 1) {
+		while (x < vc->vc_cols - 1) {
 			x++;
 			if (tab_stop[x >> 5] & (1 << (x & 31)))
 				break;
@@ -1543,20 +1527,20 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 		pos += (x << 1);
 		return;
 	case 10: case 11: case 12:
-		lf(currcons);
-		if (!is_kbd(lnm))
+		lf(vc);
+		if (!is_kbd(vc, lnm))
 			return;
 	case 13:
-		cr(currcons);
+		cr(vc);
 		return;
 	case 14:
 		charset = 1;
-		translate = set_translate(G1_charset,currcons);
+		translate = set_translate(G1_charset, vc);
 		disp_ctrl = 1;
 		return;
 	case 15:
 		charset = 0;
-		translate = set_translate(G0_charset,currcons);
+		translate = set_translate(G0_charset, vc);
 		disp_ctrl = 0;
 		return;
 	case 24: case 26:
@@ -1566,7 +1550,7 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 		vc_state = ESesc;
 		return;
 	case 127:
-		del(currcons);
+		del(vc);
 		return;
 	case 128+27:
 		vc_state = ESsquare;
@@ -1586,14 +1570,14 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 			vc_state = ESpercent;
 			return;
 		case 'E':
-			cr(currcons);
-			lf(currcons);
+			cr(vc);
+			lf(vc);
 			return;
 		case 'M':
-			ri(currcons);
+			ri(vc);
 			return;
 		case 'D':
-			lf(currcons);
+			lf(vc);
 			return;
 		case 'H':
 			tab_stop[x >> 5] |= (1 << (x & 31));
@@ -1602,10 +1586,10 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 			respond_ID(tty);
 			return;
 		case '7':
-			save_cur(currcons);
+			save_cur(vc);
 			return;
 		case '8':
-			restore_cur(currcons);
+			restore_cur(vc);
 			return;
 		case '(':
 			vc_state = ESsetG0;
@@ -1617,13 +1601,13 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 			vc_state = EShash;
 			return;
 		case 'c':
-			reset_terminal(currcons,1);
+			reset_terminal(vc, 1);
 			return;
 		case '>':  /* Numeric keypad */
-			clr_kbd(kbdapplic);
+			clr_kbd(vc, kbdapplic);
 			return;
 		case '=':  /* Appl. keypad */
-			set_kbd(kbdapplic);
+			set_kbd(vc, kbdapplic);
 			return;
 		}
 		return;
@@ -1635,7 +1619,7 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 			vc_state = ESpalette;
 			return;
 		} else if (c=='R') {   /* reset palette */
-			reset_palette(currcons);
+			reset_palette(vc);
 			vc_state = ESnormal;
 		} else
 			vc_state = ESnormal;
@@ -1651,7 +1635,7 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 				palette[i++] += par[j++];
 				palette[i] = 16*par[j++];
 				palette[i] += par[j];
-				set_palette(currcons);
+				set_palette(vc);
 				vc_state = ESnormal;
 			}
 		} else
@@ -1682,10 +1666,10 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 		vc_state = ESnormal;
 		switch(c) {
 		case 'h':
-			set_mode(currcons,1);
+			set_mode(vc, 1);
 			return;
 		case 'l':
-			set_mode(currcons,0);
+			set_mode(vc, 0);
 			return;
 		case 'c':
 			if (ques) {
@@ -1711,7 +1695,7 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 				if (par[0] == 5)
 					status_report(tty);
 				else if (par[0] == 6)
-					cursor_report(currcons,tty);
+					cursor_report(vc, tty);
 			}
 			return;
 		}
@@ -1722,55 +1706,55 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 		switch(c) {
 		case 'G': case '`':
 			if (par[0]) par[0]--;
-			gotoxy(vc_cons[currcons].d, par[0], y);
+			gotoxy(vc, par[0], y);
 			return;
 		case 'A':
 			if (!par[0]) par[0]++;
-			gotoxy(vc_cons[currcons].d, x, y-par[0]);
+			gotoxy(vc, x, y - par[0]);
 			return;
 		case 'B': case 'e':
 			if (!par[0]) par[0]++;
-			gotoxy(vc_cons[currcons].d, x, y+par[0]);
+			gotoxy(vc, x, y + par[0]);
 			return;
 		case 'C': case 'a':
 			if (!par[0]) par[0]++;
-			gotoxy(vc_cons[currcons].d, x+par[0], y);
+			gotoxy(vc, x + par[0], y);
 			return;
 		case 'D':
 			if (!par[0]) par[0]++;
-			gotoxy(vc_cons[currcons].d, x-par[0], y);
+			gotoxy(vc, x - par[0], y);
 			return;
 		case 'E':
 			if (!par[0]) par[0]++;
-			gotoxy(vc_cons[currcons].d, 0, y+par[0]);
+			gotoxy(vc, 0, y + par[0]);
 			return;
 		case 'F':
 			if (!par[0]) par[0]++;
-			gotoxy(vc_cons[currcons].d, 0, y-par[0]);
+			gotoxy(vc, 0, y - par[0]);
 			return;
 		case 'd':
 			if (par[0]) par[0]--;
-			gotoxay(currcons,x,par[0]);
+			gotoxay(vc, x, par[0]);
 			return;
 		case 'H': case 'f':
 			if (par[0]) par[0]--;
 			if (par[1]) par[1]--;
-			gotoxay(currcons,par[1],par[0]);
+			gotoxay(vc, par[1], par[0]);
 			return;
 		case 'J':
-			csi_J(currcons,par[0]);
+			csi_J(vc, par[0]);
 			return;
 		case 'K':
-			csi_K(currcons,par[0]);
+			csi_K(vc, par[0]);
 			return;
 		case 'L':
-			csi_L(currcons,par[0]);
+			csi_L(vc, par[0]);
 			return;
 		case 'M':
-			csi_M(currcons,par[0]);
+			csi_M(vc, par[0]);
 			return;
 		case 'P':
-			csi_P(currcons,par[0]);
+			csi_P(vc, par[0]);
 			return;
 		case 'c':
 			if (!par[0])
@@ -1788,41 +1772,41 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 			}
 			return;
 		case 'm':
-			csi_m(currcons);
+			csi_m(vc);
 			return;
 		case 'q': /* DECLL - but only 3 leds */
 			/* map 0,1,2,3 to 0,1,2,4 */
 			if (par[0] < 4)
-				setledstate(kbd_table + currcons,
+				setledstate(kbd_table + vc->vc_num,
 					    (par[0] < 3) ? par[0] : 4);
 			return;
 		case 'r':
 			if (!par[0])
 				par[0]++;
 			if (!par[1])
-				par[1] = vc_cons[currcons].d->vc_rows;
+				par[1] = vc->vc_rows;
 			/* Minimum allowed region is 2 lines */
 			if (par[0] < par[1] &&
-			    par[1] <= vc_cons[currcons].d->vc_rows) {
+			    par[1] <= vc->vc_rows) {
 				top=par[0]-1;
 				bottom=par[1];
-				gotoxay(currcons,0,0);
+				gotoxay(vc, 0, 0);
 			}
 			return;
 		case 's':
-			save_cur(currcons);
+			save_cur(vc);
 			return;
 		case 'u':
-			restore_cur(currcons);
+			restore_cur(vc);
 			return;
 		case 'X':
-			csi_X(currcons, par[0]);
+			csi_X(vc, par[0]);
 			return;
 		case '@':
-			csi_at(currcons,par[0]);
+			csi_at(vc, par[0]);
 			return;
 		case ']': /* setterm functions */
-			setterm_command(currcons);
+			setterm_command(vc);
 			return;
 		}
 		return;
@@ -1847,10 +1831,10 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 			/* DEC screen alignment test. kludge :-) */
 			video_erase_char =
 				(video_erase_char & 0xff00) | 'E';
-			csi_J(currcons, 2);
+			csi_J(vc, 2);
 			video_erase_char =
 				(video_erase_char & 0xff00) | ' ';
-			do_update_region(vc_cons[currcons].d, origin, screenbuf_size/2);
+			do_update_region(vc, origin, screenbuf_size / 2);
 		}
 		return;
 	case ESsetG0:
@@ -1863,7 +1847,7 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 		else if (c == 'K')
 			G0_charset = USER_MAP;
 		if (charset == 0)
-			translate = set_translate(G0_charset,currcons);
+			translate = set_translate(G0_charset, vc);
 		vc_state = ESnormal;
 		return;
 	case ESsetG1:
@@ -1876,7 +1860,7 @@ static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
 		else if (c == 'K')
 			G1_charset = USER_MAP;
 		if (charset == 1)
-			translate = set_translate(G1_charset,currcons);
+			translate = set_translate(G1_charset, vc);
 		vc_state = ESnormal;
 		return;
 	default:
@@ -1903,7 +1887,7 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
 #define FLUSH do { } while(0);
 #else
 #define FLUSH if (draw_x >= 0) { \
-	sw->con_putcs(vc_cons[currcons].d, (u16 *)draw_from, (u16 *)draw_to-(u16 *)draw_from, y, draw_x); \
+	sw->con_putcs(vc, (u16 *)draw_from, (u16 *)draw_to-(u16 *)draw_from, y, draw_x); \
 	draw_x = -1; \
 	}
 #endif
@@ -1912,6 +1896,7 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
 	unsigned int currcons;
 	unsigned long draw_from = 0, draw_to = 0;
 	struct vt_struct *vt;
+	struct vc_data *vc;
 	u16 himask, charmask;
 	const unsigned char *orig_buf = NULL;
 	int orig_count;
@@ -1940,6 +1925,7 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
 	    release_console_sem();
 	    return 0;
 	}
+	vc = vc_cons[currcons].d;
 	release_console_sem();
 
 	orig_buf = buf;
@@ -1965,8 +1951,8 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
 	charmask = himask ? 0x1ff : 0xff;
 
 	/* undraw cursor first */
-	if (IS_FG)
-		hide_cursor(vc_cons[currcons].d);
+	if (IS_FG(vc))
+		hide_cursor(vc);
 
 	while (!tty->stopped && count) {
 		int orig = *buf;
@@ -2034,11 +2020,11 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
 
 		if (vc_state == ESnormal && ok) {
 			/* Now try to find out how to display it */
-			tc = conv_uni_to_pc(vc_cons[currcons].d, tc);
+			tc = conv_uni_to_pc(vc, tc);
 			if ( tc == -4 ) {
                                 /* If we got -4 (not found) then see if we have
                                    defined a replacement character (U+FFFD) */
-                                tc = conv_uni_to_pc(vc_cons[currcons].d, 0xfffd);
+                                tc = conv_uni_to_pc(vc, 0xfffd);
 
 				/* One reason for the -4 can be that we just
 				   did a clear_unimap();
@@ -2055,20 +2041,20 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
 			if (need_wrap || decim)
 				FLUSH
 			if (need_wrap) {
-				cr(currcons);
-				lf(currcons);
+				cr(vc);
+				lf(vc);
 			}
 			if (decim)
-				insert_char(currcons, 1);
+				insert_char(vc, 1);
 			scr_writew(himask ?
 				     ((attr << 8) & ~himask) + ((tc & 0x100) ? himask : 0) + (tc & 0xff) :
 				     (attr << 8) + tc,
 				   (u16 *) pos);
-			if (DO_UPDATE && draw_x < 0) {
+			if (DO_UPDATE(vc) && draw_x < 0) {
 				draw_x = x;
 				draw_from = pos;
 			}
-			if (x == vc_cons[currcons].d->vc_cols - 1) {
+			if (x == vc->vc_cols - 1) {
 				need_wrap = decawm;
 				draw_to = pos+2;
 			} else {
@@ -2078,7 +2064,7 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
 			continue;
 		}
 		FLUSH
-		do_con_trol(tty, currcons, orig);
+		do_con_trol(tty, vc, orig);
 	}
 	FLUSH
 	console_conditional_schedule();
@@ -2106,7 +2092,7 @@ static void console_callback(void *ignored)
 		if (want_console != fg_console &&
 		    vc_cons_allocated(want_console)) {
 			hide_cursor(vc_cons[fg_console].d);
-			change_console(want_console);
+			change_console(vc_cons[want_console].d);
 			/* we only changed when the console had already
 			   been allocated - a new console is not created
 			   in an interrupt routine */
@@ -2118,10 +2104,10 @@ static void console_callback(void *ignored)
 		poke_blanked_console();
 	}
 	if (scrollback_delta) {
-		int currcons = fg_console;
+		struct vc_data *vc = vc_cons[fg_console].d;
 		clear_selection();
 		if (vcmode == KD_TEXT)
-			sw->con_scrolldelta(vc_cons[currcons].d, scrollback_delta);
+			sw->con_scrolldelta(vc, scrollback_delta);
 		scrollback_delta = 0;
 	}
 	if (blank_timer_expired) {
@@ -2150,7 +2136,7 @@ struct tty_driver *console_driver;
 
 void vt_console_print(struct console *co, const char *b, unsigned count)
 {
-	int currcons = fg_console;
+	struct vc_data *vc = vc_cons[fg_console].d;
 	unsigned char c;
 	static unsigned long printing;
 	const ushort *start;
@@ -2162,13 +2148,13 @@ void vt_console_print(struct console *co, const char *b, unsigned count)
 		return;
 
 	if (kmsg_redirect && vc_cons_allocated(kmsg_redirect - 1))
-		currcons = kmsg_redirect - 1;
+		vc = vc_cons[kmsg_redirect - 1].d;
 
 	/* read `x' only after setting currcons properly (otherwise
 	   the `x' macro will read the x of the foreground console). */
 	myx = x;
 
-	if (!vc_cons_allocated(currcons)) {
+	if (!vc_cons_allocated(fg_console)) {
 		/* impossible */
 		/* printk("vt_console_print: tty %d not allocated ??\n", currcons+1); */
 		goto quit;
@@ -2178,8 +2164,8 @@ void vt_console_print(struct console *co, const char *b, unsigned count)
 		goto quit;
 
 	/* undraw cursor first */
-	if (IS_FG)
-		hide_cursor(vc_cons[currcons].d);
+	if (IS_FG(vc))
+		hide_cursor(vc);
 
 	start = (ushort *)pos;
 
@@ -2189,22 +2175,22 @@ void vt_console_print(struct console *co, const char *b, unsigned count)
 		c = *b++;
 		if (c == 10 || c == 13 || c == 8 || need_wrap) {
 			if (cnt > 0) {
-				if (IS_VISIBLE)
-					sw->con_putcs(vc_cons[currcons].d, start, cnt, y, x);
+				if (CON_IS_VISIBLE(vc))
+					sw->con_putcs(vc, start, cnt, y, x);
 				x += cnt;
 				if (need_wrap)
 					x--;
 				cnt = 0;
 			}
 			if (c == 8) {		/* backspace */
-				bs(currcons);
+				bs(vc);
 				start = (ushort *)pos;
 				myx = x;
 				continue;
 			}
 			if (c != 13)
-				lf(currcons);
-			cr(currcons);
+				lf(vc);
+			cr(vc);
 			start = (ushort *)pos;
 			myx = x;
 			if (c == 10 || c == 13)
@@ -2212,7 +2198,7 @@ void vt_console_print(struct console *co, const char *b, unsigned count)
 		}
 		scr_writew((attr << 8) + c, (unsigned short *) pos);
 		cnt++;
-		if (myx == vc_cons[currcons].d->vc_cols - 1) {
+		if (myx == vc->vc_cols - 1) {
 			need_wrap = 1;
 			continue;
 		}
@@ -2220,15 +2206,15 @@ void vt_console_print(struct console *co, const char *b, unsigned count)
 		myx++;
 	}
 	if (cnt > 0) {
-		if (IS_VISIBLE)
-			sw->con_putcs(vc_cons[currcons].d, start, cnt, y, x);
+		if (CON_IS_VISIBLE(vc))
+			sw->con_putcs(vc, start, cnt, y, x);
 		x += cnt;
-		if (x == vc_cons[currcons].d->vc_cols) {
+		if (x == vc->vc_cols) {
 			x--;
 			need_wrap = 1;
 		}
 	}
-	set_cursor(vc_cons[currcons].d);
+	set_cursor(vc);
 
 	if (!oops_in_progress)
 		poke_blanked_console();
@@ -2333,7 +2319,7 @@ int tioclinux(struct tty_struct *tty, unsigned long arg)
 			if (get_user(lines, (s32 __user *)(p+4))) {
 				ret = -EFAULT;
 			} else {
-				scrollfront(lines);
+				scrollfront(vc_cons[fg_console].d, lines);
 				ret = 0;
 			}
 			break;
@@ -2457,9 +2443,10 @@ static int con_open(struct tty_struct *tty, struct file *filp)
 	if (tty->count == 1) {
 		ret = vc_allocate(currcons);
 		if (ret == 0) {
+			struct vc_data *vc = vc_cons[currcons].d;
 			vt_cons[currcons]->vc_num = currcons;
 			tty->driver_data = vt_cons[currcons];
-			vc_cons[currcons].d->vc_tty = tty;
+			vc->vc_tty = tty;
 
 			if (!tty->winsize.ws_row && !tty->winsize.ws_col) {
 				tty->winsize.ws_row = vc_cons[currcons].d->vc_rows;
@@ -2505,29 +2492,29 @@ static void con_close(struct tty_struct *tty, struct file *filp)
 	up(&tty_sem);
 }
 
-static void vc_init(unsigned int currcons, unsigned int rows,
-			unsigned int cols, int do_clear)
+static void vc_init(struct vc_data *vc, unsigned int rows,
+		    unsigned int cols, int do_clear)
 {
 	int j, k ;
 
-	vc_cons[currcons].d->vc_cols = cols;
-	vc_cons[currcons].d->vc_rows = rows;
-	vc_cons[currcons].d->vc_size_row = cols<<1;
-	screenbuf_size = vc_cons[currcons].d->vc_rows * vc_cons[currcons].d->vc_size_row;
+	vc->vc_cols = cols;
+	vc->vc_rows = rows;
+	vc->vc_size_row = cols << 1;
+	screenbuf_size = vc->vc_rows * vc->vc_size_row;
 
-	set_origin(currcons);
+	set_origin(vc);
 	pos = origin;
-	reset_vc(currcons);
+	reset_vc(vc);
 	for (j=k=0; j<16; j++) {
-		vc_cons[currcons].d->vc_palette[k++] = default_red[j] ;
-		vc_cons[currcons].d->vc_palette[k++] = default_grn[j] ;
-		vc_cons[currcons].d->vc_palette[k++] = default_blu[j] ;
+		vc->vc_palette[k++] = default_red[j] ;
+		vc->vc_palette[k++] = default_grn[j] ;
+		vc->vc_palette[k++] = default_blu[j] ;
 	}
 	def_color       = 0x07;   /* white */
 	ulcolor		= 0x0f;   /* bold white */
 	halfcolor       = 0x08;   /* grey */
-	init_waitqueue_head(&vt_cons[currcons]->paste_wait);
-	reset_terminal(currcons, do_clear);
+	init_waitqueue_head(&vt_cons[vc->vc_num]->paste_wait);
+	reset_terminal(vc, do_clear);
 }
 
 /*
@@ -2539,6 +2526,7 @@ static void vc_init(unsigned int currcons, unsigned int rows,
 static int __init con_init(void)
 {
 	const char *display_desc = NULL;
+	struct vc_data *vc;
 	unsigned int currcons = 0;
 
 	acquire_console_sem();
@@ -2562,27 +2550,27 @@ static int __init con_init(void)
 	 * kmalloc is not running yet - we use the bootmem allocator.
 	 */
 	for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) {
-		vc_cons[currcons].d = (struct vc_data *)
+		vc_cons[currcons].d = vc = (struct vc_data *)
 				alloc_bootmem(sizeof(struct vc_data));
 		vt_cons[currcons] = (struct vt_struct *)
 				alloc_bootmem(sizeof(struct vt_struct));
 		vc_cons[currcons].d->vc_vt = vt_cons[currcons];
-		visual_init(currcons, 1);
+		visual_init(vc, currcons, 1);
 		screenbuf = (unsigned short *) alloc_bootmem(screenbuf_size);
 		kmalloced = 0;
-		vc_init(currcons, vc_cons[currcons].d->vc_rows, vc_cons[currcons].d->vc_cols,
+		vc_init(vc, vc->vc_rows, vc->vc_cols,
 			currcons || !sw->con_save_screen);
 	}
 	currcons = fg_console = 0;
-	master_display_fg = vc_cons[currcons].d;
-	set_origin(currcons);
-	save_screen(currcons);
-	gotoxy(vc_cons[currcons].d, x, y);
-	csi_J(currcons, 0);
-	update_screen(fg_console);
+	master_display_fg = vc = vc_cons[currcons].d;
+	set_origin(vc);
+	save_screen(vc);
+	gotoxy(vc, x, y);
+	csi_J(vc, 0);
+	update_screen(vc);
 	printk("Console: %s %s %dx%d",
-		vc_cons[currcons].d->vc_can_do_color ? "colour" : "mono",
-		display_desc, vc_cons[currcons].d->vc_cols, vc_cons[currcons].d->vc_rows);
+		vc->vc_can_do_color ? "colour" : "mono",
+		display_desc, vc->vc_cols, vc->vc_rows);
 	printable = 1;
 	printk("\n");
 
@@ -2676,37 +2664,37 @@ int take_over_console(const struct consw *csw, int first, int last, int deflt)
 
 	for (i = first; i <= last; i++) {
 		int old_was_color;
-		int currcons = i;
+		struct vc_data *vc = vc_cons[i].d;
 
 		if (con_driver_map[i])
 			module_put(con_driver_map[i]->owner);
 		__module_get(owner);
 		con_driver_map[i] = csw;
 
-		if (!vc_cons[i].d || !vc_cons[i].d->vc_sw)
+		if (!vc || !vc->vc_sw)
 			continue;
 
 		j = i;
-		if (IS_VISIBLE)
-			save_screen(i);
-		old_was_color = vc_cons[i].d->vc_can_do_color;
-		vc_cons[i].d->vc_sw->con_deinit(vc_cons[i].d);
+		if (CON_IS_VISIBLE(vc))
+			save_screen(vc);
+		old_was_color = vc->vc_can_do_color;
+		vc->vc_sw->con_deinit(vc);
 		origin = (unsigned long) screenbuf;
 		visible_origin = origin;
 		scr_end = origin + screenbuf_size;
-		pos = origin + vc_cons[currcons].d->vc_size_row*y + 2*x;
-		visual_init(i, 0);
-		update_attr(i);
+		pos = origin + vc->vc_size_row * y + 2 * x;
+		visual_init(vc, i, 0);
+		update_attr(vc);
 
 		/* If the console changed between mono <-> color, then
 		 * the attributes in the screenbuf will be wrong.  The
 		 * following resets all attributes to something sane.
 		 */
-		if (old_was_color != vc_cons[i].d->vc_can_do_color)
-			clear_buffer_attributes(i);
+		if (old_was_color != vc->vc_can_do_color)
+			clear_buffer_attributes(vc);
 
-		if (IS_VISIBLE)
-			update_screen(i);
+		if (CON_IS_VISIBLE(vc))
+			update_screen(vc);
 	}
 	printk("Console: switching ");
 	if (!deflt)
@@ -2773,7 +2761,7 @@ static void vesa_powerdown(void)
 
 void do_blank_screen(int entering_gfx)
 {
-	int currcons = fg_console;
+	struct vc_data *vc = vc_cons[fg_console].d;
 	int i;
 
 	WARN_CONSOLE_UNLOCKED();
@@ -2792,11 +2780,11 @@ void do_blank_screen(int entering_gfx)
 
 	/* entering graphics mode? */
 	if (entering_gfx) {
-		hide_cursor(vc_cons[currcons].d);
-		save_screen(currcons);
-		sw->con_blank(vc_cons[currcons].d, -1, 1);
+		hide_cursor(vc);
+		save_screen(vc);
+		sw->con_blank(vc, -1, 1);
 		console_blanked = fg_console + 1;
-		set_origin(currcons);
+		set_origin(vc);
 		return;
 	}
 
@@ -2806,16 +2794,16 @@ void do_blank_screen(int entering_gfx)
 		return;
 	}
 
-	hide_cursor(vc_cons[currcons].d);
+	hide_cursor(vc);
 	del_timer_sync(&console_timer);
 	blank_timer_expired = 0;
 
-	save_screen(currcons);
+	save_screen(vc);
 	/* In case we need to reset origin, blanking hook returns 1 */
-	i = sw->con_blank(vc_cons[currcons].d, 1, 0);
+	i = sw->con_blank(vc, 1, 0);
 	console_blanked = fg_console + 1;
 	if (i)
-		set_origin(currcons);
+		set_origin(vc);
 
 	if (console_blank_hook && console_blank_hook(1))
 		return;
@@ -2826,7 +2814,7 @@ void do_blank_screen(int entering_gfx)
 	}
 
     	if (vesa_blank_mode)
-		sw->con_blank(vc_cons[currcons].d, vesa_blank_mode + 1, 0);
+		sw->con_blank(vc, vesa_blank_mode + 1, 0);
 }
 EXPORT_SYMBOL(do_blank_screen);
 
@@ -2835,7 +2823,7 @@ EXPORT_SYMBOL(do_blank_screen);
  */
 void do_unblank_screen(int leaving_gfx)
 {
-	int currcons;
+	struct vc_data *vc;
 
 	WARN_CONSOLE_UNLOCKED();
 
@@ -2847,7 +2835,7 @@ void do_unblank_screen(int leaving_gfx)
 		printk("unblank_screen: tty %d not allocated ??\n", fg_console+1);
 		return;
 	}
-	currcons = fg_console;
+	vc = vc_cons[fg_console].d;
 	if (vcmode != KD_TEXT)
 		return; /* but leave console_blanked != 0 */
 
@@ -2857,13 +2845,13 @@ void do_unblank_screen(int leaving_gfx)
 	}
 
 	console_blanked = 0;
-	if (sw->con_blank(vc_cons[currcons].d, 0, leaving_gfx))
+	if (sw->con_blank(vc, 0, leaving_gfx))
 		/* Low-level driver cannot restore -> do it ourselves */
-		update_screen(fg_console);
+		update_screen(vc);
 	if (console_blank_hook)
 		console_blank_hook(0);
-	set_palette(currcons);
-	set_cursor(vc_cons[fg_console].d);
+	set_palette(vc);
+	set_cursor(vc);
 }
 EXPORT_SYMBOL(do_unblank_screen);
 
@@ -2913,12 +2901,12 @@ void poke_blanked_console(void)
  *	Palettes
  */
 
-void set_palette(int currcons)
+void set_palette(struct vc_data *vc)
 {
 	WARN_CONSOLE_UNLOCKED();
 
 	if (vcmode != KD_GRAPHICS)
-		sw->con_set_palette(vc_cons[currcons].d, color_table);
+		sw->con_set_palette(vc, color_table);
 }
 
 static int set_get_cmap(unsigned char __user *arg, int set)
@@ -2945,7 +2933,7 @@ static int set_get_cmap(unsigned char __user *arg, int set)
 		    vc_cons[i].d->vc_palette[k++] = default_grn[j];
 		    vc_cons[i].d->vc_palette[k++] = default_blu[j];
 		}
-		set_palette(i);
+		set_palette(vc_cons[i].d);
 	    }
     }
     return 0;
@@ -2978,7 +2966,7 @@ int con_get_cmap(unsigned char __user *arg)
 	return rc;
 }
 
-void reset_palette(int currcons)
+void reset_palette(struct vc_data *vc)
 {
 	int j, k;
 	for (j=k=0; j<16; j++) {
@@ -2986,7 +2974,7 @@ void reset_palette(int currcons)
 		palette[k++] = default_grn[j];
 		palette[k++] = default_blu[j];
 	}
-	set_palette(currcons);
+	set_palette(vc);
 }
 
 /*
@@ -3004,13 +2992,13 @@ void reset_palette(int currcons)
 
 #define max_font_size 65536
 
-int con_font_get(int currcons, struct console_font_op *op)
+int con_font_get(struct vc_data *vc, struct console_font_op *op)
 {
 	struct console_font font;
 	int rc = -EINVAL;
 	int c;
 
-	if (vt_cons[currcons]->vc_mode != KD_TEXT)
+	if (vt_cons[vc->vc_num]->vc_mode != KD_TEXT)
 		return -EINVAL;
 
 	if (op->data) {
@@ -3022,7 +3010,7 @@ int con_font_get(int currcons, struct console_font_op *op)
 
 	acquire_console_sem();
 	if (sw->con_font_get)
-		rc = sw->con_font_get(vc_cons[currcons].d, &font);
+		rc = sw->con_font_get(vc, &font);
 	else
 		rc = -ENOSYS;
 	release_console_sem();
@@ -3059,13 +3047,13 @@ out:
 	return rc;
 }
 
-int con_font_set(int currcons, struct console_font_op *op)
+int con_font_set(struct vc_data *vc, struct console_font_op *op)
 {
 	struct console_font font;
 	int rc = -EINVAL;
 	int size;
 
-	if (vt_cons[currcons]->vc_mode != KD_TEXT)
+	if (vt_cons[vc->vc_num]->vc_mode != KD_TEXT)
 		return -EINVAL;
 	if (!op->data)
 		return -EINVAL;
@@ -3108,7 +3096,7 @@ int con_font_set(int currcons, struct console_font_op *op)
 	}
 	acquire_console_sem();
 	if (sw->con_font_set)
-		rc = sw->con_font_set(vc_cons[currcons].d, &font, op->flags);
+		rc = sw->con_font_set(vc, &font, op->flags);
 	else
 		rc = -ENOSYS;
 	release_console_sem();
@@ -3116,14 +3104,14 @@ int con_font_set(int currcons, struct console_font_op *op)
 	return rc;
 }
 
-int con_font_default(int currcons, struct console_font_op *op)
+int con_font_default(struct vc_data *vc, struct console_font_op *op)
 {
 	struct console_font font = {.width = op->width, .height = op->height};
 	char name[MAX_FONT_NAME];
 	char *s = name;
 	int rc;
 
-	if (vt_cons[currcons]->vc_mode != KD_TEXT)
+	if (vt_cons[vc->vc_num]->vc_mode != KD_TEXT)
 		return -EINVAL;
 
 	if (!op->data)
@@ -3135,7 +3123,7 @@ int con_font_default(int currcons, struct console_font_op *op)
 
 	acquire_console_sem();
 	if (sw->con_font_default)
-		rc = sw->con_font_default(vc_cons[currcons].d, &font, s);
+		rc = sw->con_font_default(vc, &font, s);
 	else
 		rc = -ENOSYS;
 	release_console_sem();
@@ -3146,17 +3134,15 @@ int con_font_default(int currcons, struct console_font_op *op)
 	return rc;
 }
 
-int con_font_copy(int currcons, struct console_font_op *op)
+int con_font_copy(struct vc_data *vc, struct console_font_op *op)
 {
 	int con = op->height;
-	struct vc_data *vc;
 	int rc;
 
-	if (vt_cons[currcons]->vc_mode != KD_TEXT)
+	if (vt_cons[vc->vc_num]->vc_mode != KD_TEXT)
 		return -EINVAL;
 
 	acquire_console_sem();
-	vc = vc_cons[currcons].d;
 	if (!sw->con_font_copy)
 		rc = -ENOSYS;
 	else if (con < 0 || !vc_cons_allocated(con))
@@ -3169,17 +3155,17 @@ int con_font_copy(int currcons, struct console_font_op *op)
 	return rc;
 }
 
-int con_font_op(int currcons, struct console_font_op *op)
+int con_font_op(struct vc_data *vc, struct console_font_op *op)
 {
 	switch (op->op) {
 	case KD_FONT_OP_SET:
-		return con_font_set(currcons, op);
+		return con_font_set(vc, op);
 	case KD_FONT_OP_GET:
-		return con_font_get(currcons, op);
+		return con_font_get(vc, op);
 	case KD_FONT_OP_SET_DEFAULT:
-		return con_font_default(currcons, op);
+		return con_font_default(vc, op);
 	case KD_FONT_OP_COPY:
-		return con_font_copy(currcons, op);
+		return con_font_copy(vc, op);
 	}
 	return -ENOSYS;
 }
diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index ad3a5d3d394c..3c55de997496 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -311,7 +311,7 @@ do_fontx_ioctl(int cmd, struct consolefontdesc __user *user_cfd, int perm, struc
 		op->height = cfdarg.charheight;
 		op->charcount = cfdarg.charcount;
 		op->data = cfdarg.chardata;
-		return con_font_op(fg_console, op);
+		return con_font_op(vc_cons[fg_console].d, op);
 	case GIO_FONTX: {
 		op->op = KD_FONT_OP_GET;
 		op->flags = KD_FONT_FLAG_OLD;
@@ -319,7 +319,7 @@ do_fontx_ioctl(int cmd, struct consolefontdesc __user *user_cfd, int perm, struc
 		op->height = cfdarg.charheight;
 		op->charcount = cfdarg.charcount;
 		op->data = cfdarg.chardata;
-		i = con_font_op(fg_console, op);
+		i = con_font_op(vc_cons[fg_console].d, op);
 		if (i)
 			return i;
 		cfdarg.charheight = op->height;
@@ -333,7 +333,7 @@ do_fontx_ioctl(int cmd, struct consolefontdesc __user *user_cfd, int perm, struc
 }
 
 static inline int 
-do_unimap_ioctl(int cmd, struct unimapdesc __user *user_ud, int perm, unsigned int console)
+do_unimap_ioctl(int cmd, struct unimapdesc __user *user_ud, int perm, struct vc_data *vc)
 {
 	struct unimapdesc tmp;
 	int i = 0; 
@@ -349,11 +349,11 @@ do_unimap_ioctl(int cmd, struct unimapdesc __user *user_ud, int perm, unsigned i
 	case PIO_UNIMAP:
 		if (!perm)
 			return -EPERM;
-		return con_set_unimap(console, tmp.entry_ct, tmp.entries);
+		return con_set_unimap(vc, tmp.entry_ct, tmp.entries);
 	case GIO_UNIMAP:
-		if (!perm && fg_console != console)
+		if (!perm && fg_console != vc->vc_num)
 			return -EPERM;
-		return con_get_unimap(console, tmp.entry_ct, &(user_ud->entry_ct), tmp.entries);
+		return con_get_unimap(vc, tmp.entry_ct, &(user_ud->entry_ct), tmp.entries);
 	}
 	return 0;
 }
@@ -796,7 +796,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 				 * make sure we are atomic with respect to
 				 * other console switches..
 				 */
-				complete_change_console(newvt);
+				complete_change_console(vc_cons[newvt].d);
 				release_console_sem();
 			}
 		}
@@ -852,7 +852,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 			return -EFAULT;
 		for (i = 0; i < MAX_NR_CONSOLES; i++) {
 			acquire_console_sem();
-                        vc_resize(i, cc, ll);
+			vc_resize(vc_cons[i].d, cc, ll);
 			release_console_sem();
 		}
 		return 0;
@@ -900,7 +900,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 				vc_cons[i].d->vc_scan_lines = vlin;
 			if (clin)
 				vc_cons[i].d->vc_font.height = clin;
-			vc_resize(i, cc, ll);
+			vc_resize(vc_cons[i].d, cc, ll);
 			release_console_sem();
 		}
   		return 0;
@@ -915,7 +915,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		op.height = 0;
 		op.charcount = 256;
 		op.data = up;
-		return con_font_op(fg_console, &op);
+		return con_font_op(vc_cons[fg_console].d, &op);
 	}
 
 	case GIO_FONT: {
@@ -925,7 +925,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		op.height = 32;
 		op.charcount = 256;
 		op.data = up;
-		return con_font_op(fg_console, &op);
+		return con_font_op(vc_cons[fg_console].d, &op);
 	}
 
 	case PIO_CMAP:
@@ -953,9 +953,10 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		{
 		op.op = KD_FONT_OP_SET_DEFAULT;
 		op.data = NULL;
-		i = con_font_op(fg_console, &op);
-		if (i) return i;
-		con_set_default_unimap(fg_console);
+		i = con_font_op(vc_cons[fg_console].d, &op);
+		if (i)
+			return i;
+		con_set_default_unimap(vc_cons[fg_console].d);
 		return 0;
 		}
 #endif
@@ -966,7 +967,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 			return -EFAULT;
 		if (!perm && op.op != KD_FONT_OP_GET)
 			return -EPERM;
-		i = con_font_op(console, &op);
+		i = con_font_op(vc, &op);
 		if (i) return i;
 		if (copy_to_user(up, &op, sizeof(op)))
 			return -EFAULT;
@@ -995,13 +996,13 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 			return -EPERM;
 		i = copy_from_user(&ui, up, sizeof(struct unimapinit));
 		if (i) return -EFAULT;
-		con_clear_unimap(console, &ui);
+		con_clear_unimap(vc, &ui);
 		return 0;
 	      }
 
 	case PIO_UNIMAP:
 	case GIO_UNIMAP:
-		return do_unimap_ioctl(cmd, up, perm, console);
+		return do_unimap_ioctl(cmd, up, perm, vc);
 
 	case VT_LOCKSWITCH:
 		if (!capable(CAP_SYS_TTY_CONFIG))
@@ -1054,26 +1055,29 @@ int vt_waitactive(int vt)
 
 #define vt_wake_waitactive() wake_up(&vt_activate_queue)
 
-void reset_vc(unsigned int new_console)
+void reset_vc(struct vc_data *vc)
 {
-	vt_cons[new_console]->vc_mode = KD_TEXT;
-	kbd_table[new_console].kbdmode = VC_XLATE;
-	vt_cons[new_console]->vt_mode.mode = VT_AUTO;
-	vt_cons[new_console]->vt_mode.waitv = 0;
-	vt_cons[new_console]->vt_mode.relsig = 0;
-	vt_cons[new_console]->vt_mode.acqsig = 0;
-	vt_cons[new_console]->vt_mode.frsig = 0;
-	vt_cons[new_console]->vt_pid = -1;
-	vt_cons[new_console]->vt_newvt = -1;
+	struct vt_struct *vt = vt_cons[vc->vc_num];
+
+	vt->vc_mode = KD_TEXT;
+	kbd_table[vc->vc_num].kbdmode = VC_XLATE;
+	vt->vt_mode.mode = VT_AUTO;
+	vt->vt_mode.waitv = 0;
+	vt->vt_mode.relsig = 0;
+	vt->vt_mode.acqsig = 0;
+	vt->vt_mode.frsig = 0;
+	vt->vt_pid = -1;
+	vt->vt_newvt = -1;
 	if (!in_interrupt())    /* Via keyboard.c:SAK() - akpm */
-		reset_palette(new_console) ;
+		reset_palette(vc);
 }
 
 /*
  * Performs the back end of a vt switch
  */
-void complete_change_console(unsigned int new_console)
+void complete_change_console(struct vc_data *vc)
 {
+	unsigned int new_console = vc->vc_num;
 	unsigned char old_vc_mode;
 
 	last_console = fg_console;
@@ -1084,7 +1088,7 @@ void complete_change_console(unsigned int new_console)
 	 * unblank the screen later.
 	 */
 	old_vc_mode = vt_cons[fg_console]->vc_mode;
-	switch_screen(new_console);
+	switch_screen(vc);
 
 	/*
 	 * This can't appear below a successful kill_proc().  If it did,
@@ -1129,7 +1133,7 @@ void complete_change_console(unsigned int new_console)
 		 * this outside of VT_PROCESS but there is no single process
 		 * to account for and tracking tty count may be undesirable.
 		 */
-		        reset_vc(new_console);
+			reset_vc(vc);
 
 			if (old_vc_mode != vt_cons[new_console]->vc_mode)
 			{
@@ -1151,11 +1155,9 @@ void complete_change_console(unsigned int new_console)
 /*
  * Performs the front-end of a vt switch
  */
-void change_console(unsigned int new_console)
+void change_console(struct vc_data *new_vc)
 {
-        if ((new_console == fg_console) || (vt_dont_switch))
-                return;
-        if (!vc_cons_allocated(new_console))
+	if (!new_vc || new_vc->vc_num == fg_console || vt_dont_switch)
 		return;
 
 	/*
@@ -1189,7 +1191,7 @@ void change_console(unsigned int new_console)
 			 * return. The process needs to send us a
 			 * VT_RELDISP ioctl to complete the switch.
 			 */
-			vt_cons[fg_console]->vt_newvt = new_console;
+			vt_cons[fg_console]->vt_newvt = new_vc->vc_num;
 			return;
 		}
 
@@ -1202,7 +1204,7 @@ void change_console(unsigned int new_console)
 		 * this outside of VT_PROCESS but there is no single process
 		 * to account for and tracking tty count may be undesirable.
 		 */
-		reset_vc(fg_console);
+		reset_vc(vc_cons[fg_console].d);
 
 		/*
 		 * Fall through to normal (VT_AUTO) handling of the switch...
@@ -1215,5 +1217,5 @@ void change_console(unsigned int new_console)
 	if (vt_cons[fg_console]->vc_mode == KD_GRAPHICS)
 		return;
 
-	complete_change_console(new_console);
+	complete_change_console(new_vc);
 }
diff --git a/drivers/video/console/dummycon.c b/drivers/video/console/dummycon.c
index 3ac5d5842e72..1ecda91e5a9c 100644
--- a/drivers/video/console/dummycon.c
+++ b/drivers/video/console/dummycon.c
@@ -42,7 +42,7 @@ static void dummycon_init(struct vc_data *vc, int init)
 	vc->vc_cols = DUMMY_COLUMNS;
 	vc->vc_rows = DUMMY_ROWS;
     } else
-	vc_resize(vc->vc_num, DUMMY_COLUMNS, DUMMY_ROWS);
+	vc_resize(vc, DUMMY_COLUMNS, DUMMY_ROWS);
 }
 
 static int dummycon_dummy(void)
diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 661685cc823d..50b5b9dfb66a 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -458,7 +458,7 @@ static void fbcon_prepare_logo(struct vc_data *vc, struct fb_info *info,
 
 	if (CON_IS_VISIBLE(vc) && vt_cons[vc->vc_num]->vc_mode == KD_TEXT) {
 		fbcon_clear_margins(vc, 0);
-		update_screen(vc->vc_num);
+		update_screen(vc);
 	}
 
 	if (save) {
@@ -609,7 +609,7 @@ static void con2fb_init_display(struct vc_data *vc, struct fb_info *info,
 				   fg_vc->vc_rows);
 	}
 
-	switch_screen(fg_console);
+	update_screen(vc_cons[fg_console].d);
 }
 
 /**
@@ -802,7 +802,7 @@ static const char *fbcon_startup(void)
 
 	cols = info->var.xres / vc->vc_font.width;
 	rows = info->var.yres / vc->vc_font.height;
-	vc_resize(vc->vc_num, cols, rows);
+	vc_resize(vc, cols, rows);
 
 	DPRINTK("mode:   %s\n", info->fix.id);
 	DPRINTK("visual: %d\n", info->fix.visual);
@@ -890,13 +890,12 @@ static void fbcon_init(struct vc_data *vc, int init)
 	struct vc_data **default_mode = vc->vc_display_fg;
 	struct vc_data *svc = *default_mode;
 	struct display *t, *p = &fb_display[vc->vc_num];
-	int display_fg = (*default_mode)->vc_num;
 	int logo = 1, new_rows, new_cols, rows, cols, charcnt = 256;
 	int cap = info->flags;
 
 	if (info_idx == -1 || info == NULL)
 	    return;
-	if (vc->vc_num != display_fg || logo_shown == FBCON_LOGO_DONTSHOW ||
+	if (vc != svc || logo_shown == FBCON_LOGO_DONTSHOW ||
 	    (info->fix.type == FB_TYPE_TEXT))
 		logo = 0;
 
@@ -907,7 +906,7 @@ static void fbcon_init(struct vc_data *vc, int init)
 
 	/* If we are not the first console on this
 	   fb, copy the font from that console */
-	t = &fb_display[display_fg];
+	t = &fb_display[svc->vc_num];
 	if (!vc->vc_font.data) {
 		vc->vc_font.data = p->fontdata = t->fontdata;
 		vc->vc_font.width = (*default_mode)->vc_font.width;
@@ -929,15 +928,15 @@ static void fbcon_init(struct vc_data *vc, int init)
 	}
 
 	if (!*svc->vc_uni_pagedir_loc)
-		con_set_default_unimap(display_fg);
+		con_set_default_unimap(svc);
 	if (!*vc->vc_uni_pagedir_loc)
-		con_copy_unimap(vc->vc_num, display_fg);
+		con_copy_unimap(vc, svc);
 
 	cols = vc->vc_cols;
 	rows = vc->vc_rows;
 	new_cols = info->var.xres / vc->vc_font.width;
 	new_rows = info->var.yres / vc->vc_font.height;
-	vc_resize(vc->vc_num, new_cols, new_rows);
+	vc_resize(vc, new_cols, new_rows);
 	/*
 	 * We must always set the mode. The mode of the previous console
 	 * driver could be in the same resolution but we are using different
@@ -968,7 +967,7 @@ static void fbcon_init(struct vc_data *vc, int init)
 	if (logo)
 		fbcon_prepare_logo(vc, info, cols, rows, new_cols, new_rows);
 
-	if (vc->vc_num == display_fg && softback_buf) {
+	if (vc == svc && softback_buf) {
 		int l = fbcon_softback_size / vc->vc_size_row;
 		if (l > 5)
 			softback_end = softback_buf + l * vc->vc_size_row;
@@ -1144,13 +1143,12 @@ static void fbcon_set_disp(struct fb_info *info, struct vc_data *vc)
 	struct display *p = &fb_display[vc->vc_num], *t;
 	struct vc_data **default_mode = vc->vc_display_fg;
 	struct vc_data *svc = *default_mode;
-	int display_fg = (*default_mode)->vc_num;
 	int rows, cols, charcnt = 256;
 
 	info->var.xoffset = info->var.yoffset = p->yscroll = 0;
 	if (var_to_display(p, &info->var, info))
 		return;
-	t = &fb_display[display_fg];
+	t = &fb_display[svc->vc_num];
 	if (!vc->vc_font.data) {
 		vc->vc_font.data = p->fontdata = t->fontdata;
 		vc->vc_font.width = (*default_mode)->vc_font.width;
@@ -1173,15 +1171,15 @@ static void fbcon_set_disp(struct fb_info *info, struct vc_data *vc)
 	}
 
 	if (!*svc->vc_uni_pagedir_loc)
-		con_set_default_unimap(display_fg);
+		con_set_default_unimap(svc);
 	if (!*vc->vc_uni_pagedir_loc)
-		con_copy_unimap(vc->vc_num, display_fg);
+		con_copy_unimap(vc, svc);
 
 	cols = info->var.xres / vc->vc_font.width;
 	rows = info->var.yres / vc->vc_font.height;
-	vc_resize(vc->vc_num, cols, rows);
+	vc_resize(vc, cols, rows);
 	if (CON_IS_VISIBLE(vc)) {
-		update_screen(vc->vc_num);
+		update_screen(vc);
 		if (softback_buf) {
 			int l = fbcon_softback_size / vc->vc_size_row;
 
@@ -1987,7 +1985,7 @@ static int fbcon_switch(struct vc_data *vc)
 		logo_shown = fg_console;
 		/* This is protected above by initmem_freed */
 		fb_show_logo(info);
-		update_region(fg_console,
+		update_region(vc,
 			      vc->vc_origin + vc->vc_size_row * vc->vc_top,
 			      vc->vc_size_row * (vc->vc_bottom -
 						 vc->vc_top) / 2);
@@ -2048,7 +2046,7 @@ static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch)
 		}
 
  		if (!blank)
- 			update_screen(vc->vc_num);
+ 			update_screen(vc);
  	}
 
  	return 0;
@@ -2198,7 +2196,7 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h,
 	if (resize) {
 		/* reset wrap/pan */
 		info->var.xoffset = info->var.yoffset = p->yscroll = 0;
-		vc_resize(vc->vc_num, info->var.xres / w, info->var.yres / h);
+		vc_resize(vc, info->var.xres / w, info->var.yres / h);
 		if (CON_IS_VISIBLE(vc) && softback_buf) {
 			int l = fbcon_softback_size / vc->vc_size_row;
 			if (l > 5)
@@ -2213,7 +2211,7 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h,
 	} else if (CON_IS_VISIBLE(vc)
 		   && vt_cons[vc->vc_num]->vc_mode == KD_TEXT) {
 		fbcon_clear_margins(vc, 0);
-		update_screen(vc->vc_num);
+		update_screen(vc);
 	}
 
 	if (old_data && (--REFCOUNT(old_data) == 0))
@@ -2464,7 +2462,7 @@ static int fbcon_scrolldelta(struct vc_data *vc, int lines)
 						    vc->vc_size_row);
 				}
 				softback_in = p;
-				update_region(vc->vc_num, vc->vc_origin,
+				update_region(vc, vc->vc_origin,
 					      logo_lines * vc->vc_cols);
 			}
 			logo_shown = FBCON_LOGO_CANSHOW;
@@ -2545,7 +2543,7 @@ static void fbcon_resumed(struct fb_info *info)
 		return;
 	vc = vc_cons[ops->currcon].d;
 
-	update_screen(vc->vc_num);
+	update_screen(vc);
 }
 
 static void fbcon_modechanged(struct fb_info *info)
@@ -2569,13 +2567,13 @@ static void fbcon_modechanged(struct fb_info *info)
 		var_to_display(p, &info->var, info);
 		cols = info->var.xres / vc->vc_font.width;
 		rows = info->var.yres / vc->vc_font.height;
-		vc_resize(vc->vc_num, cols, rows);
+		vc_resize(vc, cols, rows);
 		updatescrollmode(p, info, vc);
 		scrollback_max = 0;
 		scrollback_current = 0;
 		update_var(vc->vc_num, info);
 		fbcon_set_palette(vc, color_table);
-		update_screen(vc->vc_num);
+		update_screen(vc);
 		if (softback_buf) {
 			int l = fbcon_softback_size / vc->vc_size_row;
 			if (l > 5)
diff --git a/drivers/video/console/mdacon.c b/drivers/video/console/mdacon.c
index be3d53596f8d..4a26c828b798 100644
--- a/drivers/video/console/mdacon.c
+++ b/drivers/video/console/mdacon.c
@@ -351,10 +351,9 @@ static void mdacon_init(struct vc_data *c, int init)
 	if (init) {
 		c->vc_cols = mda_num_columns;
 		c->vc_rows = mda_num_lines;
-	} else {
-		vc_resize(c->vc_num, mda_num_columns, mda_num_lines);
-        }
-	
+	} else
+		vc_resize(c, mda_num_columns, mda_num_lines);
+
 	/* make the first MDA console visible */
 
 	if (mda_display_fg == NULL)
diff --git a/drivers/video/console/promcon.c b/drivers/video/console/promcon.c
index fec664e61551..04f42fcaac59 100644
--- a/drivers/video/console/promcon.c
+++ b/drivers/video/console/promcon.c
@@ -155,9 +155,9 @@ promcon_init_unimap(struct vc_data *conp)
 			k++;
 		}
 	set_fs(KERNEL_DS);
-	con_clear_unimap(conp->vc_num, NULL);
-	con_set_unimap(conp->vc_num, k, p);
-	con_protect_unimap(conp->vc_num, 1);
+	con_clear_unimap(conp, NULL);
+	con_set_unimap(conp, k, p);
+	con_protect_unimap(conp, 1);
 	set_fs(old_fs);
 	kfree(p);
 }
@@ -175,7 +175,7 @@ promcon_init(struct vc_data *conp, int init)
 	p = *conp->vc_uni_pagedir_loc;
 	if (conp->vc_uni_pagedir_loc == &conp->vc_uni_pagedir ||
 	    !--conp->vc_uni_pagedir_loc[1])
-		con_free_unimap(conp->vc_num);
+		con_free_unimap(conp);
 	conp->vc_uni_pagedir_loc = promcon_uni_pagedir;
 	promcon_uni_pagedir[1]++;
 	if (!promcon_uni_pagedir[0] && p) {
@@ -183,7 +183,7 @@ promcon_init(struct vc_data *conp, int init)
 	}
 	if (!init) {
 		if (conp->vc_cols != pw + 1 || conp->vc_rows != ph + 1)
-			vc_resize(conp->vc_num, pw + 1, ph + 1);
+			vc_resize(conp, pw + 1, ph + 1);
 	}
 }
 
@@ -192,9 +192,9 @@ promcon_deinit(struct vc_data *conp)
 {
 	/* When closing the last console, reset video origin */
 	if (!--promcon_uni_pagedir[1])
-		con_free_unimap(conp->vc_num);
+		con_free_unimap(conp);
 	conp->vc_uni_pagedir_loc = &conp->vc_uni_pagedir;
-	con_set_default_unimap(conp->vc_num);
+	con_set_default_unimap(conp);
 }
 
 static int
diff --git a/drivers/video/console/sticon.c b/drivers/video/console/sticon.c
index 99262680fdd8..24f9715f7449 100644
--- a/drivers/video/console/sticon.c
+++ b/drivers/video/console/sticon.c
@@ -217,7 +217,7 @@ static void sticon_init(struct vc_data *c, int init)
     } else {
 	/* vc_rows = (c->vc_rows > vc_rows) ? vc_rows : c->vc_rows; */
 	/* vc_cols = (c->vc_cols > vc_cols) ? vc_cols : c->vc_cols; */
-	vc_resize(c->vc_num, vc_cols, vc_rows); 
+	vc_resize(c, vc_cols, vc_rows);
 /*	vc_resize_con(vc_rows, vc_cols, c->vc_num); */
     }
 }
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index fb5d5943d8ac..7d1ae06667c6 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -340,11 +340,11 @@ static void vgacon_init(struct vc_data *c, int init)
 	p = *c->vc_uni_pagedir_loc;
 	if (c->vc_uni_pagedir_loc == &c->vc_uni_pagedir ||
 	    !--c->vc_uni_pagedir_loc[1])
-		con_free_unimap(c->vc_num);
+		con_free_unimap(c);
 	c->vc_uni_pagedir_loc = vgacon_uni_pagedir;
 	vgacon_uni_pagedir[1]++;
 	if (!vgacon_uni_pagedir[0] && p)
-		con_set_default_unimap(c->vc_num);
+		con_set_default_unimap(c);
 }
 
 static inline void vga_set_mem_top(struct vc_data *c)
@@ -358,10 +358,10 @@ static void vgacon_deinit(struct vc_data *c)
 	if (!--vgacon_uni_pagedir[1]) {
 		c->vc_visible_origin = vga_vram_base;
 		vga_set_mem_top(c);
-		con_free_unimap(c->vc_num);
+		con_free_unimap(c);
 	}
 	c->vc_uni_pagedir_loc = &c->vc_uni_pagedir;
-	con_set_default_unimap(c->vc_num);
+	con_set_default_unimap(c);
 }
 
 static u8 vgacon_build_attr(struct vc_data *c, u8 color, u8 intensity,
@@ -908,7 +908,7 @@ static int vgacon_adjust_height(struct vc_data *vc, unsigned fontheight)
 				c->vc_sw->con_cursor(c, CM_DRAW);
 			}
 			c->vc_font.height = fontheight;
-			vc_resize(c->vc_num, 0, rows);	/* Adjust console size */
+			vc_resize(c, 0, rows);	/* Adjust console size */
 		}
 	}
 	return 0;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 130506f065c5..6b38446d4e52 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1559,7 +1559,7 @@ static int do_fontx_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg,
 		    get_user(data, &user_cfd->chardata))
 			return -EFAULT;
 		op.data = compat_ptr(data);
-		return con_font_op(fg_console, &op);
+		return con_font_op(vc_cons[fg_console].d, &op);
 	case GIO_FONTX:
 		op.op = KD_FONT_OP_GET;
 		op.flags = 0;
@@ -1571,7 +1571,7 @@ static int do_fontx_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg,
 		if (!data)
 			return 0;
 		op.data = compat_ptr(data);
-		i = con_font_op(fg_console, &op);
+		i = con_font_op(vc_cons[fg_console].d, &op);
 		if (i)
 			return i;
 		if (put_user(op.height, &user_cfd->charheight) ||
@@ -1608,7 +1608,7 @@ static int do_kdfontop_ioctl(unsigned int fd, unsigned int cmd, unsigned long ar
 	op.data = compat_ptr(((struct console_font_op32 *)&op)->data);
 	op.flags |= KD_FONT_FLAG_OLD;
 	vt = (struct vt_struct *)((struct tty_struct *)file->private_data)->driver_data;
-	i = con_font_op(vt->vc_num, &op);
+	i = con_font_op(vc_cons[vt->vc_num].d, &op);
 	if (i) return i;
 	((struct console_font_op32 *)&op)->data = (unsigned long)op.data;
 	if (copy_to_user(fontop, &op, sizeof(struct console_font_op32)))
@@ -1633,9 +1633,9 @@ static int do_unimap_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg,
 	switch (cmd) {
 	case PIO_UNIMAP:
 		if (!perm) return -EPERM;
-		return con_set_unimap(fg_console, tmp.entry_ct, compat_ptr(tmp.entries));
+		return con_set_unimap(vc_cons[fg_console].d, tmp.entry_ct, compat_ptr(tmp.entries));
 	case GIO_UNIMAP:
-		return con_get_unimap(fg_console, tmp.entry_ct, &(user_ud->entry_ct), compat_ptr(tmp.entries));
+		return con_get_unimap(vc_cons[fg_console].d, tmp.entry_ct, &(user_ud->entry_ct), compat_ptr(tmp.entries));
 	}
 	return 0;
 }
diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h
index dee4b654c589..65842efc1b70 100644
--- a/include/linux/consolemap.h
+++ b/include/linux/consolemap.h
@@ -11,5 +11,5 @@
 struct vc_data;
 
 extern unsigned char inverse_translate(struct vc_data *conp, int glyph);
-extern unsigned short *set_translate(int m,int currcons);
+extern unsigned short *set_translate(int m, struct vc_data *vc);
 extern int conv_uni_to_pc(struct vc_data *conp, long ucs);
diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index 6dbef2f8445a..7c37844c31c0 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -41,25 +41,25 @@ extern int kbd_rate(struct kbd_repeat *rep);
 
 int vc_allocate(unsigned int console);
 int vc_cons_allocated(unsigned int console);
-int vc_resize(int currcons, unsigned int cols, unsigned int lines);
+int vc_resize(struct vc_data *vc, unsigned int cols, unsigned int lines);
 void vc_disallocate(unsigned int console);
-void reset_palette(int currcons);
-void set_palette(int currcons);
+void reset_palette(struct vc_data *vc);
+void set_palette(struct vc_data *vc);
 void do_blank_screen(int entering_gfx);
 void do_unblank_screen(int leaving_gfx);
 void unblank_screen(void);
 void poke_blanked_console(void);
-int con_font_op(int currcons, struct console_font_op *op);
-int con_font_set(int currcons, struct console_font_op *op);
-int con_font_get(int currcons, struct console_font_op *op);
-int con_font_default(int currcons, struct console_font_op *op);
-int con_font_copy(int currcons, struct console_font_op *op);
+int con_font_op(struct vc_data *vc, struct console_font_op *op);
+int con_font_set(struct vc_data *vc, struct console_font_op *op);
+int con_font_get(struct vc_data *vc, struct console_font_op *op);
+int con_font_default(struct vc_data *vc, struct console_font_op *op);
+int con_font_copy(struct vc_data *vc, struct console_font_op *op);
 int con_set_cmap(unsigned char __user *cmap);
 int con_get_cmap(unsigned char __user *cmap);
-void scrollback(int);
-void scrollfront(int);
-void update_region(int currcons, unsigned long start, int count);
-void redraw_screen(int new_console, int is_switch);
+void scrollback(struct vc_data *vc, int lines);
+void scrollfront(struct vc_data *vc, int lines);
+void update_region(struct vc_data *vc, unsigned long start, int count);
+void redraw_screen(struct vc_data *vc, int is_switch);
 #define update_screen(x) redraw_screen(x, 0)
 #define switch_screen(x) redraw_screen(x, 1)
 
@@ -75,19 +75,19 @@ int con_set_trans_old(unsigned char __user * table);
 int con_get_trans_old(unsigned char __user * table);
 int con_set_trans_new(unsigned short __user * table);
 int con_get_trans_new(unsigned short __user * table);
-int con_clear_unimap(int currcons, struct unimapinit *ui);
-int con_set_unimap(int currcons, ushort ct, struct unipair __user *list);
-int con_get_unimap(int currcons, ushort ct, ushort __user *uct, struct unipair __user *list);
-int con_set_default_unimap(int currcons);
-void con_free_unimap(int currcons);
-void con_protect_unimap(int currcons, int rdonly);
-int con_copy_unimap(int dstcons, int srccons);
+int con_clear_unimap(struct vc_data *vc, struct unimapinit *ui);
+int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list);
+int con_get_unimap(struct vc_data *vc, ushort ct, ushort __user *uct, struct unipair __user *list);
+int con_set_default_unimap(struct vc_data *vc);
+void con_free_unimap(struct vc_data *vc);
+void con_protect_unimap(struct vc_data *vc, int rdonly);
+int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc);
 
 /* vt.c */
-void complete_change_console(unsigned int new_console);
+void complete_change_console(struct vc_data *vc);
 int vt_waitactive(int vt);
-void change_console(unsigned int);
-void reset_vc(unsigned int new_console);
+void change_console(struct vc_data *new_vc);
+void reset_vc(struct vc_data *vc);
 
 /*
  * vc_screen.c shares this temporary buffer with the console write code so that
-- 
cgit v1.2.3


From 0142e3ff96edcf707297cf81e54d917664951a28 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Mon, 7 Mar 2005 17:45:57 -0800
Subject: [PATCH] merge vt_struct into vc_data

The vt_struct and vc_data are always allocated together, so there is no need
for a separate vt_struct structure.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/selection.c       |  6 +--
 drivers/char/vt.c              | 88 +++++++++++++++++----------------------
 drivers/char/vt_ioctl.c        | 93 ++++++++++++++++++------------------------
 drivers/video/console/fbcon.c  | 43 +++++++++----------
 drivers/video/console/sticon.c | 10 ++---
 drivers/video/sun3fb.c         |  2 +-
 fs/compat_ioctl.c              |  9 ++--
 include/linux/console_struct.h |  7 +++-
 include/linux/vt_kern.h        |  9 ----
 9 files changed, 115 insertions(+), 152 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/selection.c b/drivers/char/selection.c
index b7f584f6a919..15108f382510 100644
--- a/drivers/char/selection.c
+++ b/drivers/char/selection.c
@@ -275,7 +275,7 @@ int set_selection(const struct tiocl_selection __user *sel, struct tty_struct *t
  */
 int paste_selection(struct tty_struct *tty)
 {
-	struct vt_struct *vt = (struct vt_struct *) tty->driver_data;
+	struct vc_data *vc = (struct vc_data *)tty->driver_data;
 	int	pasted = 0, count;
 	struct  tty_ldisc *ld;
 	DECLARE_WAITQUEUE(wait, current);
@@ -286,7 +286,7 @@ int paste_selection(struct tty_struct *tty)
 
 	ld = tty_ldisc_ref_wait(tty);
 	
-	add_wait_queue(&vt->paste_wait, &wait);
+	add_wait_queue(&vc->paste_wait, &wait);
 	while (sel_buffer && sel_buffer_lth > pasted) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (test_bit(TTY_THROTTLED, &tty->flags)) {
@@ -298,7 +298,7 @@ int paste_selection(struct tty_struct *tty)
 		tty->ldisc.receive_buf(tty, sel_buffer + pasted, NULL, count);
 		pasted += count;
 	}
-	remove_wait_queue(&vt->paste_wait, &wait);
+	remove_wait_queue(&vc->paste_wait, &wait);
 	current->state = TASK_RUNNING;
 
 	tty_ldisc_deref(ld);
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 17869964f5e8..2f8c0ed22e82 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -547,7 +547,7 @@ static void hide_cursor(struct vc_data *vc)
 static void set_cursor(struct vc_data *vc)
 {
 	if (!IS_FG(vc) || console_blanked ||
-	    vc->vc_vt->vc_mode == KD_GRAPHICS)
+	    vc->vc_mode == KD_GRAPHICS)
 		return;
 	if (vc->vc_deccm) {
 		if (vc == sel_cons)
@@ -642,7 +642,7 @@ void redraw_screen(struct vc_data *vc, int is_switch)
 			update_attr(vc);
 			clear_buffer_attributes(vc);
 		}
-		if (update && vt_cons[vc->vc_num]->vc_mode != KD_GRAPHICS)
+		if (update && vc->vc_mode != KD_GRAPHICS)
 			do_update_region(vc, vc->vc_origin, vc->vc_screenbuf_size / 2);
 	}
 	set_cursor(vc);
@@ -695,7 +695,6 @@ int vc_allocate(unsigned int currcons)	/* return 0 on success */
 		return -ENXIO;
 	if (!vc_cons[currcons].d) {
 	    struct vc_data *vc;
-	    long p, q;
 
 	    /* prevent users from taking too much memory */
 	    if (currcons >= MAX_NR_USER_CONSOLES && !capable(CAP_SYS_RESOURCE))
@@ -707,24 +706,20 @@ int vc_allocate(unsigned int currcons)	/* return 0 on success */
 	    /* although the numbers above are not valid since long ago, the
 	       point is still up-to-date and the comment still has its value
 	       even if only as a historical artifact.  --mj, July 1998 */
-	    p = (long) kmalloc(sizeof(struct vc_data) + sizeof(struct vt_struct), GFP_KERNEL);
-	    if (!p)
+	    vc = kmalloc(sizeof(struct vc_data), GFP_KERNEL);
+	    if (!vc)
 		return -ENOMEM;
-	    memset((void *)p, 0, sizeof(struct vc_data) + sizeof(struct vt_struct));
-	    vc_cons[currcons].d = vc = (struct vc_data *)p;
-	    vt_cons[currcons] = (struct vt_struct *)(p+sizeof(struct vc_data));
-	    vc_cons[currcons].d->vc_vt = vt_cons[currcons];
+	    memset(vc, 0, sizeof(*vc));
+	    vc_cons[currcons].d = vc;
 	    visual_init(vc, currcons, 1);
 	    if (!*vc->vc_uni_pagedir_loc)
 		con_set_default_unimap(vc);
-	    q = (long)kmalloc(vc->vc_screenbuf_size, GFP_KERNEL);
-	    if (!q) {
-		kfree((char *) p);
+	    vc->vc_screenbuf = kmalloc(vc->vc_screenbuf_size, GFP_KERNEL);
+	    if (!vc->vc_screenbuf) {
+		kfree(vc);
 		vc_cons[currcons].d = NULL;
-		vt_cons[currcons] = NULL;
 		return -ENOMEM;
 	    }
-	    vc->vc_screenbuf = (unsigned short *)q;
 	    vc->vc_kmalloced = 1;
 	    vc_init(vc, vc->vc_rows, vc->vc_cols, 1);
 
@@ -742,7 +737,7 @@ inline int resize_screen(struct vc_data *vc, int width, int height)
 	/* Resizes the resolution of the display adapater */
 	int err = 0;
 
-	if (vt_cons[vc->vc_num]->vc_mode != KD_GRAPHICS && vc->vc_sw->con_resize)
+	if (vc->vc_mode != KD_GRAPHICS && vc->vc_sw->con_resize)
 		err = vc->vc_sw->con_resize(vc, width, height);
 	return err;
 }
@@ -1906,7 +1901,6 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
 	int c, tc, ok, n = 0, draw_x = -1;
 	unsigned int currcons;
 	unsigned long draw_from = 0, draw_to = 0;
-	struct vt_struct *vt;
 	struct vc_data *vc;
 	u16 himask, charmask;
 	const unsigned char *orig_buf = NULL;
@@ -1918,14 +1912,14 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
 	might_sleep();
 
 	acquire_console_sem();
-	vt = tty->driver_data;
-	if (vt == NULL) {
+	vc = tty->driver_data;
+	if (vc == NULL) {
 		printk(KERN_ERR "vt: argh, driver_data is NULL !\n");
 		release_console_sem();
 		return 0;
 	}
 
-	currcons = vt->vc_num;
+	currcons = vc->vc_num;
 	if (!vc_cons_allocated(currcons)) {
 	    /* could this happen? */
 	    static int error = 0;
@@ -1936,7 +1930,6 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
 	    release_console_sem();
 	    return 0;
 	}
-	vc = vc_cons[currcons].d;
 	release_console_sem();
 
 	orig_buf = buf;
@@ -1951,8 +1944,8 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
 
 	acquire_console_sem();
 
-	vt = tty->driver_data;
-	if (vt == NULL) {
+	vc = tty->driver_data;
+	if (vc == NULL) {
 		printk(KERN_ERR "vt: argh, driver_data _became_ NULL !\n");
 		release_console_sem();
 		goto out;
@@ -2117,7 +2110,7 @@ static void console_callback(void *ignored)
 	if (scrollback_delta) {
 		struct vc_data *vc = vc_cons[fg_console].d;
 		clear_selection();
-		if (vt_cons[vc->vc_num]->vc_mode == KD_TEXT)
+		if (vc->vc_mode == KD_TEXT)
 			vc->vc_sw->con_scrolldelta(vc, scrollback_delta);
 		scrollback_delta = 0;
 	}
@@ -2171,7 +2164,7 @@ void vt_console_print(struct console *co, const char *b, unsigned count)
 		goto quit;
 	}
 
-	if (vt_cons[vc->vc_num]->vc_mode != KD_TEXT)
+	if (vc->vc_mode != KD_TEXT)
 		goto quit;
 
 	/* undraw cursor first */
@@ -2392,9 +2385,9 @@ static void con_throttle(struct tty_struct *tty)
 
 static void con_unthrottle(struct tty_struct *tty)
 {
-	struct vt_struct *vt = tty->driver_data;
+	struct vc_data *vc = tty->driver_data;
 
-	wake_up_interruptible(&vt->paste_wait);
+	wake_up_interruptible(&vc->paste_wait);
 }
 
 /*
@@ -2429,16 +2422,16 @@ static void con_start(struct tty_struct *tty)
 
 static void con_flush_chars(struct tty_struct *tty)
 {
-	struct vt_struct *vt;
+	struct vc_data *vc;
 
 	if (in_interrupt())	/* from flush_to_ldisc */
 		return;
 
 	/* if we race with con_close(), vt may be null */
 	acquire_console_sem();
-	vt = tty->driver_data;
-	if (vt)
-		set_cursor(vc_cons[vt->vc_num].d);
+	vc = tty->driver_data;
+	if (vc)
+		set_cursor(vc);
 	release_console_sem();
 }
 
@@ -2455,8 +2448,7 @@ static int con_open(struct tty_struct *tty, struct file *filp)
 		ret = vc_allocate(currcons);
 		if (ret == 0) {
 			struct vc_data *vc = vc_cons[currcons].d;
-			vt_cons[currcons]->vc_num = currcons;
-			tty->driver_data = vt_cons[currcons];
+			tty->driver_data = vc;
 			vc->vc_tty = tty;
 
 			if (!tty->winsize.ws_row && !tty->winsize.ws_col) {
@@ -2484,11 +2476,10 @@ static void con_close(struct tty_struct *tty, struct file *filp)
 	down(&tty_sem);
 	acquire_console_sem();
 	if (tty && tty->count == 1) {
-		struct vt_struct *vt;
+		struct vc_data *vc = tty->driver_data;
 
-		vt = tty->driver_data;
-		if (vt)
-			vc_cons[vt->vc_num].d->vc_tty = NULL;
+		if (vc)
+			vc->vc_tty = NULL;
 		tty->driver_data = NULL;
 		release_console_sem();
 		vcs_remove_devfs(tty);
@@ -2524,7 +2515,7 @@ static void vc_init(struct vc_data *vc, unsigned int rows,
 	vc->vc_def_color       = 0x07;   /* white */
 	vc->vc_ulcolor		= 0x0f;   /* bold white */
 	vc->vc_halfcolor       = 0x08;   /* grey */
-	init_waitqueue_head(&vt_cons[vc->vc_num]->paste_wait);
+	init_waitqueue_head(&vc->paste_wait);
 	reset_terminal(vc, do_clear);
 }
 
@@ -2561,11 +2552,7 @@ static int __init con_init(void)
 	 * kmalloc is not running yet - we use the bootmem allocator.
 	 */
 	for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) {
-		vc_cons[currcons].d = vc = (struct vc_data *)
-				alloc_bootmem(sizeof(struct vc_data));
-		vt_cons[currcons] = (struct vt_struct *)
-				alloc_bootmem(sizeof(struct vt_struct));
-		vc_cons[currcons].d->vc_vt = vt_cons[currcons];
+		vc_cons[currcons].d = vc = alloc_bootmem(sizeof(struct vc_data));
 		visual_init(vc, currcons, 1);
 		vc->vc_screenbuf = (unsigned short *)alloc_bootmem(vc->vc_screenbuf_size);
 		vc->vc_kmalloced = 0;
@@ -2800,7 +2787,7 @@ void do_blank_screen(int entering_gfx)
 	}
 
 	/* don't blank graphics */
-	if (vt_cons[vc->vc_num]->vc_mode != KD_TEXT) {
+	if (vc->vc_mode != KD_TEXT) {
 		console_blanked = fg_console + 1;
 		return;
 	}
@@ -2847,7 +2834,7 @@ void do_unblank_screen(int leaving_gfx)
 		return;
 	}
 	vc = vc_cons[fg_console].d;
-	if (vt_cons[vc->vc_num]->vc_mode != KD_TEXT)
+	if (vc->vc_mode != KD_TEXT)
 		return; /* but leave console_blanked != 0 */
 
 	if (blankinterval) {
@@ -2898,7 +2885,7 @@ void poke_blanked_console(void)
 	del_timer(&console_timer);
 	blank_timer_expired = 0;
 
-	if (ignore_poke || !vt_cons[fg_console] || vt_cons[fg_console]->vc_mode == KD_GRAPHICS)
+	if (ignore_poke || !vc_cons[fg_console].d || vc_cons[fg_console].d->vc_mode == KD_GRAPHICS)
 		return;
 	if (console_blanked)
 		unblank_screen();
@@ -2916,7 +2903,7 @@ void set_palette(struct vc_data *vc)
 {
 	WARN_CONSOLE_UNLOCKED();
 
-	if (vt_cons[vc->vc_num]->vc_mode != KD_GRAPHICS)
+	if (vc->vc_mode != KD_GRAPHICS)
 		vc->vc_sw->con_set_palette(vc, color_table);
 }
 
@@ -3009,7 +2996,7 @@ int con_font_get(struct vc_data *vc, struct console_font_op *op)
 	int rc = -EINVAL;
 	int c;
 
-	if (vt_cons[vc->vc_num]->vc_mode != KD_TEXT)
+	if (vc->vc_mode != KD_TEXT)
 		return -EINVAL;
 
 	if (op->data) {
@@ -3064,7 +3051,7 @@ int con_font_set(struct vc_data *vc, struct console_font_op *op)
 	int rc = -EINVAL;
 	int size;
 
-	if (vt_cons[vc->vc_num]->vc_mode != KD_TEXT)
+	if (vc->vc_mode != KD_TEXT)
 		return -EINVAL;
 	if (!op->data)
 		return -EINVAL;
@@ -3122,7 +3109,7 @@ int con_font_default(struct vc_data *vc, struct console_font_op *op)
 	char *s = name;
 	int rc;
 
-	if (vt_cons[vc->vc_num]->vc_mode != KD_TEXT)
+	if (vc->vc_mode != KD_TEXT)
 		return -EINVAL;
 
 	if (!op->data)
@@ -3150,7 +3137,7 @@ int con_font_copy(struct vc_data *vc, struct console_font_op *op)
 	int con = op->height;
 	int rc;
 
-	if (vt_cons[vc->vc_num]->vc_mode != KD_TEXT)
+	if (vc->vc_mode != KD_TEXT)
 		return -EINVAL;
 
 	acquire_console_sem();
@@ -3262,7 +3249,6 @@ EXPORT_SYMBOL(vc_resize);
 EXPORT_SYMBOL(fg_console);
 EXPORT_SYMBOL(console_blank_hook);
 EXPORT_SYMBOL(console_blanked);
-EXPORT_SYMBOL(vt_cons);
 EXPORT_SYMBOL(vc_cons);
 #ifndef VT_SINGLE_DRIVER
 EXPORT_SYMBOL(take_over_console);
diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index 3c55de997496..fa6ee5a7e625 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -52,8 +52,6 @@ extern struct tty_driver *console_driver;
  * to the current console is done by the main ioctl code.
  */
 
-struct vt_struct *vt_cons[MAX_NR_CONSOLES];
-
 /* Keyboard type: Default is KB_101, but can be set by machine
  * specific code.
  */
@@ -365,8 +363,7 @@ do_unimap_ioctl(int cmd, struct unimapdesc __user *user_ud, int perm, struct vc_
 int vt_ioctl(struct tty_struct *tty, struct file * file,
 	     unsigned int cmd, unsigned long arg)
 {
-	struct vt_struct *vt = (struct vt_struct *)tty->driver_data;
-	struct vc_data *vc = vc_cons[vt->vc_num].d;
+	struct vc_data *vc = (struct vc_data *)tty->driver_data;
 	struct console_font_op op;	/* used in multiple places here */
 	struct kbd_struct * kbd;
 	unsigned int console;
@@ -374,7 +371,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 	void __user *up = (void __user *)arg;
 	int i, perm;
 	
-	console = vt->vc_num;
+	console = vc->vc_num;
 
 	if (!vc_cons_allocated(console)) 	/* impossible? */
 		return -ENOIOCTLCMD;
@@ -487,9 +484,9 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		default:
 			return -EINVAL;
 		}
-		if (vt_cons[console]->vc_mode == (unsigned char) arg)
+		if (vc->vc_mode == (unsigned char) arg)
 			return 0;
-		vt_cons[console]->vc_mode = (unsigned char) arg;
+		vc->vc_mode = (unsigned char) arg;
 		if (console != fg_console)
 			return 0;
 		/*
@@ -504,7 +501,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		return 0;
 
 	case KDGETMODE:
-		ucval = vt_cons[console]->vc_mode;
+		ucval = vc->vc_mode;
 		goto setint;
 
 	case KDMAPDISP:
@@ -667,12 +664,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		if (tmp.mode != VT_AUTO && tmp.mode != VT_PROCESS)
 			return -EINVAL;
 		acquire_console_sem();
-		vt_cons[console]->vt_mode = tmp;
+		vc->vt_mode = tmp;
 		/* the frsig is ignored, so we set it to 0 */
-		vt_cons[console]->vt_mode.frsig = 0;
-		vt_cons[console]->vt_pid = current->pid;
+		vc->vt_mode.frsig = 0;
+		vc->vt_pid = current->pid;
 		/* no switch is required -- saw@shade.msu.ru */
-		vt_cons[console]->vt_newvt = -1; 
+		vc->vt_newvt = -1;
 		release_console_sem();
 		return 0;
 	}
@@ -683,7 +680,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		int rc;
 
 		acquire_console_sem();
-		memcpy(&tmp, &vt_cons[console]->vt_mode, sizeof(struct vt_mode));
+		memcpy(&tmp, &vc->vt_mode, sizeof(struct vt_mode));
 		release_console_sem();
 
 		rc = copy_to_user(up, &tmp, sizeof(struct vt_mode));
@@ -761,31 +758,29 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 	case VT_RELDISP:
 		if (!perm)
 			return -EPERM;
-		if (vt_cons[console]->vt_mode.mode != VT_PROCESS)
+		if (vc->vt_mode.mode != VT_PROCESS)
 			return -EINVAL;
 
 		/*
 		 * Switching-from response
 		 */
-		if (vt_cons[console]->vt_newvt >= 0)
-		{
+		if (vc->vt_newvt >= 0) {
 			if (arg == 0)
 				/*
 				 * Switch disallowed, so forget we were trying
 				 * to do it.
 				 */
-				vt_cons[console]->vt_newvt = -1;
+				vc->vt_newvt = -1;
 
-			else
-			{
+			else {
 				/*
 				 * The current vt has been released, so
 				 * complete the switch.
 				 */
 				int newvt;
 				acquire_console_sem();
-				newvt = vt_cons[console]->vt_newvt;
-				vt_cons[console]->vt_newvt = -1;
+				newvt = vc->vt_newvt;
+				vc->vt_newvt = -1;
 				i = vc_allocate(newvt);
 				if (i) {
 					release_console_sem();
@@ -1057,17 +1052,15 @@ int vt_waitactive(int vt)
 
 void reset_vc(struct vc_data *vc)
 {
-	struct vt_struct *vt = vt_cons[vc->vc_num];
-
-	vt->vc_mode = KD_TEXT;
+	vc->vc_mode = KD_TEXT;
 	kbd_table[vc->vc_num].kbdmode = VC_XLATE;
-	vt->vt_mode.mode = VT_AUTO;
-	vt->vt_mode.waitv = 0;
-	vt->vt_mode.relsig = 0;
-	vt->vt_mode.acqsig = 0;
-	vt->vt_mode.frsig = 0;
-	vt->vt_pid = -1;
-	vt->vt_newvt = -1;
+	vc->vt_mode.mode = VT_AUTO;
+	vc->vt_mode.waitv = 0;
+	vc->vt_mode.relsig = 0;
+	vc->vt_mode.acqsig = 0;
+	vc->vt_mode.frsig = 0;
+	vc->vt_pid = -1;
+	vc->vt_newvt = -1;
 	if (!in_interrupt())    /* Via keyboard.c:SAK() - akpm */
 		reset_palette(vc);
 }
@@ -1077,7 +1070,6 @@ void reset_vc(struct vc_data *vc)
  */
 void complete_change_console(struct vc_data *vc)
 {
-	unsigned int new_console = vc->vc_num;
 	unsigned char old_vc_mode;
 
 	last_console = fg_console;
@@ -1087,7 +1079,7 @@ void complete_change_console(struct vc_data *vc)
 	 * KD_TEXT mode or vice versa, which means we need to blank or
 	 * unblank the screen later.
 	 */
-	old_vc_mode = vt_cons[fg_console]->vc_mode;
+	old_vc_mode = vc_cons[fg_console].d->vc_mode;
 	switch_screen(vc);
 
 	/*
@@ -1100,9 +1092,8 @@ void complete_change_console(struct vc_data *vc)
 	 * To account for this we duplicate this code below only if the
 	 * controlling process is gone and we've called reset_vc.
 	 */
-	if (old_vc_mode != vt_cons[new_console]->vc_mode)
-	{
-		if (vt_cons[new_console]->vc_mode == KD_TEXT)
+	if (old_vc_mode != vc->vc_mode) {
+		if (vc->vc_mode == KD_TEXT)
 			do_unblank_screen(1);
 		else
 			do_blank_screen(1);
@@ -1113,17 +1104,13 @@ void complete_change_console(struct vc_data *vc)
 	 * telling it that it has acquired. Also check if it has died and
 	 * clean up (similar to logic employed in change_console())
 	 */
-	if (vt_cons[new_console]->vt_mode.mode == VT_PROCESS)
-	{
+	if (vc->vt_mode.mode == VT_PROCESS) {
 		/*
 		 * Send the signal as privileged - kill_proc() will
 		 * tell us if the process has gone or something else
 		 * is awry
 		 */
-		if (kill_proc(vt_cons[new_console]->vt_pid,
-			      vt_cons[new_console]->vt_mode.acqsig,
-			      1) != 0)
-		{
+		if (kill_proc(vc->vt_pid, vc->vt_mode.acqsig, 1) != 0) {
 		/*
 		 * The controlling process has died, so we revert back to
 		 * normal operation. In this case, we'll also change back
@@ -1135,9 +1122,8 @@ void complete_change_console(struct vc_data *vc)
 		 */
 			reset_vc(vc);
 
-			if (old_vc_mode != vt_cons[new_console]->vc_mode)
-			{
-				if (vt_cons[new_console]->vc_mode == KD_TEXT)
+			if (old_vc_mode != vc->vc_mode) {
+				if (vc->vc_mode == KD_TEXT)
 					do_unblank_screen(1);
 				else
 					do_blank_screen(1);
@@ -1157,6 +1143,8 @@ void complete_change_console(struct vc_data *vc)
  */
 void change_console(struct vc_data *new_vc)
 {
+	struct vc_data *vc;
+
 	if (!new_vc || new_vc->vc_num == fg_console || vt_dont_switch)
 		return;
 
@@ -1175,23 +1163,20 @@ void change_console(struct vc_data *new_vc)
 	 * the user waits just the right amount of time :-) and revert the
 	 * vt to auto control.
 	 */
-	if (vt_cons[fg_console]->vt_mode.mode == VT_PROCESS)
-	{
+	vc = vc_cons[fg_console].d;
+	if (vc->vt_mode.mode == VT_PROCESS) {
 		/*
 		 * Send the signal as privileged - kill_proc() will
 		 * tell us if the process has gone or something else
 		 * is awry
 		 */
-		if (kill_proc(vt_cons[fg_console]->vt_pid,
-			      vt_cons[fg_console]->vt_mode.relsig,
-			      1) == 0)
-		{
+		if (kill_proc(vc->vt_pid, vc->vt_mode.relsig, 1) == 0) {
 			/*
 			 * It worked. Mark the vt to switch to and
 			 * return. The process needs to send us a
 			 * VT_RELDISP ioctl to complete the switch.
 			 */
-			vt_cons[fg_console]->vt_newvt = new_vc->vc_num;
+			vc->vt_newvt = new_vc->vc_num;
 			return;
 		}
 
@@ -1204,7 +1189,7 @@ void change_console(struct vc_data *new_vc)
 		 * this outside of VT_PROCESS but there is no single process
 		 * to account for and tracking tty count may be undesirable.
 		 */
-		reset_vc(vc_cons[fg_console].d);
+		reset_vc(vc);
 
 		/*
 		 * Fall through to normal (VT_AUTO) handling of the switch...
@@ -1214,7 +1199,7 @@ void change_console(struct vc_data *new_vc)
 	/*
 	 * Ignore all switches in KD_GRAPHICS+VT_AUTO mode
 	 */
-	if (vt_cons[fg_console]->vc_mode == KD_GRAPHICS)
+	if (vc->vc_mode == KD_GRAPHICS)
 		return;
 
 	complete_change_console(new_vc);
diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 50b5b9dfb66a..738008018adb 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -203,7 +203,7 @@ static irqreturn_t fb_vbl_detect(int irq, void *dummy, struct pt_regs *fp)
 static inline int fbcon_is_inactive(struct vc_data *vc, struct fb_info *info)
 {
 	return (info->state != FBINFO_STATE_RUNNING ||
-		vt_cons[vc->vc_num]->vc_mode != KD_TEXT);
+		vc->vc_mode != KD_TEXT);
 }
 
 static inline int get_color(struct vc_data *vc, struct fb_info *info,
@@ -456,7 +456,7 @@ static void fbcon_prepare_logo(struct vc_data *vc, struct fb_info *info,
 		    erase,
 		    vc->vc_size_row * logo_lines);
 
-	if (CON_IS_VISIBLE(vc) && vt_cons[vc->vc_num]->vc_mode == KD_TEXT) {
+	if (CON_IS_VISIBLE(vc) && vc->vc_mode == KD_TEXT) {
 		fbcon_clear_margins(vc, 0);
 		update_screen(vc);
 	}
@@ -2209,7 +2209,7 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h,
 			}
 		}
 	} else if (CON_IS_VISIBLE(vc)
-		   && vt_cons[vc->vc_num]->vc_mode == KD_TEXT) {
+		   && vc->vc_mode == KD_TEXT) {
 		fbcon_clear_margins(vc, 0);
 		update_screen(vc);
 	}
@@ -2436,7 +2436,7 @@ static int fbcon_scrolldelta(struct vc_data *vc, int lines)
 	if (softback_top) {
 		if (vc->vc_num != fg_console)
 			return 0;
-		if (vt_cons[vc->vc_num]->vc_mode != KD_TEXT || !lines)
+		if (vc->vc_mode != KD_TEXT || !lines)
 			return 0;
 		if (logo_shown >= 0) {
 			struct vc_data *conp2 = vc_cons[logo_shown].d;
@@ -2553,11 +2553,11 @@ static void fbcon_modechanged(struct fb_info *info)
 	struct display *p;
 	int rows, cols;
 
-	if (!ops || ops->currcon < 0 || vt_cons[ops->currcon]->vc_mode !=
-	    KD_TEXT || registered_fb[con2fb_map[ops->currcon]] != info)
+	if (!ops || ops->currcon < 0)
 		return;
-
 	vc = vc_cons[ops->currcon].d;
+	if (vc->vc_mode != KD_TEXT || registered_fb[con2fb_map[ops->currcon]] != info)
+		return;
 
 	p = &fb_display[vc->vc_num];
 
@@ -2639,26 +2639,23 @@ static int fbcon_fb_registered(int idx)
 static void fbcon_fb_blanked(struct fb_info *info, int blank)
 {
 	struct fbcon_ops *ops = info->fbcon_par;
-	int valid = 1;
-
-	if (!ops || ops->currcon < 0 ||
-	    vt_cons[ops->currcon]->vc_mode != KD_TEXT ||
-	    registered_fb[con2fb_map[ops->currcon]] != info)
-		valid = 0;
+	struct vc_data *vc;
 
-	if (valid) {
-		struct vc_data *vc;
+	if (!ops || ops->currcon < 0)
+		return;
 
-		vc = vc_cons[ops->currcon].d;
+	vc = vc_cons[ops->currcon].d;
+	if (vc->vc_mode != KD_TEXT ||
+			registered_fb[con2fb_map[ops->currcon]] != info)
+		return;
 
-		if (CON_IS_VISIBLE(vc)) {
-			if (blank)
-				do_blank_screen(0);
-			else
-				do_unblank_screen(0);
-		}
-		ops->blank_state = blank;
+	if (CON_IS_VISIBLE(vc)) {
+		if (blank)
+			do_blank_screen(0);
+		else
+			do_unblank_screen(0);
 	}
+	ops->blank_state = blank;
 }
 
 static int fbcon_event_notify(struct notifier_block *self, 
diff --git a/drivers/video/console/sticon.c b/drivers/video/console/sticon.c
index 24f9715f7449..fd5940f41271 100644
--- a/drivers/video/console/sticon.c
+++ b/drivers/video/console/sticon.c
@@ -87,13 +87,12 @@ static int sticon_set_palette(struct vc_data *c, unsigned char *table)
 
 static void sticon_putc(struct vc_data *conp, int c, int ypos, int xpos)
 {
-    int unit = conp->vc_num;
     int redraw_cursor = 0;
 
     if (vga_is_gfx || console_blanked)
 	    return;
-	    
-    if (vt_cons[unit]->vc_mode != KD_TEXT)
+
+    if (conp->vc_mode != KD_TEXT)
     	    return;
 #if 0
     if ((p->cursor_x == xpos) && (p->cursor_y == ypos)) {
@@ -111,15 +110,14 @@ static void sticon_putc(struct vc_data *conp, int c, int ypos, int xpos)
 static void sticon_putcs(struct vc_data *conp, const unsigned short *s,
 			 int count, int ypos, int xpos)
 {
-    int unit = conp->vc_num;
     int redraw_cursor = 0;
 
     if (vga_is_gfx || console_blanked)
 	    return;
 
-    if (vt_cons[unit]->vc_mode != KD_TEXT)
+    if (conp->vc_mode != KD_TEXT)
     	    return;
-    
+
 #if 0
     if ((p->cursor_y == ypos) && (xpos <= p->cursor_x) &&
 	(p->cursor_x < (xpos + count))) {
diff --git a/drivers/video/sun3fb.c b/drivers/video/sun3fb.c
index bfed02b7aa29..9b36b9df535f 100644
--- a/drivers/video/sun3fb.c
+++ b/drivers/video/sun3fb.c
@@ -505,7 +505,7 @@ void sun3fb_palette(int enter)
 			if (fb->restore_palette) {
 				if (enter)
 					fb->restore_palette(fb);
-				else if (vt_cons[i]->vc_mode != KD_GRAPHICS)
+				else if (vc_cons[i].d->vc_mode != KD_GRAPHICS)
 				         vc_cons[i].d->vc_sw->con_set_palette(vc_cons[i].d, color_table);
 			}
 		}
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6b38446d4e52..25ab41399ec4 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1597,7 +1597,7 @@ static int do_kdfontop_ioctl(unsigned int fd, unsigned int cmd, unsigned long ar
 	struct console_font_op op;
 	struct console_font_op32 __user *fontop = compat_ptr(arg);
 	int perm = vt_check(file), i;
-	struct vt_struct *vt;
+	struct vc_data *vc;
 	
 	if (perm < 0) return perm;
 	
@@ -1607,9 +1607,10 @@ static int do_kdfontop_ioctl(unsigned int fd, unsigned int cmd, unsigned long ar
 		return -EPERM;
 	op.data = compat_ptr(((struct console_font_op32 *)&op)->data);
 	op.flags |= KD_FONT_FLAG_OLD;
-	vt = (struct vt_struct *)((struct tty_struct *)file->private_data)->driver_data;
-	i = con_font_op(vc_cons[vt->vc_num].d, &op);
-	if (i) return i;
+	vc = ((struct tty_struct *)file->private_data)->driver_data;
+	i = con_font_op(vc, &op);
+	if (i)
+		return i;
 	((struct console_font_op32 *)&op)->data = (unsigned long)op.data;
 	if (copy_to_user(fontop, &op, sizeof(struct console_font_op32)))
 		return -EFAULT;
diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h
index 062049ca5c44..725be90ef55e 100644
--- a/include/linux/console_struct.h
+++ b/include/linux/console_struct.h
@@ -26,6 +26,7 @@ struct vc_data {
 	const struct consw *vc_sw;
 	unsigned short	*vc_screenbuf;		/* In-memory character/attribute buffer */
 	unsigned int	vc_screenbuf_size;
+	unsigned char	vc_mode;		/* KD_TEXT, ... */
 	/* attributes for all characters on screen */
 	unsigned char	vc_attr;		/* Current attributes */
 	unsigned char	vc_def_color;		/* Default colors */
@@ -48,6 +49,11 @@ struct vc_data {
 	unsigned int	vc_state;		/* Escape sequence parser state */
 	unsigned int	vc_npar,vc_par[NPAR];	/* Parameters of current escape sequence */
 	struct tty_struct *vc_tty;		/* TTY we are attached to */
+	/* data for manual vt switching */
+	struct vt_mode	vt_mode;
+	int		vt_pid;
+	int		vt_newvt;
+	wait_queue_head_t paste_wait;
 	/* mode flags */
 	unsigned int	vc_charset	: 1;	/* Character set G0 / G1 */
 	unsigned int	vc_s_charset	: 1;	/* Saved character set */
@@ -89,7 +95,6 @@ struct vc_data {
 	struct vc_data **vc_display_fg;		/* [!] Ptr to var holding fg console for this display */
 	unsigned long	vc_uni_pagedir;
 	unsigned long	*vc_uni_pagedir_loc;  /* [!] Location of uni_pagedir variable for this console */
-	struct vt_struct *vc_vt;
 	/* additional information is in vt_kern.h */
 };
 
diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index 7c37844c31c0..cb18320adb16 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -25,15 +25,6 @@
 #define BROKEN_GRAPHICS_PROGRAMS 1
 #endif
 
-extern struct vt_struct {
-	int vc_num;				/* The console number */
-	unsigned char	vc_mode;		/* KD_TEXT, ... */
-	struct vt_mode	vt_mode;
-	int		vt_pid;
-	int		vt_newvt;
-	wait_queue_head_t paste_wait;
-} *vt_cons[MAX_NR_CONSOLES];
-
 extern void kd_mksound(unsigned int hz, unsigned int ticks);
 extern int kbd_rate(struct kbd_repeat *rep);
 
-- 
cgit v1.2.3


From 5025daf1ae75ab4d91f48cebb9af3d4a519e611b Mon Sep 17 00:00:00 2001
From: Alex Tomas <alex@clusterfs.com>
Date: Mon, 7 Mar 2005 17:46:14 -0800
Subject: [PATCH] jbd: journal overflow fix #2

fix against credits leak in journal_release_buffer()

The idea is to charge a buffer in journal_dirty_metadata(), not in
journal_get_*_access()).  Each buffer has flag call
journal_dirty_metadata() sets on the buffer.

Signed-off-by: Alex Tomas <alex@clusterfs.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/balloc.c             |  7 +++---
 fs/ext3/ialloc.c             |  6 ++---
 fs/jbd/commit.c              | 16 +++++++++++++
 fs/jbd/transaction.c         | 56 ++++++++++++++++++--------------------------
 include/linux/ext3_jbd.h     | 21 +++++++----------
 include/linux/jbd.h          |  9 +++----
 include/linux/journal-head.h |  7 ++++++
 7 files changed, 63 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 03cd803e0b28..439df705dd84 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -342,7 +342,7 @@ do_more:
 	 */
 	/* @@@ check errors */
 	BUFFER_TRACE(bitmap_bh, "getting undo access");
-	err = ext3_journal_get_undo_access(handle, bitmap_bh, NULL);
+	err = ext3_journal_get_undo_access(handle, bitmap_bh);
 	if (err)
 		goto error_return;
 
@@ -991,7 +991,6 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	unsigned long group_first_block;
 	int ret = 0;
 	int fatal;
-	int credits = 0;
 
 	*errp = 0;
 
@@ -1001,7 +1000,7 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	 * if the buffer is in BJ_Forget state in the committing transaction.
 	 */
 	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
-	fatal = ext3_journal_get_undo_access(handle, bitmap_bh, &credits);
+	fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
 	if (fatal) {
 		*errp = fatal;
 		return -1;
@@ -1092,7 +1091,7 @@ out:
 	}
 
 	BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
-	ext3_journal_release_buffer(handle, bitmap_bh, credits);
+	ext3_journal_release_buffer(handle, bitmap_bh);
 	return ret;
 }
 
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 3ec7af54e51e..e36102d2cea9 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -474,11 +474,9 @@ repeat_in_this_group:
 		ino = ext3_find_next_zero_bit((unsigned long *)
 				bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino);
 		if (ino < EXT3_INODES_PER_GROUP(sb)) {
-			int credits = 0;
 
 			BUFFER_TRACE(bitmap_bh, "get_write_access");
-			err = ext3_journal_get_write_access_credits(handle,
-							bitmap_bh, &credits);
+			err = ext3_journal_get_write_access(handle, bitmap_bh);
 			if (err)
 				goto fail;
 
@@ -494,7 +492,7 @@ repeat_in_this_group:
 				goto got;
 			}
 			/* we lost it */
-			journal_release_buffer(handle, bitmap_bh, credits);
+			journal_release_buffer(handle, bitmap_bh);
 
 			if (++ino < EXT3_INODES_PER_GROUP(sb))
 				goto repeat_in_this_group;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index aa5f22435d0c..2069cbc9cb1b 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -228,6 +228,22 @@ void journal_commit_transaction(journal_t *journal)
 
 	jbd_debug (3, "JBD: commit phase 2\n");
 
+	/*
+	 * First, drop modified flag: all accesses to the buffers
+	 * will be tracked for a new trasaction only -bzzz
+	 */
+	spin_lock(&journal->j_list_lock);
+	if (commit_transaction->t_buffers) {
+		new_jh = jh = commit_transaction->t_buffers->b_tnext;
+		do {
+			J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
+					new_jh->b_modified == 0);
+			new_jh->b_modified = 0;
+			new_jh = new_jh->b_tnext;
+		} while (new_jh != jh);
+	}
+	spin_unlock(&journal->j_list_lock);
+
 	/*
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 2cc2b38b2324..3646dc473242 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -522,7 +522,7 @@ static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
  */
 static int
 do_get_write_access(handle_t *handle, struct journal_head *jh,
-			int force_copy, int *credits) 
+			int force_copy)
 {
 	struct buffer_head *bh;
 	transaction_t *transaction;
@@ -604,11 +604,6 @@ repeat:
 		JBUFFER_TRACE(jh, "has frozen data");
 		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
 		jh->b_next_transaction = transaction;
-
-		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
-		handle->h_buffer_credits--;
-		if (credits)
-			(*credits)++;
 		goto done;
 	}
 
@@ -688,10 +683,6 @@ repeat:
 		jh->b_next_transaction = transaction;
 	}
 
-	J_ASSERT(handle->h_buffer_credits > 0);
-	handle->h_buffer_credits--;
-	if (credits)
-		(*credits)++;
 
 	/*
 	 * Finally, if the buffer is not journaled right now, we need to make
@@ -749,8 +740,7 @@ out:
  * because we're write()ing a buffer which is also part of a shared mapping.
  */
 
-int journal_get_write_access(handle_t *handle,
-			struct buffer_head *bh, int *credits)
+int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
 {
 	struct journal_head *jh = journal_add_journal_head(bh);
 	int rc;
@@ -758,7 +748,7 @@ int journal_get_write_access(handle_t *handle,
 	/* We do not want to get caught playing with fields which the
 	 * log thread also manipulates.  Make sure that the buffer
 	 * completes any outstanding IO before proceeding. */
-	rc = do_get_write_access(handle, jh, 0, credits);
+	rc = do_get_write_access(handle, jh, 0);
 	journal_put_journal_head(jh);
 	return rc;
 }
@@ -814,9 +804,6 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
 	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
 
-	J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
-	handle->h_buffer_credits--;
-
 	if (jh->b_transaction == NULL) {
 		jh->b_transaction = transaction;
 		JBUFFER_TRACE(jh, "file as BJ_Reserved");
@@ -869,8 +856,7 @@ out:
  *
  * Returns error number or 0 on success.
  */
-int journal_get_undo_access(handle_t *handle, struct buffer_head *bh,
-				int *credits)
+int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
 {
 	int err;
 	struct journal_head *jh = journal_add_journal_head(bh);
@@ -883,7 +869,7 @@ int journal_get_undo_access(handle_t *handle, struct buffer_head *bh,
 	 * make sure that obtaining the committed_data is done
 	 * atomically wrt. completion of any outstanding commits.
 	 */
-	err = do_get_write_access(handle, jh, 1, credits);
+	err = do_get_write_access(handle, jh, 1);
 	if (err)
 		goto out;
 
@@ -1111,6 +1097,17 @@ int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 
 	jbd_lock_bh_state(bh);
 
+	if (jh->b_modified == 0) {
+		/*
+		 * This buffer's got modified and becoming part
+		 * of the transaction. This needs to be done
+		 * once a transaction -bzzz
+		 */
+		jh->b_modified = 1;
+		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+		handle->h_buffer_credits--;
+	}
+
 	/*
 	 * fastpath, to avoid expensive locking.  If this buffer is already
 	 * on the running transaction's metadata list there is nothing to do.
@@ -1161,24 +1158,11 @@ out:
  * journal_release_buffer: undo a get_write_access without any buffer
  * updates, if the update decided in the end that it didn't need access.
  *
- * The caller passes in the number of credits which should be put back for
- * this buffer (zero or one).
- *
- * We leave the buffer attached to t_reserved_list because even though this
- * handle doesn't want it, some other concurrent handle may want to journal
- * this buffer.  If that handle is curently in between get_write_access() and
- * journal_dirty_metadata() then it expects the buffer to be reserved.  If
- * we were to rip it off t_reserved_list here, the other handle will explode
- * when journal_dirty_metadata is presented with a non-reserved buffer.
- *
- * If nobody really wants to journal this buffer then it will be thrown
- * away at the start of commit.
  */
 void
-journal_release_buffer(handle_t *handle, struct buffer_head *bh, int credits)
+journal_release_buffer(handle_t *handle, struct buffer_head *bh)
 {
 	BUFFER_TRACE(bh, "entry");
-	handle->h_buffer_credits += credits;
 }
 
 /** 
@@ -1222,6 +1206,12 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
 		goto not_jbd;
 	}
 
+	/*
+	 * The buffer's going from the transaction, we must drop
+	 * all references -bzzz
+	 */
+	jh->b_modified = 0;
+
 	if (jh->b_transaction == handle->h_transaction) {
 		J_ASSERT_JH(jh, !jh->b_frozen_data);
 
diff --git a/include/linux/ext3_jbd.h b/include/linux/ext3_jbd.h
index 47445f93d4f7..e8292af9033b 100644
--- a/include/linux/ext3_jbd.h
+++ b/include/linux/ext3_jbd.h
@@ -111,9 +111,9 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
 
 static inline int
 __ext3_journal_get_undo_access(const char *where, handle_t *handle,
-				struct buffer_head *bh, int *credits)
+				struct buffer_head *bh)
 {
-	int err = journal_get_undo_access(handle, bh, credits);
+	int err = journal_get_undo_access(handle, bh);
 	if (err)
 		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
 	return err;
@@ -121,19 +121,18 @@ __ext3_journal_get_undo_access(const char *where, handle_t *handle,
 
 static inline int
 __ext3_journal_get_write_access(const char *where, handle_t *handle,
-				struct buffer_head *bh, int *credits)
+				struct buffer_head *bh)
 {
-	int err = journal_get_write_access(handle, bh, credits);
+	int err = journal_get_write_access(handle, bh);
 	if (err)
 		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
 	return err;
 }
 
 static inline void
-ext3_journal_release_buffer(handle_t *handle, struct buffer_head *bh,
-				int credits)
+ext3_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
 {
-	journal_release_buffer(handle, bh, credits);
+	journal_release_buffer(handle, bh);
 }
 
 static inline int
@@ -176,12 +175,10 @@ __ext3_journal_dirty_metadata(const char *where,
 }
 
 
-#define ext3_journal_get_undo_access(handle, bh, credits) \
-	__ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh), (credits))
+#define ext3_journal_get_undo_access(handle, bh) \
+	__ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh))
 #define ext3_journal_get_write_access(handle, bh) \
-	__ext3_journal_get_write_access(__FUNCTION__, (handle), (bh), NULL)
-#define ext3_journal_get_write_access_credits(handle, bh, credits) \
-	__ext3_journal_get_write_access(__FUNCTION__, (handle), (bh), (credits))
+	__ext3_journal_get_write_access(__FUNCTION__, (handle), (bh))
 #define ext3_journal_revoke(handle, blocknr, bh) \
 	__ext3_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
 #define ext3_journal_get_create_access(handle, bh) \
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index f857ff09921d..bf23599b9f47 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -867,15 +867,12 @@ static inline handle_t *journal_current_handle(void)
 extern handle_t *journal_start(journal_t *, int nblocks);
 extern int	 journal_restart (handle_t *, int nblocks);
 extern int	 journal_extend (handle_t *, int nblocks);
-extern int	 journal_get_write_access(handle_t *, struct buffer_head *,
-						int *credits);
+extern int	 journal_get_write_access(handle_t *, struct buffer_head *);
 extern int	 journal_get_create_access (handle_t *, struct buffer_head *);
-extern int	 journal_get_undo_access(handle_t *, struct buffer_head *,
-						int *credits);
+extern int	 journal_get_undo_access(handle_t *, struct buffer_head *);
 extern int	 journal_dirty_data (handle_t *, struct buffer_head *);
 extern int	 journal_dirty_metadata (handle_t *, struct buffer_head *);
-extern void	 journal_release_buffer (handle_t *, struct buffer_head *,
-						int credits);
+extern void	 journal_release_buffer (handle_t *, struct buffer_head *);
 extern int	 journal_forget (handle_t *, struct buffer_head *);
 extern void	 journal_sync_buffer (struct buffer_head *);
 extern int	 journal_invalidatepage(journal_t *,
diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h
index 8751663d04cc..8a62d1e84b9b 100644
--- a/include/linux/journal-head.h
+++ b/include/linux/journal-head.h
@@ -31,6 +31,13 @@ struct journal_head {
 	 */
 	unsigned b_jlist;
 
+	/*
+	 * This flag signals the buffer has been modified by
+	 * the currently running transaction
+	 * [jbd_lock_bh_state()]
+	 */
+	unsigned b_modified;
+
 	/*
 	 * Copy of the buffer data frozen for writing to the log.
 	 * [jbd_lock_bh_state()]
-- 
cgit v1.2.3


From 96761507cc998cb9c1c06f9d74bfa47cb11ae254 Mon Sep 17 00:00:00 2001
From: Alex Tomas <alex@clusterfs.com>
Date: Mon, 7 Mar 2005 17:46:28 -0800
Subject: [PATCH] JBD: reduce stack and number of journal descriptors

Dynamically allocate the holding array for kjournald write patching rather
than allocating it on the stack.

Signed-off-by: Alex Tomas <alex@clusterfs.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/commit.c     |  6 +++---
 fs/jbd/journal.c    | 25 +++++++++++++++++++++++++
 include/linux/jbd.h |  6 ++++++
 3 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 2069cbc9cb1b..5c99233ae92a 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -103,7 +103,7 @@ void journal_commit_transaction(journal_t *journal)
 {
 	transaction_t *commit_transaction;
 	struct journal_head *jh, *new_jh, *descriptor;
-	struct buffer_head *wbuf[64];
+	struct buffer_head **wbuf = journal->j_wbuf;
 	int bufs;
 	int flags;
 	int err;
@@ -287,7 +287,7 @@ write_out_data:
 				BUFFER_TRACE(bh, "start journal writeout");
 				get_bh(bh);
 				wbuf[bufs++] = bh;
-				if (bufs == ARRAY_SIZE(wbuf)) {
+				if (bufs == journal->j_wbufsize) {
 					jbd_debug(2, "submit %d writes\n",
 							bufs);
 					spin_unlock(&journal->j_list_lock);
@@ -503,7 +503,7 @@ write_out_data:
 		/* If there's no more to do, or if the descriptor is full,
 		   let the IO rip! */
 
-		if (bufs == ARRAY_SIZE(wbuf) ||
+		if (bufs == journal->j_wbufsize ||
 		    commit_transaction->t_buffers == NULL ||
 		    space_left < sizeof(journal_block_tag_t) + 16) {
 
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 71614e318465..014c1bc8e2ea 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -721,6 +721,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
 {
 	journal_t *journal = journal_init_common();
 	struct buffer_head *bh;
+	int n;
 
 	if (!journal)
 		return NULL;
@@ -736,6 +737,17 @@ journal_t * journal_init_dev(struct block_device *bdev,
 	journal->j_sb_buffer = bh;
 	journal->j_superblock = (journal_superblock_t *)bh->b_data;
 
+	/* journal descriptor can store up to n blocks -bzzz */
+	n = journal->j_blocksize / sizeof(journal_block_tag_t);
+	journal->j_wbufsize = n;
+	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+	if (!journal->j_wbuf) {
+		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
+			__FUNCTION__);
+		kfree(journal);
+		journal = NULL;
+	}
+
 	return journal;
 }
  
@@ -752,6 +764,7 @@ journal_t * journal_init_inode (struct inode *inode)
 	struct buffer_head *bh;
 	journal_t *journal = journal_init_common();
 	int err;
+	int n;
 	unsigned long blocknr;
 
 	if (!journal)
@@ -768,6 +781,17 @@ journal_t * journal_init_inode (struct inode *inode)
 	journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
 	journal->j_blocksize = inode->i_sb->s_blocksize;
 
+	/* journal descriptor can store up to n blocks -bzzz */
+	n = journal->j_blocksize / sizeof(journal_block_tag_t);
+	journal->j_wbufsize = n;
+	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+	if (!journal->j_wbuf) {
+		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
+			__FUNCTION__);
+		kfree(journal);
+		return NULL;
+	}
+
 	err = journal_bmap(journal, 0, &blocknr);
 	/* If that failed, give up */
 	if (err) {
@@ -1141,6 +1165,7 @@ void journal_destroy(journal_t *journal)
 		iput(journal->j_inode);
 	if (journal->j_revoke)
 		journal_destroy_revoke(journal);
+	kfree(journal->j_wbuf);
 	kfree(journal);
 }
 
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index bf23599b9f47..e28f8b23558f 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -788,6 +788,12 @@ struct journal_s
 	struct jbd_revoke_table_s *j_revoke;
 	struct jbd_revoke_table_s *j_revoke_table[2];
 
+	/*
+	 * array of bhs for journal_commit_transaction
+	 */
+	struct buffer_head	**j_wbuf;
+	int			j_wbufsize;
+
 	/*
 	 * An opaque pointer to fs-private information.  ext3 puts its
 	 * superblock pointer here
-- 
cgit v1.2.3


From 69bfca0e64ca97d1a3063687e26fa0191ae3ddfd Mon Sep 17 00:00:00 2001
From: Liam Girdwood <liam.girdwood@wolfsonmicro.com>
Date: Mon, 7 Mar 2005 17:48:14 -0800
Subject: [PATCH] OSS Support for AC97 low power codecs

This is a resend of a patch that has been applied to 2.4.  The low power
codec functionality has also now been included in ALSA.

It checks the codec ID before doing an AC97 register reset.  This allows
the kernel to support low power codecs that are powered down by a reset
command.  This patch also fixes some other minor issues.

Changes:-

- Added AC97_DEFAULT_POWER_OFF to ac97_codec_ids[]

- ac97_probe now checks hardwired codec ID's before sending a reset

- Added support for WM9713

- Moved the codec specific inits after the mixer setup as some init

- tings were being clobbered.

- Added extra check so that default_digital_ops doesn't overwrite a valid
  codec_ops.  (SPDIF)

Signed-off-by: Liam Girdwood <liam.girdwood@wolfsonmicro.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/ac97_codec.h |  1 +
 sound/oss/ac97_codec.c     | 81 ++++++++++++++++++++++++++++++----------------
 2 files changed, 55 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ac97_codec.h b/include/linux/ac97_codec.h
index c3970bb88c4d..c35833824e11 100644
--- a/include/linux/ac97_codec.h
+++ b/include/linux/ac97_codec.h
@@ -323,6 +323,7 @@ struct ac97_ops
 	
 #define AC97_DELUDED_MODEM	1	/* Audio codec reports its a modem */
 #define AC97_NO_PCM_VOLUME	2	/* Volume control is missing 	   */
+#define AC97_DEFAULT_POWER_OFF 4 /* Needs warm reset to power up */
 };
 
 extern int ac97_read_proc (char *page_out, char **start, off_t off,
diff --git a/sound/oss/ac97_codec.c b/sound/oss/ac97_codec.c
index 20a09424a48a..124b1e10a13d 100644
--- a/sound/oss/ac97_codec.c
+++ b/sound/oss/ac97_codec.c
@@ -71,6 +71,7 @@ static int wolfson_init03(struct ac97_codec * codec);
 static int wolfson_init04(struct ac97_codec * codec);
 static int wolfson_init05(struct ac97_codec * codec);
 static int wolfson_init11(struct ac97_codec * codec);
+static int wolfson_init13(struct ac97_codec * codec);
 static int tritech_init(struct ac97_codec * codec);
 static int tritech_maestro_init(struct ac97_codec * codec);
 static int sigmatel_9708_init(struct ac97_codec *codec);
@@ -107,6 +108,7 @@ static struct ac97_ops wolfson_ops03 = { wolfson_init03, NULL, NULL };
 static struct ac97_ops wolfson_ops04 = { wolfson_init04, NULL, NULL };
 static struct ac97_ops wolfson_ops05 = { wolfson_init05, NULL, NULL };
 static struct ac97_ops wolfson_ops11 = { wolfson_init11, NULL, NULL };
+static struct ac97_ops wolfson_ops13 = { wolfson_init13, NULL, NULL };
 static struct ac97_ops tritech_ops = { tritech_init, NULL, NULL };
 static struct ac97_ops tritech_m_ops = { tritech_maestro_init, NULL, NULL };
 static struct ac97_ops sigmatel_9708_ops = { sigmatel_9708_init, NULL, NULL };
@@ -171,6 +173,7 @@ static const struct {
 	{0x574D4C05, "Wolfson WM9705/WM9710",   &wolfson_ops05},
 	{0x574D4C09, "Wolfson WM9709",		&null_ops},
 	{0x574D4C12, "Wolfson WM9711/9712",	&wolfson_ops11},
+	{0x574D4C13, "Wolfson WM9713",	&wolfson_ops13, AC97_DEFAULT_POWER_OFF},
 	{0x83847600, "SigmaTel STAC????",	&null_ops},
 	{0x83847604, "SigmaTel STAC9701/3/4/5", &null_ops},
 	{0x83847605, "SigmaTel STAC9704",	&null_ops},
@@ -798,6 +801,9 @@ EXPORT_SYMBOL(ac97_release_codec);
  *	Currently codec_wait is used to wait for AC97 codec
  *	reset to complete. 
  *
+ *     Some codecs will power down when a register reset is
+ *     performed. We now check for such codecs.
+ *
  *	Returns 1 (true) on success, or 0 (false) on failure.
  */
  
@@ -811,34 +817,17 @@ int ac97_probe_codec(struct ac97_codec *codec)
 	struct list_head *l;
 	struct ac97_driver *d;
 	
-	/* probing AC97 codec, AC97 2.0 says that bit 15 of register 0x00 (reset) should 
-	 * be read zero.
-	 *
-	 * FIXME: is the following comment outdated?  -jgarzik 
-	 * Probing of AC97 in this way is not reliable, it is not even SAFE !!
-	 */
-	codec->codec_write(codec, AC97_RESET, 0L);
-
-	/* also according to spec, we wait for codec-ready state */	
+	/* wait for codec-ready state */
 	if (codec->codec_wait)
 		codec->codec_wait(codec);
 	else
 		udelay(10);
 
-	if ((audio = codec->codec_read(codec, AC97_RESET)) & 0x8000) {
-		printk(KERN_ERR "ac97_codec: %s ac97 codec not present\n",
-		       (codec->id & 0x2) ? (codec->id&1 ? "4th" : "Tertiary") 
-		       : (codec->id&1 ? "Secondary":  "Primary"));
-		return 0;
-	}
-
-	/* probe for Modem Codec */
-	codec->modem = ac97_check_modem(codec);
-	codec->name = NULL;
-	codec->codec_ops = &default_ops;
-
+	/* will the codec power down if register reset ? */
 	id1 = codec->codec_read(codec, AC97_VENDOR_ID1);
 	id2 = codec->codec_read(codec, AC97_VENDOR_ID2);
+	codec->name = NULL;
+	codec->codec_ops = &null_ops;
 	for (i = 0; i < ARRAY_SIZE(ac97_codec_ids); i++) {
 		if (ac97_codec_ids[i].id == ((id1 << 16) | id2)) {
 			codec->type = ac97_codec_ids[i].id;
@@ -850,9 +839,34 @@ int ac97_probe_codec(struct ac97_codec *codec)
 	}
 
 	codec->model = (id1 << 16) | id2;
+	if ((codec->flags & AC97_DEFAULT_POWER_OFF) == 0) {
+		/* reset codec and wait for the ready bit before we continue */
+		codec->codec_write(codec, AC97_RESET, 0L);
+		if (codec->codec_wait)
+			codec->codec_wait(codec);
+		else
+			udelay(10);
+	}
+
+	/* probing AC97 codec, AC97 2.0 says that bit 15 of register 0x00 (reset) should
+	 * be read zero.
+	 *
+	 * FIXME: is the following comment outdated?  -jgarzik
+	 * Probing of AC97 in this way is not reliable, it is not even SAFE !!
+	 */
+	if ((audio = codec->codec_read(codec, AC97_RESET)) & 0x8000) {
+		printk(KERN_ERR "ac97_codec: %s ac97 codec not present\n",
+		       (codec->id & 0x2) ? (codec->id&1 ? "4th" : "Tertiary")
+		       : (codec->id&1 ? "Secondary":  "Primary"));
+		return 0;
+	}
 	
+	/* probe for Modem Codec */
+	codec->modem = ac97_check_modem(codec);
+
+	/* enable SPDIF */
 	f = codec->codec_read(codec, AC97_EXTENDED_STATUS);
-	if(f & 4)
+	if((codec->codec_ops == &null_ops) && (f & 4))
 		codec->codec_ops = &default_digital_ops;
 	
 	/* A device which thinks its a modem but isnt */
@@ -921,11 +935,6 @@ static int ac97_init_mixer(struct ac97_codec *codec)
 	codec->recmask_io = ac97_recmask_io;
 	codec->mixer_ioctl = ac97_mixer_ioctl;
 
-	/* codec specific initialization for 4-6 channel output or secondary codec stuff */
-	if (codec->codec_ops->init != NULL) {
-		codec->codec_ops->init(codec);
-	}
-
 	/* initialize mixer channel volumes */
 	for (i = 0; i < SOUND_MIXER_NRDEVICES; i++) {
 		struct mixer_defaults *md = &mixer_defaults[i];
@@ -936,6 +945,11 @@ static int ac97_init_mixer(struct ac97_codec *codec)
 		ac97_set_mixer(codec, md->mixer, md->value);
 	}
 
+	/* codec specific initialization for 4-6 channel output or secondary codec stuff */
+	if (codec->codec_ops->init != NULL) {
+		codec->codec_ops->init(codec);
+	}
+
 	/*
 	 *	Volume is MUTE only on this device. We have to initialise
 	 *	it but its useless beyond that.
@@ -1091,6 +1105,19 @@ static int wolfson_init11(struct ac97_codec * codec)
 	return 0;
 }
 
+/* WM9713 */
+static int wolfson_init13(struct ac97_codec * codec)
+{
+	codec->codec_write(codec, AC97_RECORD_GAIN, 0x00a0);
+	codec->codec_write(codec, AC97_POWER_CONTROL, 0x0000);
+	codec->codec_write(codec, AC97_EXTENDED_MODEM_ID, 0xDA00);
+	codec->codec_write(codec, AC97_EXTEND_MODEM_STAT, 0x3810);
+	codec->codec_write(codec, AC97_PHONE_VOL, 0x0808);
+	codec->codec_write(codec, AC97_PCBEEP_VOL, 0x0808);
+
+	return 0;
+}
+
 static int tritech_init(struct ac97_codec * codec)
 {
 	codec->codec_write(codec, 0x26, 0x0300);
-- 
cgit v1.2.3


From 02721572728cccd31b54d69e1dfb8124fa02407e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 7 Mar 2005 17:48:29 -0800
Subject: [PATCH] Fix kallsyms/insmod/rmmod race

The attached patch fixes a race between kallsyms and insmod/rmmod.

The problem is this:

 (1) The various kallsyms functions poke around in the module list without any
     locking so that they can be called from the oops handler.

 (2) Although insmod and rmmod use locks to exclude each other, these have no
     effect on the kallsyms function.

 (3) Although rmmod modifies the module state with the machine "stopped", it
     hasn't removed the metadata from the module metadata list, meaning that
     as soon as the machine is "restarted", the metadata can be observed by
     kallsyms.

     It's not possible to say that an item in that list should be ignored if
     it's state is marked as inactive - you can't get at the state information
     because you can't trust the metadata in which it is embedded.

     Furthermore, list linkage information is embedded in the metadata too, so
     you can't trust that either...

 (4) kallsyms may be walking the module list without a lock whilst either
     insmod or rmmod are busy changing it. insmod probably isn't a problem
     since nothing is going a way, but rmmod is as it's deleting an entry.

 (5) Therefore nothing that uses these functions can in any way trust any
     pointers to "static" data (such as module symbol names or module names)
     that are returned.

 (6) On ppc64 the problems are exacerbated since the hypervisor may reschedule
     bits of the kernel, making operations that appear adjacent occur a long
     time apart.

This patch fixes the race by only linking/unlinking modules into/from the
master module list with the machine in the "stopped" state. This means that
any "static" information can be trusted as far as the next kernel reschedule
on any given CPU without the need to hold any locks.

However, I'm not sure how this is affected by preemption. I suspect more work
may need to be done in that case, but I'm not entirely sure.

This also means that rmmod has to bump the machine into the stopped state
twice... but since that shouldn't be a common operation, I don't think that's
a problem.

I've amended this patch to not get spinlocks whilst in the machine locked
state - there's no point as nothing else can be holding spinlocks.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/stop_machine.h |  2 +-
 kernel/kallsyms.c            | 16 ++++++++++++++--
 kernel/module.c              | 33 +++++++++++++++++++++++++--------
 3 files changed, 40 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 6f43cb53f21b..151a803ed0ed 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -8,7 +8,7 @@
 #include <linux/cpu.h>
 #include <asm/system.h>
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
 /**
  * stop_machine_run: freeze the machine on all CPUs and run this function
  * @fn: the function to run
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index bd765adaacd6..449306f696c5 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -146,13 +146,20 @@ unsigned long kallsyms_lookup_name(const char *name)
 	return module_kallsyms_lookup_name(name);
 }
 
-/* Lookup an address.  modname is set to NULL if it's in the kernel. */
+/*
+ * Lookup an address
+ * - modname is set to NULL if it's in the kernel
+ * - we guarantee that the returned name is valid until we reschedule even if
+ *   it resides in a module
+ * - we also guarantee that modname will be valid until rescheduled
+ */
 const char *kallsyms_lookup(unsigned long addr,
 			    unsigned long *symbolsize,
 			    unsigned long *offset,
 			    char **modname, char *namebuf)
 {
 	unsigned long i, low, high, mid;
+	const char *msym;
 
 	/* This kernel should never had been booted. */
 	BUG_ON(!kallsyms_addresses);
@@ -204,7 +211,12 @@ const char *kallsyms_lookup(unsigned long addr,
 		return namebuf;
 	}
 
-	return module_address_lookup(addr, symbolsize, offset, modname);
+	/* see if it's in a module */
+	msym = module_address_lookup(addr, symbolsize, offset, modname);
+	if (msym)
+		return strncpy(namebuf, msym, KSYM_NAME_LEN);
+
+	return NULL;
 }
 
 /* Replace "%s" in format with address, or returns -errno. */
diff --git a/kernel/module.c b/kernel/module.c
index ce427b675b98..2dbfa0773faf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -472,7 +472,7 @@ struct stopref
 };
 
 /* Whole machine is stopped with interrupts off when this runs. */
-static inline int __try_stop_module(void *_sref)
+static int __try_stop_module(void *_sref)
 {
 	struct stopref *sref = _sref;
 
@@ -1072,14 +1072,22 @@ static void mod_kobject_remove(struct module *mod)
 	kobject_unregister(&mod->mkobj.kobj);
 }
 
+/*
+ * unlink the module with the whole machine is stopped with interrupts off
+ * - this defends against kallsyms not taking locks
+ */
+static int __unlink_module(void *_mod)
+{
+	struct module *mod = _mod;
+	list_del(&mod->list);
+	return 0;
+}
+
 /* Free a module, remove from lists, etc (must hold module mutex). */
 static void free_module(struct module *mod)
 {
 	/* Delete from various lists */
-	spin_lock_irq(&modlist_lock);
-	list_del(&mod->list);
-	spin_unlock_irq(&modlist_lock);
-
+	stop_machine_run(__unlink_module, mod, NR_CPUS);
 	remove_sect_attrs(mod);
 	mod_kobject_remove(mod);
 
@@ -1732,6 +1740,17 @@ static struct module *load_module(void __user *umod,
 	goto free_hdr;
 }
 
+/*
+ * link the module with the whole machine is stopped with interrupts off
+ * - this defends against kallsyms not taking locks
+ */
+static int __link_module(void *_mod)
+{
+	struct module *mod = _mod;
+	list_add(&mod->list, &modules);
+	return 0;
+}
+
 /* This is where the real work happens */
 asmlinkage long
 sys_init_module(void __user *umod,
@@ -1766,9 +1785,7 @@ sys_init_module(void __user *umod,
 
 	/* Now sew it into the lists.  They won't access us, since
            strong_try_module_get() will fail. */
-	spin_lock_irq(&modlist_lock);
-	list_add(&mod->list, &modules);
-	spin_unlock_irq(&modlist_lock);
+	stop_machine_run(__link_module, mod, NR_CPUS);
 
 	/* Drop lock so they can recurse */
 	up(&module_mutex);
-- 
cgit v1.2.3


From cd67725a0cede85ca80348333efdd7063e3ebfb7 Mon Sep 17 00:00:00 2001
From: Jan Blunck <j.blunck@tu-harburg.de>
Date: Mon, 7 Mar 2005 17:48:44 -0800
Subject: [PATCH] d_drop should use per dentry lock

d_drop() must use the dentry->d_lock spinlock.  In some cases __d_drop()
was used without holding the dentry->d_lock spinlock, too.  This could end
in a race with __d_lookup().

Signed-off-by: Jan Blunck <j.blunck@tu-harburg.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/autofs4/root.c      |  2 ++
 fs/dcache.c            |  3 +++
 fs/namei.c             | 14 +++++---------
 fs/proc/base.c         |  6 +++++-
 fs/sysfs/inode.c       |  6 +++++-
 include/linux/dcache.h | 19 ++++++++++---------
 6 files changed, 30 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 82ef8ed2fabc..3765c047f157 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -605,7 +605,9 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 		spin_unlock(&dcache_lock);
 		return -ENOTEMPTY;
 	}
+	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 
 	dput(ino->dentry);
diff --git a/fs/dcache.c b/fs/dcache.c
index ed90b724af78..e6acad700833 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -340,13 +340,16 @@ restart:
 	tmp = head;
 	while ((tmp = tmp->next) != head) {
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_alias);
+		spin_lock(&dentry->d_lock);
 		if (!atomic_read(&dentry->d_count)) {
 			__dget_locked(dentry);
 			__d_drop(dentry);
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
 			dput(dentry);
 			goto restart;
 		}
+		spin_unlock(&dentry->d_lock);
 	}
 	spin_unlock(&dcache_lock);
 }
diff --git a/fs/namei.c b/fs/namei.c
index 281ca91fd1cc..63e3e6494f8d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1685,17 +1685,13 @@ out:
 void dentry_unhash(struct dentry *dentry)
 {
 	dget(dentry);
-	spin_lock(&dcache_lock);
-	switch (atomic_read(&dentry->d_count)) {
-	default:
-		spin_unlock(&dcache_lock);
+	if (atomic_read(&dentry->d_count))
 		shrink_dcache_parent(dentry);
-		spin_lock(&dcache_lock);
-		if (atomic_read(&dentry->d_count) != 2)
-			break;
-	case 2:
+	spin_lock(&dcache_lock);
+	spin_lock(&dentry->d_lock);
+	if (atomic_read(&dentry->d_count) == 2)
 		__d_drop(dentry);
-	}
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 }
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b30ee97c308f..9ab35875845d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1630,11 +1630,15 @@ struct dentry *proc_pid_unhash(struct task_struct *p)
 	if (proc_dentry != NULL) {
 
 		spin_lock(&dcache_lock);
+		spin_lock(&proc_dentry->d_lock);
 		if (!d_unhashed(proc_dentry)) {
 			dget_locked(proc_dentry);
 			__d_drop(proc_dentry);
-		} else
+			spin_unlock(&proc_dentry->d_lock);
+		} else {
+			spin_unlock(&proc_dentry->d_lock);
 			proc_dentry = NULL;
+		}
 		spin_unlock(&dcache_lock);
 	}
 	return proc_dentry;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 204d071baa5c..97dc6db0870c 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -129,13 +129,17 @@ void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent)
 
 	if (dentry) {
 		spin_lock(&dcache_lock);
+		spin_lock(&dentry->d_lock);
 		if (!(d_unhashed(dentry) && dentry->d_inode)) {
 			dget_locked(dentry);
 			__d_drop(dentry);
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
 			simple_unlink(parent->d_inode, dentry);
-		} else
+		} else {
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
+		}
 	}
 }
 
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 2da76867183c..50be290d24d2 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -162,17 +162,16 @@ extern spinlock_t dcache_lock;
  * d_drop - drop a dentry
  * @dentry: dentry to drop
  *
- * d_drop() unhashes the entry from the parent
- * dentry hashes, so that it won't be found through
- * a VFS lookup any more. Note that this is different
- * from deleting the dentry - d_delete will try to
- * mark the dentry negative if possible, giving a
- * successful _negative_ lookup, while d_drop will
+ * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
+ * be found through a VFS lookup any more. Note that this is different from
+ * deleting the dentry - d_delete will try to mark the dentry negative if
+ * possible, giving a successful _negative_ lookup, while d_drop will
  * just make the cache lookup fail.
  *
- * d_drop() is used mainly for stuff that wants
- * to invalidate a dentry for some reason (NFS
- * timeouts or autofs deletes).
+ * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
+ * reason (NFS timeouts or autofs deletes).
+ *
+ * __d_drop requires dentry->d_lock.
  */
 
 static inline void __d_drop(struct dentry *dentry)
@@ -186,7 +185,9 @@ static inline void __d_drop(struct dentry *dentry)
 static inline void d_drop(struct dentry *dentry)
 {
 	spin_lock(&dcache_lock);
+	spin_lock(&dentry->d_lock);
  	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 }
 
-- 
cgit v1.2.3


From e8be9091faa43e75879ec238d0171b6f3484d61c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 7 Mar 2005 17:49:01 -0800
Subject: [PATCH] Add struct request end_io callback

This is needed for several things, one in-tree user which I will introduce
after this patch.

This adds a ->end_io callback to struct request, so it can be used with
async io of any sort.  Right now users have to wait for completion in a
blocking manner.  In the next iteration, ->waiting can be folded into
->end_io_data since it is just a special case of that use.

From: Peter Osterlund <petero2@telia.com>

The problem is that the add-struct-request-end_io-callback patch forgot to
update pktcdvd.c.  This patch fixes it.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Peter Osterlund <petero2@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/ll_rw_blk.c | 36 ++++++++++++++++++++++++++++--------
 drivers/block/paride/pd.c |  1 +
 drivers/block/pktcdvd.c   |  1 +
 drivers/ide/ide-io.c      |  1 +
 drivers/ide/ide-tape.c    |  1 +
 include/linux/blkdev.h    | 11 ++++++++++-
 6 files changed, 42 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 3994af7e555b..d0a87b77cbb0 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1753,6 +1753,8 @@ rq_starved:
 	rq->data_len = 0;
 	rq->data = NULL;
 	rq->sense = NULL;
+	rq->end_io = NULL;
+	rq->end_io_data = NULL;
 
 out:
 	put_io_context(ioc);
@@ -2018,8 +2020,8 @@ int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
 	}
 
 	rq->flags |= REQ_NOMERGE;
-	if (!rq->waiting)
-		rq->waiting = &wait;
+	rq->waiting = &wait;
+	rq->end_io = blk_end_sync_rq;
 	elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
 	generic_unplug_device(q);
 	wait_for_completion(rq->waiting);
@@ -2171,7 +2173,7 @@ void disk_round_stats(struct gendisk *disk)
 /*
  * queue lock must be held
  */
-void __blk_put_request(request_queue_t *q, struct request *req)
+static void __blk_put_request(request_queue_t *q, struct request *req)
 {
 	struct request_list *rl = req->rl;
 
@@ -2218,6 +2220,25 @@ void blk_put_request(struct request *req)
 
 EXPORT_SYMBOL(blk_put_request);
 
+/**
+ * blk_end_sync_rq - executes a completion event on a request
+ * @rq: request to complete
+ */
+void blk_end_sync_rq(struct request *rq)
+{
+	struct completion *waiting = rq->waiting;
+
+	rq->waiting = NULL;
+	__blk_put_request(rq->q, rq);
+
+	/*
+	 * complete last, if this is a stack request the process (and thus
+	 * the rq pointer) could be invalid right after this complete()
+	 */
+	complete(waiting);
+}
+EXPORT_SYMBOL(blk_end_sync_rq);
+
 /**
  * blk_congestion_wait - wait for a queue to become uncongested
  * @rw: READ or WRITE
@@ -2978,7 +2999,6 @@ EXPORT_SYMBOL(end_that_request_chunk);
 void end_that_request_last(struct request *req)
 {
 	struct gendisk *disk = req->rq_disk;
-	struct completion *waiting = req->waiting;
 
 	if (unlikely(laptop_mode) && blk_fs_request(req))
 		laptop_io_completion();
@@ -2998,10 +3018,10 @@ void end_that_request_last(struct request *req)
 		disk_round_stats(disk);
 		disk->in_flight--;
 	}
-	__blk_put_request(req->q, req);
-	/* Do this LAST! The structure may be freed immediately afterwards */
-	if (waiting)
-		complete(waiting);
+	if (req->end_io)
+		req->end_io(req);
+	else
+		__blk_put_request(req->q, req);
 }
 
 EXPORT_SYMBOL(end_that_request_last);
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 4390587b3413..35f2c1d38f1f 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -743,6 +743,7 @@ static int pd_special_command(struct pd_unit *disk,
 	rq.rq_disk = disk->gd;
 	rq.ref_count = 1;
 	rq.waiting = &wait;
+	rq.end_io = blk_end_sync_rq;
 	blk_insert_request(disk->gd->queue, &rq, 0, func, 0);
 	wait_for_completion(&wait);
 	rq.waiting = NULL;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 7590000b9457..da8be8ead543 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -375,6 +375,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 	rq->ref_count++;
 	rq->flags |= REQ_NOMERGE;
 	rq->waiting = &wait;
+	rq->end_io = blk_end_sync_rq;
 	elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
 	generic_unplug_device(q);
 	wait_for_completion(&wait);
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 1984fbe5e5c6..186a51eb9196 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -1607,6 +1607,7 @@ int ide_do_drive_cmd (ide_drive_t *drive, struct request *rq, ide_action_t actio
 	if (must_wait) {
 		rq->ref_count++;
 		rq->waiting = &wait;
+		rq->end_io = blk_end_sync_rq;
 	}
 
 	spin_lock_irqsave(&ide_lock, flags);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 509e90a19e8c..c73f053c79d1 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -2720,6 +2720,7 @@ static void idetape_wait_for_request (ide_drive_t *drive, struct request *rq)
 	}
 #endif /* IDETAPE_DEBUG_BUGS */
 	rq->waiting = &wait;
+	rq->end_io = blk_end_sync_rq;
 	spin_unlock_irq(&tape->spinlock);
 	wait_for_completion(&wait);
 	/* The stage and its struct request have been deallocated */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5615a3c9e410..c7553066b917 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -93,6 +93,9 @@ struct io_context *get_io_context(int gfp_flags);
 void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
 
+struct request;
+typedef void (rq_end_io_fn)(struct request *);
+
 struct request_list {
 	int count[2];
 	int starved[2];
@@ -176,6 +179,12 @@ struct request {
 	 * For Power Management requests
 	 */
 	struct request_pm_state *pm;
+
+	/*
+	 * completion callback. end_io_data should be folded in with waiting
+	 */
+	rq_end_io_fn *end_io;
+	void *end_io_data;
 };
 
 /*
@@ -509,10 +518,10 @@ extern void blk_unregister_queue(struct gendisk *disk);
 extern void register_disk(struct gendisk *dev);
 extern void generic_make_request(struct bio *bio);
 extern void blk_put_request(struct request *);
+extern void blk_end_sync_rq(struct request *rq);
 extern void blk_attempt_remerge(request_queue_t *, struct request *);
 extern void __blk_attempt_remerge(request_queue_t *, struct request *);
 extern struct request *blk_get_request(request_queue_t *, int, int);
-extern void blk_put_request(struct request *);
 extern void blk_insert_request(request_queue_t *, struct request *, int, void *, int);
 extern void blk_requeue_request(request_queue_t *, struct request *);
 extern void blk_plug_device(request_queue_t *);
-- 
cgit v1.2.3


From e18e923a3cbae899248430b52fc761ce6e7f6ef0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 7 Mar 2005 17:49:17 -0800
Subject: [PATCH] rework core barrier support

This reworks the core barrier support to be a lot nicer, so that all the
nasty code resides outside of drivers/ide.  It requires minimal changes to
support in a driver, I've added SCSI support as an example.  The ide code
is adapted to the new code.

With this patch, we support full barriers on sata now.  Bart has acked the
addition to -mm, I would like for this to be submitted as soon as 2.6.12
opens.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/elevator.c    |  16 +++-
 drivers/block/ll_rw_blk.c   | 228 ++++++++++++++++++++++++++++++++++++++++----
 drivers/ide/ide-disk.c      |  73 ++++++++++++--
 drivers/ide/ide-io.c        | 159 ++----------------------------
 drivers/scsi/ahci.c         |   1 +
 drivers/scsi/ata_piix.c     |   1 +
 drivers/scsi/hosts.c        |  10 ++
 drivers/scsi/sata_nv.c      |   1 +
 drivers/scsi/sata_promise.c |   1 +
 drivers/scsi/sata_sil.c     |   1 +
 drivers/scsi/sata_sis.c     |   1 +
 drivers/scsi/sata_svw.c     |   1 +
 drivers/scsi/sata_sx4.c     |   1 +
 drivers/scsi/sata_uli.c     |   1 +
 drivers/scsi/sata_via.c     |   1 +
 drivers/scsi/sata_vsc.c     |   1 +
 drivers/scsi/scsi_lib.c     |  46 +++++++++
 drivers/scsi/sd.c           |  31 ++++++
 include/linux/blkdev.h      |  24 ++++-
 include/linux/ide.h         |   2 +-
 include/scsi/scsi_driver.h  |   2 +
 include/scsi/scsi_host.h    |  12 +++
 22 files changed, 430 insertions(+), 184 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index 35f4079b04d6..241fdbcb29bc 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -320,7 +320,21 @@ void elv_add_request(request_queue_t *q, struct request *rq, int where,
 
 static inline struct request *__elv_next_request(request_queue_t *q)
 {
-	return q->elevator->ops->elevator_next_req_fn(q);
+	struct request *rq = q->elevator->ops->elevator_next_req_fn(q);
+
+	/*
+	 * if this is a barrier write and the device has to issue a
+	 * flush sequence to support it, check how far we are
+	 */
+	if (rq && blk_fs_request(rq) && blk_barrier_rq(rq)) {
+		BUG_ON(q->ordered == QUEUE_ORDERED_NONE);
+
+		if (q->ordered == QUEUE_ORDERED_FLUSH &&
+		    !blk_barrier_preflush(rq))
+			rq = blk_start_pre_flush(q, rq);
+	}
+
+	return rq;
 }
 
 struct request *elv_next_request(request_queue_t *q)
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index d0a87b77cbb0..af47bc3bb0b3 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -267,6 +267,25 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 
 EXPORT_SYMBOL(blk_queue_make_request);
 
+static inline void rq_init(request_queue_t *q, struct request *rq)
+{
+	INIT_LIST_HEAD(&rq->queuelist);
+
+	rq->errors = 0;
+	rq->rq_status = RQ_ACTIVE;
+	rq->bio = rq->biotail = NULL;
+	rq->buffer = NULL;
+	rq->ref_count = 1;
+	rq->q = q;
+	rq->waiting = NULL;
+	rq->special = NULL;
+	rq->data_len = 0;
+	rq->data = NULL;
+	rq->sense = NULL;
+	rq->end_io = NULL;
+	rq->end_io_data = NULL;
+}
+
 /**
  * blk_queue_ordered - does this queue support ordered writes
  * @q:     the request queue
@@ -281,10 +300,26 @@ EXPORT_SYMBOL(blk_queue_make_request);
  **/
 void blk_queue_ordered(request_queue_t *q, int flag)
 {
-	if (flag)
-		set_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
-	else
-		clear_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+	switch (flag) {
+		case QUEUE_ORDERED_NONE:
+			if (q->flush_rq)
+				kmem_cache_free(request_cachep, q->flush_rq);
+			q->flush_rq = NULL;
+			q->ordered = flag;
+			break;
+		case QUEUE_ORDERED_TAG:
+			q->ordered = flag;
+			break;
+		case QUEUE_ORDERED_FLUSH:
+			q->ordered = flag;
+			if (!q->flush_rq)
+				q->flush_rq = kmem_cache_alloc(request_cachep,
+								GFP_KERNEL);
+			break;
+		default:
+			printk("blk_queue_ordered: bad value %d\n", flag);
+			break;
+	}
 }
 
 EXPORT_SYMBOL(blk_queue_ordered);
@@ -306,6 +341,170 @@ void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff)
 
 EXPORT_SYMBOL(blk_queue_issue_flush_fn);
 
+/*
+ * Cache flushing for ordered writes handling
+ */
+static void blk_pre_flush_end_io(struct request *flush_rq)
+{
+	struct request *rq = flush_rq->end_io_data;
+	request_queue_t *q = rq->q;
+
+	rq->flags |= REQ_BAR_PREFLUSH;
+
+	if (!flush_rq->errors)
+		elv_requeue_request(q, rq);
+	else {
+		q->end_flush_fn(q, flush_rq);
+		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
+	}
+}
+
+static void blk_post_flush_end_io(struct request *flush_rq)
+{
+	struct request *rq = flush_rq->end_io_data;
+	request_queue_t *q = rq->q;
+
+	rq->flags |= REQ_BAR_POSTFLUSH;
+
+	/*
+	 * called from end_that_request_last(), so we know that the queue
+	 * lock is held
+	 */
+	spin_unlock(q->queue_lock);
+	q->end_flush_fn(q, flush_rq);
+	spin_lock(q->queue_lock);
+
+	clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
+}
+
+struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq)
+{
+	struct request *flush_rq = q->flush_rq;
+
+	BUG_ON(!blk_barrier_rq(rq));
+
+	rq_init(q, flush_rq);
+	flush_rq->elevator_private = NULL;
+	flush_rq->flags = 0;
+	flush_rq->rq_disk = rq->rq_disk;
+	flush_rq->rl = NULL;
+
+	/*
+	 * prepare_flush returns 0 if no flush is needed, just mark both
+	 * pre and post flush as done in that case
+	 */
+	if (!q->prepare_flush_fn(q, flush_rq)) {
+		rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH;
+		return rq;
+	}
+
+	set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
+
+	/*
+	 * some drivers dequeue requests right away, some only after io
+	 * completion. make sure the request is dequeued.
+	 */
+	if (!list_empty(&rq->queuelist))
+		blkdev_dequeue_request(rq);
+
+	flush_rq->end_io_data = rq;
+	flush_rq->end_io = blk_pre_flush_end_io;
+
+	__elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
+	return flush_rq;
+}
+
+static void blk_start_post_flush(request_queue_t *q, struct request *rq)
+{
+	struct request *flush_rq = q->flush_rq;
+
+	BUG_ON(!blk_barrier_rq(rq));
+
+	rq_init(q, flush_rq);
+	flush_rq->elevator_private = NULL;
+	flush_rq->flags = 0;
+	flush_rq->rq_disk = rq->rq_disk;
+	flush_rq->rl = NULL;
+
+	if (q->prepare_flush_fn(q, flush_rq)) {
+		flush_rq->end_io_data = rq;
+		flush_rq->end_io = blk_post_flush_end_io;
+
+		__elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
+		q->request_fn(q);
+	}
+}
+
+static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq,
+					int sectors)
+{
+	if (sectors > rq->nr_sectors)
+		sectors = rq->nr_sectors;
+
+	rq->nr_sectors -= sectors;
+	return rq->nr_sectors;
+}
+
+static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq,
+				     int sectors, int queue_locked)
+{
+	if (q->ordered != QUEUE_ORDERED_FLUSH)
+		return 0;
+	if (!blk_fs_request(rq) || !blk_barrier_rq(rq))
+		return 0;
+	if (blk_barrier_postflush(rq))
+		return 0;
+
+	if (!blk_check_end_barrier(q, rq, sectors)) {
+		unsigned long flags = 0;
+
+		if (!queue_locked)
+			spin_lock_irqsave(q->queue_lock, flags);
+
+		blk_start_post_flush(q, rq);
+
+		if (!queue_locked)
+			spin_unlock_irqrestore(q->queue_lock, flags);
+	}
+
+	return 1;
+}
+
+/**
+ * blk_complete_barrier_rq - complete possible barrier request
+ * @q:  the request queue for the device
+ * @rq:  the request
+ * @sectors:  number of sectors to complete
+ *
+ * Description:
+ *   Used in driver end_io handling to determine whether to postpone
+ *   completion of a barrier request until a post flush has been done. This
+ *   is the unlocked variant, used if the caller doesn't already hold the
+ *   queue lock.
+ **/
+int blk_complete_barrier_rq(request_queue_t *q, struct request *rq, int sectors)
+{
+	return __blk_complete_barrier_rq(q, rq, sectors, 0);
+}
+EXPORT_SYMBOL(blk_complete_barrier_rq);
+
+/**
+ * blk_complete_barrier_rq_locked - complete possible barrier request
+ * @q:  the request queue for the device
+ * @rq:  the request
+ * @sectors:  number of sectors to complete
+ *
+ * Description:
+ *   See blk_complete_barrier_rq(). This variant must be used if the caller
+ *   holds the queue lock.
+ **/
+int blk_complete_barrier_rq_locked(request_queue_t *q, struct request *rq,
+				   int sectors)
+{
+	return __blk_complete_barrier_rq(q, rq, sectors, 1);
+}
+EXPORT_SYMBOL(blk_complete_barrier_rq_locked);
+
 /**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
  * @q:  the request queue for the device
@@ -1428,6 +1627,8 @@ void blk_cleanup_queue(request_queue_t * q)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
+	blk_queue_ordered(q, QUEUE_ORDERED_NONE);
+
 	kmem_cache_free(requestq_cachep, q);
 }
 
@@ -1739,23 +1940,8 @@ rq_starved:
 	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 	
-	INIT_LIST_HEAD(&rq->queuelist);
-
-	rq->errors = 0;
-	rq->rq_status = RQ_ACTIVE;
-	rq->bio = rq->biotail = NULL;
-	rq->buffer = NULL;
-	rq->ref_count = 1;
-	rq->q = q;
+	rq_init(q, rq);
 	rq->rl = rl;
-	rq->waiting = NULL;
-	rq->special = NULL;
-	rq->data_len = 0;
-	rq->data = NULL;
-	rq->sense = NULL;
-	rq->end_io = NULL;
-	rq->end_io_data = NULL;
-
 out:
 	put_io_context(ioc);
 	return rq;
@@ -2392,7 +2578,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 	spin_lock_prefetch(q->queue_lock);
 
 	barrier = bio_barrier(bio);
-	if (barrier && !(q->queue_flags & (1 << QUEUE_FLAG_ORDERED))) {
+	if (barrier && (q->ordered == QUEUE_ORDERED_NONE)) {
 		err = -EOPNOTSUPP;
 		goto end_io;
 	}
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index db55f241f76f..36b90f85968c 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -698,18 +698,54 @@ static ide_proc_entry_t idedisk_proc[] = {
 
 #endif	/* CONFIG_PROC_FS */
 
-static int idedisk_issue_flush(request_queue_t *q, struct gendisk *disk,
-			       sector_t *error_sector)
+static void idedisk_end_flush(request_queue_t *q, struct request *flush_rq)
+{
+	ide_drive_t *drive = q->queuedata;
+	struct request *rq = flush_rq->end_io_data;
+	int good_sectors = rq->hard_nr_sectors;
+	int bad_sectors;
+	sector_t sector;
+
+	if (flush_rq->errors & ABRT_ERR) {
+		printk(KERN_ERR "%s: barrier support doesn't work\n", drive->name);
+		blk_queue_ordered(drive->queue, QUEUE_ORDERED_NONE);
+		blk_queue_issue_flush_fn(drive->queue, NULL);
+		good_sectors = 0;
+	} else if (flush_rq->errors) {
+		sector = ide_get_error_location(drive, flush_rq->buffer);
+		if ((sector >= rq->hard_sector) &&
+		    (sector < rq->hard_sector + rq->hard_nr_sectors))
+			good_sectors = sector - rq->hard_sector;
+		else
+			good_sectors = 0;
+	}
+
+	if (flush_rq->errors)
+		printk(KERN_ERR "%s: failed barrier write: "
+				"sector=%Lx(good=%d/bad=%d)\n",
+				drive->name, (unsigned long long)rq->sector,
+				good_sectors,
+				(int) (rq->hard_nr_sectors-good_sectors));
+
+	bad_sectors = rq->hard_nr_sectors - good_sectors;
+
+	spin_lock(&ide_lock);
+
+	if (good_sectors)
+		__ide_end_request(drive, rq, 1, good_sectors);
+	if (bad_sectors)
+		__ide_end_request(drive, rq, 0, bad_sectors);
+
+	spin_unlock(&ide_lock);
+}
+
+static int idedisk_prepare_flush(request_queue_t *q, struct request *rq)
 {
 	ide_drive_t *drive = q->queuedata;
-	struct request *rq;
-	int ret;
 
 	if (!drive->wcache)
 		return 0;
 
-	rq = blk_get_request(q, WRITE, __GFP_WAIT);
-
 	memset(rq->cmd, 0, sizeof(rq->cmd));
 
 	if (ide_id_has_flush_cache_ext(drive->id) &&
@@ -721,6 +757,22 @@ static int idedisk_issue_flush(request_queue_t *q, struct gendisk *disk,
 
 	rq->flags |= REQ_DRIVE_TASK | REQ_SOFTBARRIER;
 	rq->buffer = rq->cmd;
+	return 1;
+}
+
+static int idedisk_issue_flush(request_queue_t *q, struct gendisk *disk,
+			       sector_t *error_sector)
+{
+	ide_drive_t *drive = q->queuedata;
+	struct request *rq;
+	int ret;
+
+	if (!drive->wcache)
+		return 0;
+
+	rq = blk_get_request(q, WRITE, __GFP_WAIT);
+
+	idedisk_prepare_flush(q, rq);
 
 	ret = blk_execute_rq(q, disk, rq);
 
@@ -1098,10 +1150,15 @@ static void idedisk_setup (ide_drive_t *drive)
 			barrier = 0;
 	}
 
-	printk(KERN_DEBUG "%s: cache flushes %ssupported\n",
+	if (!strncmp(drive->name, "hdc", 3))
+		barrier = 1;
+
+	printk(KERN_INFO "%s: cache flushes %ssupported\n",
 		drive->name, barrier ? "" : "not ");
 	if (barrier) {
-		blk_queue_ordered(drive->queue, 1);
+		blk_queue_ordered(drive->queue, QUEUE_ORDERED_FLUSH);
+		drive->queue->prepare_flush_fn = idedisk_prepare_flush;
+		drive->queue->end_flush_fn = idedisk_end_flush;
 		blk_queue_issue_flush_fn(drive->queue, idedisk_issue_flush);
 	}
 }
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 186a51eb9196..e24359fdb463 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -55,62 +55,8 @@
 #include <asm/io.h>
 #include <asm/bitops.h>
 
-static void ide_fill_flush_cmd(ide_drive_t *drive, struct request *rq)
-{
-	char *buf = rq->cmd;
-
-	/*
-	 * reuse cdb space for ata command
-	 */
-	memset(buf, 0, sizeof(rq->cmd));
-
-	rq->flags |= REQ_DRIVE_TASK | REQ_STARTED;
-	rq->buffer = buf;
-	rq->buffer[0] = WIN_FLUSH_CACHE;
-
-	if (ide_id_has_flush_cache_ext(drive->id) &&
-	    (drive->capacity64 >= (1UL << 28)))
-		rq->buffer[0] = WIN_FLUSH_CACHE_EXT;
-}
-
-/*
- * preempt pending requests, and store this cache flush for immediate
- * execution
- */
-static struct request *ide_queue_flush_cmd(ide_drive_t *drive,
-					   struct request *rq, int post)
-{
-	struct request *flush_rq = &HWGROUP(drive)->wrq;
-
-	/*
-	 * write cache disabled, clear the barrier bit and treat it like
-	 * an ordinary write
-	 */
-	if (!drive->wcache) {
-		rq->flags |= REQ_BAR_PREFLUSH;
-		return rq;
-	}
-
-	ide_init_drive_cmd(flush_rq);
-	ide_fill_flush_cmd(drive, flush_rq);
-
-	flush_rq->special = rq;
-	flush_rq->nr_sectors = rq->nr_sectors;
-
-	if (!post) {
-		drive->doing_barrier = 1;
-		flush_rq->flags |= REQ_BAR_PREFLUSH;
-		blkdev_dequeue_request(rq);
-	} else
-		flush_rq->flags |= REQ_BAR_POSTFLUSH;
-
-	__elv_add_request(drive->queue, flush_rq, ELEVATOR_INSERT_FRONT, 0);
-	HWGROUP(drive)->rq = NULL;
-	return flush_rq;
-}
-
-static int __ide_end_request(ide_drive_t *drive, struct request *rq,
-			     int uptodate, int nr_sectors)
+int __ide_end_request(ide_drive_t *drive, struct request *rq, int uptodate,
+		      int nr_sectors)
 {
 	int ret = 1;
 
@@ -148,6 +94,7 @@ static int __ide_end_request(ide_drive_t *drive, struct request *rq,
 	}
 	return ret;
 }
+EXPORT_SYMBOL(__ide_end_request);
 
 /**
  *	ide_end_request		-	complete an IDE I/O
@@ -172,17 +119,10 @@ int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
 	if (!nr_sectors)
 		nr_sectors = rq->hard_cur_sectors;
 
-	if (!blk_barrier_rq(rq) || !drive->wcache)
+	if (blk_complete_barrier_rq_locked(drive->queue, rq, nr_sectors))
+		ret = rq->nr_sectors != 0;
+	else
 		ret = __ide_end_request(drive, rq, uptodate, nr_sectors);
-	else {
-		struct request *flush_rq = &HWGROUP(drive)->wrq;
-
-		flush_rq->nr_sectors -= nr_sectors;
-		if (!flush_rq->nr_sectors) {
-			ide_queue_flush_cmd(drive, rq, 1);
-			ret = 0;
-		}
-	}
 
 	spin_unlock_irqrestore(&ide_lock, flags);
 	return ret;
@@ -253,79 +193,6 @@ u64 ide_get_error_location(ide_drive_t *drive, char *args)
 }
 EXPORT_SYMBOL(ide_get_error_location);
 
-static void ide_complete_barrier(ide_drive_t *drive, struct request *rq,
-				 int error)
-{
-	struct request *real_rq = rq->special;
-	int good_sectors, bad_sectors;
-	sector_t sector;
-
-	if (!error) {
-		if (blk_barrier_postflush(rq)) {
-			/*
-			 * this completes the barrier write
-			 */
-			__ide_end_request(drive, real_rq, 1, real_rq->hard_nr_sectors);
-			drive->doing_barrier = 0;
-		} else {
-			/*
-			 * just indicate that we did the pre flush
-			 */
-			real_rq->flags |= REQ_BAR_PREFLUSH;
-			elv_requeue_request(drive->queue, real_rq);
-		}
-		/*
-		 * all is fine, return
-		 */
-		return;
-	}
-
-	/*
-	 * we need to end real_rq, but it's not on the queue currently.
-	 * put it back on the queue, so we don't have to special case
-	 * anything else for completing it
-	 */
-	if (!blk_barrier_postflush(rq))
-		elv_requeue_request(drive->queue, real_rq);
-
-	/*
-	 * drive aborted flush command, assume FLUSH_CACHE_* doesn't
-	 * work and disable barrier support
-	 */
-	if (error & ABRT_ERR) {
-		printk(KERN_ERR "%s: barrier support doesn't work\n", drive->name);
-		__ide_end_request(drive, real_rq, -EOPNOTSUPP, real_rq->hard_nr_sectors);
-		blk_queue_ordered(drive->queue, 0);
-		blk_queue_issue_flush_fn(drive->queue, NULL);
-	} else {
-		/*
-		 * find out what part of the request failed
-		 */
-		good_sectors = 0;
-		if (blk_barrier_postflush(rq)) {
-			sector = ide_get_error_location(drive, rq->buffer);
-
-			if ((sector >= real_rq->hard_sector) &&
-			    (sector < real_rq->hard_sector + real_rq->hard_nr_sectors))
-				good_sectors = sector - real_rq->hard_sector;
-		} else
-			sector = real_rq->hard_sector;
-
-		bad_sectors = real_rq->hard_nr_sectors - good_sectors;
-		if (good_sectors)
-			__ide_end_request(drive, real_rq, 1, good_sectors);
-		if (bad_sectors)
-			__ide_end_request(drive, real_rq, 0, bad_sectors);
-
-		printk(KERN_ERR "%s: failed barrier write: "
-				"sector=%Lx(good=%d/bad=%d)\n",
-				drive->name, (unsigned long long)sector,
-				good_sectors, bad_sectors);
-	}
-
-	drive->doing_barrier = 0;
-}
-
 /**
  *	ide_end_drive_cmd	-	end an explicit drive command
  *	@drive: command 
@@ -417,11 +284,8 @@ void ide_end_drive_cmd (ide_drive_t *drive, u8 stat, u8 err)
 
 	spin_lock_irqsave(&ide_lock, flags);
 	blkdev_dequeue_request(rq);
-
-	if (blk_barrier_preflush(rq) || blk_barrier_postflush(rq))
-		ide_complete_barrier(drive, rq, err);
-
 	HWGROUP(drive)->rq = NULL;
+	rq->errors = err;
 	end_that_request_last(rq);
 	spin_unlock_irqrestore(&ide_lock, flags);
 }
@@ -963,7 +827,7 @@ repeat:
 	 * though that is 3 requests, it must be seen as a single transaction.
 	 * we must not preempt this drive until that is complete
 	 */
-	if (drive->doing_barrier) {
+	if (blk_queue_flushing(drive->queue)) {
 		/*
 		 * small race where queue could get replugged during
 		 * the 3-request flush cycle, just yank the plug since
@@ -1127,13 +991,6 @@ static void ide_do_request (ide_hwgroup_t *hwgroup, int masked_irq)
 			break;
 		}
 
-		/*
-		 * if rq is a barrier write, issue pre cache flush if not
-		 * already done
-		 */
-		if (blk_barrier_rq(rq) && !blk_barrier_preflush(rq))
-			rq = ide_queue_flush_cmd(drive, rq, 0);
-
 		/*
 		 * Sanity: don't accept a request that isn't a PM request
 		 * if we are currently power managed. This is very important as
diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index 168f2bdc9db0..d11fc2c9709b 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -199,6 +199,7 @@ static Scsi_Host_Template ahci_sht = {
 	.dma_boundary		= AHCI_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
+	.ordered_flush		= 1,
 };
 
 static struct ata_port_operations ahci_ops = {
diff --git a/drivers/scsi/ata_piix.c b/drivers/scsi/ata_piix.c
index 9940968fd08b..ce19728aa8a4 100644
--- a/drivers/scsi/ata_piix.c
+++ b/drivers/scsi/ata_piix.c
@@ -121,6 +121,7 @@ static Scsi_Host_Template piix_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
+	.ordered_flush		= 1,
 };
 
 static struct ata_port_operations piix_pata_ops = {
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 96eb23515e8b..9d2bba37dcec 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -247,6 +247,16 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	shost->cmd_per_lun = sht->cmd_per_lun;
 	shost->unchecked_isa_dma = sht->unchecked_isa_dma;
 	shost->use_clustering = sht->use_clustering;
+	shost->ordered_flush = sht->ordered_flush;
+	shost->ordered_tag = sht->ordered_tag;
+
+	/*
+	 * hosts/devices that do queueing must support ordered tags
+	 */
+	if (shost->can_queue > 1 && shost->ordered_flush) {
+		printk(KERN_ERR "scsi: ordered flushes don't support queueing\n");
+		shost->ordered_flush = 0;
+	}
 
 	if (sht->max_host_blocked)
 		shost->max_host_blocked = sht->max_host_blocked;
diff --git a/drivers/scsi/sata_nv.c b/drivers/scsi/sata_nv.c
index dc0c680cbfa8..69009f853a49 100644
--- a/drivers/scsi/sata_nv.c
+++ b/drivers/scsi/sata_nv.c
@@ -206,6 +206,7 @@ static Scsi_Host_Template nv_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
+	.ordered_flush		= 1,
 };
 
 static struct ata_port_operations nv_ops = {
diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c
index 34faa5023688..19a13e3590f4 100644
--- a/drivers/scsi/sata_promise.c
+++ b/drivers/scsi/sata_promise.c
@@ -102,6 +102,7 @@ static Scsi_Host_Template pdc_ata_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
+	.ordered_flush		= 1,
 };
 
 static struct ata_port_operations pdc_ata_ops = {
diff --git a/drivers/scsi/sata_sil.c b/drivers/scsi/sata_sil.c
index 4cb82cba3084..672e413bf807 100644
--- a/drivers/scsi/sata_sil.c
+++ b/drivers/scsi/sata_sil.c
@@ -125,6 +125,7 @@ static Scsi_Host_Template sil_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
+	.ordered_flush		= 1,
 };
 
 static struct ata_port_operations sil_ops = {
diff --git a/drivers/scsi/sata_sis.c b/drivers/scsi/sata_sis.c
index c6d82789306d..5105ddd08447 100644
--- a/drivers/scsi/sata_sis.c
+++ b/drivers/scsi/sata_sis.c
@@ -90,6 +90,7 @@ static Scsi_Host_Template sis_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
+	.ordered_flush		= 1,
 };
 
 static struct ata_port_operations sis_ops = {
diff --git a/drivers/scsi/sata_svw.c b/drivers/scsi/sata_svw.c
index 60727c76bb02..8d1a5d25c053 100644
--- a/drivers/scsi/sata_svw.c
+++ b/drivers/scsi/sata_svw.c
@@ -288,6 +288,7 @@ static Scsi_Host_Template k2_sata_sht = {
 	.proc_info		= k2_sata_proc_info,
 #endif
 	.bios_param		= ata_std_bios_param,
+	.ordered_flush		= 1,
 };
 
 
diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c
index 30d20e0fe737..70118650c461 100644
--- a/drivers/scsi/sata_sx4.c
+++ b/drivers/scsi/sata_sx4.c
@@ -188,6 +188,7 @@ static Scsi_Host_Template pdc_sata_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
+	.ordered_flush		= 1,
 };
 
 static struct ata_port_operations pdc_20621_ops = {
diff --git a/drivers/scsi/sata_uli.c b/drivers/scsi/sata_uli.c
index 5a5c13f6e282..0bff4f475f26 100644
--- a/drivers/scsi/sata_uli.c
+++ b/drivers/scsi/sata_uli.c
@@ -82,6 +82,7 @@ static Scsi_Host_Template uli_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
+	.ordered_flush		= 1,
 };
 
 static struct ata_port_operations uli_ops = {
diff --git a/drivers/scsi/sata_via.c b/drivers/scsi/sata_via.c
index 5442f5079c99..3a7830667277 100644
--- a/drivers/scsi/sata_via.c
+++ b/drivers/scsi/sata_via.c
@@ -102,6 +102,7 @@ static Scsi_Host_Template svia_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
+	.ordered_flush		= 1,
 };
 
 static struct ata_port_operations svia_sata_ops = {
diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c
index e63f95ebad98..2c28f0ad73c2 100644
--- a/drivers/scsi/sata_vsc.c
+++ b/drivers/scsi/sata_vsc.c
@@ -205,6 +205,7 @@ static Scsi_Host_Template vsc_sata_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
+	.ordered_flush		= 1,
 };
 
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 20131251693e..7300f2f3c9af 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -697,6 +697,9 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes,
 	int sense_valid = 0;
 	int sense_deferred = 0;
 
+	if (blk_complete_barrier_rq(q, req, good_bytes << 9))
+		return;
+
 	/*
 	 * Free up any indirection buffers we allocated for DMA purposes. 
 	 * For the case of a READ, we need to copy the data out of the
@@ -962,6 +965,38 @@ static int scsi_init_io(struct scsi_cmnd *cmd)
 	return BLKPREP_KILL;
 }
 
+static int scsi_prepare_flush_fn(request_queue_t *q, struct request *rq)
+{
+	struct scsi_device *sdev = q->queuedata;
+	struct scsi_driver *drv;
+
+	if (sdev->sdev_state == SDEV_RUNNING) {
+		drv = *(struct scsi_driver **) rq->rq_disk->private_data;
+
+		if (drv->prepare_flush)
+			return drv->prepare_flush(q, rq);
+	}
+
+	return 0;
+}
+
+static void scsi_end_flush_fn(request_queue_t *q, struct request *rq)
+{
+	struct scsi_device *sdev = q->queuedata;
+	struct request *flush_rq = rq->end_io_data;
+	struct scsi_driver *drv;
+
+	if (flush_rq->errors) {
+		printk("scsi: barrier error, disabling flush support\n");
+		blk_queue_ordered(q, QUEUE_ORDERED_NONE);
+	}
+
+	if (sdev->sdev_state == SDEV_RUNNING) {
+		drv = *(struct scsi_driver **) rq->rq_disk->private_data;
+		drv->end_flush(q, rq);
+	}
+}
+
 static int scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
 			       sector_t *error_sector)
 {
@@ -1366,6 +1401,17 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
 	blk_queue_segment_boundary(q, shost->dma_boundary);
 	blk_queue_issue_flush_fn(q, scsi_issue_flush_fn);
 
+	/*
+	 * ordered tags are superior to flush ordering
+	 */
+	if (shost->ordered_tag)
+		blk_queue_ordered(q, QUEUE_ORDERED_TAG);
+	else if (shost->ordered_flush) {
+		blk_queue_ordered(q, QUEUE_ORDERED_FLUSH);
+		q->prepare_flush_fn = scsi_prepare_flush_fn;
+		q->end_flush_fn = scsi_end_flush_fn;
+	}
+
 	if (!shost->use_clustering)
 		clear_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 	return q;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 523d68ad047b..fe8a079b00fb 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -122,6 +122,8 @@ static void sd_shutdown(struct device *dev);
 static void sd_rescan(struct device *);
 static int sd_init_command(struct scsi_cmnd *);
 static int sd_issue_flush(struct device *, sector_t *);
+static void sd_end_flush(request_queue_t *, struct request *);
+static int sd_prepare_flush(request_queue_t *, struct request *);
 static void sd_read_capacity(struct scsi_disk *sdkp, char *diskname,
 		 struct scsi_request *SRpnt, unsigned char *buffer);
 
@@ -136,6 +138,8 @@ static struct scsi_driver sd_template = {
 	.rescan			= sd_rescan,
 	.init_command		= sd_init_command,
 	.issue_flush		= sd_issue_flush,
+	.prepare_flush		= sd_prepare_flush,
+	.end_flush		= sd_end_flush,
 };
 
 /*
@@ -735,6 +739,33 @@ static int sd_issue_flush(struct device *dev, sector_t *error_sector)
 	return sd_sync_cache(sdp);
 }
 
+static void sd_end_flush(request_queue_t *q, struct request *flush_rq)
+{
+	struct request *rq = flush_rq->end_io_data;
+	struct scsi_cmnd *cmd = rq->special;
+	unsigned int bytes = rq->hard_nr_sectors << 9;
+
+	if (!flush_rq->errors)
+		scsi_io_completion(cmd, bytes, 0);
+	else
+		scsi_io_completion(cmd, 0, bytes);
+}
+
+static int sd_prepare_flush(request_queue_t *q, struct request *rq)
+{
+	struct scsi_device *sdev = q->queuedata;
+	struct scsi_disk *sdkp = dev_get_drvdata(&sdev->sdev_gendev);
+
+	if (sdkp->WCE) {
+		memset(rq->cmd, 0, sizeof(rq->cmd));
+		rq->flags = REQ_BLOCK_PC | REQ_SOFTBARRIER;
+		rq->cmd[0] = SYNCHRONIZE_CACHE;
+		return 1;
+	}
+
+	return 0;
+}
+
 static void sd_rescan(struct device *dev)
 {
 	struct scsi_disk *sdkp = dev_get_drvdata(dev);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c7553066b917..83eef4fde873 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -275,6 +275,8 @@ struct bio_vec;
 typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
 typedef void (activity_fn) (void *data, int rw);
 typedef int (issue_flush_fn) (request_queue_t *, struct gendisk *, sector_t *);
+typedef int (prepare_flush_fn) (request_queue_t *, struct request *);
+typedef void (end_flush_fn) (request_queue_t *, struct request *);
 
 enum blk_queue_state {
 	Queue_down,
@@ -318,6 +320,8 @@ struct request_queue
 	merge_bvec_fn		*merge_bvec_fn;
 	activity_fn		*activity_fn;
 	issue_flush_fn		*issue_flush_fn;
+	prepare_flush_fn	*prepare_flush_fn;
+	end_flush_fn		*end_flush_fn;
 
 	/*
 	 * Auto-unplugging state
@@ -389,6 +393,18 @@ struct request_queue
 	unsigned int		sg_reserved_size;
 
 	struct list_head	drain_list;
+
+	/*
+	 * reserved for flush operations
+	 */
+	struct request		*flush_rq;
+	unsigned char		ordered;
+};
+
+enum {
+	QUEUE_ORDERED_NONE,
+	QUEUE_ORDERED_TAG,
+	QUEUE_ORDERED_FLUSH,
 };
 
 #define RQ_INACTIVE		(-1)
@@ -405,12 +421,13 @@ struct request_queue
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
-#define QUEUE_FLAG_ORDERED	8	/* supports ordered writes */
-#define QUEUE_FLAG_DRAIN	9	/* draining queue for sched switch */
+#define QUEUE_FLAG_DRAIN	8	/* draining queue for sched switch */
+#define QUEUE_FLAG_FLUSH	9	/* doing barrier flush sequence */
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
+#define blk_queue_flushing(q)	test_bit(QUEUE_FLAG_FLUSH, &(q)->queue_flags)
 
 #define blk_fs_request(rq)	((rq)->flags & REQ_CMD)
 #define blk_pc_request(rq)	((rq)->flags & REQ_BLOCK_PC)
@@ -611,6 +628,9 @@ extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bd
 extern void blk_queue_ordered(request_queue_t *, int);
 extern void blk_queue_issue_flush_fn(request_queue_t *, issue_flush_fn *);
 extern int blkdev_scsi_issue_flush_fn(request_queue_t *, struct gendisk *, sector_t *);
+extern struct request *blk_start_pre_flush(request_queue_t *,struct request *);
+extern int blk_complete_barrier_rq(request_queue_t *, struct request *, int);
+extern int blk_complete_barrier_rq_locked(request_queue_t *, struct request *, int);
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 28f35bc8ba7e..9c25adc6c28d 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -740,7 +740,6 @@ typedef struct ide_drive_s {
 	u8	sect;		/* "real" sectors per track */
 	u8	bios_head;	/* BIOS/fdisk/LILO number of heads */
 	u8	bios_sect;	/* BIOS/fdisk/LILO sectors per track */
-	u8	doing_barrier;	/* state, 1=currently doing flush */
 
 	unsigned int	bios_cyl;	/* BIOS/fdisk/LILO number of cyls */
 	unsigned int	cyl;		/* "real" number of cyls */
@@ -1130,6 +1129,7 @@ extern	ide_hwif_t	ide_hwifs[];		/* master data repository */
 extern int noautodma;
 
 extern int ide_end_request (ide_drive_t *drive, int uptodate, int nrsecs);
+extern int __ide_end_request (ide_drive_t *drive, struct request *rq, int uptodate, int nrsecs);
 
 /*
  * This is used on exit from the driver to designate the next irq handler
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h
index 98c2e33f6159..850dfa877fda 100644
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -14,6 +14,8 @@ struct scsi_driver {
 	int (*init_command)(struct scsi_cmnd *);
 	void (*rescan)(struct device *);
 	int (*issue_flush)(struct device *, sector_t *);
+	int (*prepare_flush)(struct request_queue *, struct request *);
+	void (*end_flush)(struct request_queue *, struct request *);
 };
 #define to_scsi_driver(drv) \
 	container_of((drv), struct scsi_driver, gendrv)
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 191b8fced8ac..1d3e91542fa9 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -362,6 +362,12 @@ struct scsi_host_template {
 	 */
 	unsigned skip_settle_delay:1;
 
+	/*
+	 * ordered write support
+	 */
+	unsigned ordered_flush:1;
+	unsigned ordered_tag:1;
+
 	/*
 	 * Countdown for host blocking with no commands outstanding
 	 */
@@ -501,6 +507,12 @@ struct Scsi_Host {
 	 */
 	unsigned reverse_ordering:1;
 
+	/*
+	 * ordered write support
+	 */
+	unsigned ordered_flush:1;
+	unsigned ordered_tag:1;
+
 	/*
 	 * Host has rejected a command because it was busy.
 	 */
-- 
cgit v1.2.3


From 26e25c95e30faaa235f7d783adae797a21fa7392 Mon Sep 17 00:00:00 2001
From: Yoichi Yuasa <yuasa@hh.iij4u.or.jp>
Date: Mon, 7 Mar 2005 17:50:18 -0800
Subject: [PATCH] serial: add NEC VR4100 series serial support

This patch adds serial driver for NEC VR4100 series serial interface unit.

The new device numbers have been recorded by LANANA.

Signed-off-by: Yoichi Yuasa <yuasa@hh.iij4u.or.jp>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/devices.txt        |    4 +
 arch/mips/vr41xx/common/cmu.c    |    2 +-
 drivers/serial/Kconfig           |   17 +
 drivers/serial/Makefile          |    1 +
 drivers/serial/vr41xx_siu.c      | 1009 ++++++++++++++++++++++++++++++++++++++
 include/asm-mips/vr41xx/vr41xx.h |    2 -
 include/linux/serial_core.h      |    4 +
 7 files changed, 1036 insertions(+), 3 deletions(-)
 create mode 100644 drivers/serial/vr41xx_siu.c

(limited to 'include/linux')

diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 9d0cdb431f96..bb67cf25010e 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -2754,6 +2754,8 @@ Your cooperation is appreciated.
 		 50 = /dev/ttyIOC40		Altix serial card
 		    ...
 		 81 = /dev/ttyIOC431		Altix serial card
+		 82 = /dev/ttyVR0               NEC VR4100 series SIU
+		 83 = /dev/ttyVR1               NEC VR4100 series DSIU
 
 205 char	Low-density serial ports (alternate device)
 		  0 = /dev/culu0		Callout device for ttyLU0
@@ -2788,6 +2790,8 @@ Your cooperation is appreciated.
 		 50 = /dev/cuioc40		Callout device for ttyIOC40
 		    ...
 		 81 = /dev/cuioc431		Callout device for ttyIOC431
+		 82 = /dev/cuvr0                Callout device for ttyVR0
+		 83 = /dev/cuvr1                Callout device for ttyVR1
 
 
 206 char	OnStream SC-x0 tape devices
diff --git a/arch/mips/vr41xx/common/cmu.c b/arch/mips/vr41xx/common/cmu.c
index ef5197067eda..fcd3cb8cdd9d 100644
--- a/arch/mips/vr41xx/common/cmu.c
+++ b/arch/mips/vr41xx/common/cmu.c
@@ -170,7 +170,7 @@ void vr41xx_mask_clock(vr41xx_clock_t clock)
 		    current_cpu_data.cputype == CPU_VR4121) {
 			cmuclkmsk &= ~MSKDSIU;
 		} else {
-			if (cmuclkmsk & MSKSIU)
+			if (cmuclkmsk & MSKSSIU)
 				cmuclkmsk &= ~MSKDSIU;
 			else
 				cmuclkmsk &= ~(MSKSIU | MSKDSIU);
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index ab8035437a7a..d3a69f6a2906 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -810,4 +810,21 @@ config SERIAL_TXX9_STDSERIAL
 	bool "TX39XX/49XX SIO act as standard serial"
 	depends on !SERIAL_8250 && SERIAL_TXX9
 
+config SERIAL_VR41XX
+	tristate "NEC VR4100 series Serial Interface Unit support"
+	depends on CPU_VR41XX
+	select SERIAL_CORE
+	help
+	  If you have a NEC VR4100 series processor and you want to use
+	  Serial Interface Unit(SIU) or Debug Serial Interface Unit(DSIU)
+	  (not include VR4111/VR4121 DSIU), say Y.  Otherwise, say N.
+
+config SERIAL_VR41XX_CONSOLE
+	bool "Enable NEC VR4100 series Serial Interface Unit console"
+	depends on SERIAL_VR41XX
+	select SERIAL_CORE_CONSOLE
+	help
+	  If you have a NEC VR4100 series processor and you want to use
+	  a console on a serial port, say Y.  Otherwise, say N.
+
 endmenu
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index ec04507a5969..ed4425ce50a3 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -49,4 +49,5 @@ obj-$(CONFIG_SERIAL_M32R_SIO) += m32r_sio.o
 obj-$(CONFIG_SERIAL_MPSC) += mpsc.o
 obj-$(CONFIG_ETRAX_SERIAL) += crisv10.o
 obj-$(CONFIG_SERIAL_TXX9) += serial_txx9.o
+obj-$(CONFIG_SERIAL_VR41XX) += vr41xx_siu.o
 obj-$(CONFIG_BLK_DEV_SGIIOC4) += ioc4_serial.o
diff --git a/drivers/serial/vr41xx_siu.c b/drivers/serial/vr41xx_siu.c
new file mode 100644
index 000000000000..331db40dc98e
--- /dev/null
+++ b/drivers/serial/vr41xx_siu.c
@@ -0,0 +1,1009 @@
+/*
+ *  Driver for NEC VR4100 series Serial Interface Unit.
+ *
+ *  Copyright (C) 2004-2005  Yoichi Yuasa <yuasa@hh.iij4u.or.jp>
+ *
+ *  Based on drivers/serial/8250.c, by Russell King.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/config.h>
+
+#if defined(CONFIG_SERIAL_VR41XX_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
+#define SUPPORT_SYSRQ
+#endif
+
+#include <linux/console.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/platform.h>
+#include <linux/serial.h>
+#include <linux/serial_core.h>
+#include <linux/serial_reg.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+#include <asm/io.h>
+#include <asm/vr41xx/vr41xx.h>
+
+#define SIU_PORTS_MAX	2
+#define SIU_BAUD_BASE	1152000
+#define SIU_MAJOR	204
+#define SIU_MINOR_BASE	82
+
+#define RX_MAX_COUNT	256
+#define TX_MAX_COUNT	15
+
+struct siu_port {
+	unsigned int type;
+	unsigned int irq;
+	unsigned long start;
+	uint8_t flags;
+};
+
+#define SIU_HAS_IRDA_SUPPORT	0x01
+#define SIU_OUTPUT_IRDA		0x10
+
+static const struct siu_port siu_type1_ports[] = {
+	{	.type		= PORT_VR41XX_SIU,
+		.irq		= SIU_IRQ,
+		.start		= 0x0c000000UL,
+		.flags		= SIU_HAS_IRDA_SUPPORT,	},
+};
+
+#define SIU_TYPE1_NR_PORTS	(sizeof(siu_type1_ports) / sizeof(struct siu_port))
+
+static const struct siu_port siu_type2_ports[] = {
+	{	.type		= PORT_VR41XX_SIU,
+		.irq		= SIU_IRQ,
+		.start		= 0x0f000800UL,
+		.flags		= SIU_HAS_IRDA_SUPPORT,	},
+	{	.type		= PORT_VR41XX_DSIU,
+		.irq		= DSIU_IRQ,
+		.start		= 0x0f000820UL,		},
+};
+
+#define SIU_TYPE2_NR_PORTS	(sizeof(siu_type2_ports) / sizeof(struct siu_port))
+
+static struct uart_port siu_uart_ports[SIU_PORTS_MAX];
+static uint8_t lsr_break_flag[SIU_PORTS_MAX];
+
+#define siu_read(port, offset)		readb((port)->membase + (offset))
+#define siu_write(port, offset, value)	writeb((value), (port)->membase + (offset))
+
+static inline void siu_clear_fifo(struct uart_port *port)
+{
+	siu_write(port, UART_FCR, UART_FCR_ENABLE_FIFO);
+	siu_write(port, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR |
+	                          UART_FCR_CLEAR_XMIT);
+	siu_write(port, UART_FCR, 0);
+}
+
+static inline int siu_probe_ports(void)
+{
+	switch (current_cpu_data.cputype) {
+	case CPU_VR4111:
+	case CPU_VR4121:
+		return SIU_TYPE1_NR_PORTS;
+	case CPU_VR4122:
+	case CPU_VR4131:
+	case CPU_VR4133:
+		return SIU_TYPE2_NR_PORTS;
+	}
+
+	return 0;
+}
+
+static inline unsigned long siu_port_size(struct uart_port *port)
+{
+	switch (port->type) {
+	case PORT_VR41XX_SIU:
+		return 11UL;
+	case PORT_VR41XX_DSIU:
+		return 8UL;
+	}
+
+	return 0;
+}
+
+static inline unsigned int siu_check_type(struct uart_port *port)
+{
+	switch (current_cpu_data.cputype) {
+	case CPU_VR4111:
+	case CPU_VR4121:
+		if (port->line == 0)
+			return PORT_VR41XX_SIU;
+		break;
+	case CPU_VR4122:
+	case CPU_VR4131:
+	case CPU_VR4133:
+		if (port->line == 0)
+			return PORT_VR41XX_SIU;
+		else if (port->line == 1)
+			return PORT_VR41XX_DSIU;
+		break;
+	}
+
+	return PORT_UNKNOWN;
+}
+
+static inline const char *siu_type_name(struct uart_port *port)
+{
+	switch (port->type) {
+	case PORT_VR41XX_SIU:
+		return "SIU";
+	case PORT_VR41XX_DSIU:
+		return "DSIU";
+	}
+
+	return "unknown";
+}
+
+static unsigned int siu_tx_empty(struct uart_port *port)
+{
+	uint8_t lsr;
+
+	lsr = siu_read(port, UART_LSR);
+	if (lsr & UART_LSR_TEMT)
+		return TIOCSER_TEMT;
+
+	return 0;
+}
+
+static void siu_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	uint8_t mcr = 0;
+
+	if (mctrl & TIOCM_DTR)
+		mcr |= UART_MCR_DTR;
+	if (mctrl & TIOCM_RTS)
+		mcr |= UART_MCR_RTS;
+	if (mctrl & TIOCM_OUT1)
+		mcr |= UART_MCR_OUT1;
+	if (mctrl & TIOCM_OUT2)
+		mcr |= UART_MCR_OUT2;
+	if (mctrl & TIOCM_LOOP)
+		mcr |= UART_MCR_LOOP;
+
+	siu_write(port, UART_MCR, mcr);
+}
+
+static unsigned int siu_get_mctrl(struct uart_port *port)
+{
+	uint8_t msr;
+	unsigned int mctrl = 0;
+
+	msr = siu_read(port, UART_MSR);
+	if (msr & UART_MSR_DCD)
+		mctrl |= TIOCM_CAR;
+	if (msr & UART_MSR_RI)
+		mctrl |= TIOCM_RNG;
+	if (msr & UART_MSR_DSR)
+		mctrl |= TIOCM_DSR;
+	if (msr & UART_MSR_CTS)
+		mctrl |= TIOCM_CTS;
+
+	return mctrl;
+}
+
+static void siu_stop_tx(struct uart_port *port, unsigned int tty_stop)
+{
+	unsigned long flags;
+	uint8_t ier;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	ier = siu_read(port, UART_IER);
+	ier &= ~UART_IER_THRI;
+	siu_write(port, UART_IER, ier);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void siu_start_tx(struct uart_port *port, unsigned int tty_start)
+{
+	unsigned long flags;
+	uint8_t ier;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	ier = siu_read(port, UART_IER);
+	ier |= UART_IER_THRI;
+	siu_write(port, UART_IER, ier);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void siu_stop_rx(struct uart_port *port)
+{
+	unsigned long flags;
+	uint8_t ier;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	ier = siu_read(port, UART_IER);
+	ier &= ~UART_IER_RLSI;
+	siu_write(port, UART_IER, ier);
+
+	port->read_status_mask &= ~UART_LSR_DR;
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void siu_enable_ms(struct uart_port *port)
+{
+	unsigned long flags;
+	uint8_t ier;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	ier = siu_read(port, UART_IER);
+	ier |= UART_IER_MSI;
+	siu_write(port, UART_IER, ier);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void siu_break_ctl(struct uart_port *port, int ctl)
+{
+	unsigned long flags;
+	uint8_t lcr;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	lcr = siu_read(port, UART_LCR);
+	if (ctl == -1)
+		lcr |= UART_LCR_SBC;
+	else
+		lcr &= ~UART_LCR_SBC;
+	siu_write(port, UART_LCR, lcr);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static inline void receive_chars(struct uart_port *port, uint8_t *status,
+                                 struct pt_regs *regs)
+{
+	struct tty_struct *tty;
+	uint8_t lsr, ch;
+	char flag;
+	int max_count = RX_MAX_COUNT;
+
+	tty = port->info->tty;
+	lsr = *status;
+
+	do {
+		if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) {
+			if (tty->low_latency)
+				tty_flip_buffer_push(tty);
+		}
+
+		ch = siu_read(port, UART_RX);
+		port->icount.rx++;
+		flag = TTY_NORMAL;
+
+#ifdef CONFIG_SERIAL_VR41XX_CONSOLE
+		lsr |= lsr_break_flag[port->line];
+		lsr_break_flag[port->line] = 0;
+#endif
+		if (unlikely(lsr & (UART_LSR_BI | UART_LSR_FE |
+		                    UART_LSR_PE | UART_LSR_OE))) {
+			if (lsr & UART_LSR_BI) {
+				lsr &= ~(UART_LSR_FE | UART_LSR_PE);
+				port->icount.brk++;
+
+				if (uart_handle_break(port))
+					goto ignore_char;
+			}
+
+			if (lsr & UART_LSR_FE)
+				port->icount.frame++;
+			if (lsr & UART_LSR_PE)
+				port->icount.parity++;
+			if (lsr & UART_LSR_OE)
+				port->icount.overrun++;
+
+			lsr &= port->read_status_mask;
+			if (lsr & UART_LSR_BI)
+				flag = TTY_BREAK;
+			if (lsr & UART_LSR_FE)
+				flag = TTY_FRAME;
+			if (lsr & UART_LSR_PE)
+				flag = TTY_PARITY;
+		}
+
+		if (uart_handle_sysrq_char(port, ch, regs))
+			goto ignore_char;
+		if ((lsr & port->ignore_status_mask) == 0)
+			tty_insert_flip_char(tty, ch, flag);
+		if ((lsr & UART_LSR_OE) && (tty->flip.count < TTY_FLIPBUF_SIZE))
+			tty_insert_flip_char(tty, 0, TTY_OVERRUN);
+
+	ignore_char:
+		lsr = siu_read(port, UART_LSR);
+	} while ((lsr & UART_LSR_DR) && (max_count-- > 0));
+
+	tty_flip_buffer_push(tty);
+
+	*status = lsr;
+}
+
+static inline void check_modem_status(struct uart_port *port)
+{
+	uint8_t msr;
+
+	msr = siu_read(port, UART_MSR);
+	if ((msr & UART_MSR_ANY_DELTA) == 0)
+		return;
+	if (msr & UART_MSR_DDCD)
+		uart_handle_dcd_change(port, msr & UART_MSR_DCD);
+	if (msr & UART_MSR_TERI)
+		port->icount.rng++;
+	if (msr & UART_MSR_DDSR)
+		port->icount.dsr++;
+	if (msr & UART_MSR_DCTS)
+		uart_handle_cts_change(port, msr & UART_MSR_CTS);
+
+	wake_up_interruptible(&port->info->delta_msr_wait);
+}
+
+static inline void transmit_chars(struct uart_port *port)
+{
+	struct circ_buf *xmit;
+	int max_count = TX_MAX_COUNT;
+
+	xmit = &port->info->xmit;
+
+	if (port->x_char) {
+		siu_write(port, UART_TX, port->x_char);
+		port->icount.tx++;
+		port->x_char = 0;
+		return;
+	}
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+		siu_stop_tx(port, 0);
+		return;
+	}
+
+	do {
+		siu_write(port, UART_TX, xmit->buf[xmit->tail]);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+		if (uart_circ_empty(xmit))
+			break;
+	} while (max_count-- > 0);
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	if (uart_circ_empty(xmit))
+		siu_stop_tx(port, 0);
+}
+
+static irqreturn_t siu_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct uart_port *port;
+	uint8_t iir, lsr;
+
+	if (dev_id == NULL)
+		return IRQ_NONE;
+
+	port = (struct uart_port *)dev_id;
+
+	iir = siu_read(port, UART_IIR);
+	if (iir & UART_IIR_NO_INT)
+		return IRQ_NONE;
+
+	lsr = siu_read(port, UART_LSR);
+	if (lsr & UART_LSR_DR)
+		receive_chars(port, &lsr, regs);
+
+	check_modem_status(port);
+
+	if (lsr & UART_LSR_THRE)
+		transmit_chars(port);
+
+	return IRQ_HANDLED;
+}
+
+static int siu_startup(struct uart_port *port)
+{
+	int retval;
+
+	siu_clear_fifo(port);
+
+	(void)siu_read(port, UART_LSR);
+	(void)siu_read(port, UART_RX);
+	(void)siu_read(port, UART_IIR);
+	(void)siu_read(port, UART_MSR);
+
+	if (siu_read(port, UART_LSR) == 0xff)
+		return -ENODEV;
+
+	retval = request_irq(port->irq, siu_interrupt, 0, siu_type_name(port), port);
+	if (retval)
+		return retval;
+
+	if (port->type == PORT_VR41XX_DSIU)
+		vr41xx_enable_dsiuint(DSIUINT_ALL);
+
+	siu_write(port, UART_LCR, UART_LCR_WLEN8);
+
+	spin_lock_irq(&port->lock);
+	siu_set_mctrl(port, port->mctrl);
+	spin_unlock_irq(&port->lock);
+
+	siu_write(port, UART_IER, UART_IER_RLSI | UART_IER_RDI);
+
+	(void)siu_read(port, UART_LSR);
+	(void)siu_read(port, UART_RX);
+	(void)siu_read(port, UART_IIR);
+	(void)siu_read(port, UART_MSR);
+
+	return 0;
+}
+
+static void siu_shutdown(struct uart_port *port)
+{
+	unsigned long flags;
+	uint8_t lcr;
+
+	if (port->membase == NULL)
+		return;
+
+	siu_write(port, UART_IER, 0);
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	port->mctrl &= ~TIOCM_OUT2;
+	siu_set_mctrl(port, port->mctrl);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	lcr = siu_read(port, UART_LCR);
+	lcr &= ~UART_LCR_SBC;
+	siu_write(port, UART_LCR, lcr);
+
+	siu_clear_fifo(port);
+
+	(void)siu_read(port, UART_RX);
+
+	if (port->type == PORT_VR41XX_DSIU)
+		vr41xx_disable_dsiuint(DSIUINT_ALL);
+
+	free_irq(port->irq, port);
+}
+
+static void siu_set_termios(struct uart_port *port, struct termios *new,
+                            struct termios *old)
+{
+	tcflag_t c_cflag, c_iflag;
+	uint8_t lcr, fcr, ier;
+	unsigned int baud, quot;
+	unsigned long flags;
+
+	c_cflag = new->c_cflag;
+	switch (c_cflag & CSIZE) {
+	case CS5:
+		lcr = UART_LCR_WLEN5;
+		break;
+	case CS6:
+		lcr = UART_LCR_WLEN6;
+		break;
+	case CS7:
+		lcr = UART_LCR_WLEN7;
+		break;
+	default:
+		lcr = UART_LCR_WLEN8;
+		break;
+	}
+
+	if (c_cflag & CSTOPB)
+		lcr |= UART_LCR_STOP;
+	if (c_cflag & PARENB)
+		lcr |= UART_LCR_PARITY;
+	if ((c_cflag & PARODD) != PARODD)
+		lcr |= UART_LCR_EPAR;
+	if (c_cflag & CMSPAR)
+		lcr |= UART_LCR_SPAR;
+
+	baud = uart_get_baud_rate(port, new, old, 0, port->uartclk/16);
+	quot = uart_get_divisor(port, baud);
+
+	fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	uart_update_timeout(port, c_cflag, baud);
+
+	c_iflag = new->c_iflag;
+
+	port->read_status_mask = UART_LSR_THRE | UART_LSR_OE | UART_LSR_DR;
+	if (c_iflag & INPCK)
+		port->read_status_mask |= UART_LSR_FE | UART_LSR_PE;
+	if (c_iflag & (BRKINT | PARMRK))
+		port->read_status_mask |= UART_LSR_BI;
+
+	port->ignore_status_mask = 0;
+	if (c_iflag & IGNPAR)
+		port->ignore_status_mask |= UART_LSR_FE | UART_LSR_PE;
+	if (c_iflag & IGNBRK) {
+		port->ignore_status_mask |= UART_LSR_BI;
+		if (c_iflag & IGNPAR)
+			port->ignore_status_mask |= UART_LSR_OE;
+	}
+
+	if ((c_cflag & CREAD) == 0)
+		port->ignore_status_mask |= UART_LSR_DR;
+
+	ier = siu_read(port, UART_IER);
+	ier &= ~UART_IER_MSI;
+	if (UART_ENABLE_MS(port, c_cflag))
+		ier |= UART_IER_MSI;
+	siu_write(port, UART_IER, ier);
+
+	siu_write(port, UART_LCR, lcr | UART_LCR_DLAB);
+
+	siu_write(port, UART_DLL, (uint8_t)quot);
+	siu_write(port, UART_DLM, (uint8_t)(quot >> 8));
+
+	siu_write(port, UART_LCR, lcr);
+
+	siu_write(port, UART_FCR, fcr);
+
+	siu_set_mctrl(port, port->mctrl);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void siu_pm(struct uart_port *port, unsigned int state, unsigned int oldstate)
+{
+	switch (state) {
+	case 0:
+		switch (port->type) {
+		case PORT_VR41XX_SIU:
+			vr41xx_supply_clock(SIU_CLOCK);
+			break;
+		case PORT_VR41XX_DSIU:
+			vr41xx_supply_clock(DSIU_CLOCK);
+			break;
+		}
+		break;
+	case 3:
+		switch (port->type) {
+		case PORT_VR41XX_SIU:
+			vr41xx_mask_clock(SIU_CLOCK);
+			break;
+		case PORT_VR41XX_DSIU:
+			vr41xx_mask_clock(DSIU_CLOCK);
+			break;
+		}
+		break;
+	}
+}
+
+static const char *siu_type(struct uart_port *port)
+{
+	return siu_type_name(port);
+}
+
+static void siu_release_port(struct uart_port *port)
+{
+	unsigned long size;
+
+	if (port->flags	& UPF_IOREMAP) {
+		iounmap(port->membase);
+		port->membase = NULL;
+	}
+
+	size = siu_port_size(port);
+	release_mem_region(port->mapbase, size);
+}
+
+static int siu_request_port(struct uart_port *port)
+{
+	unsigned long size;
+
+	size = siu_port_size(port);
+	if (request_mem_region(port->mapbase, size, siu_type_name(port)) == NULL)
+		return -EBUSY;
+
+	if (port->flags & UPF_IOREMAP) {
+		port->membase = ioremap(port->mapbase, size);
+		if (port->membase == NULL) {
+			release_mem_region(port->mapbase, size);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static void siu_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE) {
+		port->type = siu_check_type(port);
+		(void)siu_request_port(port);
+	}
+}
+
+static int siu_verify_port(struct uart_port *port, struct serial_struct *serial)
+{
+	if (port->type != PORT_VR41XX_SIU && port->type != PORT_VR41XX_DSIU)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct uart_ops siu_uart_ops = {
+	.tx_empty	= siu_tx_empty,
+	.set_mctrl	= siu_set_mctrl,
+	.get_mctrl	= siu_get_mctrl,
+	.stop_tx	= siu_stop_tx,
+	.start_tx	= siu_start_tx,
+	.stop_rx	= siu_stop_rx,
+	.enable_ms	= siu_enable_ms,
+	.break_ctl	= siu_break_ctl,
+	.startup	= siu_startup,
+	.shutdown	= siu_shutdown,
+	.set_termios	= siu_set_termios,
+	.pm		= siu_pm,
+	.type		= siu_type,
+	.release_port	= siu_release_port,
+	.request_port	= siu_request_port,
+	.config_port	= siu_config_port,
+	.verify_port	= siu_verify_port,
+};
+
+static int siu_init_ports(void)
+{
+	const struct siu_port *siu;
+	struct uart_port *port;
+	int i, num;
+
+	switch (current_cpu_data.cputype) {
+	case CPU_VR4111:
+	case CPU_VR4121:
+		siu = siu_type1_ports;
+		break;
+	case CPU_VR4122:
+	case CPU_VR4131:
+	case CPU_VR4133:
+		siu = siu_type2_ports;
+		break;
+	default:
+		return 0;
+	}
+
+	port = siu_uart_ports;
+	num = siu_probe_ports();
+	for (i = 0; i < num; i++) {
+		spin_lock_init(&port->lock);
+		port->irq = siu->irq;
+		port->uartclk = SIU_BAUD_BASE * 16;
+		port->fifosize = 16;
+		port->regshift = 0;
+		port->iotype = UPIO_MEM;
+		port->flags = UPF_IOREMAP | UPF_BOOT_AUTOCONF;
+		port->type = siu->type;
+		port->line = i;
+		port->mapbase = siu->start;
+		siu++;
+		port++;
+	}
+
+	return num;
+}
+
+#ifdef CONFIG_SERIAL_VR41XX_CONSOLE
+
+static void early_set_termios(struct uart_port *port, struct termios *new,
+                              struct termios *old)
+{
+	tcflag_t c_cflag;
+	uint8_t lcr;
+	unsigned int baud, quot;
+
+	c_cflag = new->c_cflag;
+	switch (c_cflag & CSIZE) {
+	case CS5:
+		lcr = UART_LCR_WLEN5;
+		break;
+	case CS6:
+		lcr = UART_LCR_WLEN6;
+		break;
+	case CS7:
+		lcr = UART_LCR_WLEN7;
+		break;
+	default:
+		lcr = UART_LCR_WLEN8;
+		break;
+	}
+
+	if (c_cflag & CSTOPB)
+		lcr |= UART_LCR_STOP;
+	if (c_cflag & PARENB)
+		lcr |= UART_LCR_PARITY;
+	if ((c_cflag & PARODD) != PARODD)
+		lcr |= UART_LCR_EPAR;
+	if (c_cflag & CMSPAR)
+		lcr |= UART_LCR_SPAR;
+
+	baud = uart_get_baud_rate(port, new, old, 0, port->uartclk/16);
+	quot = uart_get_divisor(port, baud);
+
+	siu_write(port, UART_LCR, lcr | UART_LCR_DLAB);
+
+	siu_write(port, UART_DLL, (uint8_t)quot);
+	siu_write(port, UART_DLM, (uint8_t)(quot >> 8));
+
+	siu_write(port, UART_LCR, lcr);
+}
+
+static struct uart_ops early_uart_ops = {
+	.set_termios	= early_set_termios,
+};
+
+#define BOTH_EMPTY	(UART_LSR_TEMT | UART_LSR_THRE)
+
+static void wait_for_xmitr(struct uart_port *port)
+{
+	int timeout = 10000;
+	uint8_t lsr, msr;
+
+	do {
+		lsr = siu_read(port, UART_LSR);
+		if (lsr & UART_LSR_BI)
+			lsr_break_flag[port->line] = UART_LSR_BI;
+
+		if ((lsr & BOTH_EMPTY) == BOTH_EMPTY)
+			break;
+	} while (timeout-- > 0);
+
+	if (port->flags & UPF_CONS_FLOW) {
+		timeout = 1000000;
+
+		do {
+			msr = siu_read(port, UART_MSR);
+			if ((msr & UART_MSR_CTS) != 0)
+				break;
+		} while (timeout-- > 0);
+	}
+}
+
+static void siu_console_write(struct console *con, const char *s, unsigned count)
+{
+	struct uart_port *port;
+	uint8_t ier;
+	unsigned i;
+
+	port = &siu_uart_ports[con->index];
+
+	ier = siu_read(port, UART_IER);
+	siu_write(port, UART_IER, 0);
+
+	for (i = 0; i < count && *s != '\0'; i++, s++) {
+		wait_for_xmitr(port);
+		siu_write(port, UART_TX, *s);
+		if (*s == '\n') {
+			wait_for_xmitr(port);
+			siu_write(port, UART_TX, '\r');
+		}
+	}
+
+	wait_for_xmitr(port);
+	siu_write(port, UART_IER, ier);
+}
+
+static int siu_console_setup(struct console *con, char *options)
+{
+	struct uart_port *port;
+	int baud = 9600;
+	int parity = 'n';
+	int bits = 8;
+	int flow = 'n';
+
+	if (con->index >= SIU_PORTS_MAX)
+		con->index = 0;
+
+	port = &siu_uart_ports[con->index];
+	if (port->membase == NULL) {
+		if (port->mapbase == 0)
+			return -ENODEV;
+		port->membase = (unsigned char __iomem *)KSEG1ADDR(port->mapbase);
+	}
+
+	if (options != NULL)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+
+	return uart_set_options(port, con, baud, parity, bits, flow);
+}
+
+static struct uart_driver siu_uart_driver;
+
+static struct console siu_console = {
+	.name	= "ttyVR",
+	.write	= siu_console_write,
+	.device	= uart_console_device,
+	.setup	= siu_console_setup,
+	.flags	= CON_PRINTBUFFER,
+	.index	= -1,
+	.data	= &siu_uart_driver,
+};
+
+static int __devinit siu_console_init(void)
+{
+	struct uart_port *port;
+	int num, i;
+
+	num = siu_init_ports();
+	if (num <= 0)
+		return -ENODEV;
+
+	for (i = 0; i < num; i++) {
+		port = &siu_uart_ports[i];
+		port->ops = &early_uart_ops;
+	}
+
+	register_console(&siu_console);
+
+	return 0;
+}
+
+console_initcall(siu_console_init);
+
+#define SERIAL_VR41XX_CONSOLE	&siu_console
+#else
+#define SERIAL_VR41XX_CONSOLE	NULL
+#endif
+
+static struct uart_driver siu_uart_driver = {
+	.owner		= THIS_MODULE,
+	.driver_name	= "SIU",
+	.dev_name	= "ttyVR",
+	.devfs_name	= "ttvr/",
+	.major		= SIU_MAJOR,
+	.minor		= SIU_MINOR_BASE,
+	.cons		= SERIAL_VR41XX_CONSOLE,
+};
+
+static int siu_probe(struct device *dev)
+{
+	struct uart_port *port;
+	int num, i, retval;
+
+	num = siu_init_ports();
+	if (num <= 0)
+		return -ENODEV;
+
+	siu_uart_driver.nr = num;
+	retval = uart_register_driver(&siu_uart_driver);
+	if (retval)
+		return retval;
+
+	for (i = 0; i < num; i++) {
+		port = &siu_uart_ports[i];
+		port->ops = &siu_uart_ops;
+		port->dev = dev;
+
+		retval = uart_add_one_port(&siu_uart_driver, port);
+		if (retval)
+			break;
+	}
+
+	if (i == 0 && retval < 0) {
+		uart_unregister_driver(&siu_uart_driver);
+		return retval;
+	}
+
+	return 0;
+}
+
+static int siu_remove(struct device *dev)
+{
+	struct uart_port *port;
+	int i;
+
+	for (i = 0; i < siu_uart_driver.nr; i++) {
+		port = &siu_uart_ports[i];
+		if (port->dev == dev) {
+			uart_remove_one_port(&siu_uart_driver, port);
+			port->dev = NULL;
+		}
+	}
+
+	uart_unregister_driver(&siu_uart_driver);
+
+	return 0;
+}
+
+static int siu_suspend(struct device *dev, u32 state, u32 level)
+{
+	struct uart_port *port;
+	int i;
+
+	if (level != SUSPEND_DISABLE)
+		return 0;
+
+	for (i = 0; i < siu_uart_driver.nr; i++) {
+		port = &siu_uart_ports[i];
+		if ((port->type == PORT_VR41XX_SIU ||
+		     port->type == PORT_VR41XX_DSIU) && port->dev == dev)
+			uart_suspend_port(&siu_uart_driver, port);
+
+	}
+
+	return 0;
+}
+
+static int siu_resume(struct device *dev, u32 level)
+{
+	struct uart_port *port;
+	int i;
+
+	if (level != RESUME_ENABLE)
+		return 0;
+
+	for (i = 0; i < siu_uart_driver.nr; i++) {
+		port = &siu_uart_ports[i];
+		if ((port->type == PORT_VR41XX_SIU ||
+		     port->type == PORT_VR41XX_DSIU) && port->dev == dev)
+			uart_resume_port(&siu_uart_driver, port);
+	}
+
+	return 0;
+}
+
+static struct platform_device *siu_platform_device;
+
+static struct device_driver siu_device_driver = {
+	.name		= "SIU",
+	.bus		= &platform_bus_type,
+	.probe		= siu_probe,
+	.remove		= siu_remove,
+	.suspend	= siu_suspend,
+	.resume		= siu_resume,
+};
+
+static int __devinit vr41xx_siu_init(void)
+{
+	int retval;
+
+	siu_platform_device = platform_device_register_simple("SIU", -1, NULL, 0);
+	if (IS_ERR(siu_platform_device))
+		return PTR_ERR(siu_platform_device);
+
+	retval = driver_register(&siu_device_driver);
+	if (retval < 0) {
+		platform_device_unregister(siu_platform_device);
+	}
+
+	return retval;
+}
+
+static void __devexit vr41xx_siu_exit(void)
+{
+	driver_unregister(&siu_device_driver);
+
+	platform_device_unregister(siu_platform_device);
+}
+
+module_init(vr41xx_siu_init);
+module_exit(vr41xx_siu_exit);
diff --git a/include/asm-mips/vr41xx/vr41xx.h b/include/asm-mips/vr41xx/vr41xx.h
index 3c7d56ece4cc..09186c9e3a6f 100644
--- a/include/asm-mips/vr41xx/vr41xx.h
+++ b/include/asm-mips/vr41xx/vr41xx.h
@@ -250,8 +250,6 @@ enum {
 /*
  * Serial Interface Unit
  */
-extern void vr41xx_siu_init(void);
-extern int vr41xx_serial_ports;
 
 /* SIU interfaces */
 typedef enum {
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index c45598d0a846..b5c7b44da087 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -106,6 +106,10 @@
 /* TXX9 type number */
 #define PORT_TXX9       64
 
+/* NEC VR4100 series SIU/DSIU */
+#define PORT_VR41XX_SIU		65
+#define PORT_VR41XX_DSIU	66
+
 #ifdef __KERNEL__
 
 #include <linux/config.h>
-- 
cgit v1.2.3


From 3a667d2e3336310628d7465a3d48bbc539fa5f06 Mon Sep 17 00:00:00 2001
From: Arun Sharma <arun.sharma@intel.com>
Date: Mon, 7 Mar 2005 17:51:21 -0800
Subject: [PATCH] add TCSBRKP to compat_ioctl.h

Move ioctl TCSBRKP support to compat layer. Same rationale as TCSBRK.

- Remove corresponding code under ppc64, sparc64 and s390.
- Use ULONG_IOCTL() instead of COMPATIBLE_IOCTL(), since the argument is int,
  not pointer.

Signed-off-by: Gordon Jin <gordon.jin@intel.com>
Signed-off-by: Arun Sharma <arun.sharma@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/kernel/ioctl32.c     | 1 -
 arch/s390/kernel/compat_ioctl.c | 3 ---
 arch/sparc64/kernel/ioctl32.c   | 1 -
 include/linux/compat_ioctl.h    | 1 +
 4 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ppc64/kernel/ioctl32.c b/arch/ppc64/kernel/ioctl32.c
index 94d1a05b7839..a8005db23ec5 100644
--- a/arch/ppc64/kernel/ioctl32.c
+++ b/arch/ppc64/kernel/ioctl32.c
@@ -39,7 +39,6 @@ IOCTL_TABLE_START
 #include <linux/compat_ioctl.h>
 #define DECLARES
 #include "compat_ioctl.c"
-COMPATIBLE_IOCTL(TCSBRKP)
 COMPATIBLE_IOCTL(TIOCSTART)
 COMPATIBLE_IOCTL(TIOCSTOP)
 COMPATIBLE_IOCTL(TIOCSLTC)
diff --git a/arch/s390/kernel/compat_ioctl.c b/arch/s390/kernel/compat_ioctl.c
index 7c7ca191f7f4..96571ff7115d 100644
--- a/arch/s390/kernel/compat_ioctl.c
+++ b/arch/s390/kernel/compat_ioctl.c
@@ -65,9 +65,6 @@ COMPATIBLE_IOCTL(BIODASDSATTR)
 COMPATIBLE_IOCTL(TAPE390_DISPLAY)
 #endif
 
-/* This one should be architecture independent */
-COMPATIBLE_IOCTL(TCSBRKP)
-
 /* s390 doesn't need handlers here */
 COMPATIBLE_IOCTL(TIOCGSERIAL)
 COMPATIBLE_IOCTL(TIOCSSERIAL)
diff --git a/arch/sparc64/kernel/ioctl32.c b/arch/sparc64/kernel/ioctl32.c
index af8eb0b59b88..43fc3173d480 100644
--- a/arch/sparc64/kernel/ioctl32.c
+++ b/arch/sparc64/kernel/ioctl32.c
@@ -475,7 +475,6 @@ IOCTL_TABLE_START
 #include <linux/compat_ioctl.h>
 #define DECLARES
 #include "compat_ioctl.c"
-COMPATIBLE_IOCTL(TCSBRKP)
 COMPATIBLE_IOCTL(TIOCSTART)
 COMPATIBLE_IOCTL(TIOCSTOP)
 COMPATIBLE_IOCTL(TIOCSLTC)
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index 383275bf924e..56fa057bf0d7 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -16,6 +16,7 @@ COMPATIBLE_IOCTL(TCSETA)
 COMPATIBLE_IOCTL(TCSETAW)
 COMPATIBLE_IOCTL(TCSETAF)
 COMPATIBLE_IOCTL(TCSBRK)
+ULONG_IOCTL(TCSBRKP)
 COMPATIBLE_IOCTL(TCXONC)
 COMPATIBLE_IOCTL(TCFLSH)
 COMPATIBLE_IOCTL(TCGETS)
-- 
cgit v1.2.3


From 6740e938fe4eb30563c6fb040b3321ad7e546b61 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 7 Mar 2005 17:52:24 -0800
Subject: [PATCH] add compiler-gcc4.h

With the release of gcc 4.0 being only a few months away and people
already tring compiling with it, it's time for adding a compiler-gcc4.h .

This patch contains the following changes:
- remove compiler-gcc+.h
- compiler-gcc4.h: new file based on a corrected compiler-gcc+.h
- compiler.h: include compiler-gcc4.h for gcc 4
- compiler.h: #error for gcc > 4
- compiler-gcc3.h: remove __compiler_offsetof (there will never be a
                                               gcc 3.5)
                   small indention corrections

I've tested the compilation with both gcc 3.4.4 and a recent gcc 4.0
snapshot from Debian experimental.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/compiler-gcc+.h | 16 ----------------
 include/linux/compiler-gcc3.h | 10 ++++------
 include/linux/compiler-gcc4.h | 16 ++++++++++++++++
 include/linux/compiler.h      |  6 ++++--
 4 files changed, 24 insertions(+), 24 deletions(-)
 delete mode 100644 include/linux/compiler-gcc+.h
 create mode 100644 include/linux/compiler-gcc4.h

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc+.h b/include/linux/compiler-gcc+.h
deleted file mode 100644
index 6b9308541dcd..000000000000
--- a/include/linux/compiler-gcc+.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* Never include this file directly.  Include <linux/compiler.h> instead.  */
-
-/*
- * These definitions are for Ueber-GCC: always newer than the latest
- * version and hence sporting everything plus a kitchen-sink.
- */
-#include <linux/compiler-gcc.h>
-
-#define inline			inline		__attribute__((always_inline))
-#define __inline__		__inline__	__attribute__((always_inline))
-#define __inline		__inline	__attribute__((always_inline))
-#define __deprecated		__attribute__((deprecated))
-#define __attribute_used__	__attribute__((__used__))
-#define __attribute_pure__	__attribute__((pure))
-#define __attribute_const__	__attribute__((__const__))
-#define __must_check 		__attribute__((warn_unused_result))
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index eec2f88c4301..a6fa615afab5 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -10,7 +10,7 @@
 #endif
 
 #if __GNUC_MINOR__ > 0
-# define __deprecated	__attribute__((deprecated))
+# define __deprecated		__attribute__((deprecated))
 #endif
 
 #if __GNUC_MINOR__ >= 3
@@ -23,12 +23,10 @@
 #define __attribute_const__	__attribute__((__const__))
 
 #if __GNUC_MINOR__ >= 1
-#define  noinline __attribute__((noinline))
+#define  noinline		__attribute__((noinline))
 #endif
+
 #if __GNUC_MINOR__ >= 4
-#define __must_check __attribute__((warn_unused_result))
+#define __must_check		__attribute__((warn_unused_result))
 #endif
 
-#if __GNUC_MINOR__ >= 5
-#define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
-#endif
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
new file mode 100644
index 000000000000..53686c037a06
--- /dev/null
+++ b/include/linux/compiler-gcc4.h
@@ -0,0 +1,16 @@
+/* Never include this file directly.  Include <linux/compiler.h> instead.  */
+
+/* These definitions are for GCC v4.x.  */
+#include <linux/compiler-gcc.h>
+
+#define inline			inline		__attribute__((always_inline))
+#define __inline__		__inline__	__attribute__((always_inline))
+#define __inline		__inline	__attribute__((always_inline))
+#define __deprecated		__attribute__((deprecated))
+#define __attribute_used__	__attribute__((__used__))
+#define __attribute_pure__	__attribute__((pure))
+#define __attribute_const__	__attribute__((__const__))
+#define  noinline		__attribute__((noinline))
+#define __must_check 		__attribute__((warn_unused_result))
+#define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
+
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 0fd06b029847..b475fd608115 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -34,8 +34,10 @@ extern void __chk_io_ptr(void __iomem *);
 
 #ifdef __KERNEL__
 
-#if __GNUC__ > 3
-# include <linux/compiler-gcc+.h>	/* catch-all for GCC 4, 5, etc. */
+#if __GNUC__ > 4
+#error no compiler-gcc.h file for this gcc version
+#elif __GNUC__ == 4
+# include <linux/compiler-gcc4.h>
 #elif __GNUC__ == 3
 # include <linux/compiler-gcc3.h>
 #elif __GNUC__ == 2
-- 
cgit v1.2.3


From 63858f83796cc179010632f0f9a23b2524e79c17 Mon Sep 17 00:00:00 2001
From: Dave Olien <dmo@osdl.org>
Date: Mon, 7 Mar 2005 17:53:24 -0800
Subject: [PATCH] add local bio pool support and modify dm

I've had this patch reviewed by Jens, and incorporated his recommended
fixes.

The patch adds new interfaces to bio.c that support the creation of local
bio and bvec pools.  This is important for layered drivers that need to
allocate new bio and bvec structures in response to bio's submitted to it
from higher up.  The layered drivers can allocate local pools of bio
structures to preclude deadlock under global bio pool exhaustion.

The device mapper source files have been modified to remove duplicate bio
code, and to use the new interfaces to create local bio pools.

From: Dave Olien <dmo@osdl.org>

Change bio_clone() to use the global bio_set pool instead of the bio_set pool
associated with the bio argument.  This is because raid5 and raid6 bio's are
not allocated from a bio_set and have no bio_set associated with them.  This
patch along with the patch Linux just accepted allows raid5 and raid6 to
function.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-io.c   | 219 ++-------------------------------------------------
 drivers/md/dm-zero.c |  17 ----
 drivers/md/dm.c      |  10 ++-
 fs/bio.c             | 212 ++++++++++++++++++++++++++++++++++++-------------
 include/linux/bio.h  |   7 ++
 5 files changed, 182 insertions(+), 283 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ac5f74766fa2..2c66b68cf237 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -12,191 +12,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 
-#define BIO_POOL_SIZE 256
-
-
-/*-----------------------------------------------------------------
- * Bio set, move this to bio.c
- *---------------------------------------------------------------*/
-#define BV_NAME_SIZE 16
-struct biovec_pool {
-	int nr_vecs;
-	char name[BV_NAME_SIZE];
-	kmem_cache_t *slab;
-	mempool_t *pool;
-	atomic_t allocated;	/* FIXME: debug */
-};
-
-#define BIOVEC_NR_POOLS 6
-struct bio_set {
-	char name[BV_NAME_SIZE];
-	kmem_cache_t *bio_slab;
-	mempool_t *bio_pool;
-	struct biovec_pool pools[BIOVEC_NR_POOLS];
-};
-
-static void bio_set_exit(struct bio_set *bs)
-{
-	unsigned i;
-	struct biovec_pool *bp;
-
-	if (bs->bio_pool)
-		mempool_destroy(bs->bio_pool);
-
-	if (bs->bio_slab)
-		kmem_cache_destroy(bs->bio_slab);
-
-	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
-		bp = bs->pools + i;
-		if (bp->pool)
-			mempool_destroy(bp->pool);
-
-		if (bp->slab)
-			kmem_cache_destroy(bp->slab);
-	}
-}
-
-static void mk_name(char *str, size_t len, const char *prefix, unsigned count)
-{
-	snprintf(str, len, "%s-%u", prefix, count);
-}
-
-static int bio_set_init(struct bio_set *bs, const char *slab_prefix,
-			 unsigned pool_entries, unsigned scale)
-{
-	/* FIXME: this must match bvec_index(), why not go the
-	 * whole hog and have a pool per power of 2 ? */
-	static unsigned _vec_lengths[BIOVEC_NR_POOLS] = {
-		1, 4, 16, 64, 128, BIO_MAX_PAGES
-	};
-
-
-	unsigned i, size;
-	struct biovec_pool *bp;
-
-	/* zero the bs so we can tear down properly on error */
-	memset(bs, 0, sizeof(*bs));
-
-	/*
-	 * Set up the bio pool.
-	 */
-	snprintf(bs->name, sizeof(bs->name), "%s-bio", slab_prefix);
-
-	bs->bio_slab = kmem_cache_create(bs->name, sizeof(struct bio), 0,
-					 SLAB_HWCACHE_ALIGN, NULL, NULL);
-	if (!bs->bio_slab) {
-		DMWARN("can't init bio slab");
-		goto bad;
-	}
-
-	bs->bio_pool = mempool_create(pool_entries, mempool_alloc_slab,
-				      mempool_free_slab, bs->bio_slab);
-	if (!bs->bio_pool) {
-		DMWARN("can't init bio pool");
-		goto bad;
-	}
-
-	/*
-	 * Set up the biovec pools.
-	 */
-	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
-		bp = bs->pools + i;
-		bp->nr_vecs = _vec_lengths[i];
-		atomic_set(&bp->allocated, 1); /* FIXME: debug */
-
-
-		size = bp->nr_vecs * sizeof(struct bio_vec);
-
-		mk_name(bp->name, sizeof(bp->name), slab_prefix, i);
-		bp->slab = kmem_cache_create(bp->name, size, 0,
-					     SLAB_HWCACHE_ALIGN, NULL, NULL);
-		if (!bp->slab) {
-			DMWARN("can't init biovec slab cache");
-			goto bad;
-		}
-
-		if (i >= scale)
-			pool_entries >>= 1;
-
-		bp->pool = mempool_create(pool_entries, mempool_alloc_slab,
-					  mempool_free_slab, bp->slab);
-		if (!bp->pool) {
-			DMWARN("can't init biovec mempool");
-			goto bad;
-		}
-	}
-
-	return 0;
-
- bad:
-	bio_set_exit(bs);
-	return -ENOMEM;
-}
-
-/* FIXME: blech */
-static inline unsigned bvec_index(unsigned nr)
-{
-	switch (nr) {
-	case 1:		return 0;
-	case 2 ... 4: 	return 1;
-	case 5 ... 16:	return 2;
-	case 17 ... 64:	return 3;
-	case 65 ... 128:return 4;
-	case 129 ... BIO_MAX_PAGES: return 5;
-	}
-
-	BUG();
-	return 0;
-}
-
-static unsigned _bio_count = 0;
-struct bio *bio_set_alloc(struct bio_set *bs, int gfp_mask, int nr_iovecs)
-{
-	struct biovec_pool *bp;
-	struct bio_vec *bv = NULL;
-	unsigned long idx;
-	struct bio *bio;
-
-	bio = mempool_alloc(bs->bio_pool, gfp_mask);
-	if (unlikely(!bio))
-		return NULL;
-
-	bio_init(bio);
-
-	if (likely(nr_iovecs)) {
-		idx = bvec_index(nr_iovecs);
-		bp = bs->pools + idx;
-		bv = mempool_alloc(bp->pool, gfp_mask);
-		if (!bv) {
-			mempool_free(bio, bs->bio_pool);
-			return NULL;
-		}
-
-		memset(bv, 0, bp->nr_vecs * sizeof(*bv));
-		bio->bi_flags |= idx << BIO_POOL_OFFSET;
-		bio->bi_max_vecs = bp->nr_vecs;
-		atomic_inc(&bp->allocated);
-	}
-
-	bio->bi_io_vec = bv;
-	return bio;
-}
-
-static void bio_set_free(struct bio_set *bs, struct bio *bio)
-{
-	struct biovec_pool *bp = bs->pools + BIO_POOL_IDX(bio);
-
-	if (atomic_dec_and_test(&bp->allocated))
-		BUG();
-
-	mempool_free(bio->bi_io_vec, bp->pool);
-	mempool_free(bio, bs->bio_pool);
-}
-
-/*-----------------------------------------------------------------
- * dm-io proper
- *---------------------------------------------------------------*/
-static struct bio_set _bios;
+static struct bio_set *_bios;
 
 /* FIXME: can we shrink this ? */
 struct io {
@@ -240,7 +56,7 @@ static int resize_pool(unsigned int new_ios)
 			/* free off the pool */
 			mempool_destroy(_io_pool);
 			_io_pool = NULL;
-			bio_set_exit(&_bios);
+			bioset_free(_bios);
 
 		} else {
 			/* resize the pool */
@@ -253,10 +69,11 @@ static int resize_pool(unsigned int new_ios)
 		if (!_io_pool)
 			return -ENOMEM;
 
-		r = bio_set_init(&_bios, "dm-io", 512, 1);
-		if (r) {
+		_bios = bioset_create(16, 16, 4);
+		if (!_bios) {
 			mempool_destroy(_io_pool);
 			_io_pool = NULL;
+			return -ENOMEM;
 		}
 	}
 
@@ -280,6 +97,7 @@ void dm_io_put(unsigned int num_pages)
  * We need to keep track of which region a bio is doing io for.
  * In order to save a memory allocation we store this the last
  * bvec which we know is unused (blech).
+ * XXX This is ugly and can OOPS with some configs... find another way.
  *---------------------------------------------------------------*/
 static inline void bio_set_region(struct bio *bio, unsigned region)
 {
@@ -315,21 +133,6 @@ static void dec_count(struct io *io, unsigned int region, int error)
 	}
 }
 
-/* FIXME Move this to bio.h? */
-static void zero_fill_bio(struct bio *bio)
-{
-	unsigned long flags;
-	struct bio_vec *bv;
-	int i;
-
-	bio_for_each_segment(bv, bio, i) {
-		char *data = bvec_kmap_irq(bv, &flags);
-		memset(data, 0, bv->bv_len);
-		flush_dcache_page(bv->bv_page);
-		bvec_kunmap_irq(data, &flags);
-	}
-}
-
 static int endio(struct bio *bio, unsigned int done, int error)
 {
 	struct io *io = (struct io *) bio->bi_private;
@@ -347,12 +150,6 @@ static int endio(struct bio *bio, unsigned int done, int error)
 	return 0;
 }
 
-static void bio_dtr(struct bio *bio)
-{
-	_bio_count--;
-	bio_set_free(&_bios, bio);
-}
-
 /*-----------------------------------------------------------------
  * These little objects provide an abstraction for getting a new
  * destination page for io.
@@ -461,13 +258,11 @@ static void do_region(int rw, unsigned int region, struct io_region *where,
 		 * bvec for bio_get/set_region().
 		 */
 		num_bvecs = (remaining / (PAGE_SIZE >> 9)) + 2;
-		_bio_count++;
-		bio = bio_set_alloc(&_bios, GFP_NOIO, num_bvecs);
+		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios);
 		bio->bi_sector = where->sector + (where->count - remaining);
 		bio->bi_bdev = where->bdev;
 		bio->bi_end_io = endio;
 		bio->bi_private = io;
-		bio->bi_destructor = bio_dtr;
 		bio_set_region(bio, region);
 
 		/*
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 725f2c812c16..7febc2cac73d 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -23,23 +23,6 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	return 0;
 }
 
-/*
- * Fills the bio pages with zeros
- */
-static void zero_fill_bio(struct bio *bio)
-{
-	unsigned long flags;
-	struct bio_vec *bv;
-	int i;
-
-	bio_for_each_segment(bv, bio, i) {
-		char *data = bvec_kmap_irq(bv, &flags);
-		memset(data, 0, bv->bv_len);
-		flush_dcache_page(bv->bv_page);
-		bvec_kunmap_irq(data, &flags);
-	}
-}
-
 /*
  * Return zeros only on reads
  */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7f1af5231ef7..15a8e5c1a0ae 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -96,10 +96,16 @@ struct mapped_device {
 static kmem_cache_t *_io_cache;
 static kmem_cache_t *_tio_cache;
 
+static struct bio_set *dm_set;
+
 static int __init local_init(void)
 {
 	int r;
 
+	dm_set = bioset_create(16, 16, 4);
+	if (!dm_set)
+		return -ENOMEM;
+
 	/* allocate a slab for the dm_ios */
 	_io_cache = kmem_cache_create("dm_io",
 				      sizeof(struct dm_io), 0, 0, NULL, NULL);
@@ -133,6 +139,8 @@ static void local_exit(void)
 	kmem_cache_destroy(_tio_cache);
 	kmem_cache_destroy(_io_cache);
 
+	bioset_free(dm_set);
+
 	if (unregister_blkdev(_major, _name) < 0)
 		DMERR("devfs_unregister_blkdev failed");
 
@@ -393,7 +401,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
 	struct bio *clone;
 	struct bio_vec *bv = bio->bi_io_vec + idx;
 
-	clone = bio_alloc(GFP_NOIO, 1);
+	clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set);
 	*clone->bi_io_vec = *bv;
 
 	clone->bi_sector = sector;
diff --git a/fs/bio.c b/fs/bio.c
index d23d9782377e..1199a116302f 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,7 +28,6 @@
 
 #define BIO_POOL_SIZE 256
 
-static mempool_t *bio_pool;
 static kmem_cache_t *bio_slab;
 
 #define BIOVEC_NR_POOLS 6
@@ -40,11 +39,10 @@ static kmem_cache_t *bio_slab;
 #define BIO_SPLIT_ENTRIES 8	
 mempool_t *bio_split_pool;
 
-struct biovec_pool {
+struct biovec_slab {
 	int nr_vecs;
 	char *name; 
 	kmem_cache_t *slab;
-	mempool_t *pool;
 };
 
 /*
@@ -54,15 +52,32 @@ struct biovec_pool {
  */
 
 #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-static struct biovec_pool bvec_array[BIOVEC_NR_POOLS] = {
+static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] = {
 	BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
 };
 #undef BV
 
-static inline struct bio_vec *bvec_alloc(int gfp_mask, int nr, unsigned long *idx)
+/*
+ * bio_set is used to allow other portions of the IO system to
+ * allocate their own private memory pools for bio and iovec structures.
+ * These memory pools in turn all allocate from the bio_slab
+ * and the bvec_slabs[].
+ */
+struct bio_set {
+	mempool_t *bio_pool;
+	mempool_t *bvec_pools[BIOVEC_NR_POOLS];
+};
+
+/*
+ * fs_bio_set is the bio_set containing bio and iovec memory pools used by
+ * IO code that does not need private memory pools.
+ */
+static struct bio_set *fs_bio_set;
+
+static inline struct bio_vec *bvec_alloc_bs(int gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
 {
-	struct biovec_pool *bp;
 	struct bio_vec *bvl;
+	struct biovec_slab *bp;
 
 	/*
 	 * see comment near bvec_array define!
@@ -80,26 +95,27 @@ static inline struct bio_vec *bvec_alloc(int gfp_mask, int nr, unsigned long *id
 	/*
 	 * idx now points to the pool we want to allocate from
 	 */
-	bp = bvec_array + *idx;
 
-	bvl = mempool_alloc(bp->pool, gfp_mask);
+	bp = bvec_slabs + *idx;
+	bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
 	if (bvl)
 		memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec));
+
 	return bvl;
 }
 
 /*
- * default destructor for a bio allocated with bio_alloc()
+ * default destructor for a bio allocated with bio_alloc_bioset()
  */
 static void bio_destructor(struct bio *bio)
 {
 	const int pool_idx = BIO_POOL_IDX(bio);
-	struct biovec_pool *bp = bvec_array + pool_idx;
+	struct bio_set *bs = bio->bi_set;
 
 	BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
 
-	mempool_free(bio->bi_io_vec, bp->pool);
-	mempool_free(bio, bio_pool);
+	mempool_free(bio->bi_io_vec, bs->bvec_pools[pool_idx]);
+	mempool_free(bio, bs->bio_pool);
 }
 
 inline void bio_init(struct bio *bio)
@@ -121,18 +137,21 @@ inline void bio_init(struct bio *bio)
 }
 
 /**
- * bio_alloc - allocate a bio for I/O
+ * bio_alloc_bioset - allocate a bio for I/O
  * @gfp_mask:   the GFP_ mask given to the slab allocator
  * @nr_iovecs:	number of iovecs to pre-allocate
  *
  * Description:
- *   bio_alloc will first try it's on mempool to satisfy the allocation.
+ *   bio_alloc_bioset will first try it's on mempool to satisfy the allocation.
  *   If %__GFP_WAIT is set then we will block on the internal pool waiting
  *   for a &struct bio to become free.
+ *
+ *   allocate bio and iovecs from the memory pools specified by the
+ *   bio_set structure.
  **/
-struct bio *bio_alloc(int gfp_mask, int nr_iovecs)
+struct bio *bio_alloc_bioset(int gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
-	struct bio *bio = mempool_alloc(bio_pool, gfp_mask);
+	struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask);
 
 	if (likely(bio)) {
 		struct bio_vec *bvl = NULL;
@@ -141,22 +160,43 @@ struct bio *bio_alloc(int gfp_mask, int nr_iovecs)
 		if (likely(nr_iovecs)) {
 			unsigned long idx;
 
-			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx);
+			bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
 			if (unlikely(!bvl)) {
-				mempool_free(bio, bio_pool);
+				mempool_free(bio, bs->bio_pool);
 				bio = NULL;
 				goto out;
 			}
 			bio->bi_flags |= idx << BIO_POOL_OFFSET;
-			bio->bi_max_vecs = bvec_array[idx].nr_vecs;
+			bio->bi_max_vecs = bvec_slabs[idx].nr_vecs;
 		}
 		bio->bi_io_vec = bvl;
 		bio->bi_destructor = bio_destructor;
+		bio->bi_set = bs;
 	}
 out:
 	return bio;
 }
 
+struct bio *bio_alloc(int gfp_mask, int nr_iovecs)
+{
+	return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+}
+
+void zero_fill_bio(struct bio *bio)
+{
+	unsigned long flags;
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment(bv, bio, i) {
+		char *data = bvec_kmap_irq(bv, &flags);
+		memset(data, 0, bv->bv_len);
+		flush_dcache_page(bv->bv_page);
+		bvec_kunmap_irq(data, &flags);
+	}
+}
+EXPORT_SYMBOL(zero_fill_bio);
+
 /**
  * bio_put - release a reference to a bio
  * @bio:   bio to release reference to
@@ -233,7 +273,7 @@ inline void __bio_clone(struct bio *bio, struct bio *bio_src)
  */
 struct bio *bio_clone(struct bio *bio, int gfp_mask)
 {
-	struct bio *b = bio_alloc(gfp_mask, bio->bi_max_vecs);
+	struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
 
 	if (b)
 		__bio_clone(b, bio);
@@ -904,11 +944,99 @@ static void bio_pair_free(void *bp, void *data)
 	kfree(bp);
 }
 
-static void __init biovec_init_pools(void)
+
+/*
+ * create memory pools for biovec's in a bio_set.
+ * use the global biovec slabs created for general use.
+ */
+static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale)
+{
+	int i;
+
+	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
+		struct biovec_slab *bp = bvec_slabs + i;
+		mempool_t **bvp = bs->bvec_pools + i;
+
+		if (i >= scale)
+			pool_entries >>= 1;
+
+		*bvp = mempool_create(pool_entries, mempool_alloc_slab,
+					mempool_free_slab, bp->slab);
+		if (!*bvp)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+static void biovec_free_pools(struct bio_set *bs)
+{
+	int i;
+
+	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
+		mempool_t *bvp = bs->bvec_pools[i];
+
+		if (bvp)
+			mempool_destroy(bvp);
+	}
+
+}
+
+void bioset_free(struct bio_set *bs)
 {
-	int i, size, megabytes, pool_entries = BIO_POOL_SIZE;
+	if (bs->bio_pool)
+		mempool_destroy(bs->bio_pool);
+
+	biovec_free_pools(bs);
+
+	kfree(bs);
+}
+
+struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
+{
+	struct bio_set *bs = kmalloc(sizeof(*bs), GFP_KERNEL);
+
+	if (!bs)
+		return NULL;
+
+	memset(bs, 0, sizeof(*bs));
+	bs->bio_pool = mempool_create(bio_pool_size, mempool_alloc_slab,
+			mempool_free_slab, bio_slab);
+
+	if (!bs->bio_pool)
+		goto bad;
+
+	if (!biovec_create_pools(bs, bvec_pool_size, scale))
+		return bs;
+
+bad:
+	bioset_free(bs);
+	return NULL;
+}
+
+static void __init biovec_init_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
+		int size;
+		struct biovec_slab *bvs = bvec_slabs + i;
+
+		size = bvs->nr_vecs * sizeof(struct bio_vec);
+		bvs->slab = kmem_cache_create(bvs->name, size, 0,
+                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	}
+}
+
+static int __init init_bio(void)
+{
+	int megabytes, bvec_pool_entries;
 	int scale = BIOVEC_NR_POOLS;
 
+	bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0,
+				SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+
+	biovec_init_slabs();
+
 	megabytes = nr_free_pages() >> (20 - PAGE_SHIFT);
 
 	/*
@@ -928,38 +1056,13 @@ static void __init biovec_init_pools(void)
 	/*
 	 * scale number of entries
 	 */
-	pool_entries = megabytes * 2;
-	if (pool_entries > 256)
-		pool_entries = 256;
-
-	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
-		struct biovec_pool *bp = bvec_array + i;
-
-		size = bp->nr_vecs * sizeof(struct bio_vec);
-
-		bp->slab = kmem_cache_create(bp->name, size, 0,
-				SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
-
-		if (i >= scale)
-			pool_entries >>= 1;
-
-		bp->pool = mempool_create(pool_entries, mempool_alloc_slab,
-					mempool_free_slab, bp->slab);
-		if (!bp->pool)
-			panic("biovec: can't init mempool\n");
-	}
-}
-
-static int __init init_bio(void)
-{
-	bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0,
-				SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
-	bio_pool = mempool_create(BIO_POOL_SIZE, mempool_alloc_slab,
-				mempool_free_slab, bio_slab);
-	if (!bio_pool)
-		panic("bio: can't create mempool\n");
+	bvec_pool_entries = megabytes * 2;
+	if (bvec_pool_entries > 256)
+		bvec_pool_entries = 256;
 
-	biovec_init_pools();
+	fs_bio_set = bioset_create(BIO_POOL_SIZE, bvec_pool_entries, scale);
+	if (!fs_bio_set)
+		panic("bio: can't allocate bios\n");
 
 	bio_split_pool = mempool_create(BIO_SPLIT_ENTRIES,
 				bio_pair_alloc, bio_pair_free, NULL);
@@ -988,3 +1091,6 @@ EXPORT_SYMBOL(bio_split);
 EXPORT_SYMBOL(bio_split_pool);
 EXPORT_SYMBOL(bio_copy_user);
 EXPORT_SYMBOL(bio_uncopy_user);
+EXPORT_SYMBOL(bioset_create);
+EXPORT_SYMBOL(bioset_free);
+EXPORT_SYMBOL(bio_alloc_bioset);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index cd8d47bf34b4..06925a788519 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -59,6 +59,7 @@ struct bio_vec {
 	unsigned int	bv_offset;
 };
 
+struct bio_set;
 struct bio;
 typedef int (bio_end_io_t) (struct bio *, unsigned int, int);
 typedef void (bio_destructor_t) (struct bio *);
@@ -109,6 +110,7 @@ struct bio {
 	void			*bi_private;
 
 	bio_destructor_t	*bi_destructor;	/* destructor */
+	struct bio_set		*bi_set;	/* memory pools set */
 };
 
 /*
@@ -258,7 +260,11 @@ extern struct bio_pair *bio_split(struct bio *bi, mempool_t *pool,
 extern mempool_t *bio_split_pool;
 extern void bio_pair_release(struct bio_pair *dbio);
 
+extern struct bio_set *bioset_create(int, int, int);
+extern void bioset_free(struct bio_set *);
+
 extern struct bio *bio_alloc(int, int);
+extern struct bio *bio_alloc_bioset(int, int, struct bio_set *);
 extern void bio_put(struct bio *);
 
 extern void bio_endio(struct bio *, unsigned int, int);
@@ -280,6 +286,7 @@ extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
 extern struct bio *bio_copy_user(struct request_queue *, unsigned long, unsigned int, int);
 extern int bio_uncopy_user(struct bio *);
+void zero_fill_bio(struct bio *bio);
 
 #ifdef CONFIG_HIGHMEM
 /*
-- 
cgit v1.2.3


From f3f28e49a723a2ded9dedf61398c7f5f3bb8235c Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cam.ac.uk>
Date: Mon, 7 Mar 2005 17:54:10 -0800
Subject: [PATCH] a_ops-based loop I/O

Implements fallback to file_operations->write in the case that
aops->{prepare,commit}_write are not present on the backing filesystem.

The fallback happens in two different ways:

- For normal loop devices, i.e.  ones which do not do transformation on
  the data but simply pass it along, we simply call fops->write.  This
  should be pretty much just as fast as using aops->{prepare,commit}_write
  directly.

- For all other loop devices (e.g.  xor and cryptoloop), i.e.  all the
  ones which may be doing transformations on the data, we allocate and map
  a page (once for each bio), then for each bio vec we copy the bio vec
  page data to our mapped page, apply the loop transformation, and use
  fops->write to write out the transformed data from our page.  Once all
  bio vecs from the bio are done, we unmap and free the page.

This approach is the absolute minimum of overhead I could come up with and
for performance hungry people, as you can see I left the address space
operations method in place for filesystems which implement
aops->{prepare,commit}_write.

I have tested this patch with normal loop devices using
aops->{prepare,commit}_write on the backing filesystem, with normal loop
devices using the fops->write code path and with cryptoloop devices using
the double buffering + fops->write code path.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/loop.c | 159 ++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/loop.h |   5 +-
 2 files changed, 136 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3efc2ea7ad64..49bd42f55920 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -39,6 +39,11 @@
  * Support up to 256 loop devices
  * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
  *
+ * Support for falling back on the write file operation when the address space
+ * operations prepare_write and/or commit_write are not available on the
+ * backing filesystem.
+ * Anton Altaparmakov, 16 Feb 2005
+ *
  * Still To Fix:
  * - Advisory locking is ignored here.
  * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
@@ -67,6 +72,8 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>		/* for invalidate_bdev() */
 #include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
 
 #include <asm/uaccess.h>
 
@@ -127,7 +134,7 @@ static int transfer_xor(struct loop_device *lo, int cmd,
 
 static int xor_init(struct loop_device *lo, const struct loop_info64 *info)
 {
-	if (info->lo_encrypt_key_size <= 0)
+	if (unlikely(info->lo_encrypt_key_size <= 0))
 		return -EINVAL;
 	return 0;
 }
@@ -173,7 +180,7 @@ figure_loop_size(struct loop_device *lo)
 	loff_t size = get_loop_size(lo, lo->lo_backing_file);
 	sector_t x = (sector_t)size;
 
-	if ((loff_t)x != size)
+	if (unlikely((loff_t)x != size))
 		return -EFBIG;
 
 	set_capacity(disks[lo->lo_number], x);
@@ -186,23 +193,27 @@ lo_do_transfer(struct loop_device *lo, int cmd,
 	       struct page *lpage, unsigned loffs,
 	       int size, sector_t rblock)
 {
-	if (!lo->transfer)
+	if (unlikely(!lo->transfer))
 		return 0;
 
 	return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
 }
 
-static int
-do_lo_send(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos)
+/**
+ * do_lo_send_aops - helper for writing data to a loop device
+ *
+ * This is the fast version for backing filesystems which implement the address
+ * space operations prepare_write and commit_write.
+ */
+static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
+		int bsize, loff_t pos, struct page *page)
 {
 	struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
 	struct address_space *mapping = file->f_mapping;
 	struct address_space_operations *aops = mapping->a_ops;
-	struct page *page;
 	pgoff_t index;
-	unsigned size, offset, bv_offs;
-	int len;
-	int ret = 0;
+	unsigned offset, bv_offs;
+	int len, ret = 0;
 
 	down(&mapping->host->i_sem);
 	index = pos >> PAGE_CACHE_SHIFT;
@@ -211,23 +222,22 @@ do_lo_send(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos)
 	len = bvec->bv_len;
 	while (len > 0) {
 		sector_t IV;
+		unsigned size;
 		int transfer_result;
 
 		IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
-
 		size = PAGE_CACHE_SIZE - offset;
 		if (size > len)
 			size = len;
-
 		page = grab_cache_page(mapping, index);
-		if (!page)
+		if (unlikely(!page))
 			goto fail;
-		if (aops->prepare_write(file, page, offset, offset+size))
+		if (unlikely(aops->prepare_write(file, page, offset,
+				offset + size)))
 			goto unlock;
 		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
-						 bvec->bv_page, bv_offs,
-						 size, IV);
-		if (transfer_result) {
+				bvec->bv_page, bv_offs, size, IV);
+		if (unlikely(transfer_result)) {
 			char *kaddr;
 
 			/*
@@ -241,9 +251,10 @@ do_lo_send(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos)
 			kunmap_atomic(kaddr, KM_USER0);
 		}
 		flush_dcache_page(page);
-		if (aops->commit_write(file, page, offset, offset+size))
+		if (unlikely(aops->commit_write(file, page, offset,
+				offset + size)))
 			goto unlock;
-		if (transfer_result)
+		if (unlikely(transfer_result))
 			goto unlock;
 		bv_offs += size;
 		len -= size;
@@ -253,32 +264,125 @@ do_lo_send(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos)
 		unlock_page(page);
 		page_cache_release(page);
 	}
-	up(&mapping->host->i_sem);
 out:
+	up(&mapping->host->i_sem);
 	return ret;
-
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
 fail:
-	up(&mapping->host->i_sem);
 	ret = -1;
 	goto out;
 }
 
-static int
-lo_send(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
+/**
+ * __do_lo_send_write - helper for writing data to a loop device
+ *
+ * This helper just factors out common code between do_lo_send_direct_write()
+ * and do_lo_send_write().
+ */
+static inline int __do_lo_send_write(struct file *file,
+		u8 __user *buf, const int len, loff_t pos)
 {
+	ssize_t bw;
+	mm_segment_t old_fs = get_fs();
+
+	set_fs(get_ds());
+	bw = file->f_op->write(file, buf, len, &pos);
+	set_fs(old_fs);
+	if (likely(bw == len))
+		return 0;
+	printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
+			(unsigned long long)pos, len);
+	if (bw >= 0)
+		bw = -EIO;
+	return bw;
+}
+
+/**
+ * do_lo_send_direct_write - helper for writing data to a loop device
+ *
+ * This is the fast, non-transforming version for backing filesystems which do
+ * not implement the address space operations prepare_write and commit_write.
+ * It uses the write file operation which should be present on all writeable
+ * filesystems.
+ */
+static int do_lo_send_direct_write(struct loop_device *lo,
+		struct bio_vec *bvec, int bsize, loff_t pos, struct page *page)
+{
+	ssize_t bw = __do_lo_send_write(lo->lo_backing_file,
+			(u8 __user *)kmap(bvec->bv_page) + bvec->bv_offset,
+			bvec->bv_len, pos);
+	kunmap(bvec->bv_page);
+	cond_resched();
+	return bw;
+}
+
+/**
+ * do_lo_send_write - helper for writing data to a loop device
+ *
+ * This is the slow, transforming version for filesystems which do not
+ * implement the address space operations prepare_write and commit_write.  It
+ * uses the write file operation which should be present on all writeable
+ * filesystems.
+ *
+ * Using fops->write is slower than using aops->{prepare,commit}_write in the
+ * transforming case because we need to double buffer the data as we cannot do
+ * the transformations in place as we do not have direct access to the
+ * destination pages of the backing file.
+ */
+static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
+		int bsize, loff_t pos, struct page *page)
+{
+	int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page,
+			bvec->bv_offset, bvec->bv_len, pos >> 9);
+	if (likely(!ret))
+		return __do_lo_send_write(lo->lo_backing_file,
+				(u8 __user *)page_address(page), bvec->bv_len,
+				pos);
+	printk(KERN_ERR "loop: Transfer error at byte offset %llu, "
+			"length %i.\n", (unsigned long long)pos, bvec->bv_len);
+	if (ret > 0)
+		ret = -EIO;
+	return ret;
+}
+
+static int lo_send(struct loop_device *lo, struct bio *bio, int bsize,
+		loff_t pos)
+{
+	int (*do_lo_send)(struct loop_device *, struct bio_vec *, int, loff_t,
+			struct page *page);
 	struct bio_vec *bvec;
+	struct page *page = NULL;
 	int i, ret = 0;
 
+	do_lo_send = do_lo_send_aops;
+	if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) {
+		do_lo_send = do_lo_send_direct_write;
+		if (lo->transfer != transfer_none) {
+			page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+			if (unlikely(!page))
+				goto fail;
+			kmap(page);
+			do_lo_send = do_lo_send_write;
+		}
+	}
 	bio_for_each_segment(bvec, bio, i) {
-		ret = do_lo_send(lo, bvec, bsize, pos);
+		ret = do_lo_send(lo, bvec, bsize, pos, page);
 		if (ret < 0)
 			break;
 		pos += bvec->bv_len;
 	}
+	if (page) {
+		kunmap(page);
+		__free_page(page);
+	}
+out:
 	return ret;
+fail:
+	printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
+	ret = -ENOMEM;
+	goto out;
 }
 
 struct lo_read_data {
@@ -584,7 +688,7 @@ static int loop_change_fd(struct loop_device *lo, struct file *lo_file,
 
 	/* the loop device has to be read-only */
 	error = -EINVAL;
-	if (lo->lo_flags != LO_FLAGS_READ_ONLY)
+	if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
 		goto out;
 
 	error = -EBADF;
@@ -683,8 +787,9 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 		 */
 		if (!file->f_op->sendfile)
 			goto out_putf;
-
-		if (!aops->prepare_write || !aops->commit_write)
+		if (aops->prepare_write && aops->commit_write)
+			lo_flags |= LO_FLAGS_USE_AOPS;
+		if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
 			lo_flags |= LO_FLAGS_READ_ONLY;
 
 		lo_blocksize = inode->i_blksize;
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 652124463a24..8220d9c9da00 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -71,7 +71,10 @@ struct loop_device {
 /*
  * Loop flags
  */
-#define LO_FLAGS_READ_ONLY	1
+enum {
+	LO_FLAGS_READ_ONLY	= 1,
+	LO_FLAGS_USE_AOPS	= 2,
+};
 
 #include <asm/posix_types.h>	/* for __kernel_old_dev_t */
 #include <asm/types.h>		/* for __u64 */
-- 
cgit v1.2.3


From d949d0ec9c601f2b148bed3cdb5f87c052968554 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <andrea@cpushare.com>
Date: Mon, 7 Mar 2005 17:54:43 -0800
Subject: [PATCH] seccomp: secure computing support

I'd need it merged into mainline at some point, unless anybody has strong
arguments against it.  All I can guarantee here, is that I'll back it out
myself in the future, iff Cpushare will fail and nobody else started using
it in the meantime for similar security purposes.

(akpm: project details are at http://www.cpushare.com/technical.  It seems
like a good idea to me, and one which is worth supporting.  I agree that for
this to be successful, the added robustness of Andrea's simple and specific
jail is worthwhile).

Signed-off-by: Andrea Arcangeli <andrea@cpushare.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig                | 17 +++++++++
 arch/i386/kernel/entry.S         |  6 ++--
 arch/i386/kernel/ptrace.c        |  4 +++
 arch/x86_64/Kconfig              | 18 ++++++++++
 arch/x86_64/ia32/ia32entry.S     |  6 ++--
 arch/x86_64/kernel/entry.S       |  2 +-
 arch/x86_64/kernel/ptrace.c      |  4 +++
 fs/proc/base.c                   | 74 ++++++++++++++++++++++++++++++++++++++++
 include/asm-i386/thread_info.h   |  7 ++--
 include/asm-x86_64/thread_info.h |  6 ++--
 include/linux/sched.h            |  2 ++
 include/linux/seccomp.h          | 33 ++++++++++++++++++
 kernel/Makefile                  |  1 +
 kernel/seccomp.c                 | 74 ++++++++++++++++++++++++++++++++++++++++
 14 files changed, 244 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/seccomp.h
 create mode 100644 kernel/seccomp.c

(limited to 'include/linux')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 62ab5804d874..0433c53713f5 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -896,6 +896,23 @@ config REGPARM
 	generate incorrect output with certain kernel constructs when
 	-mregparm=3 is used.
 
+config SECCOMP
+	bool "Enable seccomp to safely compute untrusted bytecode"
+	depends on PROC_FS
+	default y
+	help
+	  This kernel feature is useful for number crunching applications
+	  that may need to compute untrusted bytecode during their
+	  execution. By using pipes or other transports made available to
+	  the process as file descriptors supporting the read/write
+	  syscalls, it's possible to isolate those applications in
+	  their own address space using seccomp. Once seccomp is
+	  enabled via /proc/<pid>/seccomp, it cannot be disabled
+	  and the task is only allowed to execute a few safe syscalls
+	  defined by each seccomp mode.
+
+	  If unsure, say Y. Only embedded should say N here.
+
 endmenu
 
 
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index f15856e40534..87aad70d3730 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -219,7 +219,8 @@ sysenter_past_esp:
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 
-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -243,7 +244,8 @@ ENTRY(system_call)
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation
-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 3985587e1cd2..b2f17640ceff 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -15,6 +15,7 @@
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/audit.h>
+#include <linux/seccomp.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -678,6 +679,9 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 __attribute__((regparm(3)))
 void do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
+	/* do the secure computing check first */
+	secure_computing(regs->orig_eax);
+
 	if (unlikely(current->audit_context)) {
 		if (!entryexit)
 			audit_syscall_entry(current, regs->orig_eax,
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index f7a85824d6f5..1dd759759a8d 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -350,6 +350,24 @@ config X86_MCE_INTEL
 	help
 	   Additional support for intel specific MCE features such as
 	   the thermal monitor.
+
+config SECCOMP
+	bool "Enable seccomp to safely compute untrusted bytecode"
+	depends on PROC_FS
+	default y
+	help
+	  This kernel feature is useful for number crunching applications
+	  that may need to compute untrusted bytecode during their
+	  execution. By using pipes or other transports made available to
+	  the process as file descriptors supporting the read/write
+	  syscalls, it's possible to isolate those applications in
+	  their own address space using seccomp. Once seccomp is
+	  enabled via /proc/<pid>/seccomp, it cannot be disabled
+	  and the task is only allowed to execute a few safe syscalls
+	  defined by each seccomp mode.
+
+	  If unsure, say Y. Only embedded should say N here.
+
 endmenu
 
 #
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 56ee2d8e0948..f3ca0db85b5b 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -78,7 +78,7 @@ ENTRY(ia32_sysenter_target)
  	.quad 1b,ia32_badarg
  	.previous	
 	GET_THREAD_INFO(%r10)
-	testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
 	jnz  sysenter_tracesys
 sysenter_do_call:	
 	cmpl	$(IA32_NR_syscalls),%eax
@@ -163,7 +163,7 @@ ENTRY(ia32_cstar_target)
 	.quad 1b,ia32_badarg
 	.previous	
 	GET_THREAD_INFO(%r10)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
 	jnz   cstar_tracesys
 cstar_do_call:	
 	cmpl $IA32_NR_syscalls,%eax
@@ -236,7 +236,7 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	SAVE_ARGS 0,0,1
 	GET_THREAD_INFO(%r10)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
 	jnz ia32_tracesys
 ia32_do_syscall:	
 	cmpl $(IA32_NR_syscalls),%eax
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index c261fdc1079e..fb9e742d5baa 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -184,7 +184,7 @@ ENTRY(system_call)
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
 	movq  %rcx,RIP-ARGOFFSET(%rsp)  
 	GET_THREAD_INFO(%rcx)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
 	jnz tracesys
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index eaa7250152f5..0b7b101debdf 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -17,6 +17,7 @@
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/audit.h>
+#include <linux/seccomp.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -521,6 +522,9 @@ static void syscall_trace(struct pt_regs *regs)
 
 asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 {
+	/* do the secure computing check first */
+	secure_computing(regs->orig_rax);
+
 	if (unlikely(current->audit_context))
 		audit_syscall_entry(current, regs->orig_rax,
 				    regs->rdi, regs->rsi,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9ab35875845d..f863e4c7e628 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -32,6 +32,7 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
+#include <linux/seccomp.h>
 #include "internal.h"
 
 /*
@@ -49,6 +50,9 @@ enum pid_directory_inos {
 	PROC_TGID_TASK,
 	PROC_TGID_STATUS,
 	PROC_TGID_MEM,
+#ifdef CONFIG_SECCOMP
+	PROC_TGID_SECCOMP,
+#endif
 	PROC_TGID_CWD,
 	PROC_TGID_ROOT,
 	PROC_TGID_EXE,
@@ -80,6 +84,9 @@ enum pid_directory_inos {
 	PROC_TID_INO,
 	PROC_TID_STATUS,
 	PROC_TID_MEM,
+#ifdef CONFIG_SECCOMP
+	PROC_TID_SECCOMP,
+#endif
 	PROC_TID_CWD,
 	PROC_TID_ROOT,
 	PROC_TID_EXE,
@@ -130,6 +137,9 @@ static struct pid_entry tgid_base_stuff[] = {
 	E(PROC_TGID_STATM,     "statm",   S_IFREG|S_IRUGO),
 	E(PROC_TGID_MAPS,      "maps",    S_IFREG|S_IRUGO),
 	E(PROC_TGID_MEM,       "mem",     S_IFREG|S_IRUSR|S_IWUSR),
+#ifdef CONFIG_SECCOMP
+	E(PROC_TGID_SECCOMP,   "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
+#endif
 	E(PROC_TGID_CWD,       "cwd",     S_IFLNK|S_IRWXUGO),
 	E(PROC_TGID_ROOT,      "root",    S_IFLNK|S_IRWXUGO),
 	E(PROC_TGID_EXE,       "exe",     S_IFLNK|S_IRWXUGO),
@@ -160,6 +170,9 @@ static struct pid_entry tid_base_stuff[] = {
 	E(PROC_TID_STATM,      "statm",   S_IFREG|S_IRUGO),
 	E(PROC_TID_MAPS,       "maps",    S_IFREG|S_IRUGO),
 	E(PROC_TID_MEM,        "mem",     S_IFREG|S_IRUSR|S_IWUSR),
+#ifdef CONFIG_SECCOMP
+	E(PROC_TID_SECCOMP,    "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
+#endif
 	E(PROC_TID_CWD,        "cwd",     S_IFLNK|S_IRWXUGO),
 	E(PROC_TID_ROOT,       "root",    S_IFLNK|S_IRWXUGO),
 	E(PROC_TID_EXE,        "exe",     S_IFLNK|S_IRWXUGO),
@@ -808,6 +821,61 @@ static struct file_operations proc_loginuid_operations = {
 };
 #endif
 
+#ifdef CONFIG_SECCOMP
+static ssize_t seccomp_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+	char __buf[20];
+	loff_t __ppos = *ppos;
+	size_t len;
+
+	/* no need to print the trailing zero, so use only len */
+	len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
+	if (__ppos >= len)
+		return 0;
+	if (count > len - __ppos)
+		count = len - __ppos;
+	if (copy_to_user(buf, __buf + __ppos, count))
+		return -EFAULT;
+	*ppos = __ppos + count;
+	return count;
+}
+
+static ssize_t seccomp_write(struct file *file, const char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+	char __buf[20], *end;
+	unsigned int seccomp_mode;
+
+	/* can set it only once to be even more secure */
+	if (unlikely(tsk->seccomp.mode))
+		return -EPERM;
+
+	memset(__buf, 0, sizeof(__buf));
+	count = min(count, sizeof(__buf) - 1);
+	if (copy_from_user(__buf, buf, count))
+		return -EFAULT;
+	seccomp_mode = simple_strtoul(__buf, &end, 0);
+	if (*end == '\n')
+		end++;
+	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
+		tsk->seccomp.mode = seccomp_mode;
+		set_tsk_thread_flag(tsk, TIF_SECCOMP);
+	} else
+		return -EINVAL;
+	if (unlikely(!(end - __buf)))
+		return -EIO;
+	return end - __buf;
+}
+
+static struct file_operations proc_seccomp_operations = {
+	.read		= seccomp_read,
+	.write		= seccomp_write,
+};
+#endif /* CONFIG_SECCOMP */
+
 static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
@@ -1443,6 +1511,12 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 			inode->i_op = &proc_mem_inode_operations;
 			inode->i_fop = &proc_mem_operations;
 			break;
+#ifdef CONFIG_SECCOMP
+		case PROC_TID_SECCOMP:
+		case PROC_TGID_SECCOMP:
+			inode->i_fop = &proc_seccomp_operations;
+			break;
+#endif /* CONFIG_SECCOMP */
 		case PROC_TID_MOUNTS:
 		case PROC_TGID_MOUNTS:
 			inode->i_fop = &proc_mounts_operations;
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index de75216b624b..2cd57271801d 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -140,6 +140,7 @@ register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_IRET		5	/* return with iret */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
+#define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17
 
@@ -150,12 +151,14 @@ register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
+#define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
-  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP))
-#define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|_TIF_SECCOMP))
+/* work to do on any return to u-space */
+#define _TIF_ALLWORK_MASK	(0x0000FFFF & ~_TIF_SECCOMP)
 
 /*
  * Thread-synchronous status.
diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h
index 255e7e66e897..f4b3b249639c 100644
--- a/include/asm-x86_64/thread_info.h
+++ b/include/asm-x86_64/thread_info.h
@@ -102,6 +102,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
 #define TIF_IRET		5	/* force IRET */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
+#define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
@@ -115,6 +116,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
+#define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
@@ -122,9 +124,9 @@ static inline struct thread_info *stack_thread_info(void)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
-  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP))
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|_TIF_SECCOMP))
 /* work to do on any return to user space */
-#define _TIF_ALLWORK_MASK 0x0000FFFF	
+#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
 
 #define PREEMPT_ACTIVE     0x10000000
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2f249f8015e5..fb151e634c9e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -32,6 +32,7 @@
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
+#include <linux/seccomp.h>
 
 struct exec_domain;
 
@@ -643,6 +644,7 @@ struct task_struct {
 	
 	void *security;
 	struct audit_context *audit_context;
+	seccomp_t seccomp;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
new file mode 100644
index 000000000000..ee989b6ee22a
--- /dev/null
+++ b/include/linux/seccomp.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_SECCOMP_H
+#define _LINUX_SECCOMP_H
+
+#include <linux/config.h>
+
+#ifdef CONFIG_SECCOMP
+
+#define NR_SECCOMP_MODES 1
+
+#include <linux/thread_info.h>
+
+typedef struct { int mode; } seccomp_t;
+
+extern void __secure_computing(int);
+static inline void secure_computing(int this_syscall)
+{
+	if (unlikely(test_thread_flag(TIF_SECCOMP)))
+		__secure_computing(this_syscall);
+}
+
+#else /* CONFIG_SECCOMP */
+
+#if (__GNUC__ > 2)
+  typedef struct { } seccomp_t;
+#else
+  typedef struct { int gcc_is_buggy; } seccomp_t;
+#endif
+
+#define secure_computing(x) do { } while (0)
+
+#endif /* CONFIG_SECCOMP */
+
+#endif /* _LINUX_SECCOMP_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index db7281f1f087..d680ace0fdda 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+obj-$(CONFIG_SECCOMP) += seccomp.o
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
new file mode 100644
index 000000000000..b6c5b35c737c
--- /dev/null
+++ b/kernel/seccomp.c
@@ -0,0 +1,74 @@
+/*
+ * linux/kernel/seccomp.c
+ *
+ * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
+ *
+ * This defines a simple but solid secure-computing mode.
+ */
+
+#include <linux/seccomp.h>
+#include <linux/sched.h>
+#include <asm/unistd.h>
+#ifdef TIF_IA32
+#include <asm/ia32_unistd.h>
+#endif
+
+/* #define SECCOMP_DEBUG 1 */
+
+/*
+ * Secure computing mode 1 allows only read/write/exit/sigreturn.
+ * To be fully secure this must be combined with rlimit
+ * to limit the stack allocations too.
+ */
+static int mode1_syscalls[] = {
+	__NR_read, __NR_write, __NR_exit,
+	/*
+	 * Allow either sigreturn or rt_sigreturn, newer archs
+	 * like x86-64 only defines __NR_rt_sigreturn.
+	 */
+#ifdef __NR_sigreturn
+	__NR_sigreturn,
+#else
+	__NR_rt_sigreturn,
+#endif
+	0, /* null terminated */
+};
+
+#ifdef TIF_IA32
+static int mode1_syscalls_32bit[] = {
+	__NR_ia32_read, __NR_ia32_write, __NR_ia32_exit,
+	/*
+	 * Allow either sigreturn or rt_sigreturn, newer archs
+	 * like x86-64 only defines __NR_rt_sigreturn.
+	 */
+	__NR_ia32_sigreturn,
+	0, /* null terminated */
+};
+#endif
+
+void __secure_computing(int this_syscall)
+{
+	int mode = current->seccomp.mode;
+	int * syscall;
+
+	switch (mode) {
+	case 1:
+		syscall = mode1_syscalls;
+#ifdef TIF_IA32
+		if (test_thread_flag(TIF_IA32))
+			syscall = mode1_syscalls_32bit;
+#endif
+		do {
+			if (*syscall == this_syscall)
+				return;
+		} while (*++syscall);
+		break;
+	default:
+		BUG();
+	}
+
+#ifdef SECCOMP_DEBUG
+	dump_stack();
+#endif
+	do_exit(SIGKILL);
+}
-- 
cgit v1.2.3


From a61faa62c497867e2e5ceb23f699b2a911338260 Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Mon, 7 Mar 2005 17:56:54 -0800
Subject: [PATCH] Add nobh_writepage() support

Add nobh_wripage() support for the filesystems which uses
nobh_prepare_write/nobh_commit_write().

Idea here is to reduce unnecessary bufferhead creation/attachment to the
page through pageout()->block_write_full_page().  nobh_wripage() tries to
operate by directly creating bios, but it falls back to
__block_write_full_page() if it can't make progress.

Note that this is not really generic routine and can't be used for
filesystems which uses page->Private for anything other than buffer heads.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c                 | 56 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/inode.c             |  8 ++++++-
 fs/jfs/inode.c              |  2 +-
 fs/mpage.c                  | 34 +++++++++++++++++++++++----
 include/linux/buffer_head.h |  3 +++
 include/linux/mpage.h       |  3 +++
 6 files changed, 99 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index ed6458f00d64..470c28a6c946 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -39,6 +39,7 @@
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/bitops.h>
+#include <linux/mpage.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 static void invalidate_bh_lrus(void);
@@ -2508,6 +2509,61 @@ int nobh_commit_write(struct file *file, struct page *page,
 }
 EXPORT_SYMBOL(nobh_commit_write);
 
+/*
+ * nobh_writepage() - based on block_full_write_page() except
+ * that it tries to operate without attaching bufferheads to
+ * the page.
+ */
+int nobh_writepage(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc)
+{
+	struct inode * const inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	void *kaddr;
+	int ret;
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		goto out;
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index >= end_index+1 || !offset) {
+		/*
+		 * The page may have dirty, unmapped buffers.  For example,
+		 * they may have been added in ext3_writepage().  Make them
+		 * freeable here, so the page does not leak.
+		 */
+#if 0
+		/* Not really sure about this  - do we need this ? */
+		if (page->mapping->a_ops->invalidatepage)
+			page->mapping->a_ops->invalidatepage(page, offset);
+#endif
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+out:
+	ret = mpage_writepage(page, get_block, wbc);
+	if (ret == -EAGAIN)
+		ret = __block_write_full_page(inode, page, get_block, wbc);
+	return ret;
+}
+EXPORT_SYMBOL(nobh_writepage);
+
 /*
  * This function assumes that ->prepare_write() uses nobh_prepare_write().
  */
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e10be1afeacb..b890be022496 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -626,6 +626,12 @@ ext2_nobh_prepare_write(struct file *file, struct page *page,
 	return nobh_prepare_write(page,from,to,ext2_get_block);
 }
 
+static int ext2_nobh_writepage(struct page *page,
+			struct writeback_control *wbc)
+{
+	return nobh_writepage(page, ext2_get_block, wbc);
+}
+
 static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,ext2_get_block);
@@ -675,7 +681,7 @@ struct address_space_operations ext2_aops = {
 struct address_space_operations ext2_nobh_aops = {
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
-	.writepage		= ext2_writepage,
+	.writepage		= ext2_nobh_writepage,
 	.sync_page		= block_sync_page,
 	.prepare_write		= ext2_nobh_prepare_write,
 	.commit_write		= nobh_commit_write,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 69e397ef94f6..af7242f953ad 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -286,7 +286,7 @@ static int jfs_get_block(struct inode *ip, sector_t lblock,
 
 static int jfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	return block_write_full_page(page, jfs_get_block, wbc);
+	return nobh_writepage(page, jfs_get_block, wbc);
 }
 
 static int jfs_writepages(struct address_space *mapping,
diff --git a/fs/mpage.c b/fs/mpage.c
index 4bbf15ee91fe..d6161b0258e1 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -386,8 +386,9 @@ EXPORT_SYMBOL(mpage_readpage);
  * just allocate full-size (16-page) BIOs.
  */
 static struct bio *
-mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
-	sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc)
+__mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
+	sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc,
+	writepage_t writepage_fn)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = page->mapping->host;
@@ -580,7 +581,13 @@ alloc_new:
 confused:
 	if (bio)
 		bio = mpage_bio_submit(WRITE, bio);
-	*ret = page->mapping->a_ops->writepage(page, wbc);
+
+	if (writepage_fn) {
+		*ret = (*writepage_fn)(page, wbc);
+	} else {
+		*ret = -EAGAIN;
+		goto out;
+	}
 	/*
 	 * The caller has a ref on the inode, so *mapping is stable
 	 */
@@ -706,8 +713,9 @@ retry:
 							&mapping->flags);
 				}
 			} else {
-				bio = mpage_writepage(bio, page, get_block,
-						&last_block_in_bio, &ret, wbc);
+				bio = __mpage_writepage(bio, page, get_block,
+						&last_block_in_bio, &ret, wbc,
+						page->mapping->a_ops->writepage);
 			}
 			if (ret || (--(wbc->nr_to_write) <= 0))
 				done = 1;
@@ -735,3 +743,19 @@ retry:
 	return ret;
 }
 EXPORT_SYMBOL(mpage_writepages);
+
+int mpage_writepage(struct page *page, get_block_t get_block,
+	struct writeback_control *wbc)
+{
+	int ret = 0;
+	struct bio *bio;
+	sector_t last_block_in_bio = 0;
+
+	bio = __mpage_writepage(NULL, page, get_block,
+			&last_block_in_bio, &ret, wbc, NULL);
+	if (bio)
+		mpage_bio_submit(WRITE, bio);
+
+	return ret;
+}
+EXPORT_SYMBOL(mpage_writepage);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index f388b513df75..e0d70070fabc 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -203,6 +203,9 @@ int file_fsync(struct file *, struct dentry *, int);
 int nobh_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
 int nobh_commit_write(struct file *, struct page *, unsigned, unsigned);
 int nobh_truncate_page(struct address_space *, loff_t);
+int nobh_writepage(struct page *page, get_block_t *get_block,
+                        struct writeback_control *wbc);
+
 
 /*
  * inline definitions
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 86aa7b676274..3ca880463c47 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -11,12 +11,15 @@
  */
 
 struct writeback_control;
+typedef int (writepage_t)(struct page *page, struct writeback_control *wbc);
 
 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block);
 int mpage_readpage(struct page *page, get_block_t get_block);
 int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
+int mpage_writepage(struct page *page, get_block_t *get_block,
+		struct writeback_control *wbc);
 
 static inline int
 generic_writepages(struct address_space *mapping, struct writeback_control *wbc)
-- 
cgit v1.2.3


From a5f17cb26b56417566cfba44fde34887601f4eba Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Mon, 7 Mar 2005 17:59:24 -0800
Subject: [PATCH] Minor cleanups to the IPMI driver

This patch cleans up the DMI handling so that multiple interfaces can be
reported from the DMI tables and so that the DMI slave address can be
transferred up to the upper layer.  It also adds an option to specify the
slave address as an init parm and removes some unnecessary initializers.

This patch also adds inc/dec usecount functions for the SMIs so they can
modify the usecounts of modules they use (added because the SMB driver uses
the I2C code).

Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/IPMI.txt              |   6 +++
 drivers/char/ipmi/ipmi_msghandler.c |  32 +++++++++---
 drivers/char/ipmi/ipmi_si_intf.c    | 100 ++++++++++++++++++++++--------------
 include/linux/ipmi_smi.h            |  11 +++-
 4 files changed, 103 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt
index d6dcb2769e0c..90d10e708ca3 100644
--- a/Documentation/IPMI.txt
+++ b/Documentation/IPMI.txt
@@ -342,6 +342,7 @@ You can change this at module load time (for a module) with:
        irqs=<irq1>,<irq2>... trydefaults=[0|1]
        regspacings=<sp1>,<sp2>,... regsizes=<size1>,<size2>,...
        regshifts=<shift1>,<shift2>,...
+       slave_addrs=<addr1>,<addr2>,...
 
 Each of these except si_trydefaults is a list, the first item for the
 first interface, second item for the second interface, etc.
@@ -383,6 +384,10 @@ Since the register size may be larger than 32 bits, the IPMI data may not
 be in the lower 8 bits.  The regshifts parameter give the amount to shift
 the data to get to the actual IPMI data.
 
+The slave_addrs specifies the IPMI address of the local BMC.  This is
+usually 0x20 and the driver defaults to that, but in case it's not, it
+can be specified when the driver starts up.
+
 When compiled into the kernel, the addresses can be specified on the
 kernel command line as:
 
@@ -392,6 +397,7 @@ kernel command line as:
        ipmi_si.regspacings=<sp1>,<sp2>,...
        ipmi_si.regsizes=<size1>,<size2>,...
        ipmi_si.regshifts=<shift1>,<shift2>,...
+       ipmi_si.slave_addrs=<addr1>,<addr2>,...
 
 It works the same as the module parameters of the same names.
 
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index fe02300e621c..a6606a1aced7 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -612,6 +612,7 @@ int ipmi_create_user(unsigned int          if_num,
 	unsigned long flags;
 	ipmi_user_t   new_user;
 	int           rv = 0;
+	ipmi_smi_t    intf;
 
 	/* There is no module usecount here, because it's not
            required.  Since this can only be used by and called from
@@ -646,19 +647,29 @@ int ipmi_create_user(unsigned int          if_num,
 		goto out_unlock;
 	}
 
+	intf = ipmi_interfaces[if_num];
+
 	new_user->handler = handler;
 	new_user->handler_data = handler_data;
-	new_user->intf = ipmi_interfaces[if_num];
+	new_user->intf = intf;
 	new_user->gets_events = 0;
 
-	if (!try_module_get(new_user->intf->handlers->owner)) {
+	if (!try_module_get(intf->handlers->owner)) {
 		rv = -ENODEV;
 		goto out_unlock;
 	}
 
-	write_lock_irqsave(&new_user->intf->users_lock, flags);
-	list_add_tail(&new_user->link, &new_user->intf->users);
-	write_unlock_irqrestore(&new_user->intf->users_lock, flags);
+	if (intf->handlers->inc_usecount) {
+		rv = intf->handlers->inc_usecount(intf->send_info);
+		if (rv) {
+			module_put(intf->handlers->owner);
+			goto out_unlock;
+		}
+	}
+
+	write_lock_irqsave(&intf->users_lock, flags);
+	list_add_tail(&new_user->link, &intf->users);
+	write_unlock_irqrestore(&intf->users_lock, flags);
 
  out_unlock:	
 	if (rv) {
@@ -729,8 +740,11 @@ int ipmi_destroy_user(ipmi_user_t user)
 	down_read(&interfaces_sem);
 	write_lock_irqsave(&intf->users_lock, flags);
 	rv = ipmi_destroy_user_nolock(user);
-	if (!rv)
+	if (!rv) {
 		module_put(intf->handlers->owner);
+		if (intf->handlers->dec_usecount)
+			intf->handlers->dec_usecount(intf->send_info);
+	}
 		
 	write_unlock_irqrestore(&intf->users_lock, flags);
 	up_read(&interfaces_sem);
@@ -1629,6 +1643,7 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 		      void		       *send_info,
 		      unsigned char            version_major,
 		      unsigned char            version_minor,
+		      unsigned char            slave_addr,
 		      ipmi_smi_t               *intf)
 {
 	int              i, j;
@@ -1664,7 +1679,10 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 			new_intf->intf_num = i;
 			new_intf->version_major = version_major;
 			new_intf->version_minor = version_minor;
-			new_intf->my_address = IPMI_BMC_SLAVE_ADDR;
+			if (slave_addr == 0)
+				new_intf->my_address = IPMI_BMC_SLAVE_ADDR;
+			else
+				new_intf->my_address = slave_addr;
 			new_intf->my_lun = 2;  /* the SMS LUN. */
 			rwlock_init(&(new_intf->users_lock));
 			INIT_LIST_HEAD(&(new_intf->users));
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 82f0ed43ea32..2b5d0e084b17 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -176,6 +176,9 @@ struct smi_info
 	unsigned char ipmi_version_major;
 	unsigned char ipmi_version_minor;
 
+	/* Slave address, could be reported from DMI. */
+	unsigned char slave_addr;
+
 	/* Counters and things for the proc filesystem. */
 	spinlock_t count_lock;
 	unsigned long short_timeouts;
@@ -407,7 +410,7 @@ static void handle_transaction_done(struct smi_info *smi_info)
 			/* Error fetching flags, just give up for
 			   now. */
 			smi_info->si_state = SI_NORMAL;
-		} else if (len < 3) {
+		} else if (len < 4) {
 			/* Hmm, no flags.  That's technically illegal, but
 			   don't use uninitialized data. */
 			smi_info->si_state = SI_NORMAL;
@@ -897,21 +900,23 @@ static struct smi_info *smi_infos[SI_MAX_DRIVERS] =
 #define DEFAULT_REGSPACING	1
 
 static int           si_trydefaults = 1;
-static char          *si_type[SI_MAX_PARMS] = { NULL, NULL, NULL, NULL };
+static char          *si_type[SI_MAX_PARMS];
 #define MAX_SI_TYPE_STR 30
 static char          si_type_str[MAX_SI_TYPE_STR];
-static unsigned long addrs[SI_MAX_PARMS] = { 0, 0, 0, 0 };
-static int num_addrs = 0;
-static unsigned int  ports[SI_MAX_PARMS] = { 0, 0, 0, 0 };
-static int num_ports = 0;
-static int           irqs[SI_MAX_PARMS] = { 0, 0, 0, 0 };
-static int num_irqs = 0;
-static int           regspacings[SI_MAX_PARMS] = { 0, 0, 0, 0 };
+static unsigned long addrs[SI_MAX_PARMS];
+static int num_addrs;
+static unsigned int  ports[SI_MAX_PARMS];
+static int num_ports;
+static int           irqs[SI_MAX_PARMS];
+static int num_irqs;
+static int           regspacings[SI_MAX_PARMS];
 static int num_regspacings = 0;
-static int           regsizes[SI_MAX_PARMS] = { 0, 0, 0, 0 };
+static int           regsizes[SI_MAX_PARMS];
 static int num_regsizes = 0;
-static int           regshifts[SI_MAX_PARMS] = { 0, 0, 0, 0 };
+static int           regshifts[SI_MAX_PARMS];
 static int num_regshifts = 0;
+static int slave_addrs[SI_MAX_PARMS];
+static int num_slave_addrs = 0;
 
 
 module_param_named(trydefaults, si_trydefaults, bool, 0);
@@ -955,6 +960,12 @@ MODULE_PARM_DESC(regshifts, "The amount to shift the data read from the."
 		 " IPMI register, in bits.  For instance, if the data"
 		 " is read from a 32-bit word and the IPMI data is in"
 		 " bit 8-15, then the shift would be 8");
+module_param_array(slave_addrs, int, &num_slave_addrs, 0);
+MODULE_PARM_DESC(slave_addrs, "Set the default IPMB slave address for"
+		 " the controller.  Normally this is 0x20, but can be"
+		 " overridden by this parm.  This is an array indexed"
+		 " by interface number.");
+
 
 #define IPMI_MEM_ADDR_SPACE 1
 #define IPMI_IO_ADDR_SPACE  2
@@ -1542,7 +1553,6 @@ static int try_init_acpi(int intf_num, struct smi_info **new_info)
 #endif
 
 #ifdef CONFIG_X86
-
 typedef struct dmi_ipmi_data
 {
 	u8   		type;
@@ -1550,21 +1560,26 @@ typedef struct dmi_ipmi_data
 	unsigned long	base_addr;
 	u8   		irq;
 	u8              offset;
-}dmi_ipmi_data_t;
+	u8              slave_addr;
+} dmi_ipmi_data_t;
+
+static dmi_ipmi_data_t dmi_data[SI_MAX_DRIVERS];
+static int dmi_data_entries;
 
 typedef struct dmi_header
 {
 	u8	type;
 	u8	length;
 	u16	handle;
-}dmi_header_t;
+} dmi_header_t;
 
-static int decode_dmi(dmi_header_t *dm, dmi_ipmi_data_t *ipmi_data)
+static int decode_dmi(dmi_header_t *dm, int intf_num)
 {
 	u8		*data = (u8 *)dm;
 	unsigned long  	base_addr;
 	u8		reg_spacing;
 	u8              len = dm->length;
+	dmi_ipmi_data_t *ipmi_data = dmi_data+intf_num;
 
 	ipmi_data->type = data[4];
 
@@ -1608,22 +1623,26 @@ static int decode_dmi(dmi_header_t *dm, dmi_ipmi_data_t *ipmi_data)
 		ipmi_data->offset = 1;
 	}
 
-	if (is_new_interface(-1, ipmi_data->addr_space,ipmi_data->base_addr))
+	ipmi_data->slave_addr = data[6];
+
+	if (is_new_interface(-1, ipmi_data->addr_space,ipmi_data->base_addr)) {
+		dmi_data_entries++;
 		return 0;
+	}
 
 	memset(ipmi_data, 0, sizeof(dmi_ipmi_data_t));
 
 	return -1;
 }
 
-static int dmi_table(u32 base, int len, int num,
-	dmi_ipmi_data_t *ipmi_data)
+static int dmi_table(u32 base, int len, int num)
 {
 	u8 		  *buf;
 	struct dmi_header *dm;
 	u8 		  *data;
 	int 		  i=1;
 	int		  status=-1;
+	int               intf_num = 0;
 
 	buf = ioremap(base, len);
 	if(buf==NULL)
@@ -1639,9 +1658,10 @@ static int dmi_table(u32 base, int len, int num,
         		break;
 
 		if (dm->type == 38) {
-			if (decode_dmi(dm, ipmi_data) == 0) {
-				status = 0;
-				break;
+			if (decode_dmi(dm, intf_num) == 0) {
+				intf_num++;
+				if (intf_num >= SI_MAX_DRIVERS)
+					break;
 			}
 		}
 
@@ -1666,7 +1686,7 @@ inline static int dmi_checksum(u8 *buf)
 	return (sum==0);
 }
 
-static int dmi_iterator(dmi_ipmi_data_t *ipmi_data)
+static int dmi_decode(void)
 {
 	u8   buf[15];
 	u32  fp=0xF0000;
@@ -1684,7 +1704,7 @@ static int dmi_iterator(dmi_ipmi_data_t *ipmi_data)
 			u16 len=buf[7]<<8|buf[6];
 			u32 base=buf[11]<<24|buf[10]<<16|buf[9]<<8|buf[8];
 
-			if(dmi_table(base, len, num, ipmi_data) == 0)
+			if(dmi_table(base, len, num) == 0)
 				return 0;
 		}
 		fp+=16;
@@ -1696,16 +1716,13 @@ static int dmi_iterator(dmi_ipmi_data_t *ipmi_data)
 static int try_init_smbios(int intf_num, struct smi_info **new_info)
 {
 	struct smi_info   *info;
-	dmi_ipmi_data_t   ipmi_data;
+	dmi_ipmi_data_t   *ipmi_data = dmi_data+intf_num;
 	char              *io_type;
-	int               status;
-
-	status = dmi_iterator(&ipmi_data);
 
-	if (status < 0)
+	if (intf_num >= dmi_data_entries)
 		return -ENODEV;
 
-	switch(ipmi_data.type) {
+	switch(ipmi_data->type) {
 		case 0x01: /* KCS */
 			si_type[intf_num] = "kcs";
 			break;
@@ -1716,7 +1733,6 @@ static int try_init_smbios(int intf_num, struct smi_info **new_info)
 			si_type[intf_num] = "bt";
 			break;
 		default:
-			printk("ipmi_si: Unknown SMBIOS SI type.\n");
 			return -EIO;
 	}
 
@@ -1727,15 +1743,15 @@ static int try_init_smbios(int intf_num, struct smi_info **new_info)
 	}
 	memset(info, 0, sizeof(*info));
 
-	if (ipmi_data.addr_space == 1) {
+	if (ipmi_data->addr_space == 1) {
 		io_type = "memory";
 		info->io_setup = mem_setup;
-		addrs[intf_num] = ipmi_data.base_addr;
+		addrs[intf_num] = ipmi_data->base_addr;
 		info->io.info = &(addrs[intf_num]);
-	} else if (ipmi_data.addr_space == 2) {
+	} else if (ipmi_data->addr_space == 2) {
 		io_type = "I/O";
 		info->io_setup = port_setup;
-		ports[intf_num] = ipmi_data.base_addr;
+		ports[intf_num] = ipmi_data->base_addr;
 		info->io.info = &(ports[intf_num]);
 	} else {
 		kfree(info);
@@ -1743,20 +1759,23 @@ static int try_init_smbios(int intf_num, struct smi_info **new_info)
 		return -EIO;
 	}
 
-	regspacings[intf_num] = ipmi_data.offset;
+	regspacings[intf_num] = ipmi_data->offset;
 	info->io.regspacing = regspacings[intf_num];
 	if (!info->io.regspacing)
 		info->io.regspacing = DEFAULT_REGSPACING;
 	info->io.regsize = DEFAULT_REGSPACING;
 	info->io.regshift = regshifts[intf_num];
 
-	irqs[intf_num] = ipmi_data.irq;
+	info->slave_addr = ipmi_data->slave_addr;
+
+	irqs[intf_num] = ipmi_data->irq;
 
 	*new_info = info;
 
 	printk("ipmi_si: Found SMBIOS-specified state machine at %s"
-	       " address 0x%lx\n",
-	       io_type, (unsigned long)ipmi_data.base_addr);
+	       " address 0x%lx, slave address 0x%x\n",
+	       io_type, (unsigned long)ipmi_data->base_addr,
+	       ipmi_data->slave_addr);
 	return 0;
 }
 #endif /* CONFIG_X86 */
@@ -2121,6 +2140,7 @@ static int init_one_smi(int intf_num, struct smi_info **smi)
 			       new_smi,
 			       new_smi->ipmi_version_major,
 			       new_smi->ipmi_version_minor,
+			       new_smi->slave_addr,
 			       &(new_smi->intf));
 	if (rv) {
 		printk(KERN_ERR
@@ -2222,6 +2242,10 @@ static __init int init_ipmi_si(void)
    	        printk(", BT version %s", bt_smi_handlers.version);
 	printk("\n");
 
+#ifdef CONFIG_X86
+	dmi_decode();
+#endif
+
 	rv = init_one_smi(0, &(smi_infos[pos]));
 	if (rv && !ports[0] && si_trydefaults) {
 		/* If we are trying defaults and the initial port is
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 88dd37439078..e36ee157ad67 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -104,13 +104,22 @@ struct ipmi_smi_handlers
 	/* Called to poll for work to do.  This is so upper layers can
 	   poll for operations during things like crash dumps. */
 	void (*poll)(void *send_info);
+
+	/* Tell the handler that we are using it/not using it.  The
+	   message handler get the modules that this handler belongs
+	   to; this function lets the SMI claim any modules that it
+	   uses.  These may be NULL if this is not required. */
+	int (*inc_usecount)(void *send_info);
+	void (*dec_usecount)(void *send_info);
 };
 
-/* Add a low-level interface to the IPMI driver. */
+/* Add a low-level interface to the IPMI driver.  Note that if the
+   interface doesn't know its slave address, it should pass in zero. */
 int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 		      void                     *send_info,
 		      unsigned char            version_major,
 		      unsigned char            version_minor,
+		      unsigned char            slave_addr,
 		      ipmi_smi_t               *intf);
 
 /*
-- 
cgit v1.2.3


From 90e500696af001ca0255bc838b085166521a3bb3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 7 Mar 2005 17:59:39 -0800
Subject: [PATCH] Quotactl changes for XFS

Attached patch from Nathan splits the checks done in quotactl() in XFS and
VFS parts (it's mostly just moving of code back and forth).  It's done
mainly because XFS guys would like to implement more types of quotas and I
don't want them to slow down the general VFS case.

Signed-off-by: Nathan Scott <nathans@sgi.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/quota.c                | 85 +++++++++++++++++++++++++++++++++++------------
 include/linux/dqblk_xfs.h |  6 ++++
 2 files changed, 70 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota.c b/fs/quota.c
index d59ccf2358b5..3f0333a51a23 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -16,8 +16,8 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 
-/* Check validity of quotactl */
-static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
+/* Check validity of generic quotactl commands */
+static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
 {
 	if (type >= MAXQUOTAS)
 		return -EINVAL;
@@ -58,6 +58,48 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t
 			if (sb && !sb->s_qcop->quota_sync)
 				return -ENOSYS;
 			break;
+		default:
+			return -EINVAL;
+	}
+
+	/* Is quota turned on for commands which need it? */
+	switch (cmd) {
+		case Q_GETFMT:
+		case Q_GETINFO:
+		case Q_QUOTAOFF:
+		case Q_SETINFO:
+		case Q_SETQUOTA:
+		case Q_GETQUOTA:
+			/* This is just informative test so we are satisfied without a lock */
+			if (!sb_has_quota_enabled(sb, type))
+				return -ESRCH;
+	}
+
+	/* Check privileges */
+	if (cmd == Q_GETQUOTA) {
+		if (((type == USRQUOTA && current->euid != id) ||
+		     (type == GRPQUOTA && !in_egroup_p(id))) &&
+		    !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+	}
+	else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO)
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+	return 0;
+}
+
+/* Check validity of XFS Quota Manager commands */
+static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
+{
+	if (type >= XQM_MAXQUOTAS)
+		return -EINVAL;
+	if (!sb)
+		return -ENODEV;
+	if (!sb->s_qcop)
+		return -ENOSYS;
+
+	switch (cmd) {
 		case Q_XQUOTAON:
 		case Q_XQUOTAOFF:
 		case Q_XQUOTARM:
@@ -80,30 +122,31 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t
 			return -EINVAL;
 	}
 
-	/* Is quota turned on for commands which need it? */
-	switch (cmd) {
-		case Q_GETFMT:
-		case Q_GETINFO:
-		case Q_QUOTAOFF:
-		case Q_SETINFO:
-		case Q_SETQUOTA:
-		case Q_GETQUOTA:
-			/* This is just informative test so we are satisfied without a lock */
-			if (!sb_has_quota_enabled(sb, type))
-				return -ESRCH;
-	}
 	/* Check privileges */
-	if (cmd == Q_GETQUOTA || cmd == Q_XGETQUOTA) {
-		if (((type == USRQUOTA && current->euid != id) ||
-		     (type == GRPQUOTA && !in_egroup_p(id))) &&
-		    !capable(CAP_SYS_ADMIN))
+	if (cmd == Q_XGETQUOTA) {
+		if (((type == XQM_USRQUOTA && current->euid != id) ||
+		     (type == XQM_GRPQUOTA && !in_egroup_p(id))) &&
+		     !capable(CAP_SYS_ADMIN))
 			return -EPERM;
-	}
-	else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO && cmd != Q_XGETQSTAT)
+	} else if (cmd != Q_XGETQSTAT) {
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
+	}
+
+	return 0;
+}
+
+static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
+{
+	int error;
 
-	return security_quotactl (cmd, type, id, sb);
+	if (XQM_COMMAND(cmd))
+		error = xqm_quotactl_valid(sb, type, cmd, id);
+	else
+		error = generic_quotactl_valid(sb, type, cmd, id);
+	if (!error)
+		error = security_quotactl(cmd, type, id, sb);
+	return error;
 }
 
 static struct super_block *get_super_to_sync(int type)
diff --git a/include/linux/dqblk_xfs.h b/include/linux/dqblk_xfs.h
index bf2d65765189..cb31719ee192 100644
--- a/include/linux/dqblk_xfs.h
+++ b/include/linux/dqblk_xfs.h
@@ -28,6 +28,12 @@
  */
 
 #define XQM_CMD(x)	(('X'<<8)+(x))	/* note: forms first QCMD argument */
+#define XQM_COMMAND(x)	(((x) & (0xff<<8)) == ('X'<<8))	/* test if for XFS */
+
+#define XQM_USRQUOTA	0	/* system call user quota type */
+#define XQM_GRPQUOTA	1	/* system call group quota type */
+#define XQM_MAXQUOTAS	2
+
 #define Q_XQUOTAON	XQM_CMD(1)	/* enable accounting/enforcement */
 #define Q_XQUOTAOFF	XQM_CMD(2)	/* disable accounting/enforcement */
 #define Q_XGETQUOTA	XQM_CMD(3)	/* get disk limits and usage */
-- 
cgit v1.2.3


From 7ad226a285b751f486b657b2ab16204c2a33e925 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 7 Mar 2005 18:03:40 -0800
Subject: [PATCH] base-small: shrink PID tables

CONFIG_BASE_SMALL reduce size of pidmap table for small machines

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/threads.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/threads.h b/include/linux/threads.h
index 4243c55cce87..b59738ac6197 100644
--- a/include/linux/threads.h
+++ b/include/linux/threads.h
@@ -7,7 +7,7 @@
  * The default limit for the nr of threads is now in
  * /proc/sys/kernel/threads-max.
  */
- 
+
 /*
  * Maximum supported processors that can run under SMP.  This value is
  * set via configure setting.  The maximum is equal to the size of the
@@ -25,11 +25,12 @@
 /*
  * This controls the default maximum pid allocated to a process
  */
-#define PID_MAX_DEFAULT 0x8000
+#define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)
 
 /*
  * A maximum of 4 million PIDs should be enough for a while:
  */
-#define PID_MAX_LIMIT (sizeof(long) > 4 ? 4*1024*1024 : PID_MAX_DEFAULT)
+#define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \
+	(sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))
 
 #endif
-- 
cgit v1.2.3


From 79d22ee86a32cb00096fdad4510dd7656faf401c Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 7 Mar 2005 18:04:39 -0800
Subject: [PATCH] base-small: shrink console buffer

CONFIG_BASE_SMALL reduce console transfer buffer

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/vt_kern.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index cb18320adb16..dc392a1b58eb 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -84,7 +84,8 @@ void reset_vc(struct vc_data *vc);
  * vc_screen.c shares this temporary buffer with the console write code so that
  * we can easily avoid touching user space while holding the console spinlock.
  */
-#define CON_BUF_SIZE	PAGE_SIZE
+
+#define CON_BUF_SIZE (CONFIG_BASE_SMALL ? 256 : PAGE_SIZE)
 extern char con_buf[CON_BUF_SIZE];
 extern struct semaphore con_buf_sem;
 
-- 
cgit v1.2.3


From 8c63b6d337534a6b5fb111dc27d0850f535118c0 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 7 Mar 2005 18:04:55 -0800
Subject: [PATCH] lib/sort: Heapsort implementation of sort()

This patch adds a generic array sorting library routine. This is meant
to replace qsort, which has two problem areas for kernel use.

The first issue is quadratic worst-case performance. While quicksort
worst-case datasets are rarely encountered in normal scenarios, it is
in fact quite easy to construct worst cases for almost all quicksort
algorithms given source or access to an element comparison callback.
This could allow attackers to cause sorts that would otherwise take
less than a millisecond to take seconds and sorts that should take
less than a second to take weeks or months. Fixing this problem
requires randomizing pivot selection with a secure random number
generator, which is rather expensive.

The second is that quicksort's recursion tracking requires either
nontrivial amounts of stack space or dynamic memory allocation and out
of memory error handling.

By comparison, heapsort has both O(n log n) average and worst-case
performance and practically no extra storage requirements. This
version runs within 70-90% of the average performance of optimized
quicksort so it should be an acceptable replacement wherever quicksort
would be used in the kernel.

Note that this function has an extra parameter for passing in an
optimized swapping function. This is worth 10% or more over the
typical byte-by-byte exchange functions.

Benchmarks:

qsort:     glibc variant            1189 bytes (+ 256/1024 stack)
qsort_3f:  my simplified variant     459 bytes (+ 256/1024 stack)
heapsort:  the version below         346 bytes
shellsort: an optimized shellsort    196 bytes

                         P4 1.8GHz        Opteron 1.4GHz (32-bit)
size   algorithm      cycles relative        cycles relative
100:
           qsort:      38682 100.00%	      27631 100.00%
        qsort_3f:      36277 106.63%	      22406 123.32%
        heapsort:      43574  88.77%	      30301  91.19%
       shellsort:      39087  98.97%	      25139 109.91%
200:
           qsort:      86468 100.00%	      61148 100.00%
        qsort_3f:      78918 109.57%	      48959 124.90%
        heapsort:      98040  88.20%	      68235  89.61%
       shellsort:      95688  90.36%	      62279  98.18%
400:
           qsort:     187720 100.00%	     131313 100.00%
        qsort_3f:     174905 107.33%	     107954 121.64%
        heapsort:     223896  83.84%	     154241  85.13%
       shellsort:     223037  84.17%	     148990  88.14%
800:
           qsort:     407060 100.00%	     287460 100.00%
        qsort_3f:     385106 105.70%	     239131 120.21%
        heapsort:     484662  83.99%	     340099  84.52%
       shellsort:     537110  75.79%	     354755  81.03%
1600:
           qsort:     879596 100.00%	     621331 100.00%
        qsort_3f:     861568 102.09%	     522013 119.03%
        heapsort:    1079750  81.46%	     746677  83.21%
       shellsort:    1234243  71.27%	     820782  75.70%
3200:
           qsort:    1903902 100.00%	    1342126 100.00%
        qsort_3f:    1908816  99.74%	    1131496 118.62%
        heapsort:    2515493  75.69%	    1630333  82.32%
       shellsort:    2985339  63.78%	    1964794  68.31%
6400:
           qsort:    4046370 100.00%	    2909215 100.00%
        qsort_3f:    4164468  97.16%	    2468393 117.86%
        heapsort:    5150659  78.56%	    3533585  82.33%
       shellsort:    6650225  60.85%	    4429849  65.67%
12800:
           qsort:    8729730 100.00%	    6185097 100.00%
        qsort_3f:    8776885  99.46%	    5288826 116.95%
        heapsort:   11064224  78.90%	    7603061  81.35%
       shellsort:   15487905  56.36%	   10305163  60.02%
25600:
           qsort:   18357770 100.00%	   13172205 100.00%
        qsort_3f:   18687842  98.23%	   11337115 116.19%
        heapsort:   24121241  76.11%	   16612122  79.29%
       shellsort:   35552814  51.64%	   24106987  54.64%
51200:
           qsort:   38658883 100.00%	   28008505 100.00%
        qsort_3f:   39498463  97.87%	   24339675 115.07%
        heapsort:   50553552  76.47%	   37013828  75.67%
       shellsort:   82602416  46.80%	   56201889  49.84%
102400:
           qsort:   81197794 100.00%	   58918933 100.00%
        qsort_3f:   84257930  96.37%	   51986219 113.34%
        heapsort:  110540577  73.46%	   81419675  72.36%
       shellsort:  191303132  42.44%	  129786472  45.40%
From: Zou Nan hai <nanhai.zou@intel.com>

The new sort routine only works if there are an even number of entries in
the ia64 exception fix-up tables.  If the number of entries is odd the sort
fails, and then random get_user/put_user calls can fail.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sort.h |  10 +++++
 init/Kconfig         |  10 ++---
 lib/Makefile         |   2 +-
 lib/sort.c           | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 135 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/sort.h
 create mode 100644 lib/sort.c

(limited to 'include/linux')

diff --git a/include/linux/sort.h b/include/linux/sort.h
new file mode 100644
index 000000000000..d534da2b5575
--- /dev/null
+++ b/include/linux/sort.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_SORT_H
+#define _LINUX_SORT_H
+
+#include <linux/types.h>
+
+void sort(void *base, size_t num, size_t size,
+	  int (*cmp)(const void *, const void *),
+	  void (*swap)(void *, void *, int));
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index c10ce28651b0..97789f2c3111 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -283,11 +283,6 @@ config BASE_FULL
 	  Disabling this option reduces the size of miscellaneous core
 	  kernel data structures.
 
-config BASE_SMALL
-	int
-	default 0 if BASE_FULL
-	default 1 if !BASE_FULL
-
 config FUTEX
 	bool "Enable futex support" if EMBEDDED
 	default y
@@ -370,6 +365,11 @@ config TINY_SHMEM
 	default !SHMEM
 	bool
 
+config BASE_SMALL
+	int
+	default 0 if BASE_FULL
+	default 1 if !BASE_FULL
+
 menu "Loadable module support"
 
 config MODULES
diff --git a/lib/Makefile b/lib/Makefile
index 8cb331ab0c3f..41d58bc1b54b 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -5,7 +5,7 @@
 lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
 	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
 	 kobject.o kref.o idr.o div64.o parser.o int_sqrt.o \
-	 bitmap.o extable.o kobject_uevent.o prio_tree.o
+	 bitmap.o extable.o kobject_uevent.o prio_tree.o sort.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
diff --git a/lib/sort.c b/lib/sort.c
new file mode 100644
index 000000000000..ea3caedeabdb
--- /dev/null
+++ b/lib/sort.c
@@ -0,0 +1,119 @@
+/*
+ * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
+ *
+ * Jan 23 2005  Matt Mackall <mpm@selenic.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+void u32_swap(void *a, void *b, int size)
+{
+	u32 t = *(u32 *)a;
+	*(u32 *)a = *(u32 *)b;
+	*(u32 *)b = t;
+}
+
+void generic_swap(void *a, void *b, int size)
+{
+	char t;
+
+	do {
+		t = *(char *)a;
+		*(char *)a++ = *(char *)b;
+		*(char *)b++ = t;
+	} while (--size > 0);
+}
+
+/*
+ * sort - sort an array of elements
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp: pointer to comparison function
+ * @swap: pointer to swap function or NULL
+ *
+ * This function does a heapsort on the given array. You may provide a
+ * swap function optimized to your element type.
+ *
+ * Sorting time is O(n log n) both on average and worst-case. While
+ * qsort is about 20% faster on average, it suffers from exploitable
+ * O(n*n) worst-case behavior and extra memory requirements that make
+ * it less suitable for kernel use.
+ */
+
+void sort(void *base, size_t num, size_t size,
+	  int (*cmp)(const void *, const void *),
+	  void (*swap)(void *, void *, int size))
+{
+	/* pre-scale counters for performance */
+	int i = (num/2) * size, n = num * size, c, r;
+
+	if (!swap)
+		swap = (size == 4 ? u32_swap : generic_swap);
+
+	/* heapify */
+	for ( ; i >= 0; i -= size) {
+		for (r = i; r * 2 < n; r  = c) {
+			c = r * 2;
+			if (c < n - size && cmp(base + c, base + c + size) < 0)
+				c += size;
+			if (cmp(base + r, base + c) >= 0)
+				break;
+			swap(base + r, base + c, size);
+		}
+	}
+
+	/* sort */
+	for (i = n - size; i >= 0; i -= size) {
+		swap(base, base + i, size);
+		for (r = 0; r * 2 < i; r = c) {
+			c = r * 2;
+			if (c < i - size && cmp(base + c, base + c + size) < 0)
+				c += size;
+			if (cmp(base + r, base + c) >= 0)
+				break;
+			swap(base + r, base + c, size);
+		}
+	}
+}
+
+EXPORT_SYMBOL(sort);
+
+#if 0
+/* a simple boot-time regression test */
+
+int cmpint(const void *a, const void *b)
+{
+	return *(int *)a - *(int *)b;
+}
+
+static int sort_test(void)
+{
+	int *a, i, r = 0;
+
+	a = kmalloc(1000 * sizeof(int), GFP_KERNEL);
+	BUG_ON(!a);
+
+	printk("testing sort()\n");
+
+	for (i = 0; i < 1000; i++) {
+		r = (r * 725861) % 6599;
+		a[i] = r;
+	}
+
+	sort(a, 1000, sizeof(int), cmpint, NULL);
+
+	for (i = 0; i < 999; i++)
+		if (a[i] > a[i+1]) {
+			printk("sort() failed!\n");
+			break;
+		}
+
+	kfree(a);
+
+	return 0;
+}
+
+module_init(sort_test);
+#endif
-- 
cgit v1.2.3


From 18684e6e939e136dbbcf0eabcc75d0e9f5e306fa Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 7 Mar 2005 18:12:18 -0800
Subject: [PATCH] random: Create new rol32/ror32 bitops

Add rol32 and ror32 bitops to bitops.h

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/random.c  |  5 -----
 include/linux/bitops.h | 22 ++++++++++++++++++++++
 2 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index cd2b261611bf..59c8c4152296 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -374,11 +374,6 @@ static struct poolinfo {
 static DECLARE_WAIT_QUEUE_HEAD(random_read_wait);
 static DECLARE_WAIT_QUEUE_HEAD(random_write_wait);
 
-static inline __u32 rol32(__u32 word, int shift)
-{
-	return (word << shift) | (word >> (32 - shift));
-}
-
 #if 0
 static int debug = 0;
 module_param(debug, bool, 0644);
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 48f87b979ca9..7d1f8b67c6bf 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -134,4 +134,26 @@ static inline unsigned long hweight_long(unsigned long w)
 	return sizeof(w) == 4 ? generic_hweight32(w) : generic_hweight64(w);
 }
 
+/*
+ * rol32 - rotate a 32-bit value left
+ *
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u32 rol32(__u32 word, int shift)
+{
+	return (word << shift) | (word >> (32 - shift));
+}
+
+/*
+ * ror32 - rotate a 32-bit value right
+ *
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u32 ror32(__u32 word, int shift)
+{
+	return (word >> shift) | (word << (32 - shift));
+}
+
 #endif
-- 
cgit v1.2.3


From 1b14bcee300b9d5d54e9093108ad581486dedd28 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 7 Mar 2005 18:13:20 -0800
Subject: [PATCH] random: Move SHA code to lib/

Move random SHA code to lib/.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/random.c      | 121 ++-------------------------------------------
 include/linux/cryptohash.h |  10 ++++
 lib/Makefile               |   2 +-
 lib/sha1.c                 | 102 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 118 insertions(+), 117 deletions(-)
 create mode 100644 include/linux/cryptohash.h
 create mode 100644 lib/sha1.c

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 95ac584b07d1..d9994a942721 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -218,9 +218,6 @@
  * Any flaws in the design are solely my responsibility, and should
  * not be attributed to the Phil, Colin, or any of authors of PGP.
  *
- * The code for SHA transform was taken from Peter Gutmann's
- * implementation, which has been placed in the public domain.
- *
  * Further background information on this topic may be obtained from
  * RFC 1750, "Randomness Recommendations for Security", by Donald
  * Eastlake, Steve Crocker, and Jeff Schiller.
@@ -242,6 +239,7 @@
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/percpu.h>
+#include <linux/cryptohash.h>
 
 #include <asm/processor.h>
 #include <asm/uaccess.h>
@@ -671,116 +669,7 @@ void add_disk_randomness(struct gendisk *disk)
 
 EXPORT_SYMBOL(add_disk_randomness);
 
-#define HASH_BUFFER_SIZE 5
 #define EXTRACT_SIZE 10
-#define HASH_EXTRA_SIZE 80
-
-/*
- * SHA transform algorithm, taken from code written by Peter Gutmann,
- * and placed in the public domain.
- */
-
-/* The SHA f()-functions.  */
-
-#define f1(x,y,z)   (z ^ (x & (y ^ z)))		/* Rounds  0-19: x ? y : z */
-#define f2(x,y,z)   (x ^ y ^ z)			/* Rounds 20-39: XOR */
-#define f3(x,y,z)   ((x & y) + (z & (x ^ y)))	/* Rounds 40-59: majority */
-#define f4(x,y,z)   (x ^ y ^ z)			/* Rounds 60-79: XOR */
-
-/* The SHA Mysterious Constants */
-
-#define K1  0x5A827999L			/* Rounds  0-19: sqrt(2) * 2^30 */
-#define K2  0x6ED9EBA1L			/* Rounds 20-39: sqrt(3) * 2^30 */
-#define K3  0x8F1BBCDCL			/* Rounds 40-59: sqrt(5) * 2^30 */
-#define K4  0xCA62C1D6L			/* Rounds 60-79: sqrt(10) * 2^30 */
-
-/*
- * sha_transform: single block SHA1 transform
- *
- * @digest: 160 bit digest to update
- * @data:   512 bytes of data to hash
- * @W:      80 words of workspace
- *
- * This function generates a SHA1 digest for a single. Be warned, it
- * does not handle padding and message digest, do not confuse it with
- * the full FIPS 180-1 digest algorithm for variable length messages.
- */
-static void sha_transform(__u32 digest[5], const char *data, __u32 W[80])
-{
-	__u32 A, B, C, D, E;
-	__u32 TEMP;
-	int i;
-
-	memset(W, 0, sizeof(W));
-	for (i = 0; i < 16; i++)
-		W[i] = be32_to_cpu(((const __u32 *)data)[i]);
-	/*
-	 * Do the preliminary expansion of 16 to 80 words.  Doing it
-	 * out-of-line line this is faster than doing it in-line on
-	 * register-starved machines like the x86, and not really any
-	 * slower on real processors.
-	 */
-	for (i = 0; i < 64; i++) {
-		TEMP = W[i] ^ W[i+2] ^ W[i+8] ^ W[i+13];
-		W[i+16] = rol32(TEMP, 1);
-	}
-
-	/* Set up first buffer and local data buffer */
-	A = digest[ 0 ];
-	B = digest[ 1 ];
-	C = digest[ 2 ];
-	D = digest[ 3 ];
-	E = digest[ 4 ];
-
-	/* Heavy mangling, in 4 sub-rounds of 20 iterations each. */
-	for (i = 0; i < 80; i++) {
-		if (i < 40) {
-			if (i < 20)
-				TEMP = f1(B, C, D) + K1;
-			else
-				TEMP = f2(B, C, D) + K2;
-		} else {
-			if (i < 60)
-				TEMP = f3(B, C, D) + K3;
-			else
-				TEMP = f4(B, C, D) + K4;
-		}
-		TEMP += rol32(A, 5) + E + W[i];
-		E = D; D = C; C = rol32(B, 30); B = A; A = TEMP;
-	}
-
-	/* Build message digest */
-	digest[0] += A;
-	digest[1] += B;
-	digest[2] += C;
-	digest[3] += D;
-	digest[4] += E;
-
-	/* W is wiped by the caller */
-}
-
-#undef f1
-#undef f2
-#undef f3
-#undef f4
-#undef K1
-#undef K2
-#undef K3
-#undef K4
-
-/*
- * sha_init: initialize the vectors for a SHA1 digest
- *
- * @buf: vector to initialize
- */
-static void sha_init(__u32 *buf)
-{
-	buf[0] = 0x67452301;
-	buf[1] = 0xefcdab89;
-	buf[2] = 0x98badcfe;
-	buf[3] = 0x10325476;
-	buf[4] = 0xc3d2e1f0;
-}
 
 /*********************************************************************
  *
@@ -870,7 +759,7 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min,
 static void extract_buf(struct entropy_store *r, __u8 *out)
 {
 	int i, x;
-	__u32 data[16], buf[85];
+	__u32 data[16], buf[5 + SHA_WORKSPACE_WORDS];
 
 	sha_init(buf);
 	/*
@@ -1754,12 +1643,12 @@ EXPORT_SYMBOL(secure_tcpv6_port_ephemeral);
 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
 
 static int syncookie_init;
-static __u32 syncookie_secret[2][16-3+HASH_BUFFER_SIZE];
+static __u32 syncookie_secret[2][16-3+SHA_DIGEST_WORDS];
 
 __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr, __u16 sport,
 		__u16 dport, __u32 sseq, __u32 count, __u32 data)
 {
-	__u32 tmp[16 + HASH_BUFFER_SIZE + HASH_EXTRA_SIZE];
+	__u32 tmp[16 + 5 + SHA_WORKSPACE_WORDS];
 	__u32 seq;
 
 	/*
@@ -1811,7 +1700,7 @@ __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr, __u16 sport,
 __u32 check_tcp_syn_cookie(__u32 cookie, __u32 saddr, __u32 daddr, __u16 sport,
 		__u16 dport, __u32 sseq, __u32 count, __u32 maxdiff)
 {
-	__u32 tmp[16 + HASH_BUFFER_SIZE + HASH_EXTRA_SIZE];
+	__u32 tmp[16 + 5 + SHA_WORKSPACE_WORDS];
 	__u32 diff;
 
 	if (syncookie_init == 0)
diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
new file mode 100644
index 000000000000..50e6fa516ca6
--- /dev/null
+++ b/include/linux/cryptohash.h
@@ -0,0 +1,10 @@
+#ifndef __CRYPTOHASH_H
+#define __CRYPTOHASH_H
+
+#define SHA_DIGEST_WORDS 5
+#define SHA_WORKSPACE_WORDS 80
+
+void sha_init(__u32 *buf);
+void sha_transform(__u32 *digest, const char *data, __u32 *W);
+
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index a69f5092a864..0fb5cd38b779 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -5,7 +5,7 @@
 lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
 	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
 	 kobject.o kref.o idr.o div64.o parser.o int_sqrt.o \
-	 bitmap.o extable.o kobject_uevent.o prio_tree.o
+	 bitmap.o extable.o kobject_uevent.o prio_tree.o sha1.o
 
 obj-y += sort.o
 
diff --git a/lib/sha1.c b/lib/sha1.c
new file mode 100644
index 000000000000..88df510599a9
--- /dev/null
+++ b/lib/sha1.c
@@ -0,0 +1,102 @@
+/*
+ * SHA transform algorithm, taken from code written by Peter Gutmann,
+ * and placed in the public domain.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/cryptohash.h>
+
+/* The SHA f()-functions.  */
+
+#define f1(x,y,z)   (z ^ (x & (y ^ z)))		/* Rounds  0-19: x ? y : z */
+#define f2(x,y,z)   (x ^ y ^ z)			/* Rounds 20-39: XOR */
+#define f3(x,y,z)   ((x & y) + (z & (x ^ y)))	/* Rounds 40-59: majority */
+#define f4(x,y,z)   (x ^ y ^ z)			/* Rounds 60-79: XOR */
+
+/* The SHA Mysterious Constants */
+
+#define K1  0x5A827999L			/* Rounds  0-19: sqrt(2) * 2^30 */
+#define K2  0x6ED9EBA1L			/* Rounds 20-39: sqrt(3) * 2^30 */
+#define K3  0x8F1BBCDCL			/* Rounds 40-59: sqrt(5) * 2^30 */
+#define K4  0xCA62C1D6L			/* Rounds 60-79: sqrt(10) * 2^30 */
+
+/*
+ * sha_transform: single block SHA1 transform
+ *
+ * @digest: 160 bit digest to update
+ * @data:   512 bits of data to hash
+ * @W:      80 words of workspace
+ *
+ * This function generates a SHA1 digest for a single. Be warned, it
+ * does not handle padding and message digest, do not confuse it with
+ * the full FIPS 180-1 digest algorithm for variable length messages.
+ */
+void sha_transform(__u32 *digest, const char *data, __u32 *W)
+{
+	__u32 A, B, C, D, E;
+	__u32 TEMP;
+	int i;
+
+	memset(W, 0, sizeof(W));
+	for (i = 0; i < 16; i++)
+		W[i] = be32_to_cpu(((const __u32 *)data)[i]);
+	/*
+	 * Do the preliminary expansion of 16 to 80 words.  Doing it
+	 * out-of-line line this is faster than doing it in-line on
+	 * register-starved machines like the x86, and not really any
+	 * slower on real processors.
+	 */
+	for (i = 0; i < 64; i++) {
+		TEMP = W[i] ^ W[i+2] ^ W[i+8] ^ W[i+13];
+		W[i+16] = rol32(TEMP, 1);
+	}
+
+	/* Set up first buffer and local data buffer */
+	A = digest[ 0 ];
+	B = digest[ 1 ];
+	C = digest[ 2 ];
+	D = digest[ 3 ];
+	E = digest[ 4 ];
+
+	/* Heavy mangling, in 4 sub-rounds of 20 iterations each. */
+	for (i = 0; i < 80; i++) {
+		if (i < 40) {
+			if (i < 20)
+				TEMP = f1(B, C, D) + K1;
+			else
+				TEMP = f2(B, C, D) + K2;
+		} else {
+			if (i < 60)
+				TEMP = f3(B, C, D) + K3;
+			else
+				TEMP = f4(B, C, D) + K4;
+		}
+		TEMP += rol32(A, 5) + E + W[i];
+		E = D; D = C; C = rol32(B, 30); B = A; A = TEMP;
+	}
+
+	/* Build message digest */
+	digest[0] += A;
+	digest[1] += B;
+	digest[2] += C;
+	digest[3] += D;
+	digest[4] += E;
+
+	/* W is wiped by the caller */
+}
+
+/*
+ * sha_init: initialize the vectors for a SHA1 digest
+ *
+ * @buf: vector to initialize
+ */
+void sha_init(__u32 *buf)
+{
+	buf[0] = 0x67452301;
+	buf[1] = 0xefcdab89;
+	buf[2] = 0x98badcfe;
+	buf[3] = 0x10325476;
+	buf[4] = 0xc3d2e1f0;
+}
+
-- 
cgit v1.2.3


From 3ced79bdd08d2826eed9342bda625c8198638e2f Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 7 Mar 2005 18:14:07 -0800
Subject: [PATCH] random: Move halfmd4 to lib

Move half-MD4 hash to /lib where we can share it with htree.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/random.c      | 47 +++--------------------------------
 include/linux/cryptohash.h |  2 ++
 lib/Makefile               |  3 ++-
 lib/halfmd4.c              | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 45 deletions(-)
 create mode 100644 lib/halfmd4.c

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index d9994a942721..9dcaf2b9b3b2 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1324,47 +1324,6 @@ ctl_table random_table[] = {
 #define K2 013240474631UL
 #define K3 015666365641UL
 
-/*
- * Basic cut-down MD4 transform.  Returns only 32 bits of result.
- */
-static __u32 halfMD4Transform (__u32 const buf[4], __u32 const in[8])
-{
-	__u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
-
-	/* Round 1 */
-	ROUND(F, a, b, c, d, in[0] + K1,  3);
-	ROUND(F, d, a, b, c, in[1] + K1,  7);
-	ROUND(F, c, d, a, b, in[2] + K1, 11);
-	ROUND(F, b, c, d, a, in[3] + K1, 19);
-	ROUND(F, a, b, c, d, in[4] + K1,  3);
-	ROUND(F, d, a, b, c, in[5] + K1,  7);
-	ROUND(F, c, d, a, b, in[6] + K1, 11);
-	ROUND(F, b, c, d, a, in[7] + K1, 19);
-
-	/* Round 2 */
-	ROUND(G, a, b, c, d, in[1] + K2,  3);
-	ROUND(G, d, a, b, c, in[3] + K2,  5);
-	ROUND(G, c, d, a, b, in[5] + K2,  9);
-	ROUND(G, b, c, d, a, in[7] + K2, 13);
-	ROUND(G, a, b, c, d, in[0] + K2,  3);
-	ROUND(G, d, a, b, c, in[2] + K2,  5);
-	ROUND(G, c, d, a, b, in[4] + K2,  9);
-	ROUND(G, b, c, d, a, in[6] + K2, 13);
-
-	/* Round 3 */
-	ROUND(H, a, b, c, d, in[3] + K3,  3);
-	ROUND(H, d, a, b, c, in[7] + K3,  9);
-	ROUND(H, c, d, a, b, in[2] + K3, 11);
-	ROUND(H, b, c, d, a, in[6] + K3, 15);
-	ROUND(H, a, b, c, d, in[1] + K3,  3);
-	ROUND(H, d, a, b, c, in[5] + K3,  9);
-	ROUND(H, c, d, a, b, in[0] + K3, 11);
-	ROUND(H, b, c, d, a, in[4] + K3, 15);
-
-	return buf[1] + b;	/* "most hashed" word */
-	/* Alternative: return sum of all words? */
-}
-
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 
 static __u32 twothirdsMD4Transform (__u32 const buf[4], __u32 const in[12])
@@ -1550,7 +1509,7 @@ __u32 secure_ip_id(__u32 daddr)
 	hash[2] = keyptr->secret[10];
 	hash[3] = keyptr->secret[11];
 
-	return halfMD4Transform(hash, keyptr->secret);
+	return half_md4_transform(hash, keyptr->secret);
 }
 
 #ifdef CONFIG_INET
@@ -1574,7 +1533,7 @@ __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
 	hash[2]=(sport << 16) + dport;
 	hash[3]=keyptr->secret[11];
 
-	seq = halfMD4Transform(hash, keyptr->secret) & HASH_MASK;
+	seq = half_md4_transform(hash, keyptr->secret) & HASH_MASK;
 	seq += keyptr->count;
 	/*
 	 *	As close as possible to RFC 793, which
@@ -1612,7 +1571,7 @@ u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
 	hash[2] = dport ^ keyptr->secret[10];
 	hash[3] = keyptr->secret[11];
 
-	return halfMD4Transform(hash, keyptr->secret);
+	return half_md4_transform(hash, keyptr->secret);
 }
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index 50e6fa516ca6..7d9e3576dfed 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -7,4 +7,6 @@
 void sha_init(__u32 *buf);
 void sha_transform(__u32 *digest, const char *data, __u32 *W);
 
+__u32 half_md4_transform(__u32 const buf[4], __u32 const in[8]);
+
 #endif
diff --git a/lib/Makefile b/lib/Makefile
index 0fb5cd38b779..7b2ac49f9c46 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -5,7 +5,8 @@
 lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
 	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
 	 kobject.o kref.o idr.o div64.o parser.o int_sqrt.o \
-	 bitmap.o extable.o kobject_uevent.o prio_tree.o sha1.o
+	 bitmap.o extable.o kobject_uevent.o prio_tree.o sha1.o \
+	 halfmd4.o
 
 obj-y += sort.o
 
diff --git a/lib/halfmd4.c b/lib/halfmd4.c
new file mode 100644
index 000000000000..31009bcc9d92
--- /dev/null
+++ b/lib/halfmd4.c
@@ -0,0 +1,62 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cryptohash.h>
+
+/* F, G and H are basic MD4 functions: selection, majority, parity */
+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+
+/*
+ * The generic round function.  The application is so specific that
+ * we don't bother protecting all the arguments with parens, as is generally
+ * good macro practice, in favor of extra legibility.
+ * Rotation is separate from addition to prevent recomputation
+ */
+#define ROUND(f, a, b, c, d, x, s)	\
+	(a += f(b, c, d) + x, a = (a << s) | (a >> (32 - s)))
+#define K1 0
+#define K2 013240474631UL
+#define K3 015666365641UL
+
+/*
+ * Basic cut-down MD4 transform.  Returns only 32 bits of result.
+ */
+__u32 half_md4_transform(__u32 const buf[4], __u32 const in[8])
+{
+	__u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
+
+	/* Round 1 */
+	ROUND(F, a, b, c, d, in[0] + K1,  3);
+	ROUND(F, d, a, b, c, in[1] + K1,  7);
+	ROUND(F, c, d, a, b, in[2] + K1, 11);
+	ROUND(F, b, c, d, a, in[3] + K1, 19);
+	ROUND(F, a, b, c, d, in[4] + K1,  3);
+	ROUND(F, d, a, b, c, in[5] + K1,  7);
+	ROUND(F, c, d, a, b, in[6] + K1, 11);
+	ROUND(F, b, c, d, a, in[7] + K1, 19);
+
+	/* Round 2 */
+	ROUND(G, a, b, c, d, in[1] + K2,  3);
+	ROUND(G, d, a, b, c, in[3] + K2,  5);
+	ROUND(G, c, d, a, b, in[5] + K2,  9);
+	ROUND(G, b, c, d, a, in[7] + K2, 13);
+	ROUND(G, a, b, c, d, in[0] + K2,  3);
+	ROUND(G, d, a, b, c, in[2] + K2,  5);
+	ROUND(G, c, d, a, b, in[4] + K2,  9);
+	ROUND(G, b, c, d, a, in[6] + K2, 13);
+
+	/* Round 3 */
+	ROUND(H, a, b, c, d, in[3] + K3,  3);
+	ROUND(H, d, a, b, c, in[7] + K3,  9);
+	ROUND(H, c, d, a, b, in[2] + K3, 11);
+	ROUND(H, b, c, d, a, in[6] + K3, 15);
+	ROUND(H, a, b, c, d, in[1] + K3,  3);
+	ROUND(H, d, a, b, c, in[5] + K3,  9);
+	ROUND(H, c, d, a, b, in[0] + K3, 11);
+	ROUND(H, b, c, d, a, in[4] + K3, 15);
+
+	return buf[1] + b;	/* "most hashed" word */
+	/* Alternative: return sum of all words? */
+}
+EXPORT_SYMBOL(half_md4_transform);
-- 
cgit v1.2.3


From 567200b8fba50573de44bc6f4e71572bb8c5e987 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 7 Mar 2005 18:14:22 -0800
Subject: [PATCH] random: Kill duplicate halfmd4 in ext3 htree

Replace duplicate halfMD4 code with call to lib/

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/hash.c             | 70 ++--------------------------------------------
 include/linux/cryptohash.h |  2 +-
 lib/halfmd4.c              | 10 +++++--
 3 files changed, 10 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index f3780279f5c5..5a2d1235ead0 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -13,6 +13,7 @@
 #include <linux/jbd.h>
 #include <linux/sched.h>
 #include <linux/ext3_fs.h>
+#include <linux/cryptohash.h>
 
 #define DELTA 0x9E3779B9
 
@@ -33,73 +34,6 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 	buf[1] += b1;
 }
 
-/* F, G and H are basic MD4 functions: selection, majority, parity */
-#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
-#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
-#define H(x, y, z) ((x) ^ (y) ^ (z))
-
-/*
- * The generic round function.  The application is so specific that
- * we don't bother protecting all the arguments with parens, as is generally
- * good macro practice, in favor of extra legibility.
- * Rotation is separate from addition to prevent recomputation
- */
-#define ROUND(f, a, b, c, d, x, s)	\
-	(a += f(b, c, d) + x, a = (a << s) | (a >> (32-s)))
-#define K1 0
-#define K2 013240474631UL
-#define K3 015666365641UL
-
-/*
- * Basic cut-down MD4 transform.  Returns only 32 bits of result.
- */
-static void halfMD4Transform (__u32 buf[4], __u32 const in[])
-{
-	__u32	a = buf[0], b = buf[1], c = buf[2], d = buf[3];
-
-	/* Round 1 */
-	ROUND(F, a, b, c, d, in[0] + K1,  3);
-	ROUND(F, d, a, b, c, in[1] + K1,  7);
-	ROUND(F, c, d, a, b, in[2] + K1, 11);
-	ROUND(F, b, c, d, a, in[3] + K1, 19);
-	ROUND(F, a, b, c, d, in[4] + K1,  3);
-	ROUND(F, d, a, b, c, in[5] + K1,  7);
-	ROUND(F, c, d, a, b, in[6] + K1, 11);
-	ROUND(F, b, c, d, a, in[7] + K1, 19);
-
-	/* Round 2 */
-	ROUND(G, a, b, c, d, in[1] + K2,  3);
-	ROUND(G, d, a, b, c, in[3] + K2,  5);
-	ROUND(G, c, d, a, b, in[5] + K2,  9);
-	ROUND(G, b, c, d, a, in[7] + K2, 13);
-	ROUND(G, a, b, c, d, in[0] + K2,  3);
-	ROUND(G, d, a, b, c, in[2] + K2,  5);
-	ROUND(G, c, d, a, b, in[4] + K2,  9);
-	ROUND(G, b, c, d, a, in[6] + K2, 13);
-
-	/* Round 3 */
-	ROUND(H, a, b, c, d, in[3] + K3,  3);
-	ROUND(H, d, a, b, c, in[7] + K3,  9);
-	ROUND(H, c, d, a, b, in[2] + K3, 11);
-	ROUND(H, b, c, d, a, in[6] + K3, 15);
-	ROUND(H, a, b, c, d, in[1] + K3,  3);
-	ROUND(H, d, a, b, c, in[5] + K3,  9);
-	ROUND(H, c, d, a, b, in[0] + K3, 11);
-	ROUND(H, b, c, d, a, in[4] + K3, 15);
-
-	buf[0] += a;
-	buf[1] += b;
-	buf[2] += c;
-	buf[3] += d;
-}
-
-#undef ROUND
-#undef F
-#undef G
-#undef H
-#undef K1
-#undef K2
-#undef K3
 
 /* The old legacy hash */
 static __u32 dx_hack_hash (const char *name, int len)
@@ -187,7 +121,7 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 		p = name;
 		while (len > 0) {
 			str2hashbuf(p, len, in, 8);
-			halfMD4Transform(buf, in);
+			half_md4_transform(buf, in);
 			len -= 32;
 			p += 32;
 		}
diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index 7d9e3576dfed..c118b2ad9807 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -7,6 +7,6 @@
 void sha_init(__u32 *buf);
 void sha_transform(__u32 *digest, const char *data, __u32 *W);
 
-__u32 half_md4_transform(__u32 const buf[4], __u32 const in[8]);
+__u32 half_md4_transform(__u32 buf[4], __u32 const in[8]);
 
 #endif
diff --git a/lib/halfmd4.c b/lib/halfmd4.c
index 31009bcc9d92..e11db26f8ae5 100644
--- a/lib/halfmd4.c
+++ b/lib/halfmd4.c
@@ -22,7 +22,7 @@
 /*
  * Basic cut-down MD4 transform.  Returns only 32 bits of result.
  */
-__u32 half_md4_transform(__u32 const buf[4], __u32 const in[8])
+__u32 half_md4_transform(__u32 buf[4], __u32 const in[8])
 {
 	__u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
 
@@ -56,7 +56,11 @@ __u32 half_md4_transform(__u32 const buf[4], __u32 const in[8])
 	ROUND(H, c, d, a, b, in[0] + K3, 11);
 	ROUND(H, b, c, d, a, in[4] + K3, 15);
 
-	return buf[1] + b;	/* "most hashed" word */
-	/* Alternative: return sum of all words? */
+	buf[0] += a;
+	buf[1] += b;
+	buf[2] += c;
+	buf[3] += d;
+
+	return buf[1]; /* "most hashed" word */
 }
 EXPORT_SYMBOL(half_md4_transform);
-- 
cgit v1.2.3


From 8331dc5b536aa9ddda6cf46760aa513f6c4f92bc Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 7 Mar 2005 18:14:53 -0800
Subject: [PATCH] random: Move syncookies to net/

Move syncookie code off to networking land.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/random.c  | 81 --------------------------------------------------
 include/linux/random.h |  8 -----
 net/ipv4/syncookies.c  | 77 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index dde7c62938b0..ad9b52c2ae3c 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -366,10 +366,6 @@ static struct poolinfo {
  * hash; hash collisions will occur no more often than chance.
  */
 
-#ifdef CONFIG_SYN_COOKIES
-static __u32 syncookie_secret[2][16-3+SHA_WORKSPACE_WORDS];
-#endif
-
 /*
  * Static global variables
  */
@@ -901,9 +897,6 @@ static int __init rand_initialize(void)
 	init_std_data(&input_pool);
 	init_std_data(&blocking_pool);
 	init_std_data(&nonblocking_pool);
-#ifdef CONFIG_SYN_COOKIES
-	get_random_bytes(syncookie_secret, sizeof(syncookie_secret));
-#endif
 	return 0;
 }
 module_init(rand_initialize);
@@ -1596,80 +1589,6 @@ u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dp
 EXPORT_SYMBOL(secure_tcpv6_port_ephemeral);
 #endif
 
-#ifdef CONFIG_SYN_COOKIES
-/*
- * Secure SYN cookie computation. This is the algorithm worked out by
- * Dan Bernstein and Eric Schenk.
- *
- * For linux I implement the 1 minute counter by looking at the jiffies clock.
- * The count is passed in as a parameter, so this code doesn't much care.
- */
-
-#define COOKIEBITS 24	/* Upper bits store count */
-#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
-
-static u32 cookie_hash(u32 saddr, u32 daddr, u32 sport, u32 dport,
-		       u32 count, int c)
-{
-	__u32 tmp[16 + 5 + SHA_WORKSPACE_WORDS];
-
-	memcpy(tmp + 3, syncookie_secret[c], sizeof(syncookie_secret[c]));
-	tmp[0] = saddr;
-	tmp[1] = daddr;
-	tmp[2] = (sport << 16) + dport;
-	tmp[3] = count;
-	sha_transform(tmp + 16, tmp);
-
-	return tmp[17];
-}
-
-__u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr, __u16 sport,
-		__u16 dport, __u32 sseq, __u32 count, __u32 data)
-{
-	/*
-	 * Compute the secure sequence number.
-	 * The output should be:
-   	 *   HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
-	 *      + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24).
-	 * Where sseq is their sequence number and count increases every
-	 * minute by 1.
-	 * As an extra hack, we add a small "data" value that encodes the
-	 * MSS into the second hash value.
-	 */
-
-	return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
-		sseq + (count << COOKIEBITS) +
-		((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
-		 & COOKIEMASK));
-}
-
-/*
- * This retrieves the small "data" value from the syncookie.
- * If the syncookie is bad, the data returned will be out of
- * range.  This must be checked by the caller.
- *
- * The count value used to generate the cookie must be within
- * "maxdiff" if the current (passed-in) "count".  The return value
- * is (__u32)-1 if this test fails.
- */
-__u32 check_tcp_syn_cookie(__u32 cookie, __u32 saddr, __u32 daddr, __u16 sport,
-		__u16 dport, __u32 sseq, __u32 count, __u32 maxdiff)
-{
-	__u32 diff;
-
-	/* Strip away the layers from the cookie */
-	cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
-
-	/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
-	diff = (count - (cookie >> COOKIEBITS)) & ((__u32)-1 >> COOKIEBITS);
-	if (diff >= maxdiff)
-		return (__u32)-1;
-
-	return (cookie -
-		cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
-		& COOKIEMASK;	/* Leaving the data behind */
-}
-#endif
 #endif /* CONFIG_INET */
 
 
diff --git a/include/linux/random.h b/include/linux/random.h
index cccb11199d88..cc6703449916 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -57,14 +57,6 @@ extern u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr,
 				       __u16 dport);
 extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
 					__u16 sport, __u16 dport);
-extern __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr,
-				   __u16 sport, __u16 dport,
-				   __u32 sseq, __u32 count,
-				   __u32 data);
-extern __u32 check_tcp_syn_cookie(__u32 cookie, __u32 saddr,
-				  __u32 daddr, __u16 sport,
-				  __u16 dport, __u32 sseq,
-				  __u32 count, __u32 maxdiff);
 extern __u32 secure_tcpv6_sequence_number(__u32 *saddr, __u32 *daddr,
 					  __u16 sport, __u16 dport);
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 5d6d2138ac91..4b87a4f11f46 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -17,11 +17,88 @@
 #include <linux/tcp.h>
 #include <linux/slab.h>
 #include <linux/random.h>
+#include <linux/cryptohash.h>
 #include <linux/kernel.h>
 #include <net/tcp.h>
 
 extern int sysctl_tcp_syncookies;
 
+static __u32 syncookie_secret[2][16-3+SHA_DIGEST_WORDS];
+
+static __init int init_syncookies(void)
+{
+	get_random_bytes(syncookie_secret, sizeof(syncookie_secret));
+	return 0;
+}
+module_init(init_syncookies);
+
+#define COOKIEBITS 24	/* Upper bits store count */
+#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
+
+static u32 cookie_hash(u32 saddr, u32 daddr, u32 sport, u32 dport,
+		       u32 count, int c)
+{
+	__u32 tmp[16 + 5 + SHA_WORKSPACE_WORDS];
+
+	memcpy(tmp + 3, syncookie_secret[c], sizeof(syncookie_secret[c]));
+	tmp[0] = saddr;
+	tmp[1] = daddr;
+	tmp[2] = (sport << 16) + dport;
+	tmp[3] = count;
+	sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+
+	return tmp[17];
+}
+
+static __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr, __u16 sport,
+				   __u16 dport, __u32 sseq, __u32 count,
+				   __u32 data)
+{
+	/*
+	 * Compute the secure sequence number.
+	 * The output should be:
+   	 *   HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
+	 *      + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24).
+	 * Where sseq is their sequence number and count increases every
+	 * minute by 1.
+	 * As an extra hack, we add a small "data" value that encodes the
+	 * MSS into the second hash value.
+	 */
+
+	return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
+		sseq + (count << COOKIEBITS) +
+		((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
+		 & COOKIEMASK));
+}
+
+/*
+ * This retrieves the small "data" value from the syncookie.
+ * If the syncookie is bad, the data returned will be out of
+ * range.  This must be checked by the caller.
+ *
+ * The count value used to generate the cookie must be within
+ * "maxdiff" if the current (passed-in) "count".  The return value
+ * is (__u32)-1 if this test fails.
+ */
+static __u32 check_tcp_syn_cookie(__u32 cookie, __u32 saddr, __u32 daddr,
+				  __u16 sport, __u16 dport, __u32 sseq,
+				  __u32 count, __u32 maxdiff)
+{
+	__u32 diff;
+
+	/* Strip away the layers from the cookie */
+	cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
+
+	/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
+	diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
+	if (diff >= maxdiff)
+		return (__u32)-1;
+
+	return (cookie -
+		cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
+		& COOKIEMASK;	/* Leaving the data behind */
+}
+
 /* 
  * This table has to be sorted and terminated with (__u16)-1.
  * XXX generate a better table.
-- 
cgit v1.2.3


From d6b7a781c51c91dd054e5c437885205592faac21 Mon Sep 17 00:00:00 2001
From: Prasanna Meda <pmeda@akamai.com>
Date: Mon, 7 Mar 2005 18:15:24 -0800
Subject: [PATCH] Speed up /proc/pid/maps

This patch uses find_vma() to improve the read response of /proc/pid/maps.
It attempts to make the liner scan instead of quadratic walk and utilise rb
tree.  Reading the file was doing sequential scan from the begining to file
position all the time, and taking a quite long time.

The improvements came from f_version/m_version and resulting in mmap_cache
match.  Even if mmap_cache does not match, rb tree walk should be faster
than sequential walk.  First attempt was to put the state across read
system calls into private data.  Later got inspiration from wli's pid patch
using f_version in readdir of /proc.  Other advantage is, f_version will be
cleared automatically by lseek.

The test program creates 32K maps and splits them into two(limited by
max_map_count sysctl) using mprotect(0).  After the patch, the read time
improves from many seconds to milliseconds, and does not grow superlinearly
with number of read calls.

Help taken from Peter Swain in idea and testing.

After the patch:
Reading /proc/self/maps:65528 time: 0 secs and   780728 usecs buf:4096 bytes:3811362
Reading /proc/self/maps:65528 time: 1 secs and   117573 usecs buf:1024 bytes:3866627
Reading /proc/self/maps:65528 time: 0 secs and   473459 usecs buf: 256 bytes:3866627
Reading /proc/self/maps:65528 time: 0 secs and   901288 usecs buf:  64 bytes:3866627
Reading /proc/self/maps:65528 time: 1 secs and   480185 usecs buf:  16 bytes:3866627
Reading /proc/self/maps:65528 time: 1 secs and   636268 usecs buf:   4 bytes:3866627
Reading /proc/self/maps:65528 time: 4 secs and   118327 usecs buf:   1 bytes:3866627

Before the patch:
Reading /proc/self/maps:65528 time: 4 secs and   359556 usecs buf:4096 bytes:3866647
Reading /proc/self/maps:65528 time:16 secs and   218584 usecs buf:1024 bytes:3866688
Reading /proc/self/maps:65528 time:67 secs and   870200 usecs buf: 256 bytes:3866688
Reading /proc/self/maps:65528 time:255 secs and   186934 usecs buf:  64 bytes:3866688
Small reads never completed.

Signed-off-by: Prasanna Meda <pmeda@akamai.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/task_mmu.c       | 65 +++++++++++++++++++++++++++++++++++++-----------
 fs/seq_file.c            | 25 +++++++++++++++++++
 include/linux/seq_file.h |  1 +
 3 files changed, 77 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 56dbfbc9b768..49de0bd2e623 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -87,6 +87,7 @@ static void pad_len_spaces(struct seq_file *m, int len)
 
 static int show_map(struct seq_file *m, void *v)
 {
+	struct task_struct *task = m->private;
 	struct vm_area_struct *map = v;
 	struct mm_struct *mm = map->vm_mm;
 	struct file *file = map->vm_file;
@@ -138,30 +139,66 @@ static int show_map(struct seq_file *m, void *v)
 		}
 	}
 	seq_putc(m, '\n');
+	if (m->count < m->size)  /* map is copied successfully */
+		m->version = (map != get_gate_vma(task))? map->vm_start: 0;
 	return 0;
 }
 
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct task_struct *task = m->private;
-	struct mm_struct *mm = get_task_mm(task);
-	struct vm_area_struct * map;
+	unsigned long last_addr = m->version;
+	struct mm_struct *mm;
+	struct vm_area_struct *map, *tail_map;
 	loff_t l = *pos;
 
+	/*
+	 * We remember last_addr rather than next_addr to hit with
+	 * mmap_cache most of the time. We have zero last_addr at
+	 * the begining and also after lseek. We will have -1 last_addr
+	 * after the end of the maps.
+	 */
+
+	if (last_addr == -1UL)
+		return NULL;
+
+	mm = get_task_mm(task);
 	if (!mm)
 		return NULL;
 
+	tail_map = get_gate_vma(task);
 	down_read(&mm->mmap_sem);
-	map = mm->mmap;
-	while (l-- && map)
+
+	/* Start with last addr hint */
+	if (last_addr && (map = find_vma(mm, last_addr))) {
 		map = map->vm_next;
-	if (!map) {
-		up_read(&mm->mmap_sem);
-		mmput(mm);
-		if (l == -1)
-			map = get_gate_vma(task);
+		goto out;
+	}
+
+	/*
+	 * Check the map index is within the range and do
+	 * sequential scan until m_index.
+	 */
+	map = NULL;
+	if ((unsigned long)l < mm->map_count) {
+		map = mm->mmap;
+		while (l-- && map)
+			map = map->vm_next;
+		goto out;
 	}
-	return map;
+
+	if (l != mm->map_count)
+		tail_map = NULL; /* After gate map */
+
+out:
+	if (map)
+		return map;
+
+	/* End of maps has reached */
+	m->version = (tail_map != NULL)? 0: -1UL;
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+	return tail_map;
 }
 
 static void m_stop(struct seq_file *m, void *v)
@@ -179,13 +216,13 @@ static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct task_struct *task = m->private;
 	struct vm_area_struct *map = v;
+	struct vm_area_struct *tail_map = get_gate_vma(task);
+
 	(*pos)++;
-	if (map->vm_next)
+	if (map && (map != tail_map) && map->vm_next)
 		return map->vm_next;
 	m_stop(m, v);
-	if (map != get_gate_vma(task))
-		return get_gate_vma(task);
-	return NULL;
+	return (map != tail_map)? tail_map: NULL;
 }
 
 struct seq_operations proc_pid_maps_op = {
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 5a73e085fb4e..650c43ba86c4 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -36,6 +36,13 @@ int seq_open(struct file *file, struct seq_operations *op)
 	p->op = op;
 	file->private_data = p;
 
+	/*
+	 * Wrappers around seq_open(e.g. swaps_open) need to be
+	 * aware of this. If they set f_version themselves, they
+	 * should call seq_open first and then set f_version.
+	 */
+	file->f_version = 0;
+
 	/* SEQ files support lseek, but not pread/pwrite */
 	file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE);
 	return 0;
@@ -58,6 +65,18 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 	int err = 0;
 
 	down(&m->sem);
+	/*
+	 * seq_file->op->..m_start/m_stop/m_next may do special actions
+	 * or optimisations based on the file->f_version, so we want to
+	 * pass the file->f_version to those methods.
+	 *
+	 * seq_file->version is just copy of f_version, and seq_file
+	 * methods can treat it simply as file version.
+	 * It is copied in first and copied out after all operations.
+	 * It is convenient to have it as  part of structure to avoid the
+	 * need of passing another argument to all the seq_file methods.
+	 */
+	m->version = file->f_version;
 	/* grab buffer if we didn't have one */
 	if (!m->buf) {
 		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
@@ -98,6 +117,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 		if (!m->buf)
 			goto Enomem;
 		m->count = 0;
+		m->version = 0;
 	}
 	m->op->stop(m, p);
 	m->count = 0;
@@ -136,6 +156,7 @@ Done:
 		copied = err;
 	else
 		*ppos += copied;
+	file->f_version = m->version;
 	up(&m->sem);
 	return copied;
 Enomem:
@@ -153,6 +174,7 @@ static int traverse(struct seq_file *m, loff_t offset)
 	int error = 0;
 	void *p;
 
+	m->version = 0;
 	m->index = 0;
 	m->count = m->from = 0;
 	if (!offset)
@@ -207,6 +229,7 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 	long long retval = -EINVAL;
 
 	down(&m->sem);
+	m->version = file->f_version;
 	switch (origin) {
 		case 1:
 			offset += file->f_pos;
@@ -220,6 +243,7 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 				if (retval) {
 					/* with extreme prejudice... */
 					file->f_pos = 0;
+					m->version = 0;
 					m->index = 0;
 					m->count = 0;
 				} else {
@@ -228,6 +252,7 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 			}
 	}
 	up(&m->sem);
+	file->f_version = m->version;
 	return retval;
 }
 EXPORT_SYMBOL(seq_lseek);
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 28141af6a19a..850a974ee505 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -18,6 +18,7 @@ struct seq_file {
 	size_t from;
 	size_t count;
 	loff_t index;
+	loff_t version;
 	struct semaphore sem;
 	struct seq_operations *op;
 	void *private;
-- 
cgit v1.2.3


From 8b9cfca63e45badb418550c61397be4c33022752 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 7 Mar 2005 18:15:40 -0800
Subject: [PATCH] posix-timers: tidy up clock interfaces and consolidate
 dispatch logic

This patch cleans up the posix-timers interfaces for defining clocks, and the
calls to them.  It fixes some sloppy types, adds a clockid_t parameter to the
calls that lacked it, and adds a function pointer that can be used for
clock_getres.  It further cleans up the posix-timers.c code using the k_clock
function pointers or default functions when no hooks are supplied,
consolidating repeated code into shared inline functions or macros.  This
paves the way for adding the CPU clock hooks.

The mmtimer.c changes are untested, but obviously can't be wrong.  There
aren't any other struct k_clock definitions in the tree, but any others would
need to be updated for the function signature changes.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/mmtimer.c       |   4 +-
 include/linux/posix-timers.h |  14 +--
 kernel/posix-timers.c        | 255 ++++++++++++++++++++++++++++---------------
 3 files changed, 173 insertions(+), 100 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index 3539b4acfc94..04c450a293f2 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -376,7 +376,7 @@ static int sgi_clock_period;
 static struct timespec sgi_clock_offset;
 static int sgi_clock_period;
 
-static int sgi_clock_get(struct timespec *tp)
+static int sgi_clock_get(clockid_t clockid, struct timespec *tp)
 {
 	u64 nsec;
 
@@ -387,7 +387,7 @@ static int sgi_clock_get(struct timespec *tp)
 	return 0;
 };
 
-static int sgi_clock_set(struct timespec *tp)
+static int sgi_clock_set(clockid_t clockid, struct timespec *tp)
 {
 
 	u64 nsec;
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 006f3e9af475..61f4b12d82c1 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -30,12 +30,12 @@ struct k_clock_abs {
 };
 struct k_clock {
 	int res;		/* in nano seconds */
+	int (*clock_getres) (clockid_t which_clock, struct timespec *tp);
 	struct k_clock_abs *abs_struct;
-	int (*clock_set) (struct timespec * tp);
-	int (*clock_get) (struct timespec * tp);
+	int (*clock_set) (clockid_t which_clock, struct timespec * tp);
+	int (*clock_get) (clockid_t which_clock, struct timespec * tp);
 	int (*timer_create) (struct k_itimer *timer);
-	int (*nsleep) (int which_clock, int flags,
-		       struct timespec * t);
+	int (*nsleep) (clockid_t which_clock, int flags, struct timespec *);
 	int (*timer_set) (struct k_itimer * timr, int flags,
 			  struct itimerspec * new_setting,
 			  struct itimerspec * old_setting);
@@ -44,12 +44,12 @@ struct k_clock {
 			   struct itimerspec * cur_setting);
 };
 
-void register_posix_clock(int clock_id, struct k_clock *new_clock);
+void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock);
 
 /* Error handlers for timer_create, nanosleep and settime */
 int do_posix_clock_notimer_create(struct k_itimer *timer);
-int do_posix_clock_nonanosleep(int which_clock, int flags, struct timespec * t);
-int do_posix_clock_nosettime(struct timespec *tp);
+int do_posix_clock_nonanosleep(clockid_t, int flags, struct timespec *);
+int do_posix_clock_nosettime(clockid_t, struct timespec *tp);
 
 /* function to call to trigger timer event */
 int posix_timer_event(struct k_itimer *timr, int si_private);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9e79eca513ca..d04a2f17e395 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -173,22 +173,12 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
 static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list),
 				      .lock = SPIN_LOCK_UNLOCKED};
 
-#define if_clock_do(clock_fun,alt_fun,parms) \
-		(!clock_fun) ? alt_fun parms : clock_fun parms
-
-#define p_timer_get(clock,a,b) \
-	       	if_clock_do((clock)->timer_get,do_timer_gettime, (a,b))
-
-#define p_nsleep(clock,a,b,c) \
-		if_clock_do((clock)->nsleep, do_nsleep, (a,b,c))
-
-#define p_timer_del(clock,a) \
-		if_clock_do((clock)->timer_del, do_timer_delete, (a))
-
-static int do_posix_gettime(struct k_clock *clock, struct timespec *tp);
+static void posix_timer_fn(unsigned long);
 static u64 do_posix_clock_monotonic_gettime_parts(
 	struct timespec *tp, struct timespec *mo);
 int do_posix_clock_monotonic_gettime(struct timespec *tp);
+static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp);
+
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
 
 static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
@@ -196,6 +186,109 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 	spin_unlock_irqrestore(&timr->it_lock, flags);
 }
 
+/*
+ * Define this to initialize every k_clock function table so all its
+ * function pointers are non-null, and always do indirect calls through the
+ * table.  Leave it undefined to instead leave null function pointers and
+ * decide at the call sites between a direct call (maybe inlined) to the
+ * default function and an indirect call through the table when it's filled
+ * in.  Which style is preferable is whichever performs better in the
+ * common case of using the default functions.
+ *
+#define CLOCK_DISPATCH_DIRECT
+ */
+
+#ifdef CLOCK_DISPATCH_DIRECT
+#define CLOCK_DISPATCH(clock, call, arglist) \
+	((*posix_clocks[clock].call) arglist)
+#define DEFHOOK(name)	if (clock->name == NULL) clock->name = common_##name
+#define COMMONDEFN	static
+#else
+#define CLOCK_DISPATCH(clock, call, arglist) \
+	(posix_clocks[clock].call != NULL \
+	 ? (*posix_clocks[clock].call) arglist : common_##call arglist)
+#define DEFHOOK(name)		(void) 0 /* Nothing here.  */
+#define COMMONDEFN	static inline
+#endif
+
+/*
+ * Default clock hook functions when the struct k_clock passed
+ * to register_posix_clock leaves a function pointer null.
+ *
+ * The function common_CALL is the default implementation for
+ * the function pointer CALL in struct k_clock.
+ */
+
+COMMONDEFN int common_clock_getres(clockid_t which_clock, struct timespec *tp)
+{
+	tp->tv_sec = 0;
+	tp->tv_nsec = posix_clocks[which_clock].res;
+	return 0;
+}
+
+COMMONDEFN int common_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+	getnstimeofday(tp);
+	return 0;
+}
+
+COMMONDEFN int common_clock_set(clockid_t which_clock, struct timespec *tp)
+{
+	return do_sys_settimeofday(tp, NULL);
+}
+
+COMMONDEFN int common_timer_create(struct k_itimer *new_timer)
+{
+	init_timer(&new_timer->it_timer);
+	new_timer->it_timer.expires = 0;
+	new_timer->it_timer.data = (unsigned long) new_timer;
+	new_timer->it_timer.function = posix_timer_fn;
+	set_timer_inactive(new_timer);
+	return 0;
+}
+
+/*
+ * These ones are defined below.
+ */
+static int common_nsleep(clockid_t, int flags, struct timespec *t);
+static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static int common_timer_set(struct k_itimer *, int,
+			    struct itimerspec *, struct itimerspec *);
+static int common_timer_del(struct k_itimer *timer);
+
+/*
+ * Install default functions for hooks not filled in.
+ */
+static inline void common_default_hooks(struct k_clock *clock)
+{
+	DEFHOOK(clock_getres);
+	DEFHOOK(clock_get);
+	DEFHOOK(clock_set);
+	DEFHOOK(timer_create);
+	DEFHOOK(timer_set);
+	DEFHOOK(timer_get);
+	DEFHOOK(timer_del);
+	DEFHOOK(nsleep);
+}
+#undef	DEFHOOK
+
+/*
+ * Return nonzero iff we know a priori this clockid_t value is bogus.
+ */
+static inline int invalid_clockid(clockid_t which_clock)
+{
+	if ((unsigned) which_clock >= MAX_CLOCKS)
+		return 1;
+	if (posix_clocks[which_clock].clock_getres != NULL)
+		return 0;
+#ifndef CLOCK_DISPATCH_DIRECT
+	if (posix_clocks[which_clock].res != 0)
+		return 0;
+#endif
+	return 1;
+}
+
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
@@ -206,7 +299,7 @@ static __init int init_posix_timers(void)
 	};
 	struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES,
 		.abs_struct = NULL,
-		.clock_get = do_posix_clock_monotonic_gettime,
+		.clock_get = do_posix_clock_monotonic_get,
 		.clock_set = do_posix_clock_nosettime
 	};
 
@@ -481,14 +574,16 @@ static inline struct task_struct * good_sigevent(sigevent_t * event)
 	return rtn;
 }
 
-void register_posix_clock(int clock_id, struct k_clock *new_clock)
+void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock)
 {
 	if ((unsigned) clock_id >= MAX_CLOCKS) {
 		printk("POSIX clock register failed for clock_id %d\n",
 		       clock_id);
 		return;
 	}
+
 	posix_clocks[clock_id] = *new_clock;
+	common_default_hooks(&posix_clocks[clock_id]);
 }
 
 static struct k_itimer * alloc_posix_timer(void)
@@ -538,8 +633,7 @@ sys_timer_create(clockid_t which_clock,
 	sigevent_t event;
 	int it_id_set = IT_ID_NOT_SET;
 
-	if ((unsigned) which_clock >= MAX_CLOCKS ||
-				!posix_clocks[which_clock].res)
+	if (invalid_clockid(which_clock))
 		return -EINVAL;
 
 	new_timer = alloc_posix_timer();
@@ -573,17 +667,9 @@ sys_timer_create(clockid_t which_clock,
 	new_timer->it_clock = which_clock;
 	new_timer->it_incr = 0;
 	new_timer->it_overrun = -1;
-	if (posix_clocks[which_clock].timer_create) {
-		error =  posix_clocks[which_clock].timer_create(new_timer);
-		if (error)
-			goto out;
-	} else {
-		init_timer(&new_timer->it_timer);
-		new_timer->it_timer.expires = 0;
-		new_timer->it_timer.data = (unsigned long) new_timer;
-		new_timer->it_timer.function = posix_timer_fn;
-		set_timer_inactive(new_timer);
-	}
+	error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+	if (error)
+		goto out;
 
 	/*
 	 * return the timer_id now.  The next step is hard to
@@ -734,7 +820,7 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
  * report.
  */
 static void
-do_timer_gettime(struct k_itimer *timr, struct itimerspec *cur_setting)
+common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 {
 	unsigned long expires;
 	struct now_struct now;
@@ -783,7 +869,7 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
 	if (!timr)
 		return -EINVAL;
 
-	p_timer_get(&posix_clocks[timr->it_clock], timr, &cur_setting);
+	CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting));
 
 	unlock_timer(timr, flags);
 
@@ -854,7 +940,7 @@ static int adjust_abs_time(struct k_clock *clock, struct timespec *tp,
 			/*
 			 * Not one of the basic clocks
 			 */
-			do_posix_gettime(clock, &now);	
+			clock->clock_get(clock - posix_clocks, &now);
 			jiffies_64_f = get_jiffies_64();
 		}
 		/*
@@ -906,15 +992,15 @@ static int adjust_abs_time(struct k_clock *clock, struct timespec *tp,
 
 /* Set a POSIX.1b interval timer. */
 /* timr->it_lock is taken. */
-static inline int
-do_timer_settime(struct k_itimer *timr, int flags,
+COMMONDEFN int
+common_timer_set(struct k_itimer *timr, int flags,
 		 struct itimerspec *new_setting, struct itimerspec *old_setting)
 {
 	struct k_clock *clock = &posix_clocks[timr->it_clock];
 	u64 expire_64;
 
 	if (old_setting)
-		do_timer_gettime(timr, old_setting);
+		common_timer_get(timr, old_setting);
 
 	/* disable the timer */
 	timr->it_incr = 0;
@@ -1003,12 +1089,9 @@ retry:
 	if (!timr)
 		return -EINVAL;
 
-	if (!posix_clocks[timr->it_clock].timer_set)
-		error = do_timer_settime(timr, flags, &new_spec, rtn);
-	else
-	        error = posix_clocks[timr->it_clock].timer_set(timr,
-							       flags,
-							       &new_spec, rtn);
+	error = CLOCK_DISPATCH(timr->it_clock, timer_set,
+			       (timr, flags, &new_spec, rtn));
+
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
 		rtn = NULL;	// We already got the old time...
@@ -1022,7 +1105,7 @@ retry:
 	return error;
 }
 
-static inline int do_timer_delete(struct k_itimer *timer)
+COMMONDEFN int common_timer_del(struct k_itimer *timer)
 {
 	timer->it_incr = 0;
 #ifdef CONFIG_SMP
@@ -1044,6 +1127,11 @@ static inline int do_timer_delete(struct k_itimer *timer)
 	return 0;
 }
 
+static inline int timer_delete_hook(struct k_itimer *timer)
+{
+	return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer));
+}
+
 /* Delete a POSIX.1b interval timer. */
 asmlinkage long
 sys_timer_delete(timer_t timer_id)
@@ -1060,14 +1148,14 @@ retry_delete:
 		return -EINVAL;
 
 #ifdef CONFIG_SMP
-	error = p_timer_del(&posix_clocks[timer->it_clock], timer);
+	error = timer_delete_hook(timer);
 
 	if (error == TIMER_RETRY) {
 		unlock_timer(timer, flags);
 		goto retry_delete;
 	}
 #else
-	p_timer_del(&posix_clocks[timer->it_clock], timer);
+	timer_delete_hook(timer);
 #endif
 	spin_lock(&current->sighand->siglock);
 	list_del(&timer->list);
@@ -1099,14 +1187,14 @@ retry_delete:
 	spin_lock_irqsave(&timer->it_lock, flags);
 
 #ifdef CONFIG_SMP
-	error = p_timer_del(&posix_clocks[timer->it_clock], timer);
+	error = timer_delete_hook(timer);
 
 	if (error == TIMER_RETRY) {
 		unlock_timer(timer, flags);
 		goto retry_delete;
 	}
 #else
-	p_timer_del(&posix_clocks[timer->it_clock], timer);
+	timer_delete_hook(timer);
 #endif
 	list_del(&timer->list);
 	/*
@@ -1143,14 +1231,6 @@ void exit_itimers(struct signal_struct *sig)
  * spin_lock_irq() held and from clock calls with no locking.	They must
  * use the save flags versions of locks.
  */
-static int do_posix_gettime(struct k_clock *clock, struct timespec *tp)
-{
-	if (clock->clock_get)
-		return clock->clock_get(tp);
-
-	getnstimeofday(tp);
-	return 0;
-}
 
 /*
  * We do ticks here to avoid the irq lock ( they take sooo long).
@@ -1177,7 +1257,7 @@ static u64 do_posix_clock_monotonic_gettime_parts(
 	return jiff;
 }
 
-int do_posix_clock_monotonic_gettime(struct timespec *tp)
+static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp)
 {
 	struct timespec wall_to_mono;
 
@@ -1193,7 +1273,13 @@ int do_posix_clock_monotonic_gettime(struct timespec *tp)
 	return 0;
 }
 
-int do_posix_clock_nosettime(struct timespec *tp)
+int do_posix_clock_monotonic_gettime(struct timespec *tp)
+{
+	return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp);
+}
+
+
+int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp)
 {
 	return -EINVAL;
 }
@@ -1203,7 +1289,7 @@ int do_posix_clock_notimer_create(struct k_itimer *timer)
 	return -EINVAL;
 }
 
-int do_posix_clock_nonanosleep(int which_clock, int flags, struct timespec *t)
+int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t)
 {
 #ifndef ENOTSUP
 	return -EOPNOTSUPP;	/* aka ENOTSUP in userland for POSIX */
@@ -1217,24 +1303,12 @@ sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp)
 {
 	struct timespec new_tp;
 
-	if ((unsigned) which_clock >= MAX_CLOCKS ||
-					!posix_clocks[which_clock].res)
+	if (invalid_clockid(which_clock))
 		return -EINVAL;
 	if (copy_from_user(&new_tp, tp, sizeof (*tp)))
 		return -EFAULT;
-	if (posix_clocks[which_clock].clock_set)
-		return posix_clocks[which_clock].clock_set(&new_tp);
 
-	return do_sys_settimeofday(&new_tp, NULL);
-}
-
-static int do_clock_gettime(clockid_t which_clock, struct timespec *tp)
-{
-	if ((unsigned) which_clock >= MAX_CLOCKS ||
-					!posix_clocks[which_clock].res)
-		return -EINVAL;
-
-	return do_posix_gettime(&posix_clocks[which_clock], tp);
+	return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
 }
 
 asmlinkage long
@@ -1243,7 +1317,10 @@ sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp)
 	struct timespec kernel_tp;
 	int error;
 
-	error = do_clock_gettime(which_clock, &kernel_tp);
+	if (invalid_clockid(which_clock))
+		return -EINVAL;
+	error = CLOCK_DISPATCH(which_clock, clock_get,
+			       (which_clock, &kernel_tp));
 	if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
 		error = -EFAULT;
 
@@ -1255,18 +1332,19 @@ asmlinkage long
 sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
 {
 	struct timespec rtn_tp;
+	int error;
 
-	if ((unsigned) which_clock >= MAX_CLOCKS ||
-					!posix_clocks[which_clock].res)
+	if (invalid_clockid(which_clock))
 		return -EINVAL;
 
-	rtn_tp.tv_sec = 0;
-	rtn_tp.tv_nsec = posix_clocks[which_clock].res;
-	if (tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
-		return -EFAULT;
+	error = CLOCK_DISPATCH(which_clock, clock_getres,
+			       (which_clock, &rtn_tp));
 
-	return 0;
+	if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) {
+		error = -EFAULT;
+	}
 
+	return error;
 }
 
 static void nanosleep_wake_up(unsigned long __data)
@@ -1379,9 +1457,6 @@ void clock_was_set(void)
 
 long clock_nanosleep_restart(struct restart_block *restart_block);
 
-extern long do_clock_nanosleep(clockid_t which_clock, int flags,
-			       struct timespec *t);
-
 asmlinkage long
 sys_clock_nanosleep(clockid_t which_clock, int flags,
 		    const struct timespec __user *rqtp,
@@ -1392,8 +1467,7 @@ sys_clock_nanosleep(clockid_t which_clock, int flags,
 	    &(current_thread_info()->restart_block);
 	int ret;
 
-	if ((unsigned) which_clock >= MAX_CLOCKS ||
-					!posix_clocks[which_clock].res)
+	if (invalid_clockid(which_clock))
 		return -EINVAL;
 
 	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
@@ -1402,12 +1476,10 @@ sys_clock_nanosleep(clockid_t which_clock, int flags,
 	if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0)
 		return -EINVAL;
 
-	if (posix_clocks[which_clock].nsleep)
-		ret = posix_clocks[which_clock].nsleep(which_clock, flags, &t);
-	else
-		ret = do_clock_nanosleep(which_clock, flags, &t);
+	ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t));
+
 	/*
-	 * Do this here as do_clock_nanosleep does not have the real address
+	 * Do this here as common_nsleep does not have the real address
 	 */
 	restart_block->arg1 = (unsigned long)rmtp;
 
@@ -1417,8 +1489,9 @@ sys_clock_nanosleep(clockid_t which_clock, int flags,
 	return ret;
 }
 
-long
-do_clock_nanosleep(clockid_t which_clock, int flags, struct timespec *tsave)
+
+static int common_nsleep(clockid_t which_clock,
+			 int flags, struct timespec *tsave)
 {
 	struct timespec t, dum;
 	struct timer_list new_timer;
@@ -1525,7 +1598,7 @@ long
 clock_nanosleep_restart(struct restart_block *restart_block)
 {
 	struct timespec t;
-	int ret = do_clock_nanosleep(restart_block->arg0, 0, &t);
+	int ret = common_nsleep(restart_block->arg0, 0, &t);
 
 	if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 &&
 	    copy_to_user((struct timespec __user *)(restart_block->arg1), &t,
-- 
cgit v1.2.3


From a78331f2168ef1e67b53a0f8218c70a19f0b2a4c Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 7 Mar 2005 18:15:56 -0800
Subject: [PATCH] posix-timers: high-resolution CPU clocks for POSIX clock_*
 syscalls

This patch provides support for thread and process CPU time clocks in the
POSIX clock interface.  Both the existing utime and utime+stime information
(already available via getrusage et al) can be used, as well as a new
(potentially) more precise and accurate clock (which cannot distinguish user
from system time).  The clock used is that provided by the `sched_clock'
function already used internally by the scheduler.  This gives a way for
platforms to provide the highest-resolution CPU time tracking that is
available cheaply, and some already do so (such as x86 using TSC).  Because
this clock is already sampled internally by the scheduler, this new tracking
adds only the tiniest new overhead to accomplish the bookkeeping.

Some notes:

This allows per-thread clocks to be accessed only by other threads in the same
process.  The only POSIX calls that access these are defined only for
in-process use, and having this check is necessary for the userland
implementations of the POSIX clock functions to robustly refuse stale
clockid_t's in the face of potential PID reuse.

This makes no constraint on who can see whose per-process clocks.  This
information is already available for the VIRT and PROF (i.e.  utime and stime)
information via /proc.  I am open to suggestions on if/how security
constraints on who can see whose clocks should be imposed.

The SCHED clock information is now available only via clock_* syscalls.  This
means that per-thread information is not available outside the process.
Perhaps /proc should show sched_time as well?  This would let ps et al show
this more-accurate information.

When this code is merged, it will be supported in glibc.  I have written the
support and added some test programs for glibc, which are what I mainly used
to test the new kernel code.  You can get those here:

	http://people.redhat.com/roland/glibc/kernel-cpuclocks.patch

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/posix-timers.h |  32 ++++-
 include/linux/sched.h        |  10 ++
 kernel/Makefile              |   2 +-
 kernel/fork.c                |   2 +
 kernel/posix-cpu-timers.c    | 288 +++++++++++++++++++++++++++++++++++++++++++
 kernel/posix-timers.c        |  10 +-
 kernel/sched.c               |  33 ++++-
 kernel/signal.c              |   1 +
 8 files changed, 372 insertions(+), 6 deletions(-)
 create mode 100644 kernel/posix-cpu-timers.c

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 61f4b12d82c1..a0140c8f72b7 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,6 +4,23 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 
+#define CPUCLOCK_PID(clock)	((pid_t) ~((clock) >> 3))
+#define CPUCLOCK_PERTHREAD(clock) \
+	(((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
+#define CPUCLOCK_PID_MASK	7
+#define CPUCLOCK_PERTHREAD_MASK	4
+#define CPUCLOCK_WHICH(clock)	((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
+#define CPUCLOCK_CLOCK_MASK	3
+#define CPUCLOCK_PROF		0
+#define CPUCLOCK_VIRT		1
+#define CPUCLOCK_SCHED		2
+#define CPUCLOCK_MAX		3
+
+#define MAKE_PROCESS_CPUCLOCK(pid, clock) \
+	((~(clockid_t) (pid) << 3) | (clockid_t) (clock))
+#define MAKE_THREAD_CPUCLOCK(tid, clock) \
+	MAKE_PROCESS_CPUCLOCK((tid), (clock) | CPUCLOCK_PERTHREAD_MASK)
+
 /* POSIX.1b interval timer structure. */
 struct k_itimer {
 	struct list_head list;		/* free/ allocate list */
@@ -72,5 +89,18 @@ struct now_struct {
                   (timr)->it_overrun += orun;				\
               }								\
             }while (0)
-#endif
 
+int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *);
+int posix_cpu_clock_get(clockid_t which_clock, struct timespec *);
+int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp);
+int posix_cpu_timer_create(struct k_itimer *);
+int posix_cpu_nsleep(clockid_t, int, struct timespec *);
+#define posix_cpu_timer_create do_posix_clock_notimer_create
+#define posix_cpu_nsleep do_posix_clock_nonanosleep
+int posix_cpu_timer_set(struct k_itimer *, int,
+			struct itimerspec *, struct itimerspec *);
+int posix_cpu_timer_del(struct k_itimer *);
+void posix_cpu_timer_get(struct k_itimer *, struct itimerspec *);
+
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fb151e634c9e..5fff51dc7c32 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -320,6 +320,14 @@ struct signal_struct {
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 
+	/*
+	 * Cumulative ns of scheduled CPU time for dead threads in the
+	 * group, not including a zombie group leader.  (This only differs
+	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
+	 * other than jiffies.)
+	 */
+	unsigned long long sched_time;
+
 	/*
 	 * We don't bother to synchronize most readers of this at all,
 	 * because there is no reader checking a limit that actually needs
@@ -541,6 +549,7 @@ struct task_struct {
 
 	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
+	unsigned long long sched_time; /* sched_clock time spent running */
 	int activated;
 
 	unsigned long policy;
@@ -776,6 +785,7 @@ static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask)
 #endif
 
 extern unsigned long long sched_clock(void);
+extern unsigned long long current_sched_time(const task_t *current_task);
 
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
diff --git a/kernel/Makefile b/kernel/Makefile
index d680ace0fdda..0ac3efc9d071 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
-	    kthread.o wait.o kfifo.o sys_ni.o
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o
 
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
diff --git a/kernel/fork.c b/kernel/fork.c
index f6b929e69f5b..bc0633f9730a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -749,6 +749,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
+	sig->sched_time = 0;
 
 	task_lock(current->group_leader);
 	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -877,6 +878,7 @@ static task_t *copy_process(unsigned long clone_flags,
 
 	p->utime = cputime_zero;
 	p->stime = cputime_zero;
+ 	p->sched_time = 0;
 	p->rchar = 0;		/* I/O counter: bytes read */
 	p->wchar = 0;		/* I/O counter: bytes written */
 	p->syscr = 0;		/* I/O counter: read syscalls */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
new file mode 100644
index 000000000000..fdc54f75aa15
--- /dev/null
+++ b/kernel/posix-cpu-timers.c
@@ -0,0 +1,288 @@
+/*
+ * Implement CPU time clocks for the POSIX clock interface.
+ */
+
+#include <linux/sched.h>
+#include <linux/posix-timers.h>
+#include <asm/uaccess.h>
+#include <linux/errno.h>
+
+union cpu_time_count {
+	cputime_t cpu;
+	unsigned long long sched;
+};
+
+static int check_clock(clockid_t which_clock)
+{
+	int error = 0;
+	struct task_struct *p;
+	const pid_t pid = CPUCLOCK_PID(which_clock);
+
+	if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
+		return -EINVAL;
+
+	if (pid == 0)
+		return 0;
+
+	read_lock(&tasklist_lock);
+	p = find_task_by_pid(pid);
+	if (!p || (CPUCLOCK_PERTHREAD(which_clock) ?
+		   p->tgid != current->tgid : p->tgid != pid)) {
+		error = -EINVAL;
+	}
+	read_unlock(&tasklist_lock);
+
+	return error;
+}
+
+static void sample_to_timespec(clockid_t which_clock,
+			       union cpu_time_count cpu,
+			       struct timespec *tp)
+{
+	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+		tp->tv_sec = div_long_long_rem(cpu.sched,
+					       NSEC_PER_SEC, &tp->tv_nsec);
+	} else {
+		cputime_to_timespec(cpu.cpu, tp);
+	}
+}
+
+static inline cputime_t prof_ticks(struct task_struct *p)
+{
+	return cputime_add(p->utime, p->stime);
+}
+static inline cputime_t virt_ticks(struct task_struct *p)
+{
+	return p->utime;
+}
+static inline unsigned long long sched_ns(struct task_struct *p)
+{
+	return (p == current) ? current_sched_time(p) : p->sched_time;
+}
+
+int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+{
+	int error = check_clock(which_clock);
+	if (!error) {
+		tp->tv_sec = 0;
+		tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
+		if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+			/*
+			 * If sched_clock is using a cycle counter, we
+			 * don't have any idea of its true resolution
+			 * exported, but it is much more than 1s/HZ.
+			 */
+			tp->tv_nsec = 1;
+		}
+	}
+	return error;
+}
+
+int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
+{
+	/*
+	 * You can never reset a CPU clock, but we check for other errors
+	 * in the call before failing with EPERM.
+	 */
+	int error = check_clock(which_clock);
+	if (error == 0) {
+		error = -EPERM;
+	}
+	return error;
+}
+
+
+/*
+ * Sample a per-thread clock for the given task.
+ */
+static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p,
+			    union cpu_time_count *cpu)
+{
+	switch (CPUCLOCK_WHICH(which_clock)) {
+	default:
+		return -EINVAL;
+	case CPUCLOCK_PROF:
+		cpu->cpu = prof_ticks(p);
+		break;
+	case CPUCLOCK_VIRT:
+		cpu->cpu = virt_ticks(p);
+		break;
+	case CPUCLOCK_SCHED:
+		cpu->sched = sched_ns(p);
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Sample a process (thread group) clock for the given group_leader task.
+ * Must be called with tasklist_lock held for reading.
+ */
+static int cpu_clock_sample_group(clockid_t which_clock,
+				  struct task_struct *p,
+				  union cpu_time_count *cpu)
+{
+	struct task_struct *t = p;
+	unsigned long flags;
+	switch (CPUCLOCK_WHICH(which_clock)) {
+	default:
+		return -EINVAL;
+	case CPUCLOCK_PROF:
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		cpu->cpu = cputime_add(p->signal->utime, p->signal->stime);
+		do {
+			cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t));
+			t = next_thread(t);
+		} while (t != p);
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		break;
+	case CPUCLOCK_VIRT:
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		cpu->cpu = p->signal->utime;
+		do {
+			cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t));
+			t = next_thread(t);
+		} while (t != p);
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		break;
+	case CPUCLOCK_SCHED:
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		cpu->sched = p->signal->sched_time;
+		/* Add in each other live thread.  */
+		while ((t = next_thread(t)) != p) {
+			cpu->sched += t->sched_time;
+		}
+		if (p->tgid == current->tgid) {
+			/*
+			 * We're sampling ourselves, so include the
+			 * cycles not yet banked.  We still omit
+			 * other threads running on other CPUs,
+			 * so the total can always be behind as
+			 * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
+			 */
+			cpu->sched += current_sched_time(current);
+		} else {
+			cpu->sched += p->sched_time;
+		}
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		break;
+	}
+	return 0;
+}
+
+
+int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+	const pid_t pid = CPUCLOCK_PID(which_clock);
+	int error = -EINVAL;
+	union cpu_time_count rtn;
+
+	if (pid == 0) {
+		/*
+		 * Special case constant value for our own clocks.
+		 * We don't have to do any lookup to find ourselves.
+		 */
+		if (CPUCLOCK_PERTHREAD(which_clock)) {
+			/*
+			 * Sampling just ourselves we can do with no locking.
+			 */
+			error = cpu_clock_sample(which_clock,
+						 current, &rtn);
+		} else {
+			read_lock(&tasklist_lock);
+			error = cpu_clock_sample_group(which_clock,
+						       current, &rtn);
+			read_unlock(&tasklist_lock);
+		}
+	} else {
+		/*
+		 * Find the given PID, and validate that the caller
+		 * should be able to see it.
+		 */
+		struct task_struct *p;
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+		if (p) {
+			if (CPUCLOCK_PERTHREAD(which_clock)) {
+				if (p->tgid == current->tgid) {
+					error = cpu_clock_sample(which_clock,
+								 p, &rtn);
+				}
+			} else if (p->tgid == pid && p->signal) {
+				error = cpu_clock_sample_group(which_clock,
+							       p, &rtn);
+			}
+		}
+		read_unlock(&tasklist_lock);
+	}
+
+	if (error)
+		return error;
+	sample_to_timespec(which_clock, rtn, tp);
+	return 0;
+}
+
+/*
+ * These can't be called, since timer_create never works.
+ */
+int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+			struct itimerspec *old, struct itimerspec *new)
+{
+	BUG();
+	return -EINVAL;
+}
+int posix_cpu_timer_del(struct k_itimer *timer)
+{
+	BUG();
+	return -EINVAL;
+}
+void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *spec)
+{
+	BUG();
+}
+
+
+#define PROCESS_CLOCK	MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
+#define THREAD_CLOCK	MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
+
+static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+{
+	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
+}
+static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+	return posix_cpu_clock_get(PROCESS_CLOCK, tp);
+}
+static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+{
+	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
+}
+static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+	return posix_cpu_clock_get(THREAD_CLOCK, tp);
+}
+
+
+static __init int init_posix_cpu_timers(void)
+{
+	struct k_clock process = {
+		.clock_getres = process_cpu_clock_getres,
+		.clock_get = process_cpu_clock_get,
+		.clock_set = do_posix_clock_nosettime,
+		.timer_create = do_posix_clock_notimer_create,
+		.nsleep = do_posix_clock_nonanosleep,
+	};
+	struct k_clock thread = {
+		.clock_getres = thread_cpu_clock_getres,
+		.clock_get = thread_cpu_clock_get,
+		.clock_set = do_posix_clock_nosettime,
+		.timer_create = do_posix_clock_notimer_create,
+		.nsleep = do_posix_clock_nonanosleep,
+	};
+
+	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
+	register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+
+	return 0;
+}
+__initcall(init_posix_cpu_timers);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d04a2f17e395..09b2d6b4634f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -200,13 +200,15 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 
 #ifdef CLOCK_DISPATCH_DIRECT
 #define CLOCK_DISPATCH(clock, call, arglist) \
-	((*posix_clocks[clock].call) arglist)
+	((clock) < 0 ? posix_cpu_##call arglist : \
+	 (*posix_clocks[clock].call) arglist)
 #define DEFHOOK(name)	if (clock->name == NULL) clock->name = common_##name
 #define COMMONDEFN	static
 #else
 #define CLOCK_DISPATCH(clock, call, arglist) \
-	(posix_clocks[clock].call != NULL \
-	 ? (*posix_clocks[clock].call) arglist : common_##call arglist)
+	((clock) < 0 ? posix_cpu_##call arglist : \
+	 (posix_clocks[clock].call != NULL \
+	  ? (*posix_clocks[clock].call) arglist : common_##call arglist))
 #define DEFHOOK(name)		(void) 0 /* Nothing here.  */
 #define COMMONDEFN	static inline
 #endif
@@ -277,6 +279,8 @@ static inline void common_default_hooks(struct k_clock *clock)
  */
 static inline int invalid_clockid(clockid_t which_clock)
 {
+	if (which_clock < 0)	/* CPU clock, posix_cpu_* will check it */
+		return 0;
 	if ((unsigned) which_clock >= MAX_CLOCKS)
 		return 1;
 	if (posix_clocks[which_clock].clock_getres != NULL)
diff --git a/kernel/sched.c b/kernel/sched.c
index 0888acbe3f66..8176366cfd8f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2241,6 +2241,32 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 
 EXPORT_PER_CPU_SYMBOL(kstat);
 
+/*
+ * This is called on clock ticks and on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ */
+static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
+				    unsigned long long now)
+{
+	unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
+	p->sched_time += now - last;
+}
+
+/*
+ * Return current->sched_time plus any more ns on the sched_clock
+ * that have not yet been banked.
+ */
+unsigned long long current_sched_time(const task_t *tsk)
+{
+	unsigned long long ns;
+	unsigned long flags;
+	local_irq_save(flags);
+	ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
+	ns = tsk->sched_time + (sched_clock() - ns);
+	local_irq_restore(flags);
+	return ns;
+}
+
 /*
  * We place interactive tasks back into the active array, if possible.
  *
@@ -2419,8 +2445,11 @@ void scheduler_tick(void)
 	int cpu = smp_processor_id();
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
+	unsigned long long now = sched_clock();
+
+	update_cpu_clock(p, rq, now);
 
-	rq->timestamp_last_tick = sched_clock();
+	rq->timestamp_last_tick = now;
 
 	if (p == rq->idle) {
 		if (wake_priority_sleeper(rq))
@@ -2804,6 +2833,8 @@ switch_tasks:
 	clear_tsk_need_resched(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
+	update_cpu_clock(prev, rq, now);
+
 	prev->sleep_avg -= run_time;
 	if ((long)prev->sleep_avg <= 0)
 		prev->sleep_avg = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index 9e87ab3f8f21..3f1df438d23c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -381,6 +381,7 @@ void __exit_signal(struct task_struct *tsk)
 		sig->maj_flt += tsk->maj_flt;
 		sig->nvcsw += tsk->nvcsw;
 		sig->nivcsw += tsk->nivcsw;
+		sig->sched_time += tsk->sched_time;
 		spin_unlock(&sighand->siglock);
 		sig = NULL;	/* Marker for below.  */
 	}
-- 
cgit v1.2.3


From 2c3871a8f5244025fe9d846f76994251319e23e4 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 7 Mar 2005 18:16:42 -0800
Subject: [PATCH] posix-timers: CPU clock support for POSIX timers

POSIX requires that when you claim _POSIX_CPUTIME and _POSIX_THREAD_CPUTIME,
not only the clock_* calls but also timer_* calls must support the thread and
process CPU time clocks.  This patch provides that support, building on my
recent additions to support these clocks in the POSIX clock_* interfaces.
This patch will not work without those changes, as well as the patch fixing
the timer lock-siglock deadlock problem.

The apparent pervasive changes to posix-timers.c are simply that some fields
of struct k_itimer have changed name and moved into a union.  This was
appropriate since the data structures required for the existing real-time
timer support and for the new thread/process CPU-time timers are quite
different.

The glibc patches to support CPU time clocks using the new kernel support is
in http://people.redhat.com/roland/glibc/kernel-cpuclocks.patch, and that
includes tests for the timer support (if you build glibc with NPTL).
From: Christoph Lameter <clameter@sgi.com>

  Your patch breaks the mmtimer driver because it used k_itimer values for
  its own purposes.  Here is a fix by defining an additional structure in
  k_itimer (same approach for mmtimer as the cpu timers):
From: Roland McGrath <roland@redhat.com>

Fix bug identified by Alexander Nyberg <alexn@dsv.su.se>

> The problem arises from code touching the union in alloc_posix_timer()
> which makes firing go non-zero. When firing is checked in
> posix_cpu_timer_set() it will be positive causing an infinite loop.
>
> So either the below fix or preferably move the INIT_LIST_HEAD(x) from
> alloc_posix_timer() to somewhere later where it doesn't disturb the other
> union members.

Thanks for finding this problem.  The latter is what I think is the right
solution.  This patch does that, and also removes some superfluous rezeroing.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/mmtimer.c       |   40 +-
 include/linux/init_task.h    |    9 +
 include/linux/posix-timers.h |   51 +-
 include/linux/sched.h        |    7 +
 kernel/exit.c                |    3 +
 kernel/fork.c                |   20 +
 kernel/posix-cpu-timers.c    | 1115 +++++++++++++++++++++++++++++++++++++++++-
 kernel/posix-timers.c        |  111 +++--
 kernel/signal.c              |    3 +
 kernel/timer.c               |    2 +
 10 files changed, 1256 insertions(+), 105 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index 04c450a293f2..67a604a944ba 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -418,19 +418,19 @@ static int inline reschedule_periodic_timer(mmtimer_t *x)
 	int n;
 	struct k_itimer *t = x->timer;
 
-	t->it_timer.magic = x->i;
+	t->it.mmtimer.clock = x->i;
 	t->it_overrun--;
 
 	n = 0;
 	do {
 
-		t->it_timer.expires += t->it_incr << n;
+		t->it.mmtimer.expires += t->it.mmtimer.incr << n;
 		t->it_overrun += 1 << n;
 		n++;
 		if (n > 20)
 			return 1;
 
-	} while (mmtimer_setup(x->i, t->it_timer.expires));
+	} while (mmtimer_setup(x->i, t->it.mmtimer.expires));
 
 	return 0;
 }
@@ -466,7 +466,7 @@ mmtimer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 		spin_lock(&base[i].lock);
 		if (base[i].cpu == smp_processor_id()) {
 			if (base[i].timer)
-				expires = base[i].timer->it_timer.expires;
+				expires = base[i].timer->it.mmtimer.expires;
 			/* expires test won't work with shared irqs */
 			if ((mmtimer_int_pending(i) > 0) ||
 				(expires && (expires < rtc_time()))) {
@@ -503,7 +503,7 @@ void mmtimer_tasklet(unsigned long data) {
 
 		t->it_overrun++;
 	}
-	if(t->it_incr) {
+	if(t->it.mmtimer.incr) {
 		/* Periodic timer */
 		if (reschedule_periodic_timer(x)) {
 			printk(KERN_WARNING "mmtimer: unable to reschedule\n");
@@ -511,7 +511,7 @@ void mmtimer_tasklet(unsigned long data) {
 		}
 	} else {
 		/* Ensure we don't false trigger in mmtimer_interrupt */
-		t->it_timer.expires = 0;
+		t->it.mmtimer.expires = 0;
 	}
 	t->it_overrun_last = t->it_overrun;
 out:
@@ -522,7 +522,7 @@ out:
 static int sgi_timer_create(struct k_itimer *timer)
 {
 	/* Insure that a newly created timer is off */
-	timer->it_timer.magic = TIMER_OFF;
+	timer->it.mmtimer.clock = TIMER_OFF;
 	return 0;
 }
 
@@ -533,8 +533,8 @@ static int sgi_timer_create(struct k_itimer *timer)
  */
 static int sgi_timer_del(struct k_itimer *timr)
 {
-	int i = timr->it_timer.magic;
-	cnodeid_t nodeid = timr->it_timer.data;
+	int i = timr->it.mmtimer.clock;
+	cnodeid_t nodeid = timr->it.mmtimer.node;
 	mmtimer_t *t = timers + nodeid * NUM_COMPARATORS +i;
 	unsigned long irqflags;
 
@@ -542,8 +542,8 @@ static int sgi_timer_del(struct k_itimer *timr)
 		spin_lock_irqsave(&t->lock, irqflags);
 		mmtimer_disable_int(cnodeid_to_nasid(nodeid),i);
 		t->timer = NULL;
-		timr->it_timer.magic = TIMER_OFF;
-		timr->it_timer.expires = 0;
+		timr->it.mmtimer.clock = TIMER_OFF;
+		timr->it.mmtimer.expires = 0;
 		spin_unlock_irqrestore(&t->lock, irqflags);
 	}
 	return 0;
@@ -556,7 +556,7 @@ static int sgi_timer_del(struct k_itimer *timr)
 static void sgi_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 {
 
-	if (timr->it_timer.magic == TIMER_OFF) {
+	if (timr->it.mmtimer.clock == TIMER_OFF) {
 		cur_setting->it_interval.tv_nsec = 0;
 		cur_setting->it_interval.tv_sec = 0;
 		cur_setting->it_value.tv_nsec = 0;
@@ -564,8 +564,8 @@ static void sgi_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 		return;
 	}
 
-	ns_to_timespec(cur_setting->it_interval, timr->it_incr * sgi_clock_period);
-	ns_to_timespec(cur_setting->it_value, (timr->it_timer.expires - rtc_time())* sgi_clock_period);
+	ns_to_timespec(cur_setting->it_interval, timr->it.mmtimer.incr * sgi_clock_period);
+	ns_to_timespec(cur_setting->it_value, (timr->it.mmtimer.expires - rtc_time())* sgi_clock_period);
 	return;
 }
 
@@ -638,19 +638,19 @@ retry:
 	base[i].timer = timr;
 	base[i].cpu = smp_processor_id();
 
-	timr->it_timer.magic = i;
-	timr->it_timer.data = nodeid;
-	timr->it_incr = period;
-	timr->it_timer.expires = when;
+	timr->it.mmtimer.clock = i;
+	timr->it.mmtimer.node = nodeid;
+	timr->it.mmtimer.incr = period;
+	timr->it.mmtimer.expires = when;
 
 	if (period == 0) {
 		if (mmtimer_setup(i, when)) {
 			mmtimer_disable_int(-1, i);
 			posix_timer_event(timr, 0);
-			timr->it_timer.expires = 0;
+			timr->it.mmtimer.expires = 0;
 		}
 	} else {
-		timr->it_timer.expires -= period;
+		timr->it.mmtimer.expires -= period;
 		if (reschedule_periodic_timer(base+i))
 			err = -EINVAL;
 	}
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6498e9da9ce6..aa1eb2d45ed8 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -51,6 +51,7 @@
 		.list = LIST_HEAD_INIT(sig.shared_pending.list),	\
 		.signal =  {{0}}}, \
 	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
+	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
 	.rlim		= INIT_RLIMITS,					\
 }
 
@@ -112,8 +113,16 @@ extern struct group_info init_groups;
 	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.switch_lock	= SPIN_LOCK_UNLOCKED,				\
 	.journal_info	= NULL,						\
+	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 }
 
 
+#define INIT_CPU_TIMERS(cpu_timers)					\
+{									\
+	LIST_HEAD_INIT(cpu_timers[0]),					\
+	LIST_HEAD_INIT(cpu_timers[1]),					\
+	LIST_HEAD_INIT(cpu_timers[2]),					\
+}
+
 
 #endif
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index a0140c8f72b7..2820fd4ab58b 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -3,8 +3,21 @@
 
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/sched.h>
 
-#define CPUCLOCK_PID(clock)	((pid_t) ~((clock) >> 3))
+union cpu_time_count {
+	cputime_t cpu;
+	unsigned long long sched;
+};
+
+struct cpu_timer_list {
+	struct list_head entry;
+	union cpu_time_count expires, incr;
+	struct task_struct *task;
+	int firing;
+};
+
+#define CPUCLOCK_PID(clock)		((pid_t) ~((clock) >> 3))
 #define CPUCLOCK_PERTHREAD(clock) \
 	(((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
 #define CPUCLOCK_PID_MASK	7
@@ -30,15 +43,27 @@ struct k_itimer {
 	int it_overrun;			/* overrun on pending signal  */
 	int it_overrun_last;		/* overrun on last delivered signal */
 	int it_requeue_pending;         /* waiting to requeue this timer */
+#define REQUEUE_PENDING 1
 	int it_sigev_notify;		/* notify word of sigevent struct */
 	int it_sigev_signo;		/* signo word of sigevent struct */
 	sigval_t it_sigev_value;	/* value word of sigevent struct */
-	unsigned long it_incr;		/* interval specified in jiffies */
 	struct task_struct *it_process;	/* process to send signal to */
-	struct timer_list it_timer;
 	struct sigqueue *sigq;		/* signal queue entry. */
-	struct list_head abs_timer_entry; /* clock abs_timer_list */
-	struct timespec wall_to_prev;   /* wall_to_monotonic used when set */
+	union {
+		struct {
+			struct timer_list timer;
+			struct list_head abs_timer_entry; /* clock abs_timer_list */
+			struct timespec wall_to_prev;   /* wall_to_monotonic used when set */
+			unsigned long incr; /* interval in jiffies */
+		} real;
+		struct cpu_timer_list cpu;
+		struct {
+			unsigned int clock;
+			unsigned int node;
+			unsigned long incr;
+			unsigned long expires;
+		} mmtimer;
+	} it;
 };
 
 struct k_clock_abs {
@@ -57,6 +82,7 @@ struct k_clock {
 			  struct itimerspec * new_setting,
 			  struct itimerspec * old_setting);
 	int (*timer_del) (struct k_itimer * timr);
+#define TIMER_RETRY 1
 	void (*timer_get) (struct k_itimer * timr,
 			   struct itimerspec * cur_setting);
 };
@@ -82,10 +108,11 @@ struct now_struct {
 #define posix_bump_timer(timr, now)					\
          do {								\
               long delta, orun;						\
-	      delta = now.jiffies - (timr)->it_timer.expires;		\
+	      delta = now.jiffies - (timr)->it.real.timer.expires;	\
               if (delta >= 0) {						\
-	           orun = 1 + (delta / (timr)->it_incr);		\
-	          (timr)->it_timer.expires += orun * (timr)->it_incr;	\
+	           orun = 1 + (delta / (timr)->it.real.incr);		\
+	          (timr)->it.real.timer.expires +=			\
+			 orun * (timr)->it.real.incr;			\
                   (timr)->it_overrun += orun;				\
               }								\
             }while (0)
@@ -95,12 +122,16 @@ int posix_cpu_clock_get(clockid_t which_clock, struct timespec *);
 int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp);
 int posix_cpu_timer_create(struct k_itimer *);
 int posix_cpu_nsleep(clockid_t, int, struct timespec *);
-#define posix_cpu_timer_create do_posix_clock_notimer_create
-#define posix_cpu_nsleep do_posix_clock_nonanosleep
 int posix_cpu_timer_set(struct k_itimer *, int,
 			struct itimerspec *, struct itimerspec *);
 int posix_cpu_timer_del(struct k_itimer *);
 void posix_cpu_timer_get(struct k_itimer *, struct itimerspec *);
 
+void posix_cpu_timer_schedule(struct k_itimer *);
+
+void run_posix_cpu_timers(struct task_struct *);
+void posix_cpu_timers_exit(struct task_struct *);
+void posix_cpu_timers_exit_group(struct task_struct *);
+
 
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5fff51dc7c32..36a6174597f7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -338,6 +338,8 @@ struct signal_struct {
 	 * have no need to disable irqs.
 	 */
 	struct rlimit rlim[RLIM_NLIMITS];
+
+	struct list_head cpu_timers[3];
 };
 
 /*
@@ -612,6 +614,11 @@ struct task_struct {
 	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
+
+  	cputime_t it_prof_expires, it_virt_expires;
+	unsigned long long it_sched_expires;
+	struct list_head cpu_timers[3];
+
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
diff --git a/kernel/exit.c b/kernel/exit.c
index 4173fa7536dc..7605308b580d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -759,6 +759,9 @@ static void exit_notify(struct task_struct *tsk)
 	 */
 	tsk->it_virt_value = cputime_zero;
 	tsk->it_prof_value = cputime_zero;
+ 	tsk->it_virt_expires = cputime_zero;
+ 	tsk->it_prof_expires = cputime_zero;
+	tsk->it_sched_expires = 0;
 
 	write_unlock_irq(&tasklist_lock);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index bc0633f9730a..718eaf0bb1cd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -750,6 +750,9 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->sched_time = 0;
+	INIT_LIST_HEAD(&sig->cpu_timers[0]);
+	INIT_LIST_HEAD(&sig->cpu_timers[1]);
+	INIT_LIST_HEAD(&sig->cpu_timers[2]);
 
 	task_lock(current->group_leader);
 	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -885,6 +888,13 @@ static task_t *copy_process(unsigned long clone_flags,
 	p->syscw = 0;		/* I/O counter: write syscalls */
 	acct_clear_integrals(p);
 
+ 	p->it_virt_expires = cputime_zero;
+	p->it_prof_expires = cputime_zero;
+ 	p->it_sched_expires = 0;
+ 	INIT_LIST_HEAD(&p->cpu_timers[0]);
+ 	INIT_LIST_HEAD(&p->cpu_timers[1]);
+ 	INIT_LIST_HEAD(&p->cpu_timers[2]);
+
 	p->lock_depth = -1;		/* -1 = no lock */
 	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->security = NULL;
@@ -1017,6 +1027,16 @@ static task_t *copy_process(unsigned long clone_flags,
 			set_tsk_thread_flag(p, TIF_SIGPENDING);
 		}
 
+		if (!list_empty(&current->signal->cpu_timers[0]) ||
+		    !list_empty(&current->signal->cpu_timers[1]) ||
+		    !list_empty(&current->signal->cpu_timers[2])) {
+			/*
+			 * Have child wake up on its first tick to check
+			 * for process CPU timers.
+			 */
+			p->it_prof_expires = jiffies_to_cputime(1);
+		}
+
 		spin_unlock(&current->sighand->siglock);
 	}
 
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index fdc54f75aa15..2adadcedc80f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -7,11 +7,6 @@
 #include <asm/uaccess.h>
 #include <linux/errno.h>
 
-union cpu_time_count {
-	cputime_t cpu;
-	unsigned long long sched;
-};
-
 static int check_clock(clockid_t which_clock)
 {
 	int error = 0;
@@ -35,6 +30,19 @@ static int check_clock(clockid_t which_clock)
 	return error;
 }
 
+static inline union cpu_time_count
+timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
+{
+	union cpu_time_count ret;
+	ret.sched = 0;		/* high half always zero when .cpu used */
+	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+		ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
+	} else {
+		ret.cpu = timespec_to_jiffies(tp);
+	}
+	return ret;
+}
+
 static void sample_to_timespec(clockid_t which_clock,
 			       union cpu_time_count cpu,
 			       struct timespec *tp)
@@ -47,6 +55,71 @@ static void sample_to_timespec(clockid_t which_clock,
 	}
 }
 
+static inline int cpu_time_before(clockid_t which_clock,
+				  union cpu_time_count now,
+				  union cpu_time_count then)
+{
+	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+		return now.sched < then.sched;
+	}  else {
+		return cputime_lt(now.cpu, then.cpu);
+	}
+}
+static inline void cpu_time_add(clockid_t which_clock,
+				union cpu_time_count *acc,
+			        union cpu_time_count val)
+{
+	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+		acc->sched += val.sched;
+	}  else {
+		acc->cpu = cputime_add(acc->cpu, val.cpu);
+	}
+}
+static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
+						union cpu_time_count a,
+						union cpu_time_count b)
+{
+	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+		a.sched -= b.sched;
+	}  else {
+		a.cpu = cputime_sub(a.cpu, b.cpu);
+	}
+	return a;
+}
+
+/*
+ * Update expiry time from increment, and increase overrun count,
+ * given the current clock sample.
+ */
+static inline void bump_cpu_timer(struct k_itimer *timer,
+				  union cpu_time_count now)
+{
+	if (timer->it.cpu.incr.sched == 0)
+		return;
+
+	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
+		long long delta;
+		delta = now.sched - timer->it.cpu.expires.sched;
+		if (delta >= 0) {
+			do_div(delta, timer->it.cpu.incr.sched);
+			delta++;
+			timer->it.cpu.expires.sched +=
+				delta * timer->it.cpu.incr.sched;
+			timer->it_overrun += (int) delta;
+		}
+	} else if (cputime_le(now.cpu, timer->it.cpu.expires.cpu)) {
+		cputime_t delta = cputime_sub(now.cpu,
+					      timer->it.cpu.expires.cpu);
+		if (cputime_ge(delta, cputime_zero)) {
+			long orun = 1 + (delta / timer->it.cpu.incr.cpu);
+			timer->it.cpu.expires.cpu =
+				cputime_add(timer->it.cpu.expires.cpu,
+					    orun * timer->it.cpu.incr.cpu);
+			timer->it_overrun += orun;
+		}
+	}
+}
+
 static inline cputime_t prof_ticks(struct task_struct *p)
 {
 	return cputime_add(p->utime, p->stime);
@@ -222,23 +295,1008 @@ int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+
 /*
- * These can't be called, since timer_create never works.
+ * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
+ * This is called from sys_timer_create with the new timer already locked.
  */
-int posix_cpu_timer_set(struct k_itimer *timer, int flags,
-			struct itimerspec *old, struct itimerspec *new)
+int posix_cpu_timer_create(struct k_itimer *new_timer)
 {
-	BUG();
-	return -EINVAL;
+	int ret = 0;
+	const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
+	struct task_struct *p;
+
+	if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&new_timer->it.cpu.entry);
+	new_timer->it.cpu.incr.sched = 0;
+	new_timer->it.cpu.expires.sched = 0;
+
+	read_lock(&tasklist_lock);
+	if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
+		if (pid == 0) {
+			p = current;
+		} else {
+			p = find_task_by_pid(pid);
+			if (p && p->tgid != current->tgid)
+				p = NULL;
+		}
+	} else {
+		if (pid == 0) {
+			p = current->group_leader;
+		} else {
+			p = find_task_by_pid(pid);
+			if (p && p->tgid != pid)
+				p = NULL;
+		}
+	}
+	new_timer->it.cpu.task = p;
+	if (p) {
+		get_task_struct(p);
+	} else {
+		ret = -EINVAL;
+	}
+	read_unlock(&tasklist_lock);
+
+	return ret;
 }
+
+/*
+ * Clean up a CPU-clock timer that is about to be destroyed.
+ * This is called from timer deletion with the timer already locked.
+ * If we return TIMER_RETRY, it's necessary to release the timer's lock
+ * and try again.  (This happens when the timer is in the middle of firing.)
+ */
 int posix_cpu_timer_del(struct k_itimer *timer)
 {
-	BUG();
-	return -EINVAL;
+	struct task_struct *p = timer->it.cpu.task;
+
+	if (timer->it.cpu.firing)
+		return TIMER_RETRY;
+
+	if (unlikely(p == NULL))
+		return 0;
+
+	if (!list_empty(&timer->it.cpu.entry)) {
+		read_lock(&tasklist_lock);
+		if (unlikely(p->signal == NULL)) {
+			/*
+			 * We raced with the reaping of the task.
+			 * The deletion should have cleared us off the list.
+			 */
+			BUG_ON(!list_empty(&timer->it.cpu.entry));
+		} else {
+			/*
+			 * Take us off the task's timer list.
+			 */
+			spin_lock(&p->sighand->siglock);
+			list_del(&timer->it.cpu.entry);
+			spin_unlock(&p->sighand->siglock);
+		}
+		read_unlock(&tasklist_lock);
+	}
+	put_task_struct(p);
+
+	return 0;
+}
+
+/*
+ * Clean out CPU timers still ticking when a thread exited.  The task
+ * pointer is cleared, and the expiry time is replaced with the residual
+ * time for later timer_gettime calls to return.
+ * This must be called with the siglock held.
+ */
+static void cleanup_timers(struct list_head *head,
+			   cputime_t utime, cputime_t stime,
+			   unsigned long long sched_time)
+{
+	struct cpu_timer_list *timer, *next;
+	cputime_t ptime = cputime_add(utime, stime);
+
+	list_for_each_entry_safe(timer, next, head, entry) {
+		timer->task = NULL;
+		list_del_init(&timer->entry);
+		if (cputime_lt(timer->expires.cpu, ptime)) {
+			timer->expires.cpu = cputime_zero;
+		} else {
+			timer->expires.cpu = cputime_sub(timer->expires.cpu,
+							 ptime);
+		}
+	}
+
+	++head;
+	list_for_each_entry_safe(timer, next, head, entry) {
+		timer->task = NULL;
+		list_del_init(&timer->entry);
+		if (cputime_lt(timer->expires.cpu, utime)) {
+			timer->expires.cpu = cputime_zero;
+		} else {
+			timer->expires.cpu = cputime_sub(timer->expires.cpu,
+							 utime);
+		}
+	}
+
+	++head;
+	list_for_each_entry_safe(timer, next, head, entry) {
+		timer->task = NULL;
+		list_del_init(&timer->entry);
+		if (timer->expires.sched < sched_time) {
+			timer->expires.sched = 0;
+		} else {
+			timer->expires.sched -= sched_time;
+		}
+	}
+}
+
+/*
+ * These are both called with the siglock held, when the current thread
+ * is being reaped.  When the final (leader) thread in the group is reaped,
+ * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
+ */
+void posix_cpu_timers_exit(struct task_struct *tsk)
+{
+	cleanup_timers(tsk->cpu_timers,
+		       tsk->utime, tsk->stime, tsk->sched_time);
+
+}
+void posix_cpu_timers_exit_group(struct task_struct *tsk)
+{
+	cleanup_timers(tsk->signal->cpu_timers,
+		       cputime_add(tsk->utime, tsk->signal->utime),
+		       cputime_add(tsk->stime, tsk->signal->stime),
+		       tsk->sched_time + tsk->signal->sched_time);
+}
+
+
+/*
+ * Set the expiry times of all the threads in the process so one of them
+ * will go off before the process cumulative expiry total is reached.
+ */
+static void
+process_timer_rebalance(struct k_itimer *timer, union cpu_time_count val)
+{
+	cputime_t ticks, left;
+	unsigned long long ns, nsleft;
+	struct task_struct *const p = timer->it.cpu.task, *t = p;
+	unsigned int nthreads = atomic_read(&p->signal->live);
+
+	switch (CPUCLOCK_WHICH(timer->it_clock)) {
+	default:
+		BUG();
+		break;
+	case CPUCLOCK_PROF:
+		left = cputime_sub(timer->it.cpu.expires.cpu, val.cpu)
+			/ nthreads;
+		do {
+			if (!unlikely(t->exit_state)) {
+				ticks = cputime_add(prof_ticks(t), left);
+				if (cputime_eq(t->it_prof_expires,
+					       cputime_zero) ||
+				    cputime_gt(t->it_prof_expires, ticks)) {
+					t->it_prof_expires = ticks;
+				}
+			}
+			t = next_thread(t);
+		} while (t != p);
+		break;
+	case CPUCLOCK_VIRT:
+		left = cputime_sub(timer->it.cpu.expires.cpu, val.cpu)
+			/ nthreads;
+		do {
+			if (!unlikely(t->exit_state)) {
+				ticks = cputime_add(virt_ticks(t), left);
+				if (cputime_eq(t->it_virt_expires,
+					       cputime_zero) ||
+				    cputime_gt(t->it_virt_expires, ticks)) {
+					t->it_virt_expires = ticks;
+				}
+			}
+			t = next_thread(t);
+		} while (t != p);
+		break;
+	case CPUCLOCK_SCHED:
+		nsleft = timer->it.cpu.expires.sched - val.sched;
+		do_div(nsleft, nthreads);
+		do {
+			if (!unlikely(t->exit_state)) {
+				ns = t->sched_time + nsleft;
+				if (t->it_sched_expires == 0 ||
+				    t->it_sched_expires > ns) {
+					t->it_sched_expires = ns;
+				}
+			}
+			t = next_thread(t);
+		} while (t != p);
+		break;
+	}
+}
+
+static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
+{
+	/*
+	 * That's all for this thread or process.
+	 * We leave our residual in expires to be reported.
+	 */
+	put_task_struct(timer->it.cpu.task);
+	timer->it.cpu.task = NULL;
+	timer->it.cpu.expires = cpu_time_sub(timer->it_clock,
+					     timer->it.cpu.expires,
+					     now);
+}
+
+/*
+ * Insert the timer on the appropriate list before any timers that
+ * expire later.  This must be called with the tasklist_lock held
+ * for reading, and interrupts disabled.
+ */
+static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
+{
+	struct task_struct *p = timer->it.cpu.task;
+	struct list_head *head, *listpos;
+	struct cpu_timer_list *const nt = &timer->it.cpu;
+	struct cpu_timer_list *next;
+
+	head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
+		p->cpu_timers : p->signal->cpu_timers);
+	head += CPUCLOCK_WHICH(timer->it_clock);
+
+	BUG_ON(!irqs_disabled());
+	spin_lock(&p->sighand->siglock);
+
+	listpos = head;
+	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
+		list_for_each_entry(next, head, entry) {
+			if (next->expires.sched > nt->expires.sched) {
+				listpos = &next->entry;
+				break;
+			}
+		}
+	} else {
+		list_for_each_entry(next, head, entry) {
+			if (cputime_gt(next->expires.cpu, nt->expires.cpu)) {
+				listpos = &next->entry;
+				break;
+			}
+		}
+	}
+	list_add(&nt->entry, listpos);
+
+	if (listpos == head) {
+		/*
+		 * We are the new earliest-expiring timer.
+		 * If we are a thread timer, there can always
+		 * be a process timer telling us to stop earlier.
+		 */
+
+		if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+			switch (CPUCLOCK_WHICH(timer->it_clock)) {
+			default:
+				BUG();
+#define UPDATE_CLOCK(WHICH, c, n)			      		      \
+			case CPUCLOCK_##WHICH: 				      \
+				if (p->it_##c##_expires == 0 ||		      \
+				    p->it_##c##_expires > nt->expires.n) {    \
+					p->it_##c##_expires = nt->expires.n;  \
+				}					      \
+				break
+			UPDATE_CLOCK(PROF, prof, cpu);
+			UPDATE_CLOCK(VIRT, virt, cpu);
+			UPDATE_CLOCK(SCHED, sched, sched);
+#undef UPDATE_CLOCK
+			}
+		} else {
+			/*
+			 * For a process timer, we must balance
+			 * all the live threads' expirations.
+			 */
+			process_timer_rebalance(timer, now);
+		}
+	}
+
+	spin_unlock(&p->sighand->siglock);
+}
+
+/*
+ * The timer is locked, fire it and arrange for its reload.
+ */
+static void cpu_timer_fire(struct k_itimer *timer)
+{
+	if (unlikely(timer->sigq == NULL)) {
+		/*
+		 * This a special case for clock_nanosleep,
+		 * not a normal timer from sys_timer_create.
+		 */
+		wake_up_process(timer->it_process);
+		timer->it.cpu.expires.sched = 0;
+	} else if (timer->it.cpu.incr.sched == 0) {
+		/*
+		 * One-shot timer.  Clear it as soon as it's fired.
+		 */
+		posix_timer_event(timer, 0);
+		timer->it.cpu.expires.sched = 0;
+	} else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
+		/*
+		 * The signal did not get queued because the signal
+		 * was ignored, so we won't get any callback to
+		 * reload the timer.  But we need to keep it
+		 * ticking in case the signal is deliverable next time.
+		 */
+		posix_cpu_timer_schedule(timer);
+	}
+}
+
+/*
+ * Guts of sys_timer_settime for CPU timers.
+ * This is called with the timer locked and interrupts disabled.
+ * If we return TIMER_RETRY, it's necessary to release the timer's lock
+ * and try again.  (This happens when the timer is in the middle of firing.)
+ */
+int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+			struct itimerspec *new, struct itimerspec *old)
+{
+	struct task_struct *p = timer->it.cpu.task;
+	union cpu_time_count old_expires, new_expires, val;
+	int ret;
+
+	if (unlikely(p == NULL)) {
+		/*
+		 * Timer refers to a dead task's clock.
+		 */
+		return -ESRCH;
+	}
+
+	new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
+
+	read_lock(&tasklist_lock);
+	/*
+	 * We need the tasklist_lock to protect against reaping that
+	 * clears p->signal.  If p has just been reaped, we can no
+	 * longer get any information about it at all.
+	 */
+	if (unlikely(p->signal == NULL)) {
+		read_unlock(&tasklist_lock);
+		put_task_struct(p);
+		timer->it.cpu.task = NULL;
+		return -ESRCH;
+	}
+
+	/*
+	 * Disarm any old timer after extracting its expiry time.
+	 */
+	BUG_ON(!irqs_disabled());
+	spin_lock(&p->sighand->siglock);
+	old_expires = timer->it.cpu.expires;
+	list_del_init(&timer->it.cpu.entry);
+	spin_unlock(&p->sighand->siglock);
+
+	/*
+	 * We need to sample the current value to convert the new
+	 * value from to relative and absolute, and to convert the
+	 * old value from absolute to relative.  To set a process
+	 * timer, we need a sample to balance the thread expiry
+	 * times (in arm_timer).  With an absolute time, we must
+	 * check if it's already passed.  In short, we need a sample.
+	 */
+	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+		cpu_clock_sample(timer->it_clock, p, &val);
+	} else {
+		cpu_clock_sample_group(timer->it_clock, p, &val);
+	}
+
+	if (old) {
+		if (old_expires.sched == 0) {
+			old->it_value.tv_sec = 0;
+			old->it_value.tv_nsec = 0;
+		} else {
+			/*
+			 * Update the timer in case it has
+			 * overrun already.  If it has,
+			 * we'll report it as having overrun
+			 * and with the next reloaded timer
+			 * already ticking, though we are
+			 * swallowing that pending
+			 * notification here to install the
+			 * new setting.
+			 */
+			bump_cpu_timer(timer, val);
+			if (cpu_time_before(timer->it_clock, val,
+					    timer->it.cpu.expires)) {
+				old_expires = cpu_time_sub(
+					timer->it_clock,
+					timer->it.cpu.expires, val);
+				sample_to_timespec(timer->it_clock,
+						   old_expires,
+						   &old->it_value);
+			} else {
+				old->it_value.tv_nsec = 1;
+				old->it_value.tv_sec = 0;
+			}
+		}
+	}
+
+	if (unlikely(timer->it.cpu.firing)) {
+		/*
+		 * We are colliding with the timer actually firing.
+		 * Punt after filling in the timer's old value, and
+		 * disable this firing since we are already reporting
+		 * it as an overrun (thanks to bump_cpu_timer above).
+		 */
+		read_unlock(&tasklist_lock);
+		timer->it.cpu.firing = -1;
+		ret = TIMER_RETRY;
+		goto out;
+	}
+
+	if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) {
+		cpu_time_add(timer->it_clock, &new_expires, val);
+	}
+
+	/*
+	 * Install the new expiry time (or zero).
+	 * For a timer with no notification action, we don't actually
+	 * arm the timer (we'll just fake it for timer_gettime).
+	 */
+	timer->it.cpu.expires = new_expires;
+	if (new_expires.sched != 0 &&
+	    (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
+	    cpu_time_before(timer->it_clock, val, new_expires)) {
+		arm_timer(timer, val);
+	}
+
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * Install the new reload setting, and
+	 * set up the signal and overrun bookkeeping.
+	 */
+	timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
+						&new->it_interval);
+
+	/*
+	 * This acts as a modification timestamp for the timer,
+	 * so any automatic reload attempt will punt on seeing
+	 * that we have reset the timer manually.
+	 */
+	timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
+		~REQUEUE_PENDING;
+	timer->it_overrun_last = 0;
+	timer->it_overrun = -1;
+
+	if (new_expires.sched != 0 &&
+	    (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
+	    !cpu_time_before(timer->it_clock, val, new_expires)) {
+		/*
+		 * The designated time already passed, so we notify
+		 * immediately, even if the thread never runs to
+		 * accumulate more time on this clock.
+		 */
+		cpu_timer_fire(timer);
+	}
+
+	ret = 0;
+ out:
+	if (old) {
+		sample_to_timespec(timer->it_clock,
+				   timer->it.cpu.incr, &old->it_interval);
+	}
+	return ret;
+}
+
+void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+{
+	union cpu_time_count now;
+	struct task_struct *p = timer->it.cpu.task;
+	int clear_dead;
+
+	/*
+	 * Easy part: convert the reload time.
+	 */
+	sample_to_timespec(timer->it_clock,
+			   timer->it.cpu.incr, &itp->it_interval);
+
+	if (timer->it.cpu.expires.sched == 0) {	/* Timer not armed at all.  */
+		itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
+		return;
+	}
+
+	if (unlikely(p == NULL)) {
+		/*
+		 * This task already died and the timer will never fire.
+		 * In this case, expires is actually the dead value.
+		 */
+	dead:
+		sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
+				   &itp->it_value);
+		return;
+	}
+
+	/*
+	 * Sample the clock to take the difference with the expiry time.
+	 */
+	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+		cpu_clock_sample(timer->it_clock, p, &now);
+		clear_dead = p->exit_state;
+	} else {
+		read_lock(&tasklist_lock);
+		if (unlikely(p->signal == NULL)) {
+			/*
+			 * The process has been reaped.
+			 * We can't even collect a sample any more.
+			 * Call the timer disarmed, nothing else to do.
+			 */
+			put_task_struct(p);
+			timer->it.cpu.task = NULL;
+			timer->it.cpu.expires.sched = 0;
+			read_unlock(&tasklist_lock);
+			goto dead;
+		} else {
+			cpu_clock_sample_group(timer->it_clock, p, &now);
+			clear_dead = (unlikely(p->exit_state) &&
+				      thread_group_empty(p));
+		}
+		read_unlock(&tasklist_lock);
+	}
+
+	if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+		if (timer->it.cpu.incr.sched == 0 &&
+		    cpu_time_before(timer->it_clock,
+				    timer->it.cpu.expires, now)) {
+			/*
+			 * Do-nothing timer expired and has no reload,
+			 * so it's as if it was never set.
+			 */
+			timer->it.cpu.expires.sched = 0;
+			itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
+			return;
+		}
+		/*
+		 * Account for any expirations and reloads that should
+		 * have happened.
+		 */
+		bump_cpu_timer(timer, now);
+	}
+
+	if (unlikely(clear_dead)) {
+		/*
+		 * We've noticed that the thread is dead, but
+		 * not yet reaped.  Take this opportunity to
+		 * drop our task ref.
+		 */
+		clear_dead_task(timer, now);
+		goto dead;
+	}
+
+	if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) {
+		sample_to_timespec(timer->it_clock,
+				   cpu_time_sub(timer->it_clock,
+						timer->it.cpu.expires, now),
+				   &itp->it_value);
+	} else {
+		/*
+		 * The timer should have expired already, but the firing
+		 * hasn't taken place yet.  Say it's just about to expire.
+		 */
+		itp->it_value.tv_nsec = 1;
+		itp->it_value.tv_sec = 0;
+	}
+}
+
+/*
+ * Check for any per-thread CPU timers that have fired and move them off
+ * the tsk->cpu_timers[N] list onto the firing list.  Here we update the
+ * tsk->it_*_expires values to reflect the remaining thread CPU timers.
+ */
+static void check_thread_timers(struct task_struct *tsk,
+				struct list_head *firing)
+{
+	struct list_head *timers = tsk->cpu_timers;
+
+	tsk->it_prof_expires = 0;
+	while (!list_empty(timers)) {
+		struct cpu_timer_list *t = list_entry(timers->next,
+						      struct cpu_timer_list,
+						      entry);
+		if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+			tsk->it_prof_expires = t->expires.cpu;
+			break;
+		}
+		t->firing = 1;
+		list_move_tail(&t->entry, firing);
+	}
+
+	++timers;
+	tsk->it_virt_expires = 0;
+	while (!list_empty(timers)) {
+		struct cpu_timer_list *t = list_entry(timers->next,
+						      struct cpu_timer_list,
+						      entry);
+		if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+			tsk->it_virt_expires = t->expires.cpu;
+			break;
+		}
+		t->firing = 1;
+		list_move_tail(&t->entry, firing);
+	}
+
+	++timers;
+	tsk->it_sched_expires = 0;
+	while (!list_empty(timers)) {
+		struct cpu_timer_list *t = list_entry(timers->next,
+						      struct cpu_timer_list,
+						      entry);
+		if (tsk->sched_time < t->expires.sched) {
+			tsk->it_sched_expires = t->expires.sched;
+			break;
+		}
+		t->firing = 1;
+		list_move_tail(&t->entry, firing);
+	}
+}
+
+/*
+ * Check for any per-thread CPU timers that have fired and move them
+ * off the tsk->*_timers list onto the firing list.  Per-thread timers
+ * have already been taken off.
+ */
+static void check_process_timers(struct task_struct *tsk,
+				 struct list_head *firing)
+{
+	struct signal_struct *const sig = tsk->signal;
+	cputime_t utime, stime, ptime, virt_expires, prof_expires;
+	unsigned long long sched_time, sched_expires;
+	struct task_struct *t;
+	struct list_head *timers = sig->cpu_timers;
+
+	/*
+	 * Don't sample the current process CPU clocks if there are no timers.
+	 */
+	if (list_empty(&timers[CPUCLOCK_PROF]) &&
+	    list_empty(&timers[CPUCLOCK_VIRT]) &&
+	    list_empty(&timers[CPUCLOCK_SCHED]))
+		return;
+
+	/*
+	 * Collect the current process totals.
+	 */
+	utime = sig->utime;
+	stime = sig->stime;
+	sched_time = sig->sched_time;
+	t = tsk;
+	do {
+		utime = cputime_add(utime, t->utime);
+		stime = cputime_add(stime, t->stime);
+		sched_time += t->sched_time;
+		t = next_thread(t);
+	} while (t != tsk);
+	ptime = cputime_add(utime, stime);
+
+	prof_expires = cputime_zero;
+	while (!list_empty(timers)) {
+		struct cpu_timer_list *t = list_entry(timers->next,
+						      struct cpu_timer_list,
+						      entry);
+		if (cputime_lt(ptime, t->expires.cpu)) {
+			prof_expires = t->expires.cpu;
+			break;
+		}
+		t->firing = 1;
+		list_move_tail(&t->entry, firing);
+	}
+
+	++timers;
+	virt_expires = cputime_zero;
+	while (!list_empty(timers)) {
+		struct cpu_timer_list *t = list_entry(timers->next,
+						      struct cpu_timer_list,
+						      entry);
+		if (cputime_lt(utime, t->expires.cpu)) {
+			virt_expires = t->expires.cpu;
+			break;
+		}
+		t->firing = 1;
+		list_move_tail(&t->entry, firing);
+	}
+
+	++timers;
+	sched_expires = cputime_zero;
+	while (!list_empty(timers)) {
+		struct cpu_timer_list *t = list_entry(timers->next,
+						      struct cpu_timer_list,
+						      entry);
+		if (sched_time < t->expires.sched) {
+			sched_expires = t->expires.sched;
+			break;
+		}
+		t->firing = 1;
+		list_move_tail(&t->entry, firing);
+	}
+
+	if (!cputime_eq(prof_expires, cputime_zero) ||
+	    !cputime_eq(virt_expires, cputime_zero) ||
+	    sched_expires != 0) {
+		/*
+		 * Rebalance the threads' expiry times for the remaining
+		 * process CPU timers.
+		 */
+
+		cputime_t prof_left, virt_left, ticks;
+		unsigned long long sched_left, sched;
+		const unsigned int nthreads = atomic_read(&sig->live);
+
+		prof_left = cputime_sub(prof_expires,
+					cputime_add(utime, stime)) / nthreads;
+		virt_left = cputime_sub(virt_expires, utime) / nthreads;
+		if (sched_expires) {
+			sched_left = sched_expires - sched_time;
+			do_div(sched_left, nthreads);
+		} else {
+			sched_left = 0;
+		}
+		t = tsk;
+		do {
+			ticks = cputime_add(cputime_add(t->utime, t->stime),
+					    prof_left);
+			if (!cputime_eq(prof_expires, cputime_zero) &&
+			    (cputime_eq(t->it_prof_expires, cputime_zero) ||
+			     cputime_gt(t->it_prof_expires, ticks))) {
+				t->it_prof_expires = ticks;
+			}
+
+			ticks = cputime_add(t->utime, virt_left);
+			if (!cputime_eq(virt_expires, cputime_zero) &&
+			    (cputime_eq(t->it_virt_expires, cputime_zero) ||
+			     cputime_gt(t->it_virt_expires, ticks))) {
+				t->it_virt_expires = ticks;
+			}
+
+			sched = t->sched_time + sched_left;
+			if (sched_expires && (t->it_sched_expires == 0 ||
+					      t->it_sched_expires > sched)) {
+				t->it_sched_expires = sched;
+			}
+
+			do {
+				t = next_thread(t);
+			} while (unlikely(t->exit_state));
+		} while (t != tsk);
+	}
 }
-void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *spec)
+
+/*
+ * This is called from the signal code (via do_schedule_next_timer)
+ * when the last timer signal was delivered and we have to reload the timer.
+ */
+void posix_cpu_timer_schedule(struct k_itimer *timer)
 {
-	BUG();
+	struct task_struct *p = timer->it.cpu.task;
+	union cpu_time_count now;
+
+	if (unlikely(p == NULL))
+		/*
+		 * The task was cleaned up already, no future firings.
+		 */
+		return;
+
+	/*
+	 * Fetch the current sample and update the timer's expiry time.
+	 */
+	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+		cpu_clock_sample(timer->it_clock, p, &now);
+		bump_cpu_timer(timer, now);
+		if (unlikely(p->exit_state)) {
+			clear_dead_task(timer, now);
+			return;
+		}
+		read_lock(&tasklist_lock); /* arm_timer needs it.  */
+	} else {
+		read_lock(&tasklist_lock);
+		if (unlikely(p->signal == NULL)) {
+			/*
+			 * The process has been reaped.
+			 * We can't even collect a sample any more.
+			 */
+			put_task_struct(p);
+			timer->it.cpu.task = p = NULL;
+			timer->it.cpu.expires.sched = 0;
+			read_unlock(&tasklist_lock);
+			return;
+		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
+			/*
+			 * We've noticed that the thread is dead, but
+			 * not yet reaped.  Take this opportunity to
+			 * drop our task ref.
+			 */
+			clear_dead_task(timer, now);
+			read_unlock(&tasklist_lock);
+			return;
+		}
+		cpu_clock_sample_group(timer->it_clock, p, &now);
+		bump_cpu_timer(timer, now);
+		/* Leave the tasklist_lock locked for the call below.  */
+	}
+
+	/*
+	 * Now re-arm for the new expiry time.
+	 */
+	arm_timer(timer, now);
+
+	read_unlock(&tasklist_lock);
+}
+
+/*
+ * This is called from the timer interrupt handler.  The irq handler has
+ * already updated our counts.  We need to check if any timers fire now.
+ * Interrupts are disabled.
+ */
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+	LIST_HEAD(firing);
+	struct k_itimer *timer, *next;
+
+	BUG_ON(!irqs_disabled());
+
+#define UNEXPIRED(clock) \
+		(tsk->it_##clock##_expires == 0 || \
+		 cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires))
+
+	if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
+	    (tsk->it_sched_expires == 0 ||
+	     tsk->sched_time < tsk->it_sched_expires))
+		return;
+
+#undef	UNEXPIRED
+
+	BUG_ON(tsk->exit_state);
+
+	/*
+	 * Double-check with locks held.
+	 */
+	read_lock(&tasklist_lock);
+	spin_lock(&tsk->sighand->siglock);
+
+	/*
+	 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+	 * all the timers that are firing, and put them on the firing list.
+	 */
+	check_thread_timers(tsk, &firing);
+	check_process_timers(tsk, &firing);
+
+	/*
+	 * We must release these locks before taking any timer's lock.
+	 * There is a potential race with timer deletion here, as the
+	 * siglock now protects our private firing list.  We have set
+	 * the firing flag in each timer, so that a deletion attempt
+	 * that gets the timer lock before we do will give it up and
+	 * spin until we've taken care of that timer below.
+	 */
+	spin_unlock(&tsk->sighand->siglock);
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * Now that all the timers on our list have the firing flag,
+	 * noone will touch their list entries but us.  We'll take
+	 * each timer's lock before clearing its firing flag, so no
+	 * timer call will interfere.
+	 */
+	list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
+		int firing;
+		spin_lock(&timer->it_lock);
+		list_del_init(&timer->it.cpu.entry);
+		firing = timer->it.cpu.firing;
+		timer->it.cpu.firing = 0;
+		/*
+		 * The firing flag is -1 if we collided with a reset
+		 * of the timer, which already reported this
+		 * almost-firing as an overrun.  So don't generate an event.
+		 */
+		if (likely(firing >= 0)) {
+			cpu_timer_fire(timer);
+		}
+		spin_unlock(&timer->it_lock);
+	}
+}
+
+static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
+
+int posix_cpu_nsleep(clockid_t which_clock, int flags,
+		     struct timespec *rqtp)
+{
+	struct restart_block *restart_block =
+	    &current_thread_info()->restart_block;
+	struct k_itimer timer;
+	int error;
+
+	/*
+	 * Diagnose required errors first.
+	 */
+	if (CPUCLOCK_PERTHREAD(which_clock) &&
+	    (CPUCLOCK_PID(which_clock) == 0 ||
+	     CPUCLOCK_PID(which_clock) == current->pid))
+		return -EINVAL;
+
+	/*
+	 * Set up a temporary timer and then wait for it to go off.
+	 */
+	memset(&timer, 0, sizeof timer);
+	spin_lock_init(&timer.it_lock);
+	timer.it_clock = which_clock;
+	timer.it_overrun = -1;
+	error = posix_cpu_timer_create(&timer);
+	timer.it_process = current;
+	if (!error) {
+		struct timespec __user *rmtp;
+		static struct itimerspec zero_it;
+		struct itimerspec it = { .it_value = *rqtp,
+					 .it_interval = {} };
+
+		spin_lock_irq(&timer.it_lock);
+		error = posix_cpu_timer_set(&timer, flags, &it, NULL);
+		if (error) {
+			spin_unlock_irq(&timer.it_lock);
+			return error;
+		}
+
+		while (!signal_pending(current)) {
+			if (timer.it.cpu.expires.sched == 0) {
+				/*
+				 * Our timer fired and was reset.
+				 */
+				spin_unlock_irq(&timer.it_lock);
+				return 0;
+			}
+
+			/*
+			 * Block until cpu_timer_fire (or a signal) wakes us.
+			 */
+			__set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&timer.it_lock);
+			schedule();
+			spin_lock_irq(&timer.it_lock);
+		}
+
+		/*
+		 * We were interrupted by a signal.
+		 */
+		sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
+		posix_cpu_timer_set(&timer, 0, &zero_it, &it);
+		spin_unlock_irq(&timer.it_lock);
+
+		if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
+			/*
+			 * It actually did fire already.
+			 */
+			return 0;
+		}
+
+		/*
+		 * Report back to the user the time still remaining.
+		 */
+		rmtp = (struct timespec __user *) restart_block->arg1;
+		if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
+		    copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+			return -EFAULT;
+
+		restart_block->fn = posix_cpu_clock_nanosleep_restart;
+		/* Caller already set restart_block->arg1 */
+		restart_block->arg0 = which_clock;
+		restart_block->arg2 = rqtp->tv_sec;
+		restart_block->arg3 = rqtp->tv_nsec;
+
+		error = -ERESTART_RESTARTBLOCK;
+	}
+
+	return error;
+}
+
+static long
+posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
+{
+	clockid_t which_clock = restart_block->arg0;
+	struct timespec t = { .tv_sec = restart_block->arg2,
+			      .tv_nsec = restart_block->arg3 };
+	restart_block->fn = do_no_restart_syscall;
+	return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t);
 }
 
 
@@ -253,6 +1311,16 @@ static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
 {
 	return posix_cpu_clock_get(PROCESS_CLOCK, tp);
 }
+static int process_cpu_timer_create(struct k_itimer *timer)
+{
+	timer->it_clock = PROCESS_CLOCK;
+	return posix_cpu_timer_create(timer);
+}
+static int process_cpu_nsleep(clockid_t which_clock, int flags,
+			      struct timespec *rqtp)
+{
+	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
+}
 static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
 {
 	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
@@ -261,7 +1329,16 @@ static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
 {
 	return posix_cpu_clock_get(THREAD_CLOCK, tp);
 }
-
+static int thread_cpu_timer_create(struct k_itimer *timer)
+{
+	timer->it_clock = THREAD_CLOCK;
+	return posix_cpu_timer_create(timer);
+}
+static int thread_cpu_nsleep(clockid_t which_clock, int flags,
+			      struct timespec *rqtp)
+{
+	return -EINVAL;
+}
 
 static __init int init_posix_cpu_timers(void)
 {
@@ -269,15 +1346,15 @@ static __init int init_posix_cpu_timers(void)
 		.clock_getres = process_cpu_clock_getres,
 		.clock_get = process_cpu_clock_get,
 		.clock_set = do_posix_clock_nosettime,
-		.timer_create = do_posix_clock_notimer_create,
-		.nsleep = do_posix_clock_nonanosleep,
+		.timer_create = process_cpu_timer_create,
+		.nsleep = process_cpu_nsleep,
 	};
 	struct k_clock thread = {
 		.clock_getres = thread_cpu_clock_getres,
 		.clock_get = thread_cpu_clock_get,
 		.clock_set = do_posix_clock_nosettime,
-		.timer_create = do_posix_clock_notimer_create,
-		.nsleep = do_posix_clock_nonanosleep,
+		.timer_create = thread_cpu_timer_create,
+		.nsleep = thread_cpu_nsleep,
 	};
 
 	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 59a38d25adc0..5f0fbcf511ba 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -92,14 +92,13 @@ static DEFINE_SPINLOCK(idr_lock);
  * inactive.  It could be in the "fire" routine getting a new expire time.
  */
 #define TIMER_INACTIVE 1
-#define TIMER_RETRY 1
 
 #ifdef CONFIG_SMP
 # define timer_active(tmr) \
-		((tmr)->it_timer.entry.prev != (void *)TIMER_INACTIVE)
+		((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE)
 # define set_timer_inactive(tmr) \
 		do { \
-			(tmr)->it_timer.entry.prev = (void *)TIMER_INACTIVE; \
+			(tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \
 		} while (0)
 #else
 # define timer_active(tmr) BARFY	// error to use outside of SMP
@@ -115,7 +114,6 @@ static DEFINE_SPINLOCK(idr_lock);
 #endif
 
 
-#define REQUEUE_PENDING 1
 /*
  * The timer ID is turned into a timer address by idr_find().
  * Verifying a valid ID consists of:
@@ -223,10 +221,10 @@ static inline int common_clock_set(clockid_t which_clock, struct timespec *tp)
 
 static inline int common_timer_create(struct k_itimer *new_timer)
 {
-	init_timer(&new_timer->it_timer);
-	new_timer->it_timer.expires = 0;
-	new_timer->it_timer.data = (unsigned long) new_timer;
-	new_timer->it_timer.function = posix_timer_fn;
+	INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry);
+	init_timer(&new_timer->it.real.timer);
+	new_timer->it.real.timer.data = (unsigned long) new_timer;
+	new_timer->it.real.timer.function = posix_timer_fn;
 	set_timer_inactive(new_timer);
 	return 0;
 }
@@ -326,9 +324,9 @@ static long add_clockset_delta(struct k_itimer *timr,
 
 	set_normalized_timespec(&delta,
 				new_wall_to->tv_sec -
-				timr->wall_to_prev.tv_sec,
+				timr->it.real.wall_to_prev.tv_sec,
 				new_wall_to->tv_nsec -
-				timr->wall_to_prev.tv_nsec);
+				timr->it.real.wall_to_prev.tv_nsec);
 	if (likely(!(delta.tv_sec | delta.tv_nsec)))
 		return 0;
 	if (delta.tv_sec < 0) {
@@ -339,16 +337,16 @@ static long add_clockset_delta(struct k_itimer *timr,
 		sign++;
 	}
 	tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp);
-	timr->wall_to_prev = *new_wall_to;
-	timr->it_timer.expires += (sign ? -exp : exp);
+	timr->it.real.wall_to_prev = *new_wall_to;
+	timr->it.real.timer.expires += (sign ? -exp : exp);
 	return 1;
 }
 
 static void remove_from_abslist(struct k_itimer *timr)
 {
-	if (!list_empty(&timr->abs_timer_entry)) {
+	if (!list_empty(&timr->it.real.abs_timer_entry)) {
 		spin_lock(&abs_list.lock);
-		list_del_init(&timr->abs_timer_entry);
+		list_del_init(&timr->it.real.abs_timer_entry);
 		spin_unlock(&abs_list.lock);
 	}
 }
@@ -362,7 +360,7 @@ static void schedule_next_timer(struct k_itimer *timr)
 	/*
 	 * Set up the timer for the next interval (if there is one).
 	 * Note: this code uses the abs_timer_lock to protect
-	 * wall_to_prev and must hold it until exp is set, not exactly
+	 * it.real.wall_to_prev and must hold it until exp is set, not exactly
 	 * obvious...
 
 	 * This function is used for CLOCK_REALTIME* and
@@ -372,7 +370,7 @@ static void schedule_next_timer(struct k_itimer *timr)
 	 * "other" CLOCKs "next timer" code (which, I suppose should
 	 * also be added to the k_clock structure).
 	 */
-	if (!timr->it_incr) 
+	if (!timr->it.real.incr)
 		return;
 
 	do {
@@ -381,7 +379,7 @@ static void schedule_next_timer(struct k_itimer *timr)
 		posix_get_now(&now);
 	} while (read_seqretry(&xtime_lock, seq));
 
-	if (!list_empty(&timr->abs_timer_entry)) {
+	if (!list_empty(&timr->it.real.abs_timer_entry)) {
 		spin_lock(&abs_list.lock);
 		add_clockset_delta(timr, &new_wall_to);
 
@@ -394,7 +392,7 @@ static void schedule_next_timer(struct k_itimer *timr)
 	timr->it_overrun_last = timr->it_overrun;
 	timr->it_overrun = -1;
 	++timr->it_requeue_pending;
-	add_timer(&timr->it_timer);
+	add_timer(&timr->it.real.timer);
 }
 
 /*
@@ -418,7 +416,10 @@ void do_schedule_next_timer(struct siginfo *info)
 	if (!timr || timr->it_requeue_pending != info->si_sys_private)
 		goto exit;
 
-	schedule_next_timer(timr);
+	if (timr->it_clock < 0)	/* CPU clock */
+		posix_cpu_timer_schedule(timr);
+	else
+		schedule_next_timer(timr);
 	info->si_overrun = timr->it_overrun_last;
 exit:
 	if (timr)
@@ -478,7 +479,7 @@ static void posix_timer_fn(unsigned long __data)
 
 	spin_lock_irqsave(&timr->it_lock, flags);
  	set_timer_inactive(timr);
-	if (!list_empty(&timr->abs_timer_entry)) {
+	if (!list_empty(&timr->it.real.abs_timer_entry)) {
 		spin_lock(&abs_list.lock);
 		do {
 			seq = read_seqbegin(&xtime_lock);
@@ -486,9 +487,9 @@ static void posix_timer_fn(unsigned long __data)
 		} while (read_seqretry(&xtime_lock, seq));
 		set_normalized_timespec(&delta,
 					new_wall_to.tv_sec -
-					timr->wall_to_prev.tv_sec,
+					timr->it.real.wall_to_prev.tv_sec,
 					new_wall_to.tv_nsec -
-					timr->wall_to_prev.tv_nsec);
+					timr->it.real.wall_to_prev.tv_nsec);
 		if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) {
 			/* do nothing, timer is on time */
 		} else if (delta.tv_sec < 0) {
@@ -498,9 +499,9 @@ static void posix_timer_fn(unsigned long __data)
 			tstojiffie(&delta,
 				   posix_clocks[timr->it_clock].res,
 				   &exp);
-			timr->wall_to_prev = new_wall_to;
-			timr->it_timer.expires += exp;
-			add_timer(&timr->it_timer);
+			timr->it.real.wall_to_prev = new_wall_to;
+			timr->it.real.timer.expires += exp;
+			add_timer(&timr->it.real.timer);
 			do_notify = 0;
 		}
 		spin_unlock(&abs_list.lock);
@@ -509,7 +510,7 @@ static void posix_timer_fn(unsigned long __data)
 	if (do_notify)  {
 		int si_private=0;
 
-		if (timr->it_incr)
+		if (timr->it.real.incr)
 			si_private = ++timr->it_requeue_pending;
 		else {
 			remove_from_abslist(timr);
@@ -562,7 +563,6 @@ static struct k_itimer * alloc_posix_timer(void)
 	if (!tmr)
 		return tmr;
 	memset(tmr, 0, sizeof (struct k_itimer));
-	INIT_LIST_HEAD(&tmr->abs_timer_entry);
 	if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
 		kmem_cache_free(posix_timers_cache, tmr);
 		tmr = NULL;
@@ -634,7 +634,6 @@ sys_timer_create(clockid_t which_clock,
 	it_id_set = IT_ID_SET;
 	new_timer->it_id = (timer_t) new_timer_id;
 	new_timer->it_clock = which_clock;
-	new_timer->it_incr = 0;
 	new_timer->it_overrun = -1;
 	error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
 	if (error)
@@ -795,30 +794,30 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 	struct now_struct now;
 
 	do
-		expires = timr->it_timer.expires;
-	while ((volatile long) (timr->it_timer.expires) != expires);
+		expires = timr->it.real.timer.expires;
+	while ((volatile long) (timr->it.real.timer.expires) != expires);
 
 	posix_get_now(&now);
 
 	if (expires &&
 	    ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) &&
-	    !timr->it_incr &&
-	    posix_time_before(&timr->it_timer, &now))
-		timr->it_timer.expires = expires = 0;
+	    !timr->it.real.incr &&
+	    posix_time_before(&timr->it.real.timer, &now))
+		timr->it.real.timer.expires = expires = 0;
 	if (expires) {
 		if (timr->it_requeue_pending & REQUEUE_PENDING ||
 		    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
 			posix_bump_timer(timr, now);
-			expires = timr->it_timer.expires;
+			expires = timr->it.real.timer.expires;
 		}
 		else
-			if (!timer_pending(&timr->it_timer))
+			if (!timer_pending(&timr->it.real.timer))
 				expires = 0;
 		if (expires)
 			expires -= now.jiffies;
 	}
 	jiffies_to_timespec(expires, &cur_setting->it_value);
-	jiffies_to_timespec(timr->it_incr, &cur_setting->it_interval);
+	jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval);
 
 	if (cur_setting->it_value.tv_sec < 0) {
 		cur_setting->it_value.tv_nsec = 1;
@@ -972,13 +971,13 @@ common_timer_set(struct k_itimer *timr, int flags,
 		common_timer_get(timr, old_setting);
 
 	/* disable the timer */
-	timr->it_incr = 0;
+	timr->it.real.incr = 0;
 	/*
 	 * careful here.  If smp we could be in the "fire" routine which will
 	 * be spinning as we hold the lock.  But this is ONLY an SMP issue.
 	 */
 #ifdef CONFIG_SMP
-	if (timer_active(timr) && !del_timer(&timr->it_timer))
+	if (timer_active(timr) && !del_timer(&timr->it.real.timer))
 		/*
 		 * It can only be active if on an other cpu.  Since
 		 * we have cleared the interval stuff above, it should
@@ -991,7 +990,7 @@ common_timer_set(struct k_itimer *timr, int flags,
 
 	set_timer_inactive(timr);
 #else
-	del_timer(&timr->it_timer);
+	del_timer(&timr->it.real.timer);
 #endif
 	remove_from_abslist(timr);
 
@@ -1003,29 +1002,29 @@ common_timer_set(struct k_itimer *timr, int flags,
 	 *switch off the timer when it_value is zero
 	 */
 	if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) {
-		timr->it_timer.expires = 0;
+		timr->it.real.timer.expires = 0;
 		return 0;
 	}
 
 	if (adjust_abs_time(clock,
 			    &new_setting->it_value, flags & TIMER_ABSTIME, 
-			    &expire_64, &(timr->wall_to_prev))) {
+			    &expire_64, &(timr->it.real.wall_to_prev))) {
 		return -EINVAL;
 	}
-	timr->it_timer.expires = (unsigned long)expire_64;	
+	timr->it.real.timer.expires = (unsigned long)expire_64;
 	tstojiffie(&new_setting->it_interval, clock->res, &expire_64);
-	timr->it_incr = (unsigned long)expire_64;
+	timr->it.real.incr = (unsigned long)expire_64;
 
 	/*
 	 * We do not even queue SIGEV_NONE timers!  But we do put them
 	 * in the abs list so we can do that right.
 	 */
 	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE))
-		add_timer(&timr->it_timer);
+		add_timer(&timr->it.real.timer);
 
 	if (flags & TIMER_ABSTIME && clock->abs_struct) {
 		spin_lock(&clock->abs_struct->lock);
-		list_add_tail(&(timr->abs_timer_entry),
+		list_add_tail(&(timr->it.real.abs_timer_entry),
 			      &(clock->abs_struct->list));
 		spin_unlock(&clock->abs_struct->lock);
 	}
@@ -1076,9 +1075,9 @@ retry:
 
 static inline int common_timer_del(struct k_itimer *timer)
 {
-	timer->it_incr = 0;
+	timer->it.real.incr = 0;
 #ifdef CONFIG_SMP
-	if (timer_active(timer) && !del_timer(&timer->it_timer))
+	if (timer_active(timer) && !del_timer(&timer->it.real.timer))
 		/*
 		 * It can only be active if on an other cpu.  Since
 		 * we have cleared the interval stuff above, it should
@@ -1089,7 +1088,7 @@ static inline int common_timer_del(struct k_itimer *timer)
 		 */
 		return TIMER_RETRY;
 #else
-	del_timer(&timer->it_timer);
+	del_timer(&timer->it.real.timer);
 #endif
 	remove_from_abslist(timer);
 
@@ -1411,13 +1410,13 @@ void clock_was_set(void)
 			break;
 		}
 		timr = list_entry(cws_list.next, struct k_itimer,
-				   abs_timer_entry);
+				  it.real.abs_timer_entry);
 
-		list_del_init(&timr->abs_timer_entry);
+		list_del_init(&timr->it.real.abs_timer_entry);
 		if (add_clockset_delta(timr, &new_wall_to) &&
-		    del_timer(&timr->it_timer))  /* timer run yet? */
-			add_timer(&timr->it_timer);
-		list_add(&timr->abs_timer_entry, &abs_list.list);
+		    del_timer(&timr->it.real.timer))  /* timer run yet? */
+			add_timer(&timr->it.real.timer);
+		list_add(&timr->it.real.abs_timer_entry, &abs_list.list);
 		spin_unlock_irq(&abs_list.lock);
 	} while (1);
 
@@ -1445,13 +1444,13 @@ sys_clock_nanosleep(clockid_t which_clock, int flags,
 	if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0)
 		return -EINVAL;
 
-	ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t));
-
 	/*
-	 * Do this here as common_nsleep does not have the real address
+	 * Do this here as nsleep function does not have the real address.
 	 */
 	restart_block->arg1 = (unsigned long)rmtp;
 
+	ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t));
+
 	if ((ret == -ERESTART_RESTARTBLOCK) && rmtp &&
 					copy_to_user(rmtp, &t, sizeof (t)))
 		return -EFAULT;
diff --git a/kernel/signal.c b/kernel/signal.c
index 912bb622578f..3418e67aabc7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,6 +22,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/ptrace.h>
+#include <linux/posix-timers.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -347,7 +348,9 @@ void __exit_signal(struct task_struct *tsk)
 	if (!atomic_read(&sig->count))
 		BUG();
 	spin_lock(&sighand->siglock);
+	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count)) {
+		posix_cpu_timers_exit_group(tsk);
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
diff --git a/kernel/timer.c b/kernel/timer.c
index e57a223fac51..167f5b9e65bd 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -30,6 +30,7 @@
 #include <linux/thread_info.h>
 #include <linux/time.h>
 #include <linux/jiffies.h>
+#include <linux/posix-timers.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
 
@@ -825,6 +826,7 @@ void update_process_times(int user_tick)
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);
 	scheduler_tick();
+ 	run_posix_cpu_timers(p);
 }
 
 /*
-- 
cgit v1.2.3


From c1dcd6c2d9b7478baf876725bd356f1b19eeaa65 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 7 Mar 2005 18:17:13 -0800
Subject: [PATCH] make ITIMER_REAL per-process

POSIX requires that setitimer, getitimer, and alarm work on a per-process
basis.  Currently, Linux implements these for individual threads.  This patch
fixes these semantics for the ITIMER_REAL timer (which generates SIGALRM),
making it shared by all threads in a process (thread group).

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/array.c           |   4 +-
 include/linux/init_task.h |   3 --
 include/linux/sched.h     |   6 ++-
 kernel/exit.c             |   5 +-
 kernel/fork.c             |   9 ++--
 kernel/itimer.c           | 115 +++++++++++++++++++++++++++++-----------------
 6 files changed, 87 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index eb5c084ede4a..a16bf85063a4 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -317,6 +317,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 	unsigned long  min_flt = 0,  maj_flt = 0;
 	cputime_t cutime, cstime, utime, stime;
 	unsigned long rsslim = 0;
+	unsigned long it_real_value = 0;
 	struct task_struct *t;
 	char tcomm[sizeof(task->comm)];
 
@@ -372,6 +373,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 			utime = cputime_add(utime, task->signal->utime);
 			stime = cputime_add(stime, task->signal->stime);
 		}
+		it_real_value = task->signal->it_real_value;
 	}
 	ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0;
 	read_unlock(&tasklist_lock);
@@ -420,7 +422,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 		priority,
 		nice,
 		num_threads,
-		jiffies_to_clock_t(task->it_real_value),
+		jiffies_to_clock_t(it_real_value),
 		start_time,
 		vsize,
 		mm ? mm->rss : 0, /* you might want to shift this left 3 */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index aa1eb2d45ed8..a6a8c1a38d5e 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -90,9 +90,6 @@ extern struct group_info init_groups;
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
 	.group_leader	= &tsk,						\
-	.real_timer	= {						\
-		.function	= it_real_fn				\
-	},								\
 	.group_info	= &init_groups,					\
 	.cap_effective	= CAP_INIT_EFF_SET,				\
 	.cap_inheritable = CAP_INIT_INH_SET,				\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 36a6174597f7..5fe77e2927af 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -301,6 +301,10 @@ struct signal_struct {
 	/* POSIX.1b Interval Timers */
 	struct list_head posix_timers;
 
+	/* ITIMER_REAL timer for the process */
+	struct timer_list real_timer;
+	unsigned long it_real_value, it_real_incr;
+
 	/* job control IDs */
 	pid_t pgrp;
 	pid_t tty_old_pgrp;
@@ -605,10 +609,8 @@ struct task_struct {
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
 	unsigned long rt_priority;
-	unsigned long it_real_value, it_real_incr;
 	cputime_t it_virt_value, it_virt_incr;
 	cputime_t it_prof_value, it_prof_incr;
-	struct timer_list real_timer;
 	cputime_t utime, stime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time;
diff --git a/kernel/exit.c b/kernel/exit.c
index fbe293b10a47..db204cd02d8b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -795,7 +795,6 @@ fastcall NORET_TYPE void do_exit(long code)
 	}
 
 	tsk->flags |= PF_EXITING;
-	del_timer_sync(&tsk->real_timer);
 
 	/*
 	 * Make sure we don't try to process any timer firings
@@ -813,8 +812,10 @@ fastcall NORET_TYPE void do_exit(long code)
 	acct_update_integrals(tsk);
 	update_mem_hiwater(tsk);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
-	if (group_dead)
+	if (group_dead) {
+ 		del_timer_sync(&tsk->signal->real_timer);
 		acct_process(code);
+	}
 	exit_mm(tsk);
 
 	exit_sem(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 718eaf0bb1cd..a1d1939f596c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -740,6 +740,11 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
 
+	sig->it_real_value = sig->it_real_incr = 0;
+	sig->real_timer.function = it_real_fn;
+	sig->real_timer.data = (unsigned long) tsk;
+	init_timer(&sig->real_timer);
+
 	sig->tty = current->signal->tty;
 	sig->pgrp = process_group(current);
 	sig->session = current->signal->session;
@@ -870,14 +875,10 @@ static task_t *copy_process(unsigned long clone_flags,
 	clear_tsk_thread_flag(p, TIF_SIGPENDING);
 	init_sigpending(&p->pending);
 
-	p->it_real_value = 0;
-	p->it_real_incr = 0;
 	p->it_virt_value = cputime_zero;
 	p->it_virt_incr = cputime_zero;
 	p->it_prof_value = cputime_zero;
 	p->it_prof_incr = cputime_zero;
-	init_timer(&p->real_timer);
-	p->real_timer.data = (unsigned long) p;
 
 	p->utime = cputime_zero;
 	p->stime = cputime_zero;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index e1743c563206..c9cc2a4cb40a 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -14,25 +14,31 @@
 
 #include <asm/uaccess.h>
 
+static unsigned long it_real_value(struct signal_struct *sig)
+{
+	unsigned long val = 0;
+	if (timer_pending(&sig->real_timer)) {
+		val = sig->real_timer.expires - jiffies;
+
+		/* look out for negative/zero itimer.. */
+		if ((long) val <= 0)
+			val = 1;
+	}
+	return val;
+}
+
 int do_getitimer(int which, struct itimerval *value)
 {
-	register unsigned long val;
+	unsigned long interval, val;
 
 	switch (which) {
 	case ITIMER_REAL:
-		val = 0;
-		/* 
-		 * FIXME! This needs to be atomic, in case the kernel timer happens!
-		 */
-		if (timer_pending(&current->real_timer)) {
-			val = current->real_timer.expires - jiffies;
-
-			/* look out for negative/zero itimer.. */
-			if ((long) val <= 0)
-				val = 1;
-		}
+		spin_lock_irq(&current->sighand->siglock);
+		interval = current->signal->it_real_incr;
+		val = it_real_value(current->signal);
+		spin_unlock_irq(&current->sighand->siglock);
 		jiffies_to_timeval(val, &value->it_value);
-		jiffies_to_timeval(current->it_real_incr, &value->it_interval);
+		jiffies_to_timeval(interval, &value->it_interval);
 		break;
 	case ITIMER_VIRTUAL:
 		cputime_to_timeval(current->it_virt_value, &value->it_value);
@@ -48,7 +54,6 @@ int do_getitimer(int which, struct itimerval *value)
 	return 0;
 }
 
-/* SMP: Only we modify our itimer values. */
 asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
 {
 	int error = -EFAULT;
@@ -63,60 +68,87 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
 	return error;
 }
 
+/*
+ * Called with P->sighand->siglock held and P->signal->real_timer inactive.
+ * If interval is nonzero, arm the timer for interval ticks from now.
+ */
+static inline void it_real_arm(struct task_struct *p, unsigned long interval)
+{
+	p->signal->it_real_value = interval; /* XXX unnecessary field?? */
+	if (interval == 0)
+		return;
+	if (interval > (unsigned long) LONG_MAX)
+		interval = LONG_MAX;
+	p->signal->real_timer.expires = jiffies + interval;
+	add_timer(&p->signal->real_timer);
+}
+
 void it_real_fn(unsigned long __data)
 {
 	struct task_struct * p = (struct task_struct *) __data;
-	unsigned long interval;
 
 	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p);
-	interval = p->it_real_incr;
-	if (interval) {
-		if (interval > (unsigned long) LONG_MAX)
-			interval = LONG_MAX;
-		p->real_timer.expires = jiffies + interval;
-		add_timer(&p->real_timer);
-	}
+
+	/*
+	 * Now restart the timer if necessary.  We don't need any locking
+	 * here because do_setitimer makes sure we have finished running
+	 * before it touches anything.
+	 */
+	it_real_arm(p, p->signal->it_real_incr);
 }
 
 int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
-	unsigned long expire;
+	struct task_struct *tsk = current;
+ 	unsigned long val, interval;
 	cputime_t cputime;
-	int k;
 
-	if (ovalue && (k = do_getitimer(which, ovalue)) < 0)
-		return k;
 	switch (which) {
 		case ITIMER_REAL:
-			del_timer_sync(&current->real_timer);
-			expire = timeval_to_jiffies(&value->it_value);
-			current->it_real_value = expire;
-			current->it_real_incr =
+ 			spin_lock_irq(&tsk->sighand->siglock);
+ 			interval = tsk->signal->it_real_incr;
+ 			val = it_real_value(tsk->signal);
+ 			if (val)
+ 				del_timer_sync(&tsk->signal->real_timer);
+ 			tsk->signal->it_real_incr =
 				timeval_to_jiffies(&value->it_interval);
-			if (!expire)
-				break;
-			if (expire > (unsigned long) LONG_MAX)
-				expire = LONG_MAX;
-			current->real_timer.expires = jiffies + expire;
-			add_timer(&current->real_timer);
+ 			it_real_arm(tsk, timeval_to_jiffies(&value->it_value));
+ 			spin_unlock_irq(&tsk->sighand->siglock);
+			if (ovalue) {
+				jiffies_to_timeval(val, &ovalue->it_value);
+				jiffies_to_timeval(interval,
+						   &ovalue->it_interval);
+			}
 			break;
 		case ITIMER_VIRTUAL:
+			if (ovalue) {
+				cputime_to_timeval(tsk->it_virt_value,
+						   &ovalue->it_value);
+				cputime_to_timeval(tsk->it_virt_incr,
+						   &ovalue->it_interval);
+			}
 			cputime = timeval_to_cputime(&value->it_value);
 			if (cputime_gt(cputime, cputime_zero))
 				cputime = cputime_add(cputime,
 						      jiffies_to_cputime(1));
-			current->it_virt_value = cputime;
+			tsk->it_virt_value = cputime;
 			cputime = timeval_to_cputime(&value->it_interval);
-			current->it_virt_incr = cputime;
+			tsk->it_virt_incr = cputime;
 			break;
 		case ITIMER_PROF:
+			if (ovalue) {
+				cputime_to_timeval(tsk->it_prof_value,
+						   &ovalue->it_value);
+				cputime_to_timeval(tsk->it_prof_incr,
+						   &ovalue->it_interval);
+			}
 			cputime = timeval_to_cputime(&value->it_value);
 			if (cputime_gt(cputime, cputime_zero))
 				cputime = cputime_add(cputime,
 						      jiffies_to_cputime(1));
-			current->it_prof_value = cputime;
+			tsk->it_prof_value = cputime;
 			cputime = timeval_to_cputime(&value->it_interval);
-			current->it_prof_incr = cputime;
+			tsk->it_prof_incr = cputime;
 			break;
 		default:
 			return -EINVAL;
@@ -124,9 +156,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 	return 0;
 }
 
-/* SMP: Again, only we play with our itimers, and signals are SMP safe
- *      now so that is not an issue at all anymore.
- */
 asmlinkage long sys_setitimer(int which,
 			      struct itimerval __user *value,
 			      struct itimerval __user *ovalue)
-- 
cgit v1.2.3


From d80d30ff8b9122aa51135e942e35566904f32ee5 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 7 Mar 2005 18:17:29 -0800
Subject: [PATCH] make ITIMER_PROF, ITIMER_VIRTUAL per-process

POSIX requires that setitimer, getitimer, and alarm work on a per-process
basis.  Currently, Linux implements these for individual threads.  This patch
fixes these semantics for the ITIMER_PROF timer (which generates SIGPROF) and
the ITIMER_VIRTUAL timer (which generates SIGVTALRM), making them shared by
all threads in a process (thread group).  This patch should be applied after
the one that fixes ITIMER_REAL.

The essential machinery for these timers is tied into the new posix-timers
code for process CPU clocks and timers.  This patch requires the cputimers
patch and its dependencies.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/posix-timers.h |   2 +
 include/linux/sched.h        |   6 +-
 include/linux/signal.h       |   1 +
 kernel/exit.c                |   3 -
 kernel/fork.c                |  16 ++--
 kernel/itimer.c              | 176 +++++++++++++++++++++++++++++--------------
 kernel/posix-cpu-timers.c    | 157 +++++++++++++++++++++++++++++++++-----
 kernel/sched.c               |  47 +-----------
 kernel/signal.c              |   2 +-
 9 files changed, 277 insertions(+), 133 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 2820fd4ab58b..f942e2bad8e3 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -133,5 +133,7 @@ void run_posix_cpu_timers(struct task_struct *);
 void posix_cpu_timers_exit(struct task_struct *);
 void posix_cpu_timers_exit_group(struct task_struct *);
 
+void set_process_cpu_timer(struct task_struct *, unsigned int,
+			   cputime_t *, cputime_t *);
 
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5fe77e2927af..cf90d1ed1a1f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -305,6 +305,10 @@ struct signal_struct {
 	struct timer_list real_timer;
 	unsigned long it_real_value, it_real_incr;
 
+	/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
+	cputime_t it_prof_expires, it_virt_expires;
+	cputime_t it_prof_incr, it_virt_incr;
+
 	/* job control IDs */
 	pid_t pgrp;
 	pid_t tty_old_pgrp;
@@ -609,8 +613,6 @@ struct task_struct {
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
 	unsigned long rt_priority;
-	cputime_t it_virt_value, it_virt_incr;
-	cputime_t it_prof_value, it_prof_incr;
 	cputime_t utime, stime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time;
diff --git a/include/linux/signal.h b/include/linux/signal.h
index e5f3d83ab215..3d8bf1afdb51 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -212,6 +212,7 @@ static inline void init_sigpending(struct sigpending *sig)
 }
 
 extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
+extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
 extern long do_sigpending(void __user *, unsigned long);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index db204cd02d8b..ae320758b2f5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -753,9 +753,6 @@ static void exit_notify(struct task_struct *tsk)
 		state = EXIT_DEAD;
 	tsk->exit_state = state;
 
-	tsk->it_virt_value = cputime_zero;
-	tsk->it_prof_value = cputime_zero;
-
 	write_unlock_irq(&tasklist_lock);
 
 	list_for_each_safe(_p, _n, &ptrace_dead) {
diff --git a/kernel/fork.c b/kernel/fork.c
index a1d1939f596c..37d6b4769965 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -745,6 +745,11 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->real_timer.data = (unsigned long) tsk;
 	init_timer(&sig->real_timer);
 
+	sig->it_virt_expires = cputime_zero;
+	sig->it_virt_incr = cputime_zero;
+	sig->it_prof_expires = cputime_zero;
+	sig->it_prof_incr = cputime_zero;
+
 	sig->tty = current->signal->tty;
 	sig->pgrp = process_group(current);
 	sig->session = current->signal->session;
@@ -875,11 +880,6 @@ static task_t *copy_process(unsigned long clone_flags,
 	clear_tsk_thread_flag(p, TIF_SIGPENDING);
 	init_sigpending(&p->pending);
 
-	p->it_virt_value = cputime_zero;
-	p->it_virt_incr = cputime_zero;
-	p->it_prof_value = cputime_zero;
-	p->it_prof_incr = cputime_zero;
-
 	p->utime = cputime_zero;
 	p->stime = cputime_zero;
  	p->sched_time = 0;
@@ -1028,7 +1028,11 @@ static task_t *copy_process(unsigned long clone_flags,
 			set_tsk_thread_flag(p, TIF_SIGPENDING);
 		}
 
-		if (!list_empty(&current->signal->cpu_timers[0]) ||
+		if (!cputime_eq(current->signal->it_virt_expires,
+				cputime_zero) ||
+		    !cputime_eq(current->signal->it_prof_expires,
+				cputime_zero) ||
+		    !list_empty(&current->signal->cpu_timers[0]) ||
 		    !list_empty(&current->signal->cpu_timers[1]) ||
 		    !list_empty(&current->signal->cpu_timers[2])) {
 			/*
diff --git a/kernel/itimer.c b/kernel/itimer.c
index c9cc2a4cb40a..e9a40e947e07 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -11,6 +11,7 @@
 #include <linux/interrupt.h>
 #include <linux/syscalls.h>
 #include <linux/time.h>
+#include <linux/posix-timers.h>
 
 #include <asm/uaccess.h>
 
@@ -29,24 +30,67 @@ static unsigned long it_real_value(struct signal_struct *sig)
 
 int do_getitimer(int which, struct itimerval *value)
 {
+	struct task_struct *tsk = current;
 	unsigned long interval, val;
+	cputime_t cinterval, cval;
 
 	switch (which) {
 	case ITIMER_REAL:
-		spin_lock_irq(&current->sighand->siglock);
-		interval = current->signal->it_real_incr;
-		val = it_real_value(current->signal);
-		spin_unlock_irq(&current->sighand->siglock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		interval = tsk->signal->it_real_incr;
+		val = it_real_value(tsk->signal);
+		spin_unlock_irq(&tsk->sighand->siglock);
 		jiffies_to_timeval(val, &value->it_value);
 		jiffies_to_timeval(interval, &value->it_interval);
 		break;
 	case ITIMER_VIRTUAL:
-		cputime_to_timeval(current->it_virt_value, &value->it_value);
-		cputime_to_timeval(current->it_virt_incr, &value->it_interval);
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		cval = tsk->signal->it_virt_expires;
+		cinterval = tsk->signal->it_virt_incr;
+		if (!cputime_eq(cval, cputime_zero)) {
+			struct task_struct *t = tsk;
+			cputime_t utime = tsk->signal->utime;
+			do {
+				utime = cputime_add(utime, t->utime);
+				t = next_thread(t);
+			} while (t != tsk);
+			if (cputime_le(cval, utime)) { /* about to fire */
+				cval = jiffies_to_cputime(1);
+			} else {
+				cval = cputime_sub(cval, utime);
+			}
+		}
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+		cputime_to_timeval(cval, &value->it_value);
+		cputime_to_timeval(cinterval, &value->it_interval);
 		break;
 	case ITIMER_PROF:
-		cputime_to_timeval(current->it_prof_value, &value->it_value);
-		cputime_to_timeval(current->it_prof_incr, &value->it_interval);
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		cval = tsk->signal->it_prof_expires;
+		cinterval = tsk->signal->it_prof_incr;
+		if (!cputime_eq(cval, cputime_zero)) {
+			struct task_struct *t = tsk;
+			cputime_t ptime = cputime_add(tsk->signal->utime,
+						      tsk->signal->stime);
+			do {
+				ptime = cputime_add(ptime,
+						    cputime_add(t->utime,
+								t->stime));
+				t = next_thread(t);
+			} while (t != tsk);
+			if (cputime_le(cval, ptime)) { /* about to fire */
+				cval = jiffies_to_cputime(1);
+			} else {
+				cval = cputime_sub(cval, ptime);
+			}
+		}
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+		cputime_to_timeval(cval, &value->it_value);
+		cputime_to_timeval(cinterval, &value->it_interval);
 		break;
 	default:
 		return(-EINVAL);
@@ -101,57 +145,75 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
 	struct task_struct *tsk = current;
  	unsigned long val, interval;
-	cputime_t cputime;
+	cputime_t cval, cinterval, nval, ninterval;
 
 	switch (which) {
-		case ITIMER_REAL:
- 			spin_lock_irq(&tsk->sighand->siglock);
- 			interval = tsk->signal->it_real_incr;
- 			val = it_real_value(tsk->signal);
- 			if (val)
- 				del_timer_sync(&tsk->signal->real_timer);
- 			tsk->signal->it_real_incr =
-				timeval_to_jiffies(&value->it_interval);
- 			it_real_arm(tsk, timeval_to_jiffies(&value->it_value));
- 			spin_unlock_irq(&tsk->sighand->siglock);
-			if (ovalue) {
-				jiffies_to_timeval(val, &ovalue->it_value);
-				jiffies_to_timeval(interval,
-						   &ovalue->it_interval);
-			}
-			break;
-		case ITIMER_VIRTUAL:
-			if (ovalue) {
-				cputime_to_timeval(tsk->it_virt_value,
-						   &ovalue->it_value);
-				cputime_to_timeval(tsk->it_virt_incr,
-						   &ovalue->it_interval);
-			}
-			cputime = timeval_to_cputime(&value->it_value);
-			if (cputime_gt(cputime, cputime_zero))
-				cputime = cputime_add(cputime,
-						      jiffies_to_cputime(1));
-			tsk->it_virt_value = cputime;
-			cputime = timeval_to_cputime(&value->it_interval);
-			tsk->it_virt_incr = cputime;
-			break;
-		case ITIMER_PROF:
-			if (ovalue) {
-				cputime_to_timeval(tsk->it_prof_value,
-						   &ovalue->it_value);
-				cputime_to_timeval(tsk->it_prof_incr,
-						   &ovalue->it_interval);
-			}
-			cputime = timeval_to_cputime(&value->it_value);
-			if (cputime_gt(cputime, cputime_zero))
-				cputime = cputime_add(cputime,
-						      jiffies_to_cputime(1));
-			tsk->it_prof_value = cputime;
-			cputime = timeval_to_cputime(&value->it_interval);
-			tsk->it_prof_incr = cputime;
-			break;
-		default:
-			return -EINVAL;
+	case ITIMER_REAL:
+		spin_lock_irq(&tsk->sighand->siglock);
+		interval = tsk->signal->it_real_incr;
+		val = it_real_value(tsk->signal);
+		if (val)
+			del_timer_sync(&tsk->signal->real_timer);
+		tsk->signal->it_real_incr =
+			timeval_to_jiffies(&value->it_interval);
+		it_real_arm(tsk, timeval_to_jiffies(&value->it_value));
+		spin_unlock_irq(&tsk->sighand->siglock);
+		if (ovalue) {
+			jiffies_to_timeval(val, &ovalue->it_value);
+			jiffies_to_timeval(interval,
+					   &ovalue->it_interval);
+		}
+		break;
+	case ITIMER_VIRTUAL:
+		nval = timeval_to_cputime(&value->it_value);
+		ninterval = timeval_to_cputime(&value->it_interval);
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		cval = tsk->signal->it_virt_expires;
+		cinterval = tsk->signal->it_virt_incr;
+		if (!cputime_eq(cval, cputime_zero) ||
+		    !cputime_eq(nval, cputime_zero)) {
+			if (cputime_gt(nval, cputime_zero))
+				nval = cputime_add(nval,
+						   jiffies_to_cputime(1));
+			set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
+					      &nval, &cval);
+		}
+		tsk->signal->it_virt_expires = nval;
+		tsk->signal->it_virt_incr = ninterval;
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+		if (ovalue) {
+			cputime_to_timeval(cval, &ovalue->it_value);
+			cputime_to_timeval(cinterval, &ovalue->it_interval);
+		}
+		break;
+	case ITIMER_PROF:
+		nval = timeval_to_cputime(&value->it_value);
+		ninterval = timeval_to_cputime(&value->it_interval);
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		cval = tsk->signal->it_prof_expires;
+		cinterval = tsk->signal->it_prof_incr;
+		if (!cputime_eq(cval, cputime_zero) ||
+		    !cputime_eq(nval, cputime_zero)) {
+			if (cputime_gt(nval, cputime_zero))
+				nval = cputime_add(nval,
+						   jiffies_to_cputime(1));
+			set_process_cpu_timer(tsk, CPUCLOCK_PROF,
+					      &nval, &cval);
+		}
+		tsk->signal->it_prof_expires = nval;
+		tsk->signal->it_prof_incr = ninterval;
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+		if (ovalue) {
+			cputime_to_timeval(cval, &ovalue->it_value);
+			cputime_to_timeval(cinterval, &ovalue->it_interval);
+		}
+		break;
+	default:
+		return -EINVAL;
 	}
 	return 0;
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 2adadcedc80f..612754b099e6 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -190,36 +190,31 @@ static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p,
 /*
  * Sample a process (thread group) clock for the given group_leader task.
  * Must be called with tasklist_lock held for reading.
+ * Must be called with tasklist_lock held for reading, and p->sighand->siglock.
  */
-static int cpu_clock_sample_group(clockid_t which_clock,
-				  struct task_struct *p,
-				  union cpu_time_count *cpu)
+static int cpu_clock_sample_group_locked(unsigned int clock_idx,
+					 struct task_struct *p,
+					 union cpu_time_count *cpu)
 {
 	struct task_struct *t = p;
-	unsigned long flags;
-	switch (CPUCLOCK_WHICH(which_clock)) {
+ 	switch (clock_idx) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
-		spin_lock_irqsave(&p->sighand->siglock, flags);
 		cpu->cpu = cputime_add(p->signal->utime, p->signal->stime);
 		do {
 			cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t));
 			t = next_thread(t);
 		} while (t != p);
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
 		break;
 	case CPUCLOCK_VIRT:
-		spin_lock_irqsave(&p->sighand->siglock, flags);
 		cpu->cpu = p->signal->utime;
 		do {
 			cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t));
 			t = next_thread(t);
 		} while (t != p);
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
 		break;
 	case CPUCLOCK_SCHED:
-		spin_lock_irqsave(&p->sighand->siglock, flags);
 		cpu->sched = p->signal->sched_time;
 		/* Add in each other live thread.  */
 		while ((t = next_thread(t)) != p) {
@@ -237,12 +232,28 @@ static int cpu_clock_sample_group(clockid_t which_clock,
 		} else {
 			cpu->sched += p->sched_time;
 		}
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
 		break;
 	}
 	return 0;
 }
 
+/*
+ * Sample a process (thread group) clock for the given group_leader task.
+ * Must be called with tasklist_lock held for reading.
+ */
+static int cpu_clock_sample_group(clockid_t which_clock,
+				  struct task_struct *p,
+				  union cpu_time_count *cpu)
+{
+	int ret;
+	unsigned long flags;
+	spin_lock_irqsave(&p->sighand->siglock, flags);
+	ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p,
+					    cpu);
+	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	return ret;
+}
+
 
 int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
 {
@@ -453,20 +464,22 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
  * Set the expiry times of all the threads in the process so one of them
  * will go off before the process cumulative expiry total is reached.
  */
-static void
-process_timer_rebalance(struct k_itimer *timer, union cpu_time_count val)
+static void process_timer_rebalance(struct task_struct *p,
+				    unsigned int clock_idx,
+				    union cpu_time_count expires,
+				    union cpu_time_count val)
 {
 	cputime_t ticks, left;
 	unsigned long long ns, nsleft;
-	struct task_struct *const p = timer->it.cpu.task, *t = p;
+ 	struct task_struct *t = p;
 	unsigned int nthreads = atomic_read(&p->signal->live);
 
-	switch (CPUCLOCK_WHICH(timer->it_clock)) {
+	switch (clock_idx) {
 	default:
 		BUG();
 		break;
 	case CPUCLOCK_PROF:
-		left = cputime_sub(timer->it.cpu.expires.cpu, val.cpu)
+		left = cputime_sub(expires.cpu, val.cpu)
 			/ nthreads;
 		do {
 			if (!unlikely(t->exit_state)) {
@@ -481,7 +494,7 @@ process_timer_rebalance(struct k_itimer *timer, union cpu_time_count val)
 		} while (t != p);
 		break;
 	case CPUCLOCK_VIRT:
-		left = cputime_sub(timer->it.cpu.expires.cpu, val.cpu)
+		left = cputime_sub(expires.cpu, val.cpu)
 			/ nthreads;
 		do {
 			if (!unlikely(t->exit_state)) {
@@ -496,7 +509,7 @@ process_timer_rebalance(struct k_itimer *timer, union cpu_time_count val)
 		} while (t != p);
 		break;
 	case CPUCLOCK_SCHED:
-		nsleft = timer->it.cpu.expires.sched - val.sched;
+		nsleft = expires.sched - val.sched;
 		do_div(nsleft, nthreads);
 		do {
 			if (!unlikely(t->exit_state)) {
@@ -590,7 +603,31 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 			 * For a process timer, we must balance
 			 * all the live threads' expirations.
 			 */
-			process_timer_rebalance(timer, now);
+			switch (CPUCLOCK_WHICH(timer->it_clock)) {
+			default:
+				BUG();
+			case CPUCLOCK_VIRT:
+				if (!cputime_eq(p->signal->it_virt_expires,
+						cputime_zero) &&
+				    cputime_lt(p->signal->it_virt_expires,
+					       timer->it.cpu.expires.cpu))
+					break;
+				goto rebalance;
+			case CPUCLOCK_PROF:
+				if (!cputime_eq(p->signal->it_prof_expires,
+						cputime_zero) &&
+				    cputime_lt(p->signal->it_prof_expires,
+					       timer->it.cpu.expires.cpu))
+					break;
+				goto rebalance;
+			case CPUCLOCK_SCHED:
+			rebalance:
+				process_timer_rebalance(
+					timer->it.cpu.task,
+					CPUCLOCK_WHICH(timer->it_clock),
+					timer->it.cpu.expires, now);
+				break;
+			}
 		}
 	}
 
@@ -952,7 +989,9 @@ static void check_process_timers(struct task_struct *tsk,
 	 * Don't sample the current process CPU clocks if there are no timers.
 	 */
 	if (list_empty(&timers[CPUCLOCK_PROF]) &&
+	    cputime_eq(sig->it_prof_expires, cputime_zero) &&
 	    list_empty(&timers[CPUCLOCK_VIRT]) &&
+	    cputime_eq(sig->it_virt_expires, cputime_zero) &&
 	    list_empty(&timers[CPUCLOCK_SCHED]))
 		return;
 
@@ -1012,6 +1051,42 @@ static void check_process_timers(struct task_struct *tsk,
 		list_move_tail(&t->entry, firing);
 	}
 
+	/*
+	 * Check for the special case process timers.
+	 */
+	if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
+		if (cputime_ge(ptime, sig->it_prof_expires)) {
+			/* ITIMER_PROF fires and reloads.  */
+			sig->it_prof_expires = sig->it_prof_incr;
+			if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
+				sig->it_prof_expires = cputime_add(
+					sig->it_prof_expires, ptime);
+			}
+			__group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk);
+		}
+		if (!cputime_eq(sig->it_prof_expires, cputime_zero) &&
+		    (cputime_eq(prof_expires, cputime_zero) ||
+		     cputime_lt(sig->it_prof_expires, prof_expires))) {
+			prof_expires = sig->it_prof_expires;
+		}
+	}
+	if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
+		if (cputime_ge(utime, sig->it_virt_expires)) {
+			/* ITIMER_VIRTUAL fires and reloads.  */
+			sig->it_virt_expires = sig->it_virt_incr;
+			if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
+				sig->it_virt_expires = cputime_add(
+					sig->it_virt_expires, utime);
+			}
+			__group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk);
+		}
+		if (!cputime_eq(sig->it_virt_expires, cputime_zero) &&
+		    (cputime_eq(virt_expires, cputime_zero) ||
+		     cputime_lt(sig->it_virt_expires, virt_expires))) {
+			virt_expires = sig->it_virt_expires;
+		}
+	}
+
 	if (!cputime_eq(prof_expires, cputime_zero) ||
 	    !cputime_eq(virt_expires, cputime_zero) ||
 	    sched_expires != 0) {
@@ -1197,6 +1272,50 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	}
 }
 
+/*
+ * Set one of the process-wide special case CPU timers.
+ * The tasklist_lock and tsk->sighand->siglock must be held by the caller.
+ */
+void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
+			   cputime_t *newval, cputime_t *oldval)
+{
+	union cpu_time_count now;
+	struct list_head *head;
+
+	BUG_ON(clock_idx == CPUCLOCK_SCHED);
+	cpu_clock_sample_group_locked(clock_idx, tsk, &now);
+
+	if (oldval && !cputime_eq(*oldval, cputime_zero)) {
+		if (cputime_le(*oldval, now.cpu)) { /* Just about to fire. */
+			*oldval = jiffies_to_cputime(1);
+		} else {
+			*oldval = cputime_sub(*oldval, now.cpu);
+		}
+	}
+
+	if (cputime_eq(*newval, cputime_zero))
+		return;
+	*newval = cputime_add(*newval, now.cpu);
+
+	/*
+	 * Check whether there are any process timers already set to fire
+	 * before this one.  If so, we don't have anything more to do.
+	 */
+	head = &tsk->signal->cpu_timers[clock_idx];
+	if (list_empty(head) ||
+	    cputime_ge(list_entry(head->next,
+				  struct cpu_timer_list, entry)->expires.cpu,
+		       *newval)) {
+		/*
+		 * Rejigger each thread's expiry time so that one will
+		 * notice before we hit the process-cumulative expiry time.
+		 */
+		union cpu_time_count expires = { .sched = 0 };
+		expires.cpu = *newval;
+		process_timer_rebalance(tsk, clock_idx, expires, now);
+	}
+}
+
 static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
 
 int posix_cpu_nsleep(clockid_t which_clock, int flags,
diff --git a/kernel/sched.c b/kernel/sched.c
index 8176366cfd8f..a0fd3d36923c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2283,46 +2283,6 @@ unsigned long long current_sched_time(const task_t *tsk)
 			STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
 			((rq)->curr->static_prio > (rq)->best_expired_prio))
 
-/*
- * Do the virtual cpu time signal calculations.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- */
-static inline void account_it_virt(struct task_struct * p, cputime_t cputime)
-{
-	cputime_t it_virt = p->it_virt_value;
-
-	if (cputime_gt(it_virt, cputime_zero) &&
-	    cputime_gt(cputime, cputime_zero)) {
-		if (cputime_ge(cputime, it_virt)) {
-			it_virt = cputime_add(it_virt, p->it_virt_incr);
-			send_sig(SIGVTALRM, p, 1);
-		}
-		it_virt = cputime_sub(it_virt, cputime);
-		p->it_virt_value = it_virt;
-	}
-}
-
-/*
- * Do the virtual profiling signal calculations.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user and kernel space since the last update
- */
-static void account_it_prof(struct task_struct *p, cputime_t cputime)
-{
-	cputime_t it_prof = p->it_prof_value;
-
-	if (cputime_gt(it_prof, cputime_zero) &&
-	    cputime_gt(cputime, cputime_zero)) {
-		if (cputime_ge(cputime, it_prof)) {
-			it_prof = cputime_add(it_prof, p->it_prof_incr);
-			send_sig(SIGPROF, p, 1);
-		}
-		it_prof = cputime_sub(it_prof, cputime);
-		p->it_prof_value = it_prof;
-	}
-}
-
 /*
  * Check if the process went over its cputime resource limit after
  * some cpu time got added to utime/stime.
@@ -2360,10 +2320,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 
 	p->utime = cputime_add(p->utime, cputime);
 
-	/* Check for signals (SIGVTALRM, SIGPROF, SIGXCPU & SIGKILL). */
+	/* Check for signals (SIGXCPU & SIGKILL). */
 	check_rlimit(p, cputime);
-	account_it_virt(p, cputime);
-	account_it_prof(p, cputime);
 
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
@@ -2388,10 +2346,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 
 	p->stime = cputime_add(p->stime, cputime);
 
-	/* Check for signals (SIGPROF, SIGXCPU & SIGKILL). */
+	/* Check for signals (SIGXCPU & SIGKILL). */
 	if (likely(p->signal && p->exit_state < EXIT_ZOMBIE)) {
 		check_rlimit(p, cputime);
-		account_it_prof(p, cputime);
 	}
 
 	/* Add system time to cpustat. */
diff --git a/kernel/signal.c b/kernel/signal.c
index 3418e67aabc7..80cd734e3570 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1048,7 +1048,7 @@ __group_complete_signal(int sig, struct task_struct *p)
 	return;
 }
 
-static int
+int
 __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	int ret = 0;
-- 
cgit v1.2.3


From bb5b29911b8c7ad7041c4e1a15b0e9b19c7f97da Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 7 Mar 2005 18:18:29 -0800
Subject: [PATCH] set RLIMIT_SIGPENDING limit based on RLIMIT_NPROC

While looking into the issues Jeremy had with the RLIMIT_SIGPENDING limit,
it occurred to me that the normal setting of this limit is bizarrely low.
The initial hard limit setting (MAX_SIGPENDING) was taken from the old
max_queued_signals parameter, which was for the entire system in aggregate.

But even as a per-user limit, the 1024 value is incongruously low for this.
 On my machine, RLIMIT_NPROC allows me 8192 processes, but only 1024 queued
signals, i.e.  fewer even than one pending signal in each process.  (To me,
this really puts in doubt the sensibility of using a per-user limit for
this rather than a per-process one, i.e.  counted in sighand_struct or
signal_struct, which could have a much smaller reasonable value.  I don't
recall the rationale for making this new limit per-user in the first
place.)

This patch sets the default RLIMIT_SIGPENDING limit at boot time, using the
calculation that decides the default RLIMIT_NPROC limit.  This uses the
same value for those two limits, which I think is still pretty conservative
on the RLIMIT_SIGPENDING value.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/resource.h | 2 +-
 include/linux/signal.h         | 2 --
 kernel/fork.c                  | 2 ++
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h
index c0c910498273..b1fcda9eac23 100644
--- a/include/asm-generic/resource.h
+++ b/include/asm-generic/resource.h
@@ -79,7 +79,7 @@
 	[RLIMIT_MEMLOCK]	= {    MLOCK_LIMIT,    MLOCK_LIMIT },	\
 	[RLIMIT_AS]		= {  RLIM_INFINITY,  RLIM_INFINITY },	\
 	[RLIMIT_LOCKS]		= {  RLIM_INFINITY,  RLIM_INFINITY },	\
-	[RLIMIT_SIGPENDING]	= { MAX_SIGPENDING, MAX_SIGPENDING },	\
+	[RLIMIT_SIGPENDING]	= { 		0,	       0 },	\
 	[RLIMIT_MSGQUEUE]	= {   MQ_BYTES_MAX,   MQ_BYTES_MAX },	\
 }
 
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 3d8bf1afdb51..99c97ad026c8 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -8,8 +8,6 @@
 
 #ifdef __KERNEL__
 
-#define MAX_SIGPENDING	1024
-
 /*
  * Real Time signals may be queued.
  */
diff --git a/kernel/fork.c b/kernel/fork.c
index 3a3b19a98c4e..11e456652543 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -129,6 +129,8 @@ void __init fork_init(unsigned long mempages)
 
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+	init_task.signal->rlim[RLIMIT_SIGPENDING] =
+		init_task.signal->rlim[RLIMIT_NPROC];
 }
 
 static struct task_struct *dup_task_struct(struct task_struct *orig)
-- 
cgit v1.2.3