From 0f23a3a8d04a5568062600a55749108a27cd511d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:25:47 -0700
Subject: [PATCH] 3c574_cs fixes

- It was doing spin_lock_irqsave()/spin_unlock()

- Can't free the skb inside local_irq_save(): kfree_skb ends up running
  local_bh_enable(), which enables interrupts.
---
 drivers/net/pcmcia/3c574_cs.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c
index 9a0727a4b120..ae13b70c367a 100644
--- a/drivers/net/pcmcia/3c574_cs.c
+++ b/drivers/net/pcmcia/3c574_cs.c
@@ -940,11 +940,9 @@ static int el3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		outw(SetTxThreshold + (1536>>2), ioaddr + EL3_CMD);
 	}
 
-	dev_kfree_skb (skb);
 	pop_tx_status(dev);
-
-	spin_unlock(&lp->window_lock);
-	
+	spin_unlock_irqrestore(&lp->window_lock, flags);
+	dev_kfree_skb(skb);
 	return 0;
 }
 
-- 
cgit v1.2.3


From cb9704057b5bd0b69a4d42cf63cdfb2535b5bbae Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:25:54 -0700
Subject: [PATCH] Fix nc98 partition parser link error

Fix this:

fs/partitions/nec98.c:169: undefined reference to `parse_bsd'
---
 fs/partitions/check.h | 5 +++++
 fs/partitions/msdos.c | 4 ++--
 fs/partitions/nec98.c | 7 -------
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 0be95725e097..882980c55720 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -29,3 +29,8 @@ put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
 }
 
 extern int warn_no_part;
+
+extern void parse_bsd(struct parsed_partitions *state,
+			struct block_device *bdev, u32 offset, u32 size,
+			int origin, char *flavour, int max_partitions);
+
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 147485d515c8..56dea78c0312 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -214,12 +214,12 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
 #endif
 }
 
-#ifdef CONFIG_BSD_DISKLABEL
+#if defined(CONFIG_BSD_DISKLABEL) || defined(CONFIG_NEC98_PARTITION)
 /* 
  * Create devices for BSD partitions listed in a disklabel, under a
  * dos-like partition. See parse_extended() for more information.
  */
-static void
+void
 parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
 		u32 offset, u32 size, int origin, char *flavour,
 		int max_partitions)
diff --git a/fs/partitions/nec98.c b/fs/partitions/nec98.c
index b3bd8faf9bda..cbd55789f4b2 100644
--- a/fs/partitions/nec98.c
+++ b/fs/partitions/nec98.c
@@ -66,13 +66,6 @@ is_valid_nec98_partition_table(const struct nec98_partition *ptable,
 	return valid;
 }
 
-#ifdef CONFIG_BSD_DISKLABEL
-extern void parse_bsd(struct parsed_partitions *state,
-			struct block_device *bdev,
-			u32 offset, u32 size, int origin, char *flavour,
-			int max_partitions);
-#endif
-
 int nec98_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
 	unsigned int nr;
-- 
cgit v1.2.3


From 70d6700010a67ff55656cd630bf08b5c2a1756e3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:26:00 -0700
Subject: [PATCH] dmfe: don't free skb with local interrupts disabled

dev_kfree_skb() can end up calling local_bh_enable() which goes BUG if local
interrupts are disabled.  Apparently it can deadlock.

So move the skb freeing outside the lock in the dmfe driver.  It will
decrease the lock hold time as well.
---
 drivers/net/tulip/dmfe.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tulip/dmfe.c b/drivers/net/tulip/dmfe.c
index ee017a02ecbe..b2d33c9ac274 100644
--- a/drivers/net/tulip/dmfe.c
+++ b/drivers/net/tulip/dmfe.c
@@ -668,13 +668,13 @@ static int dmfe_start_xmit(struct sk_buff *skb, struct DEVICE *dev)
 	if ( db->tx_queue_cnt < TX_FREE_DESC_CNT )
 		netif_wake_queue(dev);
 
-	/* free this SKB */
-	dev_kfree_skb(skb);
-
 	/* Restore CR7 to enable interrupt */
 	spin_unlock_irqrestore(&db->lock, flags);
 	outl(db->cr7_data, dev->base_addr + DCR7);
 
+	/* free this SKB */
+	dev_kfree_skb(skb);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From de8e3749bfc54556c3b9a40534fc2d3309ec84e6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:26:07 -0700
Subject: [PATCH] dentry_stat accounting fix

From: Maneesh Soni <maneesh@in.ibm.com>

This patch the corrects the dentry_stat.nr_unused calculation.

In select_parent() and shrink_dcache_anon() we were not doing any adjustments
to the nr_unused count after manipulating the dentry_unused list.  Now the
nr_unused count is decremented if the dentry is on dentry_unused list and is
removed from there.

Further in the same routines, we have to adjust the nr_unused count again if
the dentry is moved to the end of d_lru list for pruning.
---
 fs/dcache.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index efc51c38ea25..59480fedd61a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -538,13 +538,18 @@ resume:
 		struct list_head *tmp = next;
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
 		next = tmp->next;
-		list_del_init(&dentry->d_lru);
 
-		/* don't add non zero d_count dentries 
-		 * back to d_lru list
+		if (!list_empty(&dentry->d_lru)) {
+			dentry_stat.nr_unused--;
+			list_del_init(&dentry->d_lru);
+		}
+		/* 
+		 * move only zero ref count dentries to the end 
+		 * of the unused list for prune_dcache
 		 */
 		if (!atomic_read(&dentry->d_count)) {
 			list_add(&dentry->d_lru, dentry_unused.prev);
+			dentry_stat.nr_unused++;
 			found++;
 		}
 		/*
@@ -609,13 +614,18 @@ void shrink_dcache_anon(struct hlist_head *head)
 		spin_lock(&dcache_lock);
 		hlist_for_each(lp, head) {
 			struct dentry *this = hlist_entry(lp, struct dentry, d_hash);
-			list_del(&this->d_lru);
+			if (!list_empty(&this->d_lru)) {
+				dentry_stat.nr_unused--;
+				list_del(&this->d_lru);
+			}
 
-			/* don't add non zero d_count dentries 
-			 * back to d_lru list
+			/* 
+			 * move only zero ref count dentries to the end 
+			 * of the unused list for prune_dcache
 			 */
 			if (!atomic_read(&this->d_count)) {
 				list_add_tail(&this->d_lru, &dentry_unused);
+				dentry_stat.nr_unused++;
 				found++;
 			}
 		}
-- 
cgit v1.2.3


From 0e3efbd1ebf9b6f6b16cd5b68465f02dbe72dff0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:26:14 -0700
Subject: [PATCH] Fix and clean up DCACHE_REFERENCED usage

From: Maneesh Soni <maneesh@in.ibm.com>

This patch changes the way DCACHE_REFERENCED flag is used. It
got messed up in dcache_rcu iterations. I hope this will be ok now.

The flag was meant to be advisory flag which is used while
prune_dcache() so as not to free dentries which have recently
entered d_lru list. At first pass in prune_dcache the dentries
marked DCACHE_REFERENCED are left with the flag reset. and they
are freed in the next pass.

So, now we mark the dentry as DCACHE_REFERENCED when it is first
entering the d_lru list in dput() and resetthe flag in prune_dcache().
If the flag remains reset in the next call to prune_dcache(), the
dentry is then freed.

Also I don't think any file system have to use this flag as it is taken
care by the dcache layer. The patch removes such code from a few of file
systems. Moreover these filesystems were anyway doing worng thing as they
were changing the flag out of dcache_lock.

Changes:
o dput() marks dentry DCACHE_REFERENCED when it is added to the dentry_unused
  list
o no need to set the flag in dget, dget_locked, d_lookup as these guys anyway
  increments the ref count.
o check the ref count in prune_dcache and use DCACHE_REFERENCED flag just for
  two stage aging.
o remove code for setting DACACHE_REFERENCED from reiserfs, fat, xfs and
  exportfs.
---
 fs/dcache.c              | 20 ++++++++------------
 fs/exportfs/expfs.c      |  3 ---
 fs/fat/inode.c           |  1 -
 fs/reiserfs/inode.c      |  1 -
 fs/xfs/linux/xfs_super.c |  1 -
 include/linux/dcache.h   |  1 -
 6 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 59480fedd61a..9eec20e0ab20 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -155,12 +155,11 @@ repeat:
  	if (d_unhashed(dentry))
 		goto kill_it;
   	if (list_empty(&dentry->d_lru)) {
-  		dentry->d_vfs_flags &= ~DCACHE_REFERENCED;
+  		dentry->d_vfs_flags |= DCACHE_REFERENCED;
   		list_add(&dentry->d_lru, &dentry_unused);
   		dentry_stat.nr_unused++;
   	}
  	spin_unlock(&dentry->d_lock);
-	dentry->d_vfs_flags |= DCACHE_REFERENCED;
 	spin_unlock(&dcache_lock);
 	return;
 
@@ -250,7 +249,6 @@ int d_invalidate(struct dentry * dentry)
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
 	atomic_inc(&dentry->d_count);
-	dentry->d_vfs_flags |= DCACHE_REFERENCED;
 	if (atomic_read(&dentry->d_count) == 1) {
 		dentry_stat.nr_unused--;
 		list_del_init(&dentry->d_lru);
@@ -379,17 +377,16 @@ static void prune_dcache(int count)
 		dentry = list_entry(tmp, struct dentry, d_lru);
 
  		spin_lock(&dentry->d_lock);
+		/* leave inuse dentries */
+ 		if (atomic_read(&dentry->d_count)) {
+ 			spin_unlock(&dentry->d_lock);
+			continue;
+		}
 		/* If the dentry was recently referenced, don't free it. */
 		if (dentry->d_vfs_flags & DCACHE_REFERENCED) {
 			dentry->d_vfs_flags &= ~DCACHE_REFERENCED;
-
-			/* don't add non zero d_count dentries 
-			 * back to d_lru list
-			 */
- 			if (!atomic_read(&dentry->d_count)) {
- 				list_add(&dentry->d_lru, &dentry_unused);
- 				dentry_stat.nr_unused++;
- 			}
+ 			list_add(&dentry->d_lru, &dentry_unused);
+ 			dentry_stat.nr_unused++;
  			spin_unlock(&dentry->d_lock);
 			continue;
 		}
@@ -1027,7 +1024,6 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 		if (likely(move_count == dentry->d_move_count)) {
 			if (!d_unhashed(dentry)) {
 				atomic_inc(&dentry->d_count);
-				dentry->d_vfs_flags |= DCACHE_REFERENCED;
 				found = dentry;
 			}
 		}
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 7264433b25fd..aae953bb9572 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -91,7 +91,6 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
 			if (dentry != result &&
 			    acceptable(context, dentry)) {
 				dput(result);
-				dentry->d_vfs_flags |= DCACHE_REFERENCED;
 				return dentry;
 			}
 			spin_lock(&dcache_lock);
@@ -271,7 +270,6 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
 		if (dentry != result &&
 		    acceptable(context, dentry)) {
 			dput(result);
-			dentry->d_vfs_flags |= DCACHE_REFERENCED;
 			return dentry;
 		}
 		spin_lock(&dcache_lock);
@@ -434,7 +432,6 @@ static struct dentry *export_iget(struct super_block *sb, unsigned long ino, __u
 		iput(inode);
 		return ERR_PTR(-ENOMEM);
 	}
-	result->d_vfs_flags |= DCACHE_REFERENCED;
 	return result;
 }
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 67548b28113d..866edb62fad5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -608,7 +608,6 @@ struct dentry *fat_get_dentry(struct super_block *sb, void *inump)
 		return ERR_PTR(-ENOMEM);
 	}
 	result->d_op = sb->s_root->d_op;
-	result->d_vfs_flags |= DCACHE_REFERENCED;
 	return result;
 }
 
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0536716aa84e..fb7f27f1f532 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1260,7 +1260,6 @@ struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp)
 	    iput(inode);
 	    return ERR_PTR(-ENOMEM);
     }
-    result->d_vfs_flags |= DCACHE_REFERENCED;
     return result;
 }
 
diff --git a/fs/xfs/linux/xfs_super.c b/fs/xfs/linux/xfs_super.c
index 662a43c90a1e..73ef4ec19e95 100644
--- a/fs/xfs/linux/xfs_super.c
+++ b/fs/xfs/linux/xfs_super.c
@@ -741,7 +741,6 @@ linvfs_get_dentry(
 		iput(inode);
 		return ERR_PTR(-ENOMEM);
 	}
-	result->d_vfs_flags |= DCACHE_REFERENCED;
 	return result;
 }
 
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index db979c3cf890..78fafd500123 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -270,7 +270,6 @@ static inline struct dentry *dget(struct dentry *dentry)
 		if (!atomic_read(&dentry->d_count))
 			BUG();
 		atomic_inc(&dentry->d_count);
-		dentry->d_vfs_flags |= DCACHE_REFERENCED;
 	}
 	return dentry;
 }
-- 
cgit v1.2.3


From 2f98681fa42f8971f09ca96ce6329fa3a3ff0740 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:26:21 -0700
Subject: [PATCH] Fix POSIX timers to give CLOCK_MONOTONIC full

The POSIX CLOCK_MONOTONIC currently has only 1/HZ resolution.  Further, it is
tied to jiffies (i.e.  is a restatment of jiffies) rather than "xtime" or the
gettimeofday() clock.

This patch changes CLOCK_MONOTONIC to be a restatment of gettimeofday() plus
an offset to remove any clock setting activity from CLOCK_MONOTONIC.  An
offset is kept that represents the difference between CLOCK_MONOTONIC and
gettimeofday().  This offset is updated when ever the gettimeofday() clock is
set to back the clock setting change out of CLOCK_MONOTONIC (which by the
standard, can not be set).

With this change CLOCK_REALTIME (a direct restatement of gettimeofday()),
CLOCK_MONOTONIC and gettimeofday() will all tick at the same time and with
the same rate.  And all will be affected by NTP adjustments (save those which
actually set the time).
---
 arch/i386/kernel/time.c |  23 +++++-
 include/linux/time.h    |   4 +
 kernel/posix-timers.c   | 190 ++++++++++++++++++++++++++++--------------------
 kernel/timer.c          |  12 ++-
 4 files changed, 146 insertions(+), 83 deletions(-)

diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index ab0f09bfe3d1..f5e9bf60dbb0 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -124,15 +124,28 @@ void do_settimeofday(struct timeval *tv)
 	 * made, and then undo it!
 	 */
 	tv->tv_usec -= timer->get_offset();
-	tv->tv_usec -= (jiffies - wall_jiffies) * (1000000 / HZ);
+	tv->tv_usec -= (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ);
 
 	while (tv->tv_usec < 0) {
-		tv->tv_usec += 1000000;
+		tv->tv_usec += USEC_PER_SEC;
 		tv->tv_sec--;
 	}
+	tv->tv_usec *= NSEC_PER_USEC;
+
+	wall_to_monotonic.tv_sec += xtime.tv_sec - tv->tv_sec;
+	wall_to_monotonic.tv_nsec += xtime.tv_nsec - tv->tv_usec;
+
+	if (wall_to_monotonic.tv_nsec > NSEC_PER_SEC) {
+		wall_to_monotonic.tv_nsec -= NSEC_PER_SEC;
+		wall_to_monotonic.tv_sec++;
+	}
+	if (wall_to_monotonic.tv_nsec < 0) {
+		wall_to_monotonic.tv_nsec += NSEC_PER_SEC;
+		wall_to_monotonic.tv_sec--;
+	}
 
 	xtime.tv_sec = tv->tv_sec;
-	xtime.tv_nsec = (tv->tv_usec * 1000);
+	xtime.tv_nsec = tv->tv_usec;
 	time_adjust = 0;		/* stop active adjtime() */
 	time_status |= STA_UNSYNC;
 	time_maxerror = NTP_PHASE_LIMIT;
@@ -322,7 +335,9 @@ void __init time_init(void)
 {
 	
 	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = 0;
+	wall_to_monotonic.tv_sec = -xtime.tv_sec + INITIAL_JIFFIES / HZ;
+	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+	wall_to_monotonic.tv_nsec = 0;
 
 
 	timer = select_timer();
diff --git a/include/linux/time.h b/include/linux/time.h
index 4d7238025fe9..6fad88e1a37d 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -140,6 +140,7 @@ mktime (unsigned int year, unsigned int mon,
 }
 
 extern struct timespec xtime;
+extern struct timespec wall_to_monotonic;
 extern seqlock_t xtime_lock;
 
 static inline unsigned long get_seconds(void)
@@ -200,6 +201,9 @@ struct	itimerval {
 #define CLOCK_MONOTONIC_HR	  5
 
 #define MAX_CLOCKS 6
+#define CLOCKS_MASK  (CLOCK_REALTIME | CLOCK_MONOTONIC | \
+                     CLOCK_REALTIME_HR | CLOCK_MONOTONIC_HR)
+#define CLOCKS_MONO (CLOCK_MONOTONIC & CLOCK_MONOTONIC_HR)
 
 /*
  * The various flags for setting POSIX.1b interval timers.
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 54fe15f5c0b3..a0a972ec7742 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -48,7 +48,7 @@
  * The idr_get_new *may* call slab for more memory so it must not be
  * called under a spin lock.  Likewise idr_remore may release memory
  * (but it may be ok to do this under a lock...).
- * idr_find is just a memory look up and is quite fast.  A zero return
+ * idr_find is just a memory look up and is quite fast.  A -1 return
  * indicates that the requested id does not exist.
  */
 
@@ -82,6 +82,7 @@ static spinlock_t idr_lock = SPIN_LOCK_UNLOCKED;
  * For some reason mips/mips64 define the SIGEV constants plus 128.
  * Here we define a mask to get rid of the common bits.	 The
  * optimizer should make this costless to all but mips.
+ * Note that no common bits (the non-mips case) will give 0xffffffff.
  */
 #define MIPS_SIGEV ~(SIGEV_NONE & \
 		      SIGEV_SIGNAL & \
@@ -93,7 +94,7 @@ static spinlock_t idr_lock = SPIN_LOCK_UNLOCKED;
  * The timer ID is turned into a timer address by idr_find().
  * Verifying a valid ID consists of:
  *
- * a) checking that idr_find() returns other than zero.
+ * a) checking that idr_find() returns other than -1.
  * b) checking that the timer id matches the one in the timer itself.
  * c) that the timer owner is in the callers thread group.
  */
@@ -162,6 +163,8 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
 
 void register_posix_clock(int clock_id, struct k_clock *new_clock);
 static int do_posix_gettime(struct k_clock *clock, struct timespec *tp);
+static u64 do_posix_clock_monotonic_gettime_parts(
+	struct timespec *tp, struct timespec *mo);
 int do_posix_clock_monotonic_gettime(struct timespec *tp);
 int do_posix_clock_monotonic_settime(struct timespec *tp);
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
@@ -192,7 +195,7 @@ __initcall(init_posix_timers);
 
 static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
 {
-	unsigned long sec = tp->tv_sec;
+	long sec = tp->tv_sec;
 	long nsec = tp->tv_nsec + res - 1;
 
 	if (nsec > NSEC_PER_SEC) {
@@ -210,7 +213,7 @@ static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
 	 * below.  Here it is enough to just discard the high order
 	 * bits.
 	 */
-	*jiff = (u64)sec * HZ;
+	*jiff = (s64)sec * HZ;
 	/*
 	 * Do the res thing. (Don't forget the add in the declaration of nsec)
 	 */
@@ -221,17 +224,6 @@ static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
 	*jiff += nsec / (NSEC_PER_SEC / HZ);
 }
 
-static void tstotimer(struct itimerspec *time, struct k_itimer *timer)
-{
-	u64 result;
-	int res = posix_clocks[timer->it_clock].res;
-
-	tstojiffie(&time->it_value, res, &result);
-	timer->it_timer.expires = (unsigned long)result;
-	tstojiffie(&time->it_interval, res, &result);
-	timer->it_incr = (unsigned long)result;
-}
-
 static void schedule_next_timer(struct k_itimer *timr)
 {
 	struct now_struct now;
@@ -690,57 +682,81 @@ sys_timer_getoverrun(timer_t timer_id)
  * If it is relative time, we need to add the current (CLOCK_MONOTONIC)
  * time to it to get the proper time for the timer.
  */
-static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, int abs)
+static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, 
+			   int abs, u64 *exp)
 {
 	struct timespec now;
-	struct timespec oc;
-	do_posix_clock_monotonic_gettime(&now);
-
-	if (!abs || (posix_clocks[CLOCK_MONOTONIC].clock_get !=
-							clock->clock_get)) {
-		if (abs)
-			do_posix_gettime(clock, &oc);
-		else
-			oc.tv_nsec = oc.tv_sec = 0;
-
-		tp->tv_sec += now.tv_sec - oc.tv_sec;
-		tp->tv_nsec += now.tv_nsec - oc.tv_nsec;
+	struct timespec oc = *tp;
+	struct timespec wall_to_mono;
+	u64 jiffies_64_f;
+	int rtn =0;
 
+	if (abs) {
+		/*
+		 * The mask pick up the 4 basic clocks 
+		 */
+		if (!(clock - &posix_clocks[0]) & ~CLOCKS_MASK) {
+			jiffies_64_f = do_posix_clock_monotonic_gettime_parts(
+				&now,  &wall_to_mono);
+			/*
+			 * If we are doing a MONOTONIC clock
+			 */
+			if((clock - &posix_clocks[0]) & CLOCKS_MONO){
+				now.tv_sec += wall_to_mono.tv_sec;
+				now.tv_nsec += wall_to_mono.tv_nsec;
+			}
+		} else {
+			/*
+			 * Not one of the basic clocks
+			 */
+			do_posix_gettime(clock, &now);	
+			jiffies_64_f = get_jiffies_64();
+		}
+		/*
+		 * Take away now to get delta
+		 */
+		oc.tv_sec -= now.tv_sec;
+		oc.tv_nsec -= now.tv_nsec;
 		/*
 		 * Normalize...
 		 */
-		if ((tp->tv_nsec - NSEC_PER_SEC) >= 0) {
-			tp->tv_nsec -= NSEC_PER_SEC;
-			tp->tv_sec++;
+		while ((oc.tv_nsec - NSEC_PER_SEC) >= 0) {
+			oc.tv_nsec -= NSEC_PER_SEC;
+			oc.tv_sec++;
 		}
-		if ((tp->tv_nsec) < 0) {
-			tp->tv_nsec += NSEC_PER_SEC;
-			tp->tv_sec--;
+		while ((oc.tv_nsec) < 0) {
+			oc.tv_nsec += NSEC_PER_SEC;
+			oc.tv_sec--;
 		}
+	}else{
+		jiffies_64_f = get_jiffies_64();
 	}
 	/*
-	 * Check if the requested time is prior to now (if so set now) or
-	 * is more than the timer code can handle (if so we error out).
-	 * The (unsigned) catches the case of prior to "now" with the same
-	 * test.  Only on failure do we sort out what happened, and then
-	 * we use the (unsigned) to error out negative seconds.
+	 * Check if the requested time is prior to now (if so set now)
 	 */
-	if ((unsigned) (tp->tv_sec - now.tv_sec) > (MAX_JIFFY_OFFSET / HZ)) {
-		if ((unsigned) tp->tv_sec < now.tv_sec) {
-			tp->tv_sec = now.tv_sec;
-			tp->tv_nsec = now.tv_nsec;
-		} else
+	if (oc.tv_sec < 0)
+		oc.tv_sec = oc.tv_nsec = 0;
+	tstojiffie(&oc, clock->res, exp);
+
+	/*
+	 * Check if the requested time is more than the timer code
+	 * can handle (if so we error out but return the value too).
+	 */
+	if (*exp > ((u64)MAX_JIFFY_OFFSET))
 			/*
 			 * This is a considered response, not exactly in
 			 * line with the standard (in fact it is silent on
-			 * possible overflows).  We assume such a large
+			 * possible overflows).  We assume such a large 
 			 * value is ALMOST always a programming error and
 			 * try not to compound it by setting a really dumb
 			 * value.
 			 */
-			return -EINVAL;
-	}
-	return 0;
+			rtn = -EINVAL;
+	/*
+	 * return the actual jiffies expire time, full 64 bits
+	 */
+	*exp += jiffies_64_f;
+	return rtn;
 }
 
 /* Set a POSIX.1b interval timer. */
@@ -750,6 +766,7 @@ do_timer_settime(struct k_itimer *timr, int flags,
 		 struct itimerspec *new_setting, struct itimerspec *old_setting)
 {
 	struct k_clock *clock = &posix_clocks[timr->it_clock];
+	u64 expire_64;
 
 	if (old_setting)
 		do_timer_gettime(timr, old_setting);
@@ -788,14 +805,15 @@ do_timer_settime(struct k_itimer *timr, int flags,
 		return 0;
 	}
 
-	if ((flags & TIMER_ABSTIME) &&
-	    (clock->clock_get != do_posix_clock_monotonic_gettime))
-		// FIXME: what is this?
-		;
 	if (adjust_abs_time(clock,
-			    &new_setting->it_value, flags & TIMER_ABSTIME))
+			    &new_setting->it_value, flags & TIMER_ABSTIME, 
+			    &expire_64)) {
 		return -EINVAL;
-	tstotimer(new_setting, timr);
+	}
+	timr->it_timer.expires = (unsigned long)expire_64;	
+	tstojiffie(&new_setting->it_interval, clock->res, &expire_64);
+	timr->it_incr = (unsigned long)expire_64;
+
 
 	/*
 	 * For some reason the timer does not fire immediately if expires is
@@ -964,30 +982,46 @@ static int do_posix_gettime(struct k_clock *clock, struct timespec *tp)
  * Note also that the while loop assures that the sub_jiff_offset
  * will be less than a jiffie, thus no need to normalize the result.
  * Well, not really, if called with ints off :(
- *
- * HELP, this code should make an attempt at resolution beyond the
- * jiffie.  Trouble is this is "arch" dependent...
  */
 
-int do_posix_clock_monotonic_gettime(struct timespec *tp)
+static u64 do_posix_clock_monotonic_gettime_parts(
+	struct timespec *tp, struct timespec *mo)
 {
-	long sub_sec;
-	u64 jiffies_64_f;
-
-#if (BITS_PER_LONG > 32)
-	jiffies_64_f = jiffies_64;
-#else
+	u64 jiff;
+	struct timeval tpv;
 	unsigned int seq;
 
 	do {
 		seq = read_seqbegin(&xtime_lock);
-		jiffies_64_f = jiffies_64;
+		do_gettimeofday(&tpv);
+		*mo = wall_to_monotonic;
+		jiff = jiffies_64;
 
-	} while (read_seqretry(&xtime_lock, seq));
-#endif
-	tp->tv_sec = div_long_long_rem(jiffies_64_f, HZ, &sub_sec);
-	tp->tv_nsec = sub_sec * (NSEC_PER_SEC / HZ);
+	} while(read_seqretry(&xtime_lock, seq));
 
+	/*
+	 * Love to get this before it is converted to usec.
+	 * It would save a div AND a mpy.
+	 */
+	tp->tv_sec = tpv.tv_sec;
+	tp->tv_nsec = tpv.tv_usec * NSEC_PER_USEC;
+
+	return jiff;
+}
+
+int do_posix_clock_monotonic_gettime(struct timespec *tp)
+{
+	struct timespec wall_to_mono;
+
+	do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono);
+
+	tp->tv_sec += wall_to_mono.tv_sec;
+	tp->tv_nsec += wall_to_mono.tv_nsec;
+
+	if ((tp->tv_nsec - NSEC_PER_SEC) > 0) {
+		tp->tv_nsec -= NSEC_PER_SEC;
+		tp->tv_sec++;
+	}
 	return 0;
 }
 
@@ -1138,7 +1172,7 @@ do_clock_nanosleep(clockid_t which_clock, int flags, struct timespec *tsave)
 	struct timespec t;
 	struct timer_list new_timer;
 	DECLARE_WAITQUEUE(abs_wqueue, current);
-	u64 rq_time = 0;
+	u64 rq_time = (u64)0;
 	s64 left;
 	int abs;
 	struct restart_block *restart_block =
@@ -1163,7 +1197,7 @@ do_clock_nanosleep(clockid_t which_clock, int flags, struct timespec *tsave)
 		if (!rq_time)
 			return -EINTR;
 		left = rq_time - get_jiffies_64();
-		if (left <= 0LL)
+		if (left <= (s64)0)
 			return 0;	/* Already passed */
 	}
 
@@ -1174,14 +1208,14 @@ do_clock_nanosleep(clockid_t which_clock, int flags, struct timespec *tsave)
 	do {
 		t = *tsave;
 		if (abs || !rq_time) {
-			adjust_abs_time(&posix_clocks[which_clock], &t, abs);
-			tstojiffie(&t, posix_clocks[which_clock].res, &rq_time);
+			adjust_abs_time(&posix_clocks[which_clock], &t, abs,
+					&rq_time);
 		}
 
 		left = rq_time - get_jiffies_64();
-		if (left >= MAX_JIFFY_OFFSET)
-			left = MAX_JIFFY_OFFSET;
-		if (left < 0)
+		if (left >= (s64)MAX_JIFFY_OFFSET)
+			left = (s64)MAX_JIFFY_OFFSET;
+		if (left < (s64)0)
 			break;
 
 		new_timer.expires = jiffies + left;
@@ -1192,12 +1226,12 @@ do_clock_nanosleep(clockid_t which_clock, int flags, struct timespec *tsave)
 
 		del_timer_sync(&new_timer);
 		left = rq_time - get_jiffies_64();
-	} while (left > 0 && !test_thread_flag(TIF_SIGPENDING));
+	} while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
 
 	if (abs_wqueue.task_list.next)
 		finish_wait(&nanosleep_abs_wqueue, &abs_wqueue);
 
-	if (left > 0) {
+	if (left > (s64)0) {
 		unsigned long rmd;
 
 		/*
diff --git a/kernel/timer.c b/kernel/timer.c
index 4aaf025ee8ba..caa37716f860 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -441,8 +441,16 @@ repeat:
 unsigned long tick_usec = TICK_USEC; 		/* ACTHZ   period (usec) */
 unsigned long tick_nsec = TICK_NSEC(TICK_USEC);	/* USER_HZ period (nsec) */
 
-/* The current time */
+/* 
+ * The current time 
+ * wall_to_monotonic is what we need to add to xtime (or xtime corrected 
+ * for sub jiffie times) to get to monotonic time.  Monotonic is pegged at zero
+ * at zero at system boot time, so wall_to_monotonic will be negative,
+ * however, we will ALWAYS keep the tv_nsec part positive so we can use
+ * the usual normalization.
+ */
 struct timespec xtime __attribute__ ((aligned (16)));
+struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
 
 /* Don't completely fail for HZ > 500.  */
 int tickadj = 500/HZ ? : 1;		/* microsecs */
@@ -508,6 +516,7 @@ static void second_overflow(void)
     case TIME_INS:
 	if (xtime.tv_sec % 86400 == 0) {
 	    xtime.tv_sec--;
+	    wall_to_monotonic.tv_sec++;
 	    time_state = TIME_OOP;
 	    clock_was_set();
 	    printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
@@ -517,6 +526,7 @@ static void second_overflow(void)
     case TIME_DEL:
 	if ((xtime.tv_sec + 1) % 86400 == 0) {
 	    xtime.tv_sec++;
+	    wall_to_monotonic.tv_sec--;
 	    time_state = TIME_WAIT;
 	    clock_was_set();
 	    printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
-- 
cgit v1.2.3


From 0ebcfd99e0a50c42248c5f4ee78d82d8cb65ab4a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:26:28 -0700
Subject: [PATCH] Fix jiffies_to_time[spec | val] and converse to use

From: george anzinger <george@mvista.com>

In the current system (2.5.67) time_spec to jiffies, time_val to
jiffies and the converse (jiffies to time_val and jiffies to
time_spec) all use 1/HZ as the measure of a jiffie.  Because of the
inability of the PIT to actually generate an accurate 1/HZ interrupt,
the wall clock is updated with a more accurate value (999848
nanoseconds per jiffie for HZ = 1000).  This causes a 1/HZ
interpretation of jiffies based timing to run faster than the wall
clock, thus causing sleeps and timers to expire short of the requested
time.  Try, for example:

time sleep 60

This patch changes the conversion routines to use the same value as
the wall clock update code to do the conversions.

The actual math is almost all done at compile time.  The run time
conversions require little if any more execution time.

This patch must be applied after the patch I posted earlier today
which fixed the CLOCK_MONOTONIC resolution issue.
---
 include/asm-i386/div64.h | 18 +++++++++++++
 include/linux/time.h     | 70 ++++++++++++++++++++++++++++++++++++++----------
 include/linux/timex.h    |  2 +-
 kernel/posix-timers.c    | 43 +++++++++++++----------------
 4 files changed, 94 insertions(+), 39 deletions(-)

diff --git a/include/asm-i386/div64.h b/include/asm-i386/div64.h
index ef915df700e4..bc8718a0b1ce 100644
--- a/include/asm-i386/div64.h
+++ b/include/asm-i386/div64.h
@@ -14,4 +14,22 @@
 	__mod; \
 })
 
+/*
+ * (long)X = ((long long)divs) / (long)div
+ * (long)rem = ((long long)divs) % (long)div
+ *
+ * Warning, this will do an exception if X overflows.
+ */
+#define div_long_long_rem(a,b,c) div_ll_X_l_rem(a,b,c)
+
+extern inline long
+div_ll_X_l_rem(long long divs, long div, long *rem)
+{
+	long dum2;
+      __asm__("divl %2":"=a"(dum2), "=d"(*rem)
+      :	"rm"(div), "A"(divs));
+
+	return dum2;
+
+}
 #endif
diff --git a/include/linux/time.h b/include/linux/time.h
index 6fad88e1a37d..fdab2abc43be 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -26,6 +26,16 @@ struct timezone {
 
 #include <linux/spinlock.h>
 #include <linux/seqlock.h>
+#include <linux/timex.h>
+#include <asm/div64.h>
+#ifndef div_long_long_rem
+
+#define div_long_long_rem(dividend,divisor,remainder) ({ \
+		       u64 result = dividend;		\
+		       *remainder = do_div(result,divisor); \
+		       result; })
+
+#endif
 
 /*
  * Have the 32 bit jiffies value wrap 5 minutes after boot
@@ -59,25 +69,52 @@ struct timezone {
 #ifndef NSEC_PER_USEC
 #define NSEC_PER_USEC (1000L)
 #endif
+/*
+ * We want to do realistic conversions of time so we need to use the same
+ * values the update wall clock code uses as the jiffie size.  This value
+ * is: TICK_NSEC(TICK_USEC) (both of which are defined in timex.h).  This 
+ * is a constant and is in nanoseconds.  We will used scaled math and
+ * with a scales defined here as SEC_JIFFIE_SC,  USEC_JIFFIE_SC and 
+ * NSEC_JIFFIE_SC.  Note that these defines contain nothing but
+ * constants and so are computed at compile time.  SHIFT_HZ (computed in
+ * timex.h) adjusts the scaling for different HZ values.
+ */
+#define SEC_JIFFIE_SC (30 - SHIFT_HZ)
+#define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 30)
+#define USEC_JIFFIE_SC (SEC_JIFFIE_SC + 20)
+#define SEC_CONVERSION ((unsigned long)(((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) /\
+                            (u64)TICK_NSEC(TICK_USEC))) 
+#define NSEC_CONVERSION ((unsigned long)(((u64)1 << NSEC_JIFFIE_SC) / \
+                            (u64)TICK_NSEC(TICK_USEC))) 
+#define USEC_CONVERSION \
+               ((unsigned long)(((u64)NSEC_PER_USEC << USEC_JIFFIE_SC)/ \
+                                 (u64)TICK_NSEC(TICK_USEC))) 
+#define MAX_SEC_IN_JIFFIES \
+    (u32)((u64)((u64)MAX_JIFFY_OFFSET * TICK_NSEC(TICK_USEC)) / NSEC_PER_SEC)
 
 static __inline__ unsigned long
 timespec_to_jiffies(struct timespec *value)
 {
 	unsigned long sec = value->tv_sec;
-	long nsec = value->tv_nsec;
+	long nsec = value->tv_nsec + TICK_NSEC(TICK_USEC) - 1;
 
-	if (sec >= (MAX_JIFFY_OFFSET / HZ))
+	if (sec >=  MAX_SEC_IN_JIFFIES)
 		return MAX_JIFFY_OFFSET;
-	nsec += 1000000000L / HZ - 1;
-	nsec /= 1000000000L / HZ;
-	return HZ * sec + nsec;
+	return (((u64)sec * SEC_CONVERSION) +
+		(((u64)nsec * NSEC_CONVERSION) >>
+		 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+
 }
 
 static __inline__ void
 jiffies_to_timespec(unsigned long jiffies, struct timespec *value)
 {
-	value->tv_nsec = (jiffies % HZ) * (1000000000L / HZ);
-	value->tv_sec = jiffies / HZ;
+	/*
+	 * Convert jiffies to nanoseconds and seperate with
+	 * one divide.
+	 */
+	u64 nsec = (u64)jiffies * TICK_NSEC(TICK_USEC); 
+	value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec);
 }
 
 /* Same for "timeval" */
@@ -85,20 +122,25 @@ static __inline__ unsigned long
 timeval_to_jiffies(struct timeval *value)
 {
 	unsigned long sec = value->tv_sec;
-	long usec = value->tv_usec;
+	long usec = value->tv_usec + USEC_PER_SEC / HZ - 1;
 
-	if (sec >= (MAX_JIFFY_OFFSET / HZ))
+	if (sec >= MAX_SEC_IN_JIFFIES)
 		return MAX_JIFFY_OFFSET;
-	usec += 1000000L / HZ - 1;
-	usec /= 1000000L / HZ;
-	return HZ * sec + usec;
+	return (((u64)sec * SEC_CONVERSION) +
+		(((u64)usec * USEC_CONVERSION) >>
+		 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
 }
 
 static __inline__ void
 jiffies_to_timeval(unsigned long jiffies, struct timeval *value)
 {
-	value->tv_usec = (jiffies % HZ) * (1000000L / HZ);
-	value->tv_sec = jiffies / HZ;
+	/*
+	 * Convert jiffies to nanoseconds and seperate with
+	 * one divide.
+	 */
+	u64 nsec = (u64)jiffies * TICK_NSEC(TICK_USEC); 
+	value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_usec);
+	value->tv_usec /= NSEC_PER_USEC;
 }
 
 static __inline__ int timespec_equal(struct timespec *a, struct timespec *b) 
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 5b2b0ac18ae7..6c00606c6e33 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -51,7 +51,6 @@
 #ifndef _LINUX_TIMEX_H
 #define _LINUX_TIMEX_H
 
-#include <linux/time.h>
 #include <asm/param.h>
 
 /*
@@ -177,6 +176,7 @@
 /* a value TUSEC for TICK_USEC (can be set bij adjtimex)		*/
 #define TICK_NSEC(TUSEC) (SH_DIV (TUSEC * USER_HZ * 1000, ACTHZ, 8))
 
+#include <linux/time.h>
 /*
  * syscall interface - used (mainly by NTP daemon)
  * to discipline kernel clock oscillator
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a0a972ec7742..bca12ba294e4 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -33,7 +33,12 @@
 		       result; })
 
 #endif
+#define CLOCK_REALTIME_RES TICK_NSEC(TICK_USEC)  // In nano seconds.
 
+static inline u64  mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2)
+{
+	return (u64)mpy1 * mpy2;
+}
 /*
  * Management arrays for POSIX timers.	 Timers are kept in slab memory
  * Timer ids are allocated by an external routine that keeps track of the
@@ -175,8 +180,8 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags);
  */
 static __init int init_posix_timers(void)
 {
-	struct k_clock clock_realtime = {.res = NSEC_PER_SEC / HZ };
-	struct k_clock clock_monotonic = {.res = NSEC_PER_SEC / HZ,
+	struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES };
+	struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES,
 		.clock_get = do_posix_clock_monotonic_gettime,
 		.clock_set = do_posix_clock_monotonic_settime
 	};
@@ -204,24 +209,14 @@ static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
 	}
 
 	/*
-	 * A note on jiffy overflow: It is possible for the system to
-	 * have been up long enough for the jiffies quanity to overflow.
-	 * In order for correct timer evaluations we require that the
-	 * specified time be somewhere between now and now + (max
-	 * unsigned int/2).  Times beyond this will be truncated back to
-	 * this value.   This is done in the absolute adjustment code,
-	 * below.  Here it is enough to just discard the high order
-	 * bits.
-	 */
-	*jiff = (s64)sec * HZ;
-	/*
-	 * Do the res thing. (Don't forget the add in the declaration of nsec)
-	 */
-	nsec -= nsec % res;
-	/*
-	 * Split to jiffie and sub jiffie
-	 */
-	*jiff += nsec / (NSEC_PER_SEC / HZ);
+	 * The scaling constants are defined in <linux/time.h>
+	 * The difference between there and here is that we do the
+	 * res rounding and compute a 64-bit result (well so does that
+	 * but it then throws away the high bits).
+  	 */
+	*jiff =  (mpy_l_X_l_ll(sec, SEC_CONVERSION) +
+		  (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >> 
+		   (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
 }
 
 static void schedule_next_timer(struct k_itimer *timr)
@@ -1232,7 +1227,6 @@ do_clock_nanosleep(clockid_t which_clock, int flags, struct timespec *tsave)
 		finish_wait(&nanosleep_abs_wqueue, &abs_wqueue);
 
 	if (left > (s64)0) {
-		unsigned long rmd;
 
 		/*
 		 * Always restart abs calls from scratch to pick up any
@@ -1241,9 +1235,10 @@ do_clock_nanosleep(clockid_t which_clock, int flags, struct timespec *tsave)
 		if (abs)
 			return -ERESTARTNOHAND;
 
-		tsave->tv_sec = div_long_long_rem(left, HZ, &rmd);
-		tsave->tv_nsec = rmd * (NSEC_PER_SEC / HZ);
-
+		left *= TICK_NSEC(TICK_USEC);
+		tsave->tv_sec = div_long_long_rem(left, 
+						  NSEC_PER_SEC, 
+						  &tsave->tv_nsec);
 		restart_block->fn = clock_nanosleep_restart;
 		restart_block->arg0 = which_clock;
 		restart_block->arg1 = (unsigned long)tsave;
-- 
cgit v1.2.3


From e2ac56f6344ddf58cbabbb82e478c440857a770d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:26:34 -0700
Subject: [PATCH] get_offset_pit and do_timer_overflow vs IRQ locking

From: john stultz <johnstul@us.ibm.com>, Alexander Atanasov <alex@ssi.bg>

We want to make sure we update jiffies_p and count_p atomically.  So I'm
inserting the spin_unlock_irqrestore() after we update count_p, rather then
just before.
---
 arch/i386/kernel/timers/timer_pit.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/i386/kernel/timers/timer_pit.c b/arch/i386/kernel/timers/timer_pit.c
index cdcc95b74aaf..7d521228c28a 100644
--- a/arch/i386/kernel/timers/timer_pit.c
+++ b/arch/i386/kernel/timers/timer_pit.c
@@ -54,7 +54,7 @@ static void delay_pit(unsigned long loops)
 }
 
 
-/* This function must be called with interrupts disabled 
+/* This function must be called with xtime_lock held.
  * It was inspired by Steve McCanne's microtime-i386 for BSD.  -- jrs
  * 
  * However, the pc-audio speaker driver changes the divisor so that
@@ -93,7 +93,7 @@ static unsigned long get_offset_pit(void)
 	static unsigned long jiffies_p = 0;
 
 	/*
-	 * cache volatile jiffies temporarily; we have IRQs turned off. 
+	 * cache volatile jiffies temporarily; we have xtime_lock. 
 	 */
 	unsigned long jiffies_t;
 
@@ -119,8 +119,6 @@ static unsigned long get_offset_pit(void)
                 count = LATCH - 1;
         }
 	
-	spin_unlock_irqrestore(&i8253_lock, flags);
-
 	/*
 	 * avoiding timer inconsistencies (they are rare, but they happen)...
 	 * there are two kinds of problems that must be avoided here:
@@ -130,7 +128,6 @@ static unsigned long get_offset_pit(void)
 	 *     (see c't 95/10 page 335 for Neptun bug.)
 	 */
 
-
 	if( jiffies_t == jiffies_p ) {
 		if( count > count_p ) {
 			/* the nutcase */
@@ -141,6 +138,8 @@ static unsigned long get_offset_pit(void)
 
 	count_p = count;
 
+	spin_unlock_irqrestore(&i8253_lock, flags);
+
 	count = ((LATCH-1) - count) * TICK_SIZE;
 	count = (count + LATCH/2) / LATCH;
 
-- 
cgit v1.2.3


From d9a4b6c50867f83bb0d9c2d3cc08990075204eac Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:26:42 -0700
Subject: [PATCH] detect_lost_tick locking fixes

From: john stultz <johnstul@us.ibm.com>

This patch fixes a race in the timer_interrupt code caused by
detect_lost_tick().  Since we're doing lost-tick compensation outside
timer->mark_offset, time can pass between time-source reads which can cause
gettimeofday inconsistencies.

Additionally detect_lost_tick() was broken for the PIT case, since the whole
point of detect_lost_tick() is to interpolate between two time sources to
find inconsistencies.  Additionally this could cause xtime_lock seq_lock
reader starvation which has been causing machine hangs for SMP boxes that use
the PIT as a time source.

This patch fixes the described race by removing detect_lost_tick() and
instead implementing the lost tick detection code inside mark_offset().

Some of the divs and mods being added here might concern folks, but by not
calling timer->get_offset() in detect_lost_tick() we eliminate much of the
same math.  I did some simple cycle counting and the new code comes out on
average equivalent or faster.
---
 arch/i386/kernel/time.c                 | 36 ---------------------------------
 arch/i386/kernel/timers/timer_cyclone.c | 18 +++++++++++++++++
 arch/i386/kernel/timers/timer_tsc.c     | 26 ++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 36 deletions(-)

diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index f5e9bf60dbb0..21f7f88a5dae 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -240,41 +240,6 @@ static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *reg
 #endif
 }
 
-/*
- * Lost tick detection and compensation
- */
-static inline void detect_lost_tick(void)
-{
-	/* read time since last interrupt */
-	unsigned long delta = timer->get_offset();
-	static unsigned long dbg_print;
-	
-	/* check if delta is greater then two ticks */
-	if(delta >= 2*(1000000/HZ)){
-
-		/*
-		 * only print debug info first 5 times
-		 */
-		/*
-		 * AKPM: disable this for now; it's nice, but irritating.
-		 */
-		if (0 && dbg_print < 5) {
-			printk(KERN_WARNING "\nWarning! Detected %lu "
-				"micro-second gap between interrupts.\n",
-				delta);
-			printk(KERN_WARNING "  Compensating for %lu lost "
-				"ticks.\n",
-				delta/(1000000/HZ)-1);
-			dump_stack();
-			dbg_print++;
-		}
-		/* calculate number of missed ticks */
-		delta = delta/(1000000/HZ)-1;
-		jiffies += delta;
-	}
-		
-}
-
 /*
  * This is the same as the above, except we _also_ save the current
  * Time Stamp Counter value at the time of the timer interrupt, so that
@@ -291,7 +256,6 @@ void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 */
 	write_seqlock(&xtime_lock);
 
-	detect_lost_tick();
 	timer->mark_offset();
  
 	do_timer_interrupt(irq, NULL, regs);
diff --git a/arch/i386/kernel/timers/timer_cyclone.c b/arch/i386/kernel/timers/timer_cyclone.c
index cbe74fd46491..7cdf4d7bca6f 100644
--- a/arch/i386/kernel/timers/timer_cyclone.c
+++ b/arch/i386/kernel/timers/timer_cyclone.c
@@ -18,6 +18,7 @@
 #include <asm/fixmap.h>
 
 extern spinlock_t i8253_lock;
+extern unsigned long jiffies;
 extern unsigned long calibrate_tsc(void);
 
 /* Number of usecs that the last interrupt was delayed */
@@ -46,6 +47,8 @@ static rwlock_t monotonic_lock = RW_LOCK_UNLOCKED;
 
 static void mark_offset_cyclone(void)
 {
+	unsigned long lost, delay;
+	unsigned long delta = last_cyclone_low;
 	int count;
 	unsigned long long this_offset, last_offset;
 
@@ -62,6 +65,15 @@ static void mark_offset_cyclone(void)
 	count |= inb(0x40) << 8;
 	spin_unlock(&i8253_lock);
 
+	/* lost tick compensation */
+	delta = last_cyclone_low - delta;	
+	delta /=(CYCLONE_TIMER_FREQ/1000000);
+	delta += delay_at_last_interrupt;
+	lost = delta/(1000000/HZ);
+	delay = delta%(1000000/HZ);
+	if(lost >= 2)
+		jiffies += lost-1;
+	
 	/* update the monotonic base value */
 	this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
 	monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK;
@@ -70,6 +82,12 @@ static void mark_offset_cyclone(void)
 	/* calculate delay_at_last_interrupt */
 	count = ((LATCH-1) - count) * TICK_SIZE;
 	delay_at_last_interrupt = (count + LATCH/2) / LATCH;
+
+	/* catch corner case where tick rollover 
+	 * occured between cyclone and pit reads
+	 */
+	if(abs(delay - delay_at_last_interrupt) > 900)
+		jiffies++;
 }
 
 static unsigned long get_offset_cyclone(void)
diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c
index fad90c000cc6..e7c126d30e42 100644
--- a/arch/i386/kernel/timers/timer_tsc.c
+++ b/arch/i386/kernel/timers/timer_tsc.c
@@ -21,6 +21,7 @@
 int tsc_disable __initdata = 0;
 
 extern spinlock_t i8253_lock;
+extern unsigned long jiffies;
 
 static int use_tsc;
 /* Number of usecs that the last interrupt was delayed */
@@ -117,6 +118,8 @@ static unsigned long long monotonic_clock_tsc(void)
 
 static void mark_offset_tsc(void)
 {
+	unsigned long lost,delay;
+	unsigned long delta = last_tsc_low;
 	int count;
 	int countmp;
 	static int count1 = 0;
@@ -161,6 +164,23 @@ static void mark_offset_tsc(void)
 		}
 	}
 
+	/* lost tick compensation */
+	delta = last_tsc_low - delta;
+	{
+		register unsigned long eax, edx;
+		eax = delta;
+		__asm__("mull %2"
+		:"=a" (eax), "=d" (edx)
+		:"rm" (fast_gettimeoffset_quotient),
+		 "0" (eax));
+		delta = edx;
+	}
+	delta += delay_at_last_interrupt;
+	lost = delta/(1000000/HZ);
+	delay = delta%(1000000/HZ);
+	if(lost >= 2)
+		jiffies += lost-1;
+
 	/* update the monotonic base value */
 	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
 	monotonic_base += cycles_2_ns(this_offset - last_offset);
@@ -169,6 +189,12 @@ static void mark_offset_tsc(void)
 	/* calculate delay_at_last_interrupt */
 	count = ((LATCH-1) - count) * TICK_SIZE;
 	delay_at_last_interrupt = (count + LATCH/2) / LATCH;
+
+	/* catch corner case where tick rollover 
+	 * occured between tsc and pit reads
+	 */
+	if(abs(delay - delay_at_last_interrupt) > 900)
+		jiffies++;
 }
 
 static void delay_tsc(unsigned long loops)
-- 
cgit v1.2.3


From 72689e6707c297b721969f8e6ae6059aa92f92ed Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:26:49 -0700
Subject: [PATCH] Minor fix for driver/serial/core.c

From: Jean Tourrilhes <jt@bougret.hpl.hp.com>

	The following command will do nothing at all on 2.5.X :
		setserial /dev/ttyS0 uart none
---
 drivers/serial/core.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/serial/core.c b/drivers/serial/core.c
index b461093a13cc..c6207f0737b7 100644
--- a/drivers/serial/core.c
+++ b/drivers/serial/core.c
@@ -782,8 +782,12 @@ uart_set_info(struct uart_state *state, struct serial_struct *newinfo)
 		/*
 		 * Claim and map the new regions
 		 */
-		if (port->type != PORT_UNKNOWN)
+		if (port->type != PORT_UNKNOWN) {
 			retval = port->ops->request_port(port);
+		} else {
+			/* Always success - Jean II */
+			retval = 0;
+		}
 
 		/*
 		 * If we fail to request resources for the
-- 
cgit v1.2.3


From 5da505b18a17e7007fb7300d3557adbdbeab8193 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:26:55 -0700
Subject: [PATCH] keyboard.c Fix SAK in raw mode

From: Chris Heath <chris@heathens.co.nz>

Trivial fix to get the SAK key working in raw and medium raw modes.  Patch is
against kernel 2.5.67.
---
 drivers/char/keyboard.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c
index ed453a23e4de..ef1fd1b6b0df 100644
--- a/drivers/char/keyboard.c
+++ b/drivers/char/keyboard.c
@@ -601,7 +601,7 @@ static void k_spec(struct vc_data *vc, unsigned char value, char up_flag, struct
 		return;
 	if ((kbd->kbdmode == VC_RAW || 
 	     kbd->kbdmode == VC_MEDIUMRAW) && 
-	     value != K_SAK)
+	     value != KVAL(K_SAK))
 		return;		/* SAK is allowed even in raw mode */
 	fn_handler[value](vc, regs);
 }
-- 
cgit v1.2.3


From 609b0188b7b8d23aad00c5e995461a56f47dc5df Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:27:02 -0700
Subject: [PATCH] Make PCI scanning order the same as 2.4

From: Chuck Ebbert <76306.1226@compuserve.com>

2.4 builds its global PCI device list in breadth-first order.

2.5 is doing the scan that way but defers the construction of the global list
until later and then does it depth-first.  This causes devices to found in
different order by drivers.  The below fixed that problem for me.

Russell King has acked this change.
---
 drivers/pci/bus.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index b37429ad5e65..bee04f53a84b 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -75,7 +75,8 @@ pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res,
  * Add newly discovered PCI devices (which are on the bus->devices
  * list) to the global PCI device list, add the sysfs and procfs
  * entries.  Where a bridge is found, add the discovered bus to
- * the parents list of child buses, and recurse.
+ * the parents list of child buses, and recurse (breadth-first
+ * to be compatible with 2.4)
  *
  * Call hotplug for each new devices.
  */
@@ -98,6 +99,12 @@ void __devinit pci_bus_add_devices(struct pci_bus *bus)
 #endif
 		pci_create_sysfs_dev_files(dev);
 
+	}
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+
+		BUG_ON(list_empty(&dev->global_list));
+
 		/*
 		 * If there is an unattached subordinate bus, attach
 		 * it and then scan for unattached PCI devices.
-- 
cgit v1.2.3


From 26fbf90f0fe0a7992597e61397747d454b8875e1 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:27:09 -0700
Subject: [PATCH] Turn on NUMA rebalancing

From: "Martin J. Bligh" <mbligh@aracnet.com>

I'd forgotten that I'd set this to only fire every 20s in the past, because
it would rebalance too agressively.  That seems to be fixed now, so we should
turn it back on.
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 43b08b5ec658..ae2dbdf33d7d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1091,7 +1091,7 @@ out:
 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
 #define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
-#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 100)
+#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
 
 #ifdef CONFIG_NUMA
 static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
-- 
cgit v1.2.3


From 5549174d1698a7406bea0f3d4eb395e483f5fb7b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:27:14 -0700
Subject: [PATCH] Move __set_page_dirty_buffers to fs/buffer.c

From: William Lee Irwin III <wli@holomorphy.com>

Move __set_page_dirty_buffers() to fs/buffer.c, as per the FIXME.
---
 fs/buffer.c         | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c | 82 -----------------------------------------------------
 2 files changed, 79 insertions(+), 82 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 006892c691cd..c40bf36afc0b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -775,6 +775,85 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 }
 EXPORT_SYMBOL(mark_buffer_dirty_inode);
 
+/*
+ * Add a page to the dirty page list.
+ *
+ * It is a sad fact of life that this function is called from several places
+ * deeply under spinlocking.  It may not sleep.
+ *
+ * If the page has buffers, the uptodate buffers are set dirty, to preserve
+ * dirty-state coherency between the page and the buffers.  It the page does
+ * not have buffers then when they are later attached they will all be set
+ * dirty.
+ *
+ * The buffers are dirtied before the page is dirtied.  There's a small race
+ * window in which a writepage caller may see the page cleanness but not the
+ * buffer dirtiness.  That's fine.  If this code were to set the page dirty
+ * before the buffers, a concurrent writepage caller could clear the page dirty
+ * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
+ * page on the dirty page list.
+ *
+ * There is also a small window where the page is dirty, and not on dirty_pages.
+ * Also a possibility that by the time the page is added to dirty_pages, it has
+ * been set clean.  The page lists are somewhat approximate in this regard.
+ * It's better to have clean pages accidentally attached to dirty_pages than to
+ * leave dirty pages attached to clean_pages.
+ *
+ * We use private_lock to lock against try_to_free_buffers while using the
+ * page's buffer list.  Also use this to protect against clean buffers being
+ * added to the page after it was set dirty.
+ *
+ * FIXME: may need to call ->reservepage here as well.  That's rather up to the
+ * address_space though.
+ *
+ * For now, we treat swapper_space specially.  It doesn't use the normal
+ * block a_ops.
+ */
+int __set_page_dirty_buffers(struct page *page)
+{
+	struct address_space * const mapping = page->mapping;
+	int ret = 0;
+
+	if (mapping == NULL) {
+		SetPageDirty(page);
+		goto out;
+	}
+
+	if (!PageUptodate(page))
+		buffer_error();
+
+	spin_lock(&mapping->private_lock);
+	if (page_has_buffers(page)) {
+		struct buffer_head *head = page_buffers(page);
+		struct buffer_head *bh = head;
+
+		do {
+			if (buffer_uptodate(bh))
+				set_buffer_dirty(bh);
+			else
+				buffer_error();
+			bh = bh->b_this_page;
+		} while (bh != head);
+	}
+	spin_unlock(&mapping->private_lock);
+
+	if (!TestSetPageDirty(page)) {
+		spin_lock(&mapping->page_lock);
+		if (page->mapping) {	/* Race with truncate? */
+			if (!mapping->backing_dev_info->memory_backed)
+				inc_page_state(nr_dirty);
+			list_del(&page->list);
+			list_add(&page->list, &mapping->dirty_pages);
+		}
+		spin_unlock(&mapping->page_lock);
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+	}
+	
+out:
+	return ret;
+}
+EXPORT_SYMBOL(__set_page_dirty_buffers);
+
 /*
  * Write out and wait upon a list of buffers.
  *
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index aaa70d02b859..c33c6a207426 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -461,88 +461,6 @@ int write_one_page(struct page *page, int wait)
 }
 EXPORT_SYMBOL(write_one_page);
 
-/*
- * Add a page to the dirty page list.
- *
- * It is a sad fact of life that this function is called from several places
- * deeply under spinlocking.  It may not sleep.
- *
- * If the page has buffers, the uptodate buffers are set dirty, to preserve
- * dirty-state coherency between the page and the buffers.  It the page does
- * not have buffers then when they are later attached they will all be set
- * dirty.
- *
- * The buffers are dirtied before the page is dirtied.  There's a small race
- * window in which a writepage caller may see the page cleanness but not the
- * buffer dirtiness.  That's fine.  If this code were to set the page dirty
- * before the buffers, a concurrent writepage caller could clear the page dirty
- * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
- * page on the dirty page list.
- *
- * There is also a small window where the page is dirty, and not on dirty_pages.
- * Also a possibility that by the time the page is added to dirty_pages, it has
- * been set clean.  The page lists are somewhat approximate in this regard.
- * It's better to have clean pages accidentally attached to dirty_pages than to
- * leave dirty pages attached to clean_pages.
- *
- * We use private_lock to lock against try_to_free_buffers while using the
- * page's buffer list.  Also use this to protect against clean buffers being
- * added to the page after it was set dirty.
- *
- * FIXME: may need to call ->reservepage here as well.  That's rather up to the
- * address_space though.
- *
- * For now, we treat swapper_space specially.  It doesn't use the normal
- * block a_ops.
- *
- * FIXME: this should move over to fs/buffer.c - buffer_heads have no business in mm/
- */
-#include <linux/buffer_head.h>
-int __set_page_dirty_buffers(struct page *page)
-{
-	struct address_space * const mapping = page->mapping;
-	int ret = 0;
-
-	if (mapping == NULL) {
-		SetPageDirty(page);
-		goto out;
-	}
-
-	if (!PageUptodate(page))
-		buffer_error();
-
-	spin_lock(&mapping->private_lock);
-	if (page_has_buffers(page)) {
-		struct buffer_head *head = page_buffers(page);
-		struct buffer_head *bh = head;
-
-		do {
-			if (buffer_uptodate(bh))
-				set_buffer_dirty(bh);
-			else
-				buffer_error();
-			bh = bh->b_this_page;
-		} while (bh != head);
-	}
-	spin_unlock(&mapping->private_lock);
-
-	if (!TestSetPageDirty(page)) {
-		spin_lock(&mapping->page_lock);
-		if (page->mapping) {	/* Race with truncate? */
-			if (!mapping->backing_dev_info->memory_backed)
-				inc_page_state(nr_dirty);
-			list_del(&page->list);
-			list_add(&page->list, &mapping->dirty_pages);
-		}
-		spin_unlock(&mapping->page_lock);
-		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-	}
-	
-out:
-	return ret;
-}
-EXPORT_SYMBOL(__set_page_dirty_buffers);
-
 /*
  * For address_spaces which do not use buffers.  Just set the page's dirty bit
  * and move it to the dirty_pages list.  Also perform space reservation if
-- 
cgit v1.2.3


From cda55f337ee4197fed8154008512aaa56d09faa4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:27:23 -0700
Subject: [PATCH] Clean up various buffer-head dependencies

From: William Lee Irwin III <wli@holomorphy.com>

Remove page_has_buffers() from various functions, document the dependencies
on buffer_head.h from other files besides filemap.c, and s/this file/core VM/
in filemap.c
---
 mm/filemap.c    | 5 ++---
 mm/swap.c       | 2 +-
 mm/swap_state.c | 5 ++---
 mm/swapfile.c   | 5 ++---
 mm/vmscan.c     | 3 ++-
 5 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 884a9f50c6fe..40008f8f3626 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -31,12 +31,11 @@
  * This is needed for the following functions:
  *  - try_to_release_page
  *  - block_invalidatepage
- *  - page_has_buffers
  *  - generic_osync_inode
  *
- * FIXME: remove all knowledge of the buffer layer from this file
+ * FIXME: remove all knowledge of the buffer layer from the core VM
  */
-#include <linux/buffer_head.h>
+#include <linux/buffer_head.h> /* for generic_osync_inode */
 
 #include <asm/uaccess.h>
 #include <asm/mman.h>
diff --git a/mm/swap.c b/mm/swap.c
index eb71588c1f1a..f6442275cda5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -21,7 +21,7 @@
 #include <linux/pagevec.h>
 #include <linux/init.h>
 #include <linux/mm_inline.h>
-#include <linux/buffer_head.h>
+#include <linux/buffer_head.h>	/* for try_to_release_page() */
 #include <linux/percpu.h>
 
 /* How many pages do we try to swap or page in/out together? */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b479ebafa2bd..29198f06fcae 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -13,7 +13,6 @@
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/backing-dev.h>
-#include <linux/buffer_head.h>	/* block_sync_page() */
 
 #include <asm/pgtable.h>
 
@@ -187,7 +186,7 @@ void delete_from_swap_cache(struct page *page)
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(PageWriteback(page));
-	BUG_ON(page_has_buffers(page));
+	BUG_ON(PagePrivate(page));
   
 	entry.val = page->index;
 
@@ -236,7 +235,7 @@ int move_from_swap_cache(struct page *page, unsigned long index,
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(PageWriteback(page));
-	BUG_ON(page_has_buffers(page));
+	BUG_ON(PagePrivate(page));
 
 	entry.val = page->index;
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2271d23d7e7b..d646f7cb5340 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -15,7 +15,6 @@
 #include <linux/namei.h>
 #include <linux/shm.h>
 #include <linux/blkdev.h>
-#include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -300,7 +299,7 @@ int remove_exclusive_swap_page(struct page *page)
 	struct swap_info_struct * p;
 	swp_entry_t entry;
 
-	BUG_ON(page_has_buffers(page));
+	BUG_ON(PagePrivate(page));
 	BUG_ON(!PageLocked(page));
 
 	if (!PageSwapCache(page))
@@ -355,7 +354,7 @@ void free_swap_and_cache(swp_entry_t entry)
 	if (page) {
 		int one_user;
 
-		BUG_ON(page_has_buffers(page));
+		BUG_ON(PagePrivate(page));
 		page_cache_get(page);
 		one_user = (page_count(page) == 2);
 		/* Only cache user (+us), or swap space full? Free it! */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3d204f882d04..291ce1136c6e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -22,7 +22,8 @@
 #include <linux/writeback.h>
 #include <linux/suspend.h>
 #include <linux/blkdev.h>
-#include <linux/buffer_head.h>		/* for try_to_release_page() */
+#include <linux/buffer_head.h>	/* for try_to_release_page(),
+					buffer_heads_over_limit */
 #include <linux/mm_inline.h>
 #include <linux/pagevec.h>
 #include <linux/backing-dev.h>
-- 
cgit v1.2.3


From a3efc1fa0cb06bcea5805d384190bba12f4d5fd9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:27:30 -0700
Subject: [PATCH] follow_hugetlb_page fix

From: William Lee Irwin III <wli@holomorphy.com>

follow_hugetlb_page() drops out of the loop prematurely and fails to take the
appropriate refcounts if its starting address was not hugepage-aligned.

It looked a bit unclean too, so I rewrote it.  This fixes a bug, and more
importantly, makes the thing readable by something other than a compiler
(e.g.  programmers).
---
 arch/i386/mm/hugetlbpage.c | 52 ++++++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 6b37833292aa..9023cb2a605c 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -129,37 +129,45 @@ nomem:
 int
 follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		    struct page **pages, struct vm_area_struct **vmas,
-		    unsigned long *st, int *length, int i)
+		    unsigned long *position, int *length, int i)
 {
-	pte_t *ptep, pte;
-	unsigned long start = *st;
-	unsigned long pstart;
-	int len = *length;
-	struct page *page;
+	unsigned long vpfn, vaddr = *position;
+	int remainder = *length;
+
+	WARN_ON(!is_vm_hugetlb_page(vma));
 
-	do {
-		pstart = start;
-		ptep = huge_pte_offset(mm, start);
-		pte = *ptep;
+	vpfn = vaddr/PAGE_SIZE;
+	while (vaddr < vma->vm_end && remainder) {
 
-back1:
-		page = pte_page(pte);
 		if (pages) {
-			page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
+			pte_t *pte;
+			struct page *page;
+
+			pte = huge_pte_offset(mm, vaddr);
+
+			/* hugetlb should be locked, and hence, prefaulted */
+			WARN_ON(!pte || pte_none(*pte));
+
+			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+			WARN_ON(!PageCompound(page));
+
 			get_page(page);
 			pages[i] = page;
 		}
+
 		if (vmas)
 			vmas[i] = vma;
-		i++;
-		len--;
-		start += PAGE_SIZE;
-		if (((start & HPAGE_MASK) == pstart) && len &&
-				(start < vma->vm_end))
-			goto back1;
-	} while (len && start < vma->vm_end);
-	*length = len;
-	*st = start;
+
+		vaddr += PAGE_SIZE;
+		++vpfn;
+		--remainder;
+		++i;
+	}
+
+	*length = remainder;
+	*position = vaddr;
+
 	return i;
 }
 
-- 
cgit v1.2.3


From 03b8371038efee455f5c6efe9cac1b59350f9d19 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:27:37 -0700
Subject: [PATCH] hugetlb math overflow fix

From: William Lee Irwin III <wli@holomorphy.com>

And this one fixes an overflow when there is more than 4GB of hugetlb.
---
 arch/i386/mm/hugetlbpage.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 9023cb2a605c..c7259ef89bd2 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -482,9 +482,7 @@ int hugetlb_report_meminfo(char *buf)
 
 int is_hugepage_mem_enough(size_t size)
 {
-	if (size > (htlbpagemem << HPAGE_SHIFT))
-		return 0;
-	return 1;
+	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
 }
 
 /*
-- 
cgit v1.2.3


From 061fa91f507b01da74fbfb71620dc48cde631fa3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:27:44 -0700
Subject: [PATCH] ATI Mach64 build fix

From: Geert Uytterhoeven <geert@linux-m68k.org>

Atyfb: Add missing parts of reversal of Mobility changes, allowing ATI Mach64
GX support to compile again.
---
 drivers/video/aty/mach64_gx.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/video/aty/mach64_gx.c b/drivers/video/aty/mach64_gx.c
index a27b9bcd8859..85168a32eea8 100644
--- a/drivers/video/aty/mach64_gx.c
+++ b/drivers/video/aty/mach64_gx.c
@@ -119,7 +119,7 @@ static int aty_set_dac_514(const struct fb_info *info,
 }
 
 static int aty_var_to_pll_514(const struct fb_info *info, u32 vclk_per,
-			      u32 bpp, u32 width, union aty_pll *pll)
+			      u8 bpp, union aty_pll *pll)
 {
 	/*
 	 *  FIXME: use real calculations instead of using fixed values from the old
@@ -338,7 +338,7 @@ const struct aty_dac_ops aty_dac_att21c498 = {
      */
 
 static int aty_var_to_pll_18818(const struct fb_info *info, u32 vclk_per,
-				u32 bpp, u32 width, union aty_pll *pll)
+				u8 bpp, union aty_pll *pll)
 {
 	u32 MHz100;		/* in 0.01 MHz */
 	u32 program_bits;
@@ -494,7 +494,7 @@ const struct aty_pll_ops aty_pll_ati18818_1 = {
      */
 
 static int aty_var_to_pll_1703(const struct fb_info *info, u32 vclk_per,
-			       u32 bpp, u32 width, union aty_pll *pll)
+			       u32 vclk_per, u8 bpp, union aty_pll *pll)
 {
 	u32 mhz100;		/* in 0.01 MHz */
 	u32 program_bits;
@@ -610,7 +610,7 @@ const struct aty_pll_ops aty_pll_stg1703 = {
      */
 
 static int aty_var_to_pll_8398(const struct fb_info *info, u32 vclk_per,
-			       u32 bpp, u32 width, union aty_pll *pll)
+			       u32 vclk_per, u8 bpp, union aty_pll *pll)
 {
 	u32 tempA, tempB, fOut, longMHz100, diff, preDiff;
 
@@ -734,7 +734,7 @@ const struct aty_pll_ops aty_pll_ch8398 = {
      */
 
 static int aty_var_to_pll_408(const struct fb_info *info, u32 vclk_per,
-			      u32 bpp, u32 width, union aty_pll *pll)
+			      u8 bpp, union aty_pll *pll)
 {
 	u32 mhz100;		/* in 0.01 MHz */
 	u32 program_bits;
-- 
cgit v1.2.3


From d637ceb017026cfc44d6cef9e1541185757cd78c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:27:50 -0700
Subject: [PATCH] quotactl(): sync all quotas

From: Jan Kara <jack@suse.cz>

  I'm resending a patch which implements quotactl(2) call for syncing
all devices. Particulary it allows the caller not to specify the device
for syncing and in that case quotas on all the devices are written.
The patch is rather trivial (mostly moving the code).
---
 fs/dquot.c | 44 --------------------------------
 fs/quota.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 68 insertions(+), 62 deletions(-)

diff --git a/fs/dquot.c b/fs/dquot.c
index 737b9f1d54ab..f3c0f63265c3 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -345,50 +345,6 @@ restart:
 	return 0;
 }
 
-static struct super_block *get_super_to_sync(int type)
-{
-	struct list_head *head;
-	int cnt, dirty;
-
-restart:
-	spin_lock(&sb_lock);
-	list_for_each(head, &super_blocks) {
-		struct super_block *sb = list_entry(head, struct super_block, s_list);
-
-		for (cnt = 0, dirty = 0; cnt < MAXQUOTAS; cnt++)
-			if ((type == cnt || type == -1) && sb_has_quota_enabled(sb, cnt)
-			    && info_any_dquot_dirty(&sb_dqopt(sb)->info[cnt]))
-				dirty = 1;
-		if (!dirty)
-			continue;
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		if (!sb->s_root) {
-			drop_super(sb);
-			goto restart;
-		}
-		return sb;
-	}
-	spin_unlock(&sb_lock);
-	return NULL;
-}
-
-void sync_dquots(struct super_block *sb, int type)
-{
-	if (sb) {
-		if (sb->s_qcop->quota_sync)
-			sb->s_qcop->quota_sync(sb, type);
-	}
-	else {
-		while ((sb = get_super_to_sync(type))) {
-			if (sb->s_qcop->quota_sync)
-				sb->s_qcop->quota_sync(sb, type);
-			drop_super(sb);
-		}
-	}
-}
-
 /* Free unused dquots from cache */
 static void prune_dqcache(int count)
 {
diff --git a/fs/quota.c b/fs/quota.c
index 3daa61901363..ce929f581b53 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -19,8 +19,10 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t
 {
 	if (type >= MAXQUOTAS)
 		return -EINVAL;
+	if (!sb && cmd != Q_SYNC)
+		return -ENODEV;
 	/* Is operation supported? */
-	if (!sb->s_qcop)
+	if (sb && !sb->s_qcop)
 		return -ENOSYS;
 
 	switch (cmd) {
@@ -51,7 +53,7 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t
 				return -ENOSYS;
 			break;
 		case Q_SYNC:
-			if (!sb->s_qcop->quota_sync)
+			if (sb && !sb->s_qcop->quota_sync)
 				return -ENOSYS;
 			break;
 		case Q_XQUOTAON:
@@ -102,6 +104,50 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t
 	return security_quotactl (cmd, type, id, sb);
 }
 
+static struct super_block *get_super_to_sync(int type)
+{
+	struct list_head *head;
+	int cnt, dirty;
+
+restart:
+	spin_lock(&sb_lock);
+	list_for_each(head, &super_blocks) {
+		struct super_block *sb = list_entry(head, struct super_block, s_list);
+
+		for (cnt = 0, dirty = 0; cnt < MAXQUOTAS; cnt++)
+			if ((type == cnt || type == -1) && sb_has_quota_enabled(sb, cnt)
+			    && info_any_dquot_dirty(&sb_dqopt(sb)->info[cnt]))
+				dirty = 1;
+		if (!dirty)
+			continue;
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		if (!sb->s_root) {
+			drop_super(sb);
+			goto restart;
+		}
+		return sb;
+	}
+	spin_unlock(&sb_lock);
+	return NULL;
+}
+
+void sync_dquots(struct super_block *sb, int type)
+{
+	if (sb) {
+		if (sb->s_qcop->quota_sync)
+			sb->s_qcop->quota_sync(sb, type);
+	}
+	else {
+		while ((sb = get_super_to_sync(type))) {
+			if (sb->s_qcop->quota_sync)
+				sb->s_qcop->quota_sync(sb, type);
+			drop_super(sb);
+		}
+	}
+}
+
 /* Copy parameters and call proper function */
 static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, caddr_t addr)
 {
@@ -167,7 +213,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, cadd
 			return sb->s_qcop->set_dqblk(sb, type, id, &idq);
 		}
 		case Q_SYNC:
-			return sb->s_qcop->quota_sync(sb, type);
+			sync_dquots(sb, type);
+			return 0;
 
 		case Q_XQUOTAON:
 		case Q_XQUOTAOFF:
@@ -222,27 +269,30 @@ asmlinkage long sys_quotactl(unsigned int cmd, const char *special, qid_t id, ca
 	struct super_block *sb = NULL;
 	struct block_device *bdev;
 	char *tmp;
-	int ret = -ENODEV;
+	int ret;
 
 	cmds = cmd >> SUBCMDSHIFT;
 	type = cmd & SUBCMDMASK;
 
-	tmp = getname(special);
-	if (IS_ERR(tmp))
-		return PTR_ERR(tmp);
-	bdev = lookup_bdev(tmp);
-	putname(tmp);
-	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
-	sb = get_super(bdev);
-	bdput(bdev);
+	if (cmds != Q_SYNC || special) {
+		tmp = getname(special);
+		if (IS_ERR(tmp))
+			return PTR_ERR(tmp);
+		bdev = lookup_bdev(tmp);
+		putname(tmp);
+		if (IS_ERR(bdev))
+			return PTR_ERR(bdev);
+		sb = get_super(bdev);
+		bdput(bdev);
+		if (!sb)
+			return -ENODEV;
+	}
 
-	if (sb) {
-		ret = check_quotactl_valid(sb, type, cmds, id);
-		if (ret >= 0)
-			ret = do_quotactl(sb, type, cmds, id, addr);
+	ret = check_quotactl_valid(sb, type, cmds, id);
+	if (ret >= 0)
+		ret = do_quotactl(sb, type, cmds, id, addr);
+	if (sb)
 		drop_super(sb);
-	}
 
 	return ret;
 }
-- 
cgit v1.2.3


From bb4552505ff39c9f3dd978c143fa5d258cfd3bd6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:27:57 -0700
Subject: [PATCH] AIO mmap fix

From: Badari Pulavarty <pbadari@us.ibm.com>

Here is a small bug fix for AIO. get_user_pages() takes number
of pages to map as argument. (not in bytes)
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/aio.c b/fs/aio.c
index a70c898244a6..c5627b387b2e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -148,7 +148,7 @@ static int aio_setup_ring(struct kioctx *ctx)
 
 	dprintk("mmap address: 0x%08lx\n", info->mmap_base);
 	info->nr_pages = get_user_pages(current, ctx->mm,
-					info->mmap_base, info->mmap_size, 
+					info->mmap_base, nr_pages, 
 					1, 0, info->ring_pages, NULL);
 	up_write(&ctx->mm->mmap_sem);
 
-- 
cgit v1.2.3


From efbb77b282f9173b4c183aa8ae30772bcb5e580f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:28:04 -0700
Subject: [PATCH] shmdt() speedup

From: William Lee Irwin III <wli@holomorphy.com>

Micro-optimize sys_shmdt(). There are methods of exploiting knowledge
of the vma's being searched to restrict the search space. These are:

(1) shm mappings always start their lives at file offset 0, so only
	vma's above shmaddr need be considered. find_vma() can be used
	to seek to the proper position in mm->mmap in O(lg(n)) time.

(2) The search is for a vma which could be a fragment of a broken-up
	shm mapping, which would have been created starting at shmaddr
	with vm_pgoff 0 and then continued no further into userspace
	than shmaddr + size. So after having found an initial vma, find
	the size of the shm segment it maps to calculate an upper bound
	to the virtualspace that needs to be searched.

(3) mremap() would have caused the original checks to miss vma's mapping
	the shm segment if shmaddr were the original address at which
	the shm segments were attached. This does no better and no worse
	than the original code in that situation.

(4) If the chain of references in vma->vm_file->f_dentry->d_inode->i_size
	is not guaranteed by refcounting and/or the shm code then this is
	oopsable; AFAICT an inode is always allocated.
---
 ipc/shm.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 52 insertions(+), 7 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index e97264937900..c822dc7872f5 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -737,21 +737,66 @@ out:
  * detach and kill segment if marked destroyed.
  * The work is done in shm_close.
  */
-asmlinkage long sys_shmdt (char *shmaddr)
+asmlinkage long sys_shmdt(char *shmaddr)
 {
 	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *shmd, *shmdnext;
+	struct vm_area_struct *vma, *next;
+	unsigned long addr = (unsigned long)shmaddr;
+	loff_t size = 0;
 	int retval = -EINVAL;
 
 	down_write(&mm->mmap_sem);
-	for (shmd = mm->mmap; shmd; shmd = shmdnext) {
-		shmdnext = shmd->vm_next;
-		if ((shmd->vm_ops == &shm_vm_ops || (shmd->vm_flags & VM_HUGETLB))
-		    && shmd->vm_start - (shmd->vm_pgoff << PAGE_SHIFT) == (ulong) shmaddr) {
-			do_munmap(mm, shmd->vm_start, shmd->vm_end - shmd->vm_start);
+
+	/*
+	 * If it had been mremap()'d, the starting address would not
+	 * match the usual checks anyway. So assume all vma's are
+	 * above the starting address given.
+	 */
+	vma = find_vma(mm, addr);
+
+	while (vma) {
+		next = vma->vm_next;
+
+		/*
+		 * Check if the starting address would match, i.e. it's
+		 * a fragment created by mprotect() and/or munmap(), or it
+		 * otherwise it starts at this address with no hassles.
+		 */
+		if ((vma->vm_ops == &shm_vm_ops || is_vm_hugetlb_page(vma)) &&
+			(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {
+
+
+			size = vma->vm_file->f_dentry->d_inode->i_size;
+			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+			/*
+			 * We discovered the size of the shm segment, so
+			 * break out of here and fall through to the next
+			 * loop that uses the size information to stop
+			 * searching for matching vma's.
+			 */
 			retval = 0;
+			vma = next;
+			break;
 		}
+		vma = next;
 	}
+
+	/*
+	 * We need look no further than the maximum address a fragment
+	 * could possibly have landed at. Also cast things to loff_t to
+	 * prevent overflows and make comparisions vs. equal-width types.
+	 */
+	while (vma && (loff_t)(vma->vm_end - addr) <= size) {
+		next = vma->vm_next;
+
+		/* finding a matching vma now does not alter retval */
+		if ((vma->vm_ops == &shm_vm_ops || is_vm_hugetlb_page(vma)) &&
+			(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff)
+
+			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+		vma = next;
+	}
+
 	up_write(&mm->mmap_sem);
 	return retval;
 }
-- 
cgit v1.2.3


From 75908778d91e92ca3c9ed587c4550866f4c903fc Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:28:12 -0700
Subject: [PATCH] implement __GFP_REPEAT, __GFP_NOFAIL, __GFP_NORETRY

This is a cleanup patch.

There are quite a lot of places in the kernel which will infinitely retry a
memory allocation.

Generally, they get it wrong.  Some do yield(), the semantics of which have
changed over time.  Some do schedule(), which can lock up if the caller is
SCHED_FIFO/RR.  Some do schedule_timeout(), etc.

And often it is unnecessary, because the page allocator will do the retry
internally anyway.  But we cannot rely on that - this behaviour may change
(-aa and -rmap kernels do not do this, for instance).

So it is good to formalise and to centralise this operation.  If an
allocation specifies __GFP_REPEAT then the page allocator must infinitely
retry the allocation.

The semantics of __GFP_REPEAT are "try harder".  The allocation _may_ fail
(the 2.4 -aa and -rmap VM's do not retry infinitely by default).

The semantics of __GFP_NOFAIL are "cannot fail".  It is a no-op in this VM,
but needs to be honoured (or fix up the callers) if the VM ischanged to not
retry infinitely by default.

The semantics of __GFP_NOREPEAT are "try once, don't loop".  This isn't used
at present (although perhaps it should be, in swapoff).  It is mainly for
completeness.
---
 include/linux/gfp.h  | 15 ++++++++++++++-
 include/linux/slab.h |  2 +-
 mm/page_alloc.c      | 18 +++++++++++++++---
 mm/vmscan.c          |  5 ++---
 4 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c475f7b41e59..ade6d9e97475 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -11,13 +11,26 @@
 #define __GFP_DMA	0x01
 #define __GFP_HIGHMEM	0x02
 
-/* Action modifiers - doesn't change the zoning */
+/*
+ * Action modifiers - doesn't change the zoning
+ *
+ * __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt
+ * _might_ fail.  This depends upon the particular VM implementation.
+ *
+ * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
+ * cannot handle allocation failures.
+ *
+ * __GFP_NORETRY: The VM implementation must not retry indefinitely.
+ */
 #define __GFP_WAIT	0x10	/* Can wait and reschedule? */
 #define __GFP_HIGH	0x20	/* Should access emergency pools? */
 #define __GFP_IO	0x40	/* Can start physical IO? */
 #define __GFP_FS	0x80	/* Can call down to low-level FS? */
 #define __GFP_COLD	0x100	/* Cache-cold page required */
 #define __GFP_NOWARN	0x200	/* Suppress page allocation failure warning */
+#define __GFP_REPEAT	0x400	/* Retry the allocation.  Might fail */
+#define __GFP_NOFAIL	0x800	/* Retry for ever.  Cannot fail */
+#define __GFP_NORETRY	0x1000	/* Do not retry.  Might fail */
 
 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_NOIO	(__GFP_WAIT)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index bdc5256de12a..603748b9b349 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -22,7 +22,7 @@ typedef struct kmem_cache_s kmem_cache_t;
 #define	SLAB_KERNEL		GFP_KERNEL
 #define	SLAB_DMA		GFP_DMA
 
-#define SLAB_LEVEL_MASK		(__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|__GFP_COLD|__GFP_NOWARN)
+#define SLAB_LEVEL_MASK		(__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|__GFP_NORETRY)
 #define	SLAB_NO_GROW		0x00001000UL	/* don't grow a cache */
 
 /* flags to pass to kmem_cache_create().
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c9c7accac1f7..bff7db2296ae 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -536,6 +536,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	struct page *page;
 	int i;
 	int cold;
+	int do_retry;
 
 	if (wait)
 		might_sleep();
@@ -626,10 +627,21 @@ rebalance:
 	}
 
 	/*
-	 * Don't let big-order allocations loop.  Yield for kswapd, try again.
+	 * Don't let big-order allocations loop unless the caller explicitly
+	 * requests that.  Wait for some write requests to complete then retry.
+	 *
+	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that
+	 * may not be true in other implementations.
 	 */
-	if (order <= 3) {
-		yield();
+	do_retry = 0;
+	if (!(gfp_mask & __GFP_NORETRY)) {
+		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
+			do_retry = 1;
+		if (gfp_mask & __GFP_NOFAIL)
+			do_retry = 1;
+	}
+	if (do_retry) {
+		blk_congestion_wait(WRITE, HZ/50);
 		goto rebalance;
 	}
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 291ce1136c6e..c1dffbdf2e41 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -805,8 +805,7 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned,
  * excessive rotation of the inactive list, which is _supposed_ to be an LRU,
  * yes?
  */
-int
-try_to_free_pages(struct zone *classzone,
+int try_to_free_pages(struct zone *classzone,
 		unsigned int gfp_mask, unsigned int order)
 {
 	int priority;
@@ -838,7 +837,7 @@ try_to_free_pages(struct zone *classzone,
 		blk_congestion_wait(WRITE, HZ/10);
 		shrink_slab(total_scanned, gfp_mask);
 	}
-	if (gfp_mask & __GFP_FS)
+	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
 		out_of_memory();
 	return 0;
 }
-- 
cgit v1.2.3


From 8db50e8b5efcdb7665443f750aef592871761978 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:28:19 -0700
Subject: [PATCH] make alloc_buffer_head take gfp_flags

- alloc_buffer_head() should take the allocation mode as an arg, and not
  assume.

- Use __GFP_NOFAIL in JBD's call to alloc_buffer_head().

- Remove all the retry code from jbd_kmalloc() - do it via page allocator
  controls.
---
 fs/buffer.c                 |  8 ++++----
 fs/jbd/journal.c            | 33 +++------------------------------
 include/linux/buffer_head.h |  2 +-
 3 files changed, 8 insertions(+), 35 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index c40bf36afc0b..47e2cf01f30c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -995,7 +995,7 @@ try_again:
 	head = NULL;
 	offset = PAGE_SIZE;
 	while ((offset -= size) >= 0) {
-		bh = alloc_buffer_head();
+		bh = alloc_buffer_head(GFP_NOFS);
 		if (!bh)
 			goto no_grow;
 
@@ -2346,7 +2346,7 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
 		if (buffer_uptodate(&map_bh))
 			continue;	/* reiserfs does this */
 		if (block_start < from || block_end > to) {
-			struct buffer_head *bh = alloc_buffer_head();
+			struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
 
 			if (!bh) {
 				ret = -ENOMEM;
@@ -2905,9 +2905,9 @@ static void recalc_bh_state(void)
 	buffer_heads_over_limit = (tot > max_buffer_heads);
 }
 	
-struct buffer_head *alloc_buffer_head(void)
+struct buffer_head *alloc_buffer_head(int gfp_flags)
 {
-	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, GFP_NOFS);
+	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
 	if (ret) {
 		preempt_disable();
 		__get_cpu_var(bh_accounting).nr++;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index a429a2aa0ac5..96e1d0bf490f 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -457,14 +457,8 @@ int journal_write_metadata_buffer(transaction_t *transaction,
 	/*
 	 * Right, time to make up the new buffer_head.
 	 */
-	do {
-		new_bh = alloc_buffer_head();
-		if (!new_bh) {
-			printk (KERN_NOTICE "%s: ENOMEM at alloc_buffer_head, "
-				"trying again.\n", __FUNCTION__);
-			yield();
-		}
-	} while (!new_bh);
+	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+
 	/* keep subsequent assertions sane */
 	new_bh->b_state = 0;
 	init_buffer(new_bh, NULL, NULL);
@@ -1613,28 +1607,7 @@ void shrink_journal_memory(void)
  */
 void * __jbd_kmalloc (const char *where, size_t size, int flags, int retry)
 {
-	void *p;
-	static unsigned long last_warning;
-	
-	while (1) {
-		p = kmalloc(size, flags);
-		if (p)
-			return p;
-		if (!retry)
-			return NULL;
-		/* Log every retry for debugging.  Also log them to the
-		 * syslog, but do rate-limiting on the non-debugging
-		 * messages. */
-		jbd_debug(1, "ENOMEM in %s, retrying.\n", where);
-
-		if (time_after(jiffies, last_warning + 5*HZ)) {
-			printk(KERN_NOTICE
-			       "ENOMEM in %s, retrying.\n", where);
-			last_warning = jiffies;
-		}
-		
-		yield();
-	}
+	return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0));	
 }
 
 /*
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 64a70553d40c..26d71a5fb742 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -172,7 +172,7 @@ struct buffer_head * __getblk(struct block_device *, sector_t, int);
 void __brelse(struct buffer_head *);
 void __bforget(struct buffer_head *);
 struct buffer_head *__bread(struct block_device *, sector_t block, int size);
-struct buffer_head *alloc_buffer_head(void);
+struct buffer_head *alloc_buffer_head(int gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
 void FASTCALL(unlock_buffer(struct buffer_head *bh));
 void ll_rw_block(int, int, struct buffer_head * bh[]);
-- 
cgit v1.2.3


From 68b5a30ff3f3bd193ab1c824b7dc9ed464988a41 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:28:25 -0700
Subject: [PATCH] use __GFP_REPEAT in pte_alloc_one()

Remove all the open-coded retry loops in various architectures, use
__GFP_REPEAT.

It could be that at some time in the future we change __GFP_REPEAT to give up
after ten seconds or so, so all the checks for failed allocations are
retained.
---
 arch/alpha/mm/init.c                | 12 +-----------
 arch/i386/mm/pgtable.c              | 32 ++++++++------------------------
 arch/ppc/mm/pgtable.c               | 30 +++++++++++-------------------
 arch/sparc/mm/sun4c.c               |  2 +-
 arch/um/kernel/mem.c                | 26 ++++++--------------------
 include/asm-arm/proc-armv/pgalloc.h | 20 ++------------------
 include/asm-cris/pgalloc.h          |  2 +-
 include/asm-ia64/pgalloc.h          |  4 ++--
 include/asm-m68k/motorola_pgalloc.h |  4 ++--
 include/asm-m68k/sun3_pgalloc.h     |  4 ++--
 include/asm-mips/pgalloc.h          |  2 +-
 include/asm-mips64/pgalloc.h        |  2 +-
 include/asm-parisc/pgalloc.h        |  4 ++--
 include/asm-ppc64/pgalloc.h         | 14 +++-----------
 include/asm-s390/pgalloc.h          | 17 +++++------------
 include/asm-sh/pgalloc.h            |  2 +-
 include/asm-x86_64/pgalloc.h        |  4 ++--
 17 files changed, 51 insertions(+), 130 deletions(-)

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index bbf7417506c1..d310bcc70d54 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -66,19 +66,9 @@ pgd_alloc(struct mm_struct *mm)
 pte_t *
 pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-	pte_t *pte;
-	long timeout = 10;
-
- retry:
-	pte = (pte_t *) __get_free_page(GFP_KERNEL);
+	pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
 	if (pte)
 		clear_page(pte);
-	else if (--timeout >= 0) {
-		current->state = TASK_UNINTERRUPTIBLE;
-		schedule_timeout(HZ);
-		goto retry;
-	}
-
 	return pte;
 }
 
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 054eec2afc35..9d36261de7d0 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -131,39 +131,23 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-	int count = 0;
-	pte_t *pte;
-   
-   	do {
-		pte = (pte_t *) __get_free_page(GFP_KERNEL);
-		if (pte)
-			clear_page(pte);
-		else {
-			current->state = TASK_UNINTERRUPTIBLE;
-			schedule_timeout(HZ);
-		}
-	} while (!pte && (count++ < 10));
+	pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+	if (pte)
+		clear_page(pte);
 	return pte;
 }
 
 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	int count = 0;
 	struct page *pte;
-   
-   	do {
+
 #if CONFIG_HIGHPTE
-		pte = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, 0);
+	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0);
 #else
-		pte = alloc_pages(GFP_KERNEL, 0);
+	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
 #endif
-		if (pte)
-			clear_highpage(pte);
-		else {
-			current->state = TASK_UNINTERRUPTIBLE;
-			schedule_timeout(HZ);
-		}
-	} while (!pte && (count++ < 10));
+	if (pte)
+		clear_highpage(pte);
 	return pte;
 }
 
diff --git a/arch/ppc/mm/pgtable.c b/arch/ppc/mm/pgtable.c
index 9682525026f9..5d4aef7ab895 100644
--- a/arch/ppc/mm/pgtable.c
+++ b/arch/ppc/mm/pgtable.c
@@ -76,15 +76,11 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 	extern void *early_get_page(void);
 	int timeout = 0;
 
-	if (mem_init_done) {
-		while ((pte = (pte_t *) __get_free_page(GFP_KERNEL)) == NULL
-		       && ++timeout < 10) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			schedule_timeout(HZ);
-		}
-	} else
-		pte = (pte_t *) early_get_page();
-	if (pte != NULL)
+	if (mem_init_done)
+		pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+	else
+		pte = (pte_t *)early_get_page();
+	if (pte)
 		clear_page(pte);
 	return pte;
 }
@@ -92,20 +88,16 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
 	struct page *pte;
-	int timeout = 0;
+
 #ifdef CONFIG_HIGHPTE
-	int flags = GFP_KERNEL | __GFP_HIGHMEM;
+	int flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_REPEAT;
 #else
-	int flags = GFP_KERNEL;
+	int flags = GFP_KERNEL | __GFP_REPEAT;
 #endif
 
-	while ((pte = alloc_pages(flags, 0)) == NULL) {
-		if (++timeout >= 10)
-			return NULL;
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(HZ);
-	}
-	clear_highpage(pte);
+	pte = alloc_pages(flags, 0);
+	if (pte)
+		clear_highpage(pte);
 	return pte;
 }
 
diff --git a/arch/sparc/mm/sun4c.c b/arch/sparc/mm/sun4c.c
index 9cda5ee63d98..e4da22250441 100644
--- a/arch/sparc/mm/sun4c.c
+++ b/arch/sparc/mm/sun4c.c
@@ -1901,7 +1901,7 @@ static pte_t *sun4c_pte_alloc_one_kernel(struct mm_struct *mm, unsigned long add
 	if ((pte = sun4c_pte_alloc_one_fast(mm, address)) != NULL)
 		return pte;
 
-	pte = (pte_t *)__get_free_page(GFP_KERNEL);
+	pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
 	if (pte)
 		memset(pte, 0, PAGE_SIZE);
 	return pte;
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 2e7199293856..d0c24d48071e 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -810,35 +810,21 @@ void pgd_free(pgd_t *pgd)
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-	int count = 0;
 	pte_t *pte;
 
-   	do {
-		pte = (pte_t *) __get_free_page(GFP_KERNEL);
-		if (pte)
-			clear_page(pte);
-		else {
-			current->state = TASK_UNINTERRUPTIBLE;
-			schedule_timeout(HZ);
-		}
-	} while (!pte && (count++ < 10));
+	pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+	if (pte)
+		clear_page(pte);
 	return pte;
 }
 
 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	int count = 0;
 	struct page *pte;
    
-   	do {
-		pte = alloc_pages(GFP_KERNEL, 0);
-		if (pte)
-			clear_highpage(pte);
-		else {
-			current->state = TASK_UNINTERRUPTIBLE;
-			schedule_timeout(HZ);
-		}
-	} while (!pte && (count++ < 10));
+	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
+	if (pte)
+		clear_highpage(pte);
 	return pte;
 }
 
diff --git a/include/asm-arm/proc-armv/pgalloc.h b/include/asm-arm/proc-armv/pgalloc.h
index 4440be79d5ac..3263c346ccba 100644
--- a/include/asm-arm/proc-armv/pgalloc.h
+++ b/include/asm-arm/proc-armv/pgalloc.h
@@ -27,17 +27,9 @@
 static inline pte_t *
 pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr)
 {
-	int count = 0;
 	pte_t *pte;
 
-	do {
-		pte = (pte_t *)__get_free_page(GFP_KERNEL);
-		if (!pte) {
-			current->state = TASK_UNINTERRUPTIBLE;
-			schedule_timeout(HZ);
-		}
-	} while (!pte && (count++ < 10));
-
+	pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
 	if (pte) {
 		clear_page(pte);
 		clean_dcache_area(pte, sizeof(pte_t) * PTRS_PER_PTE);
@@ -51,16 +43,8 @@ static inline struct page *
 pte_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
 	struct page *pte;
-	int count = 0;
-
-	do {
-		pte = alloc_pages(GFP_KERNEL, 0);
-		if (!pte) {
-			current->state = TASK_UNINTERRUPTIBLE;
-			schedule_timeout(HZ);
-		}
-	} while (!pte && (count++ < 10));
 
+	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
 	if (pte) {
 		void *page = page_address(pte);
 		clear_page(page);
diff --git a/include/asm-cris/pgalloc.h b/include/asm-cris/pgalloc.h
index 80e73be0d2b0..75dde6f4a42f 100644
--- a/include/asm-cris/pgalloc.h
+++ b/include/asm-cris/pgalloc.h
@@ -62,7 +62,7 @@ static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
         pte_t *pte;
 
-        pte = (pte_t *) __get_free_page(GFP_KERNEL);
+        pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
         if (pte)
                 clear_page(pte);
         return pte;
diff --git a/include/asm-ia64/pgalloc.h b/include/asm-ia64/pgalloc.h
index 2e6134af88bc..00847acab313 100644
--- a/include/asm-ia64/pgalloc.h
+++ b/include/asm-ia64/pgalloc.h
@@ -125,7 +125,7 @@ pmd_populate_kernel (struct mm_struct *mm, pmd_t *pmd_entry, pte_t *pte)
 static inline struct page *
 pte_alloc_one (struct mm_struct *mm, unsigned long addr)
 {
-	struct page *pte = alloc_pages(GFP_KERNEL, 0);
+	struct page *pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
 
 	if (likely(pte != NULL))
 		clear_page(page_address(pte));
@@ -135,7 +135,7 @@ pte_alloc_one (struct mm_struct *mm, unsigned long addr)
 static inline pte_t *
 pte_alloc_one_kernel (struct mm_struct *mm, unsigned long addr)
 {
-	pte_t *pte = (pte_t *) __get_free_page(GFP_KERNEL);
+	pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
 
 	if (likely(pte != NULL))
 		clear_page(pte);
diff --git a/include/asm-m68k/motorola_pgalloc.h b/include/asm-m68k/motorola_pgalloc.h
index 4beb7a822b38..f315615e488a 100644
--- a/include/asm-m68k/motorola_pgalloc.h
+++ b/include/asm-m68k/motorola_pgalloc.h
@@ -11,7 +11,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long ad
 {
 	pte_t *pte;
 
-	pte = (pte_t *) __get_free_page(GFP_KERNEL);
+	pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
 	if (pte) {
 		clear_page(pte);
 		__flush_page_to_ram(pte);
@@ -30,7 +30,7 @@ static inline void pte_free_kernel(pte_t *pte)
 
 static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	struct page *page = alloc_pages(GFP_KERNEL, 0);
+	struct page *page = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
 	pte_t *pte;
 
 	if(!page)
diff --git a/include/asm-m68k/sun3_pgalloc.h b/include/asm-m68k/sun3_pgalloc.h
index 7740a2936511..3b7f6cc2d38c 100644
--- a/include/asm-m68k/sun3_pgalloc.h
+++ b/include/asm-m68k/sun3_pgalloc.h
@@ -39,7 +39,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *page)
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, 
 					  unsigned long address)
 {
-	unsigned long page = __get_free_page(GFP_KERNEL);
+	unsigned long page = __get_free_page(GFP_KERNEL|__GFP_REPEAT);
 
 	if (!page)
 		return NULL;
@@ -51,7 +51,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
 static inline struct page *pte_alloc_one(struct mm_struct *mm, 
 					 unsigned long address)
 {
-        struct page *page = alloc_pages(GFP_KERNEL, 0);
+        struct page *page = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
 
 	if (page == NULL)
 		return NULL;
diff --git a/include/asm-mips/pgalloc.h b/include/asm-mips/pgalloc.h
index 9492a50dae76..f71b90b1c8e1 100644
--- a/include/asm-mips/pgalloc.h
+++ b/include/asm-mips/pgalloc.h
@@ -132,7 +132,7 @@ static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
 	pte_t *pte;
 
-	pte = (pte_t *) __get_free_page(GFP_KERNEL);
+	pte = (pte_t *) __get_free_page(GFP_KERNEL|__GFP_REPEAT);
 	if (pte)
 		clear_page(pte);
 	return pte;
diff --git a/include/asm-mips64/pgalloc.h b/include/asm-mips64/pgalloc.h
index 79b58408d660..a3113072d611 100644
--- a/include/asm-mips64/pgalloc.h
+++ b/include/asm-mips64/pgalloc.h
@@ -93,7 +93,7 @@ static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
 	pte_t *pte;
 
-	pte = (pte_t *) __get_free_page(GFP_KERNEL);
+	pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
 	if (pte)
 		clear_page(pte);
 	return pte;
diff --git a/include/asm-parisc/pgalloc.h b/include/asm-parisc/pgalloc.h
index 32dcf11d084c..14c8605f46a5 100644
--- a/include/asm-parisc/pgalloc.h
+++ b/include/asm-parisc/pgalloc.h
@@ -73,7 +73,7 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
 static inline struct page *
 pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	struct page *page = alloc_page(GFP_KERNEL);
+	struct page *page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
 	if (likely(page != NULL))
 		clear_page(page_address(page));
 	return page;
@@ -82,7 +82,7 @@ pte_alloc_one(struct mm_struct *mm, unsigned long address)
 static inline pte_t *
 pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr)
 {
-	pte_t *pte = (pte_t *) __get_free_page(GFP_KERNEL);
+	pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
 	if (likely(pte != NULL))
 		clear_page(pte);
 	return pte;
diff --git a/include/asm-ppc64/pgalloc.h b/include/asm-ppc64/pgalloc.h
index 0c461418bb48..40361c23d924 100644
--- a/include/asm-ppc64/pgalloc.h
+++ b/include/asm-ppc64/pgalloc.h
@@ -62,19 +62,11 @@ pmd_free(pmd_t *pmd)
 static inline pte_t *
 pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr)
 {
-	int count = 0;
 	pte_t *pte;
 
-	do {
-		pte = (pte_t *)__get_free_page(GFP_KERNEL);
-		if (pte)
-			clear_page(pte);
-		else {
-			current->state = TASK_UNINTERRUPTIBLE;
-			schedule_timeout(HZ);
-		}
-	} while (!pte && (count++ < 10));
-
+	pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+	if (pte)
+		clear_page(pte);
 	return pte;
 }
 
diff --git a/include/asm-s390/pgalloc.h b/include/asm-s390/pgalloc.h
index 67230ef0e0c5..e4729fb912fd 100644
--- a/include/asm-s390/pgalloc.h
+++ b/include/asm-s390/pgalloc.h
@@ -120,20 +120,13 @@ static inline pte_t *
 pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr)
 {
 	pte_t *pte;
-	int count;
         int i;
 
-	count = 0;
-	do {
-		pte = (pte_t *) __get_free_page(GFP_KERNEL);
-		if (pte != NULL) {
-			for (i=0; i < PTRS_PER_PTE; i++)
-				pte_clear(pte+i);
-		} else {
-			current->state = TASK_UNINTERRUPTIBLE;
-			schedule_timeout(HZ);
-		}
-	} while (!pte && (count++ < 10));
+	pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+	if (pte != NULL) {
+		for (i=0; i < PTRS_PER_PTE; i++)
+			pte_clear(pte+i);
+	}
 	return pte;
 }
 
diff --git a/include/asm-sh/pgalloc.h b/include/asm-sh/pgalloc.h
index 9cc5a7dc98ed..a60b4c961a4f 100644
--- a/include/asm-sh/pgalloc.h
+++ b/include/asm-sh/pgalloc.h
@@ -35,7 +35,7 @@ static inline void pgd_free(pgd_t *pgd)
 
 static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	pte_t *pte = (pte_t *) __get_free_page(GFP_KERNEL);
+	pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
 	if (pte)
 		clear_page(pte);
 	return pte;
diff --git a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h
index 4cae8e6a37a0..65b817702a16 100644
--- a/include/asm-x86_64/pgalloc.h
+++ b/include/asm-x86_64/pgalloc.h
@@ -48,12 +48,12 @@ static inline void pgd_free (pgd_t *pgd)
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-	return (pte_t *) get_zeroed_page(GFP_KERNEL);
+	return (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	void *p = (void *)get_zeroed_page(GFP_KERNEL); 
+	void *p = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
 	if (!p)
 		return NULL;
 	return virt_to_page(p);
-- 
cgit v1.2.3


From 36f6aa1b67f4387ef8d50ad060207df04f007ffd Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:28:32 -0700
Subject: [PATCH] use __GFP_REPEAT in pmd_alloc_one()

Convert all pmd_alloc_one() implementations to use __GFP_REPEAT
---
 arch/sparc/mm/sun4c.c           |  2 +-
 include/asm-alpha/pgalloc.h     |  2 +-
 include/asm-ia64/pgalloc.h      |  2 +-
 include/asm-m68k/sun3_pgalloc.h |  2 +-
 include/asm-mips64/pgalloc.h    |  2 +-
 include/asm-parisc/pgalloc.h    |  2 +-
 include/asm-ppc64/pgalloc.h     | 14 +++-----------
 include/asm-sparc64/pgalloc.h   |  2 +-
 include/asm-x86_64/pgalloc.h    |  4 ++--
 9 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/arch/sparc/mm/sun4c.c b/arch/sparc/mm/sun4c.c
index e4da22250441..13495f4c418f 100644
--- a/arch/sparc/mm/sun4c.c
+++ b/arch/sparc/mm/sun4c.c
@@ -2194,7 +2194,7 @@ void __init ld_mmu_sun4c(void)
 	BTFIXUPSET_CALL(pte_alloc_one_kernel, sun4c_pte_alloc_one_kernel, BTFIXUPCALL_NORM);
 	BTFIXUPSET_CALL(pte_alloc_one, sun4c_pte_alloc_one, BTFIXUPCALL_NORM);
 	BTFIXUPSET_CALL(free_pmd_fast, sun4c_free_pmd_fast, BTFIXUPCALL_NOP);
-	BTFIXUPSET_CALL(pmd_alloc_one, sun4c_pmd_alloc_one, BTFIXUPCALL_RETO0);
+	BTFIXUPSET_CALL(pmd_alloc_one, sun4c_lpmd_alloc_one, BTFIXUPCALL_RETO0);
 	BTFIXUPSET_CALL(free_pgd_fast, sun4c_free_pgd_fast, BTFIXUPCALL_NORM);
 	BTFIXUPSET_CALL(get_pgd_fast, sun4c_get_pgd_fast, BTFIXUPCALL_NORM);
 
diff --git a/include/asm-alpha/pgalloc.h b/include/asm-alpha/pgalloc.h
index fc675efac381..b34194c3d96c 100644
--- a/include/asm-alpha/pgalloc.h
+++ b/include/asm-alpha/pgalloc.h
@@ -40,7 +40,7 @@ pgd_free(pgd_t *pgd)
 static inline pmd_t *
 pmd_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	pmd_t *ret = (pmd_t *)__get_free_page(GFP_KERNEL);
+	pmd_t *ret = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
 	if (ret)
 		clear_page(ret);
 	return ret;
diff --git a/include/asm-ia64/pgalloc.h b/include/asm-ia64/pgalloc.h
index 00847acab313..4f56ceb9c42b 100644
--- a/include/asm-ia64/pgalloc.h
+++ b/include/asm-ia64/pgalloc.h
@@ -93,7 +93,7 @@ pmd_alloc_one_fast (struct mm_struct *mm, unsigned long addr)
 static inline pmd_t*
 pmd_alloc_one (struct mm_struct *mm, unsigned long addr)
 {
-	pmd_t *pmd = (pmd_t *) __get_free_page(GFP_KERNEL);
+	pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
 
 	if (likely(pmd != NULL))
 		clear_page(pmd);
diff --git a/include/asm-m68k/sun3_pgalloc.h b/include/asm-m68k/sun3_pgalloc.h
index 3b7f6cc2d38c..4580b60196b9 100644
--- a/include/asm-m68k/sun3_pgalloc.h
+++ b/include/asm-m68k/sun3_pgalloc.h
@@ -18,7 +18,7 @@
 
 extern const char bad_pmd_string[];
 
-#define pmd_alloc_one(mm,address)       ({ BUG(); ((pmd_t *)2); })
+#define lpmd_alloc_one(mm,address)       ({ BUG(); ((pmd_t *)2); })
 
 
 static inline void pte_free_kernel(pte_t * pte)
diff --git a/include/asm-mips64/pgalloc.h b/include/asm-mips64/pgalloc.h
index a3113072d611..2b777eebcc31 100644
--- a/include/asm-mips64/pgalloc.h
+++ b/include/asm-mips64/pgalloc.h
@@ -141,7 +141,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
 {
 	pmd_t *pmd;
 
-	pmd = (pmd_t *) __get_free_pages(GFP_KERNEL, 1);
+	pmd = (pmd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
 	if (pmd)
 		pmd_init((unsigned long)pmd, (unsigned long)invalid_pte_table);
 	return pmd;
diff --git a/include/asm-parisc/pgalloc.h b/include/asm-parisc/pgalloc.h
index 14c8605f46a5..bbc02cb134b7 100644
--- a/include/asm-parisc/pgalloc.h
+++ b/include/asm-parisc/pgalloc.h
@@ -35,7 +35,7 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd)
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	pmd_t *pmd = (pmd_t *) __get_free_page(GFP_KERNEL);
+	pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
 	if (pmd)
 		clear_page(pmd);
 	return pmd;
diff --git a/include/asm-ppc64/pgalloc.h b/include/asm-ppc64/pgalloc.h
index 40361c23d924..9376b791bee7 100644
--- a/include/asm-ppc64/pgalloc.h
+++ b/include/asm-ppc64/pgalloc.h
@@ -31,19 +31,11 @@ pgd_free(pgd_t *pgd)
 static inline pmd_t *
 pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	int count = 0;
 	pmd_t *pmd;
 
-	do {
-		pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
-		if (pmd)
-			clear_page(pmd);
-		else {
-			current->state = TASK_UNINTERRUPTIBLE;
-			schedule_timeout(HZ);
-		}
-	} while (!pmd && (count++ < 10));
-
+	pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+	if (pmd)
+		clear_page(pmd);
 	return pmd;
 }
 
diff --git a/include/asm-sparc64/pgalloc.h b/include/asm-sparc64/pgalloc.h
index 037c09b1a737..d3c3a7060664 100644
--- a/include/asm-sparc64/pgalloc.h
+++ b/include/asm-sparc64/pgalloc.h
@@ -159,7 +159,7 @@ static __inline__ pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addre
 
 	pmd = pmd_alloc_one_fast(mm, address);
 	if (!pmd) {
-		pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
+		pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
 		if (pmd)
 			memset(pmd, 0, PAGE_SIZE);
 	}
diff --git a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h
index 65b817702a16..e15895c99a4e 100644
--- a/include/asm-x86_64/pgalloc.h
+++ b/include/asm-x86_64/pgalloc.h
@@ -31,12 +31,12 @@ extern __inline__ void pmd_free(pmd_t *pmd)
 
 static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr)
 {
-	return (pmd_t *) get_zeroed_page(GFP_KERNEL); 
+	return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline pgd_t *pgd_alloc (struct mm_struct *mm)
 {
-	return (pgd_t *)get_zeroed_page(GFP_KERNEL);
+	return (pgd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void pgd_free (pgd_t *pgd)
-- 
cgit v1.2.3


From 464f4e7893c4a88b6bdf8c241358b91d4fc79942 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:28:39 -0700
Subject: [PATCH] Disallow swapoff if there is insufficient memory

From: Hugh Dickins <hugh@veritas.com>

First of three small "stop swapoff" patches based on 2.5.67-mm3:

stop swapoff 1/3 vm_enough_memory?

Before embarking upon swapoff, check vm_enough_memory.  Mainly
for consistency in the overcommit_memory 2 (strict accounting) case:
fail with -ENOMEM if it wouldn't let the amount removed be committed.

Will always succeed in the overcommit_memory 1 case, as it should in
root-shoot-foot mode.  In the overcommit_memory 0 case, well, I don't
care much either way, so opted for the simplest code: no special case.
Which means it could now fail at the start; but that's unlikely (case 0
is over-generous) and only when it would have got stuck later on anyway.
---
 mm/swapfile.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index d646f7cb5340..6cc7a47b6eba 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -7,6 +7,7 @@
 
 #include <linux/config.h>
 #include <linux/mm.h>
+#include <linux/mman.h>
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -1028,12 +1029,18 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 		}
 		prev = type;
 	}
-	err = -EINVAL;
 	if (type < 0) {
+		err = -EINVAL;
+		swap_list_unlock();
+		goto out_dput;
+	}
+	if (vm_enough_memory(p->pages))
+		vm_unacct_memory(p->pages);
+	else {
+		err = -ENOMEM;
 		swap_list_unlock();
 		goto out_dput;
 	}
-
 	if (prev < 0) {
 		swap_list.head = p->next;
 	} else {
-- 
cgit v1.2.3


From 6bf11a468c63a09911e5c6bb02ad980684923c81 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:28:45 -0700
Subject: [PATCH] Permit interruption of swapoff

From: Hugh Dickins <hugh@veritas.com>

Sometimes you start a swapoff and, seeing how long it takes, wish you had
not: allow signal to interrupt and stop swapoff.
---
 mm/swapfile.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cc7a47b6eba..144915b74734 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -590,6 +590,11 @@ static int try_to_unuse(unsigned int type)
 	 * to swapoff for a while, then reappear - but that is rare.
 	 */
 	while ((i = find_next_to_unuse(si, i))) {
+		if (signal_pending(current)) {
+			retval = -EINTR;
+			break;
+		}
+
 		/* 
 		 * Get a page for the entry, using the existing swap
 		 * cache page if there is one.  Otherwise, get a clean
@@ -759,8 +764,7 @@ static int try_to_unuse(unsigned int type)
 
 		/*
 		 * Make sure that we aren't completely killing
-		 * interactive performance.  Interruptible check on
-		 * signal_pending() would be nice, but changes the spec?
+		 * interactive performance.
 		 */
 		cond_resched();
 	}
-- 
cgit v1.2.3


From cca095e03401e4a1c6cd1a747d8e278986b9d6a4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:28:52 -0700
Subject: [PATCH] oom-kill: preferentially kill swapoff

From: Hugh Dickins <hugh@veritas.com>

The current behaviour is that once swapoff has filled memory, other tasks get
OOMkilled one by one until swapoff completes, or more likely hangs.  It is
better that swapoff be the first choice for OOMkill.

The patch changes the oom-killer so that it will kill off any
currently-running swapoff instance before killing any other task.

(Bit kludgy, couldn't think of a better way)
---
 include/linux/sched.h | 1 +
 mm/oom_kill.c         | 2 ++
 mm/swapfile.c         | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c30c44f3cfcc..eee58c7354b4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -465,6 +465,7 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_FROZEN	0x00010000	/* frozen for system suspend */
 #define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
+#define PF_SWAPOFF	0x00080000	/* I am in swapoff */
 
 #if CONFIG_SMP
 extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 34a3aeb50799..e537462aaf58 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -129,6 +129,8 @@ static struct task_struct * select_bad_process(void)
 				chosen = p;
 				maxpoints = points;
 			}
+			if (p->flags & PF_SWAPOFF)
+				return p;
 		}
 	while_each_thread(g, p);
 	return chosen;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 144915b74734..48ffb627914d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1058,7 +1058,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	total_swap_pages -= p->pages;
 	p->flags &= ~SWP_WRITEOK;
 	swap_list_unlock();
+	current->flags |= PF_SWAPOFF;
 	err = try_to_unuse(type);
+	current->flags &= ~PF_SWAPOFF;
 	if (err) {
 		/* re-insert swap space back into swap_list */
 		swap_list_lock();
-- 
cgit v1.2.3


From d7b557d1e7b8069a0ce6125a774cf9264901a957 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:28:59 -0700
Subject: [PATCH] DAC960: add call to blk_queue_bounce_limit

From: Dave Olien <dmo@osdl.org>

The following patch adds a call to blk_queue_bounce_limit to the DAC960
driver.  Otherwise, it uses bounce buffering more than it needs to.
---
 drivers/block/DAC960.c | 3 +++
 drivers/block/DAC960.h | 6 +-----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 9362b6cb01eb..8ffb84e6db80 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -1069,6 +1069,7 @@ static boolean DAC960_V1_EnableMemoryMailboxInterface(DAC960_Controller_T
   
   if (pci_set_dma_mask(Controller->PCIDevice, DAC690_V1_PciDmaMask))
 	return DAC960_Failure(Controller, "DMA mask out of range");
+  Controller->BounceBufferLimit = DAC690_V1_PciDmaMask;
 
   if ((hw_type == DAC960_PD_Controller) || (hw_type == DAC960_P_Controller)) {
     CommandMailboxesSize =  0;
@@ -1271,6 +1272,7 @@ static boolean DAC960_V2_EnableMemoryMailboxInterface(DAC960_Controller_T
 
   if (pci_set_dma_mask(Controller->PCIDevice, DAC690_V2_PciDmaMask))
 	return DAC960_Failure(Controller, "DMA mask out of range");
+  Controller->BounceBufferLimit = DAC690_V2_PciDmaMask;
 
   /* This is a temporary dma mapping, used only in the scope of this function */
   CommandMailbox =
@@ -2386,6 +2388,7 @@ static boolean DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller)
   */
   RequestQueue = &Controller->RequestQueue;
   blk_init_queue(RequestQueue, DAC960_RequestFunction, &Controller->queue_lock);
+  blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit);
   RequestQueue->queuedata = Controller;
   blk_queue_max_hw_segments(RequestQueue,
 			    Controller->DriverScatterGatherLimit);
diff --git a/drivers/block/DAC960.h b/drivers/block/DAC960.h
index f38145a54a67..01b543211870 100644
--- a/drivers/block/DAC960.h
+++ b/drivers/block/DAC960.h
@@ -62,11 +62,6 @@
 
 /*
   Define the pci dma mask supported by DAC960 V1 and V2 Firmware Controlers
-
-  For now set the V2 mask to only 32 bits.  The controller IS capable
-  of doing 64 bit dma.  But I have yet to find out whether this needs to
-  be explicitely enabled in the controller, or of the controller adapts
-  automatically.
  */
 
 #define DAC690_V1_PciDmaMask	0xffffffff
@@ -2370,6 +2365,7 @@ typedef struct DAC960_Controller
   unsigned short ControllerScatterGatherLimit;
   unsigned short DriverScatterGatherLimit;
   unsigned int ControllerUsageCount;
+  u64		BounceBufferLimit;
   unsigned int CombinedStatusBufferLength;
   unsigned int InitialStatusLength;
   unsigned int CurrentStatusLength;
-- 
cgit v1.2.3


From 88bdd4c3d1faed465755e2b0f5b9c34352693ac5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:29:05 -0700
Subject: [PATCH] shm_get_stat-handle-hugetlb-pages.patch

From: William Lee Irwin III <wli@holomorphy.com>

shm_get_stat() didn't know about hugetlbpage-backed shm.
---
 ipc/shm.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index c822dc7872f5..19c06135b184 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -361,27 +361,35 @@ static inline unsigned long copy_shminfo_to_user(void *buf, struct shminfo64 *in
 	}
 }
 
-static void shm_get_stat (unsigned long *rss, unsigned long *swp) 
+static void shm_get_stat(unsigned long *rss, unsigned long *swp) 
 {
-	struct shmem_inode_info *info;
 	int i;
 
 	*rss = 0;
 	*swp = 0;
 
-	for(i = 0; i <= shm_ids.max_id; i++) {
-		struct shmid_kernel* shp;
-		struct inode * inode;
+	for (i = 0; i <= shm_ids.max_id; i++) {
+		struct shmid_kernel *shp;
+		struct inode *inode;
 
 		shp = shm_get(i);
-		if(shp == NULL)
+		if(!shp)
 			continue;
+
 		inode = shp->shm_file->f_dentry->d_inode;
-		info = SHMEM_I(inode);
-		spin_lock (&info->lock);
-		*rss += inode->i_mapping->nrpages;
-		*swp += info->swapped;
-		spin_unlock (&info->lock);
+
+		if (is_file_hugepages(shp->shm_file)) {
+			struct address_space *mapping = inode->i_mapping;
+			spin_lock(&mapping->page_lock);
+			*rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages;
+			spin_unlock(&mapping->page_lock);
+		} else {
+			struct shmem_inode_info *info = SHMEM_I(inode);
+			spin_lock(&info->lock);
+			*rss += inode->i_mapping->nrpages;
+			*swp += info->swapped;
+			spin_unlock(&info->lock);
+		}
 	}
 }
 
-- 
cgit v1.2.3


From 5fb58500eec7e5afab139a6b3825c55fbbeff48c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:29:12 -0700
Subject: [PATCH] Allocate hd_structs dynamically

From: Badari Pulavarty <pbadari@us.ibm.com>

Here is the patch to allocate hd_struct dynamically as we find
partitions.

There are 3 things I didn't like in the patch.

1) The patch allocates 15 pointers instead of 15 hd_structs.  (incase of
   s= csi).  I was really hoping to get rid of "15" and make it really
   dynamic.  (In ca= se if we ever want to support more than 15 partitions
   per disk etc..).=20 I was thought about making it a linked list, but
   blk_partition_remap() needs to get to hd_struct for a given partition
   everytime we do IO.  So linked list would be bad, we really need direct
   access to partition in= fo.

2) I had to add "partno" to hd_struct, since part_dev_read() used to calc=
   ulate partition number from the address before.

3) kmalloc() failure in add_partition() will be silently ignored.

It saves 2048 bytes per disk.
---
 drivers/block/cciss.c     |  9 ++++++---
 drivers/block/genhd.c     | 19 +++++++++++--------
 drivers/block/ioctl.c     | 11 ++++++++---
 drivers/block/ll_rw_blk.c |  2 +-
 fs/block_dev.c            |  4 ++--
 fs/partitions/check.c     | 17 ++++++++++++++---
 include/linux/genhd.h     |  4 ++--
 7 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 8987b67272cd..f566e20e2094 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -599,9 +599,12 @@ static int cciss_ioctl(struct inode *inode, struct file *filep,
  		luninfo.num_opens = drv->usage_count;
  		luninfo.num_parts = 0;
  		/* count partitions 1 to 15 with sizes > 0 */
- 		for(i=1; i <MAX_PART; i++)
- 			if (disk->part[i].nr_sects != 0)
- 				luninfo.num_parts++;
+ 		for(i=1; i <MAX_PART; i++) {
+			if (!disk->part[i])
+				continue;
+			if (disk->part[i]->nr_sects != 0)
+				luninfo.num_parts++;
+		}
  		if (copy_to_user((void *) arg, &luninfo,
  				sizeof(LogvolInfo_struct)))
  			return -EFAULT;
diff --git a/drivers/block/genhd.c b/drivers/block/genhd.c
index 032739646da9..af193deee80c 100644
--- a/drivers/block/genhd.c
+++ b/drivers/block/genhd.c
@@ -365,11 +365,13 @@ static int show_partition(struct seq_file *part, void *v)
 		(unsigned long long)get_capacity(sgp) >> 1,
 		disk_name(sgp, 0, buf));
 	for (n = 0; n < sgp->minors - 1; n++) {
-		if (sgp->part[n].nr_sects == 0)
+		if (!sgp->part[n])
+			continue;
+		if (sgp->part[n]->nr_sects == 0)
 			continue;
 		seq_printf(part, "%4d  %4d %10llu %s\n",
 			sgp->major, n + 1 + sgp->first_minor,
-			(unsigned long long)sgp->part[n].nr_sects >> 1 ,
+			(unsigned long long)sgp->part[n]->nr_sects >> 1 ,
 			disk_name(sgp, n + 1, buf));
 	}
 
@@ -552,7 +554,7 @@ struct gendisk *alloc_disk(int minors)
 			return NULL;
 		}
 		if (minors > 1) {
-			int size = (minors - 1) * sizeof(struct hd_struct);
+			int size = (minors - 1) * sizeof(struct hd_struct *);
 			disk->part = kmalloc(size, GFP_KERNEL);
 			if (!disk->part) {
 				kfree(disk);
@@ -604,8 +606,8 @@ void set_device_ro(struct block_device *bdev, int flag)
 	struct gendisk *disk = bdev->bd_disk;
 	if (bdev->bd_contains != bdev) {
 		int part = bdev->bd_dev - MKDEV(disk->major, disk->first_minor);
-		struct hd_struct *p = &disk->part[part-1];
-		p->policy = flag;
+		struct hd_struct *p = disk->part[part-1];
+		if (p) p->policy = flag;
 	} else
 		disk->policy = flag;
 }
@@ -615,7 +617,7 @@ void set_disk_ro(struct gendisk *disk, int flag)
 	int i;
 	disk->policy = flag;
 	for (i = 0; i < disk->minors - 1; i++)
-		disk->part[i].policy = flag;
+		if (disk->part[i]) disk->part[i]->policy = flag;
 }
 
 int bdev_read_only(struct block_device *bdev)
@@ -626,8 +628,9 @@ int bdev_read_only(struct block_device *bdev)
 	disk = bdev->bd_disk;
 	if (bdev->bd_contains != bdev) {
 		int part = bdev->bd_dev - MKDEV(disk->major, disk->first_minor);
-		struct hd_struct *p = &disk->part[part-1];
-		return p->policy;
+		struct hd_struct *p = disk->part[part-1];
+		if (p) return p->policy;
+		return 0;
 	} else
 		return disk->policy;
 }
diff --git a/drivers/block/ioctl.c b/drivers/block/ioctl.c
index 538c8a04a2d3..3dbd0824319b 100644
--- a/drivers/block/ioctl.c
+++ b/drivers/block/ioctl.c
@@ -41,11 +41,14 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg *arg)
 					return -EINVAL;
 			}
 			/* partition number in use? */
-			if (disk->part[part - 1].nr_sects != 0)
+			if (disk->part[part - 1])
 				return -EBUSY;
 			/* overlap? */
 			for (i = 0; i < disk->minors - 1; i++) {
-				struct hd_struct *s = &disk->part[i];
+				struct hd_struct *s = disk->part[i];
+
+				if (!s)
+					continue;
 				if (!(start+length <= s->start_sect ||
 				      start >= s->start_sect + s->nr_sects))
 					return -EBUSY;
@@ -54,7 +57,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg *arg)
 			add_partition(disk, part, start, length);
 			return 0;
 		case BLKPG_DEL_PARTITION:
-			if (disk->part[part - 1].nr_sects == 0)
+			if (!disk->part[part-1])
+				return -ENXIO;
+			if (disk->part[part - 1]->nr_sects == 0)
 				return -ENXIO;
 			/* partition in use? Incomplete check for now. */
 			bdevp = bdget(MKDEV(disk->major, disk->first_minor) + part);
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index e14210308577..9e2fd26ce0ed 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1841,7 +1841,7 @@ static inline void blk_partition_remap(struct bio *bio)
 	if (bdev == bdev->bd_contains)
 		return;
 
-	p = &disk->part[bdev->bd_dev-MKDEV(disk->major,disk->first_minor)-1];
+	p = disk->part[bdev->bd_dev-MKDEV(disk->major,disk->first_minor)-1];
 	switch (bio->bi_rw) {
 	case READ:
 		p->read_sectors += bio_sectors(bio);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 948864b885b5..9a974170a10b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -559,10 +559,10 @@ static int do_open(struct block_device *bdev, struct inode *inode, struct file *
 			bdev->bd_contains = whole;
 			down(&whole->bd_sem);
 			whole->bd_part_count++;
-			p = disk->part + part - 1;
+			p = disk->part[part - 1];
 			bdev->bd_inode->i_data.backing_dev_info =
 			   whole->bd_inode->i_data.backing_dev_info;
-			if (!(disk->flags & GENHD_FL_UP) || !p->nr_sects) {
+			if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) {
 				whole->bd_part_count--;
 				up(&whole->bd_sem);
 				ret = -ENXIO;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 647f0357e30c..aa0646e44598 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -182,7 +182,7 @@ static struct sysfs_ops part_sysfs_ops = {
 static ssize_t part_dev_read(struct hd_struct * p, char *page)
 {
 	struct gendisk *disk = container_of(p->kobj.parent,struct gendisk,kobj);
-	int part = p - disk->part + 1;
+	int part = p->partno;
 	dev_t base = MKDEV(disk->major, disk->first_minor); 
 	return sprintf(page, "%04x\n", (unsigned)(base + part));
 }
@@ -234,7 +234,9 @@ struct kobj_type ktype_part = {
 
 void delete_partition(struct gendisk *disk, int part)
 {
-	struct hd_struct *p = disk->part + part - 1;
+	struct hd_struct *p = disk->part[part-1];
+	if (!p)
+		return;
 	if (!p->nr_sects)
 		return;
 	p->start_sect = 0;
@@ -242,14 +244,23 @@ void delete_partition(struct gendisk *disk, int part)
 	p->reads = p->writes = p->read_sectors = p->write_sectors = 0;
 	devfs_remove("%s/part%d", disk->devfs_name, part);
 	kobject_unregister(&p->kobj);
+	disk->part[part-1] = NULL;
+	kfree(p);
 }
 
 void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
 {
-	struct hd_struct *p = disk->part + part - 1;
+	struct hd_struct *p;
 
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return;
+	
+	memset(p, 0, sizeof(*p));
 	p->start_sect = start;
 	p->nr_sects = len;
+	p->partno = part;
+	disk->part[part-1] = p;
 	devfs_register_partition(disk, part);
 	snprintf(p->kobj.name,KOBJ_NAME_LEN,"%s%d",disk->kobj.name,part);
 	p->kobj.parent = &disk->kobj;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index c2432bd349e5..ac8fc9ef5bdb 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -64,7 +64,7 @@ struct hd_struct {
 	sector_t nr_sects;
 	struct kobject kobj;
 	unsigned reads, read_sectors, writes, write_sectors;
-	int policy;
+	int policy, partno;
 };
 
 #define GENHD_FL_REMOVABLE  1
@@ -89,7 +89,7 @@ struct gendisk {
 	int minor_shift;		/* number of times minor is shifted to
 					   get real minor */
 	char disk_name[16];		/* name of major driver */
-	struct hd_struct *part;		/* [indexed by minor] */
+	struct hd_struct **part;	/* [indexed by minor] */
 	struct block_device_operations *fops;
 	struct request_queue *queue;
 	void *private_data;
-- 
cgit v1.2.3


From 4a6b60f25d68965452d60c2a77cccefffd8652b3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:29:19 -0700
Subject: [PATCH] fix CONFIG_NOMMU mismerges

From: Christoph Hellwig <hch@lst.de>

we already have better stubs in nommu.c, the additional inlines in mm.h only
cause compile failures.
---
 include/linux/mm.h | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ede6c5ff4181..6aa89d73f65b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -594,28 +594,10 @@ extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned lon
 
 extern unsigned int nr_used_zone_pages(void);
 
-#ifdef CONFIG_MMU
 extern struct page * vmalloc_to_page(void *addr);
 extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
 		int write);
 extern int remap_page_range(struct vm_area_struct *vma, unsigned long from,
 		unsigned long to, unsigned long size, pgprot_t prot);
-#else
-static inline struct page * vmalloc_to_page(void *addr)
-{
-	return NULL;
-}
-static inline struct page * follow_page(struct mm_struct *mm,
-		unsigned long address, int write)
-{
-	return NULL;
-}
-static inline int remap_page_range(struct vm_area_struct *vma,
-		unsigned long from, unsigned long to,
-		unsigned long size, pgprot_t prot)
-{
-	return -EPERM;
-}
-#endif /* CONFIG_MMU */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
-- 
cgit v1.2.3


From 2096040f5178cfc109cb8a529bd23dba77384f97 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:29:25 -0700
Subject: [PATCH] Extend map_vm_area()/get_vm_area()

From: Christoph Hellwig <hch@infradead.org> and David M-T

The ia64 port can use vmap(), but needs to be able to specify the protection
flags and the resulting vma's vm_flags.

The patch adds the two extra args to vmap(), updates the two callers and
fixes some comment spellos.
---
 fs/xfs/pagebuf/page_buf.c |  3 ++-
 include/linux/vmalloc.h   |  3 ++-
 mm/vmalloc.c              | 29 ++++++++++++++++-------------
 sound/core/sgbuf.c        |  2 +-
 4 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/fs/xfs/pagebuf/page_buf.c b/fs/xfs/pagebuf/page_buf.c
index f50803bd2570..2230380e952f 100644
--- a/fs/xfs/pagebuf/page_buf.c
+++ b/fs/xfs/pagebuf/page_buf.c
@@ -554,7 +554,8 @@ mapit:
 		} else if (flags & PBF_MAPPED) {
 			if (as_list_len > 64)
 				purge_addresses();
-			pb->pb_addr = vmap(pb->pb_pages, page_count);
+			pb->pb_addr = vmap(pb->pb_pages, page_count,
+					VM_MAP, PAGE_KERNEL);
 			if (pb->pb_addr == NULL)
 				return -ENOMEM;
 			pb->pb_addr += pb->pb_offset;
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index d90763253759..570778ddeae9 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -27,7 +27,8 @@ extern void *vmalloc_32(unsigned long size);
 extern void *__vmalloc(unsigned long size, int gfp_mask, pgprot_t prot);
 extern void vfree(void *addr);
 
-extern void *vmap(struct page **pages, unsigned int count);
+extern void *vmap(struct page **pages, unsigned int count,
+			unsigned long flags, pgprot_t prot);
 extern void vunmap(void *addr);
  
 /*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a6423eebcd5d..f6ce2378b721 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -308,7 +308,7 @@ void __vunmap(void *addr, int deallocate_pages)
  *
  *	@addr:		memory base address
  *
- *	Free the virtually continguos memory area starting at @addr, as
+ *	Free the virtually contiguous memory area starting at @addr, as
  *	obtained from vmalloc(), vmalloc_32() or __vmalloc().
  *
  *	May not be called in interrupt context.
@@ -324,7 +324,7 @@ void vfree(void *addr)
  *
  *	@addr:		memory base address
  *
- *	Free the virtually continguos memory area starting at @addr,
+ *	Free the virtually contiguous memory area starting at @addr,
  *	which was created from the page array passed to vmap().
  *
  *	May not be called in interrupt context.
@@ -336,25 +336,28 @@ void vunmap(void *addr)
 }
 
 /**
- *	vmap  -  map an array of pages into virtually continguos space
+ *	vmap  -  map an array of pages into virtually contiguous space
  *
  *	@pages:		array of page pointers
  *	@count:		number of pages to map
+ *	@flags:		vm_area->flags
+ *	@prot:		page protection for the mapping
  *
- *	Maps @count pages from @pages into continguos kernel virtual
+ *	Maps @count pages from @pages into contiguous kernel virtual
  *	space.
  */
-void *vmap(struct page **pages, unsigned int count)
+void *vmap(struct page **pages, unsigned int count,
+		unsigned long flags, pgprot_t prot)
 {
 	struct vm_struct *area;
 
 	if (count > num_physpages)
 		return NULL;
 
-	area = get_vm_area((count << PAGE_SHIFT), VM_MAP);
+	area = get_vm_area((count << PAGE_SHIFT), flags);
 	if (!area)
 		return NULL;
-	if (map_vm_area(area, PAGE_KERNEL, &pages)) {
+	if (map_vm_area(area, prot, &pages)) {
 		vunmap(area->addr);
 		return NULL;
 	}
@@ -363,14 +366,14 @@ void *vmap(struct page **pages, unsigned int count)
 }
 
 /**
- *	__vmalloc  -  allocate virtually continguos memory
+ *	__vmalloc  -  allocate virtually contiguous memory
  *
  *	@size:		allocation size
  *	@gfp_mask:	flags for the page level allocator
  *	@prot:		protection mask for the allocated pages
  *
  *	Allocate enough pages to cover @size from the page level
- *	allocator with @gfp_mask flags.  Map them into continguos
+ *	allocator with @gfp_mask flags.  Map them into contiguous
  *	kernel virtual space, using a pagetable protection of @prot.
  */
 void *__vmalloc(unsigned long size, int gfp_mask, pgprot_t prot)
@@ -418,12 +421,12 @@ fail:
 }
 
 /**
- *	vmalloc  -  allocate virtually continguos memory
+ *	vmalloc  -  allocate virtually contiguous memory
  *
  *	@size:		allocation size
  *
  *	Allocate enough pages to cover @size from the page level
- *	allocator and map them into continguos kernel virtual space.
+ *	allocator and map them into contiguous kernel virtual space.
  *
  *	For tight cotrol over page level allocator and protection flags
  *	use __vmalloc() instead.
@@ -434,12 +437,12 @@ void *vmalloc(unsigned long size)
 }
 
 /**
- *	vmalloc_32  -  allocate virtually continguos memory (32bit addressable)
+ *	vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
  *
  *	@size:		allocation size
  *
  *	Allocate enough 32bit PA addressable pages to cover @size from the
- *	page level allocator and map them into continguos kernel virtual space.
+ *	page level allocator and map them into contiguous kernel virtual space.
  */
 void *vmalloc_32(unsigned long size)
 {
diff --git a/sound/core/sgbuf.c b/sound/core/sgbuf.c
index 84e79ebc5c80..4578d2b335bf 100644
--- a/sound/core/sgbuf.c
+++ b/sound/core/sgbuf.c
@@ -85,7 +85,7 @@ void *snd_malloc_sgbuf_pages(struct pci_dev *pci, size_t size, struct snd_dma_bu
 	}
 
 	sgbuf->size = size;
-	dmab->area = vmap(sgbuf->page_table, sgbuf->pages);
+	dmab->area = vmap(sgbuf->page_table, sgbuf->pages, VM_MAP, PAGE_KERNEL);
 	if (! dmab->area)
 		goto _failed;
 	return dmab->area;
-- 
cgit v1.2.3


From 5a08774a70f10258ff8efc6fb0ab806045779432 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:29:32 -0700
Subject: [PATCH] don't shrink slab for highmem allocations

From: William Lee Irwin III <wli@holomorphy.com>

If one's goal is to free highmem pages, shrink_slab() is an ineffective
method of recovering them, as slab pages are all ZONE_NORMAL or ZONE_DMA.
Hence, this "FIXME: do not do for zone highmem".  Presumably this is a
question of policy, as highmem allocations may be satisfied by reaping slab
pages and handing them back; but the FIXME says what we should do.
---
 mm/vmscan.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index c1dffbdf2e41..aa24e1d1c693 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -135,11 +135,9 @@ void remove_shrinker(struct shrinker *shrinker)
  * If the vm encounted mapped pages on the LRU it increase the pressure on
  * slab to avoid swapping.
  *
- * FIXME: do not do for zone highmem
- *
  * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
  */
-static int shrink_slab(long scanned,  unsigned int gfp_mask)
+static int shrink_slab(long scanned, unsigned int gfp_mask)
 {
 	struct shrinker *shrinker;
 	long pages;
@@ -835,7 +833,8 @@ int try_to_free_pages(struct zone *classzone,
 
 		/* Take a nap, wait for some writeback to complete */
 		blk_congestion_wait(WRITE, HZ/10);
-		shrink_slab(total_scanned, gfp_mask);
+		if (classzone - classzone->zone_pgdat->node_zones < ZONE_HIGHMEM)
+			shrink_slab(total_scanned, gfp_mask);
 	}
 	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
 		out_of_memory();
@@ -895,7 +894,8 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
 				max_scan = SWAP_CLUSTER_MAX;
 			to_free -= shrink_zone(zone, max_scan, GFP_KERNEL,
 					to_reclaim, &nr_mapped, ps, priority);
-			shrink_slab(max_scan + nr_mapped, GFP_KERNEL);
+			if (i < ZONE_HIGHMEM)
+				shrink_slab(max_scan + nr_mapped, GFP_KERNEL);
 			if (zone->all_unreclaimable)
 				continue;
 			if (zone->pages_scanned > zone->present_pages * 2)
-- 
cgit v1.2.3


From cc43a08abe942d4ea0b7c1041a4e1eb1df1e33be Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:29:39 -0700
Subject: [PATCH] prepare device mapper for larger dev_t

From: Joe Thornber <thornber@sistina.com>

The only other thing that will need changing in dm to cope with 64bit
dev_t concerns the bitset I'm using to keep track of allocated minor
numbers.  A trivial patch like this would work for now:
---
 drivers/md/dm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 60bd488baece..18ead55a549a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -15,7 +15,7 @@
 #include <linux/slab.h>
 
 static const char *_name = DM_NAME;
-#define MAX_DEVICES (1 << KDEV_MINOR_BITS)
+#define MAX_DEVICES 1024
 
 static int major = 0;
 static int _major = 0;
-- 
cgit v1.2.3


From f0d10803186f593da44a2cae5ffa40ff51fe21b6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:29:47 -0700
Subject: [PATCH] smbfs: larger dev_t preparation

Discard fewer bits of the device number recd with smb.
This does not depend on anything else.

Andries
---
 fs/smbfs/proc.c  | 10 +++++-----
 fs/smbfs/proto.h |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index 703d6324f417..5ecb4353994e 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -2085,7 +2085,6 @@ out:
 void smb_decode_unix_basic(struct smb_fattr *fattr, char *p)
 {
 	/* FIXME: verify nls support. all is sent as utf8? */
-	__u64 devmajor, devminor;
 
 	fattr->f_unix = 1;
 	fattr->f_mode = 0;
@@ -2112,9 +2111,10 @@ void smb_decode_unix_basic(struct smb_fattr *fattr, char *p)
 	fattr->f_mode |= smb_filetype_to_mode(WVAL(p, 56));
 
 	if (S_ISBLK(fattr->f_mode) || S_ISCHR(fattr->f_mode)) {
-		devmajor = LVAL(p, 60);
-		devminor = LVAL(p, 68);
-		fattr->f_rdev = ((devmajor & 0xFF) << 8) | (devminor & 0xFF);
+		__u64 major = LVAL(p, 60);
+		__u64 minor = LVAL(p, 68);
+
+		fattr->f_rdev = MKDEV(major & 0xffffffff, minor & 0xffffffff);
 	}
 	fattr->f_mode |= LVAL(p, 84);
 }
@@ -3008,7 +3008,7 @@ out:
  */
 int
 smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
-		      int major, int minor)
+		      unsigned int major, unsigned int minor)
 {
 	struct smb_sb_info *server = server_from_dentry(d);
 	u64 nttime;
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 07b690eb8848..ec44bad0e84e 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -27,7 +27,7 @@ extern void smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *f
 extern void smb_decode_unix_basic(struct smb_fattr *fattr, char *p);
 extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
-extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, int major, int minor);
+extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
 extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
 extern int smb_proc_dskattr(struct super_block *sb, struct statfs *attr);
 extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
-- 
cgit v1.2.3


From 36ba76bb4499ed3d2735240369fbd371aa8dd11b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:29:55 -0700
Subject: [PATCH] Fix nfsctl for larger dev_t

From: Andries.Brouwer@cwi.nl

The old NFS control interface passes dev_t's in from userspace.  This patch
keeps it working when the size of dev_t changes.

This is a deprecated interface - new nfs-utils uses an ascii representation
in exportfs.

Acked by Neil.
---
 arch/ia64/ia32/sys_ia32.c         | 1 -
 arch/parisc/kernel/sys_parisc32.c | 2 +-
 arch/ppc64/kernel/sys_ppc32.c     | 1 -
 arch/sparc64/kernel/sys_sparc32.c | 1 -
 arch/x86_64/ia32/sys_ia32.c       | 1 -
 include/linux/nfsd/syscall.h      | 3 +--
 6 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 84d34a212ac2..570b03908dd5 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -3040,7 +3040,6 @@ struct nfsctl_arg32 {
 #define ca32_svc	u.u32_svc
 #define ca32_client	u.u32_client
 #define ca32_export	u.u32_export
-#define ca32_authd	u.u32_authd
 #define ca32_debug	u.u32_debug
 };
 
diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index 63a85350d4ea..23a0afcf31a6 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -1131,7 +1131,7 @@ asmlinkage long sys32_msgrcv(int msqid,
 struct nfsctl_export32 {
 	char		ex_client[NFSCLNT_IDMAX+1];
 	char		ex_path[NFS_MAXPATHLEN+1];
-	__kernel_dev_t	ex_dev;
+	__kernel_old_dev_t ex_dev;
 	compat_ino_t	ex_ino;
 	int		ex_flags;
 	__kernel_uid_t	ex_anon_uid;
diff --git a/arch/ppc64/kernel/sys_ppc32.c b/arch/ppc64/kernel/sys_ppc32.c
index fba8c8254455..300a93c9c42e 100644
--- a/arch/ppc64/kernel/sys_ppc32.c
+++ b/arch/ppc64/kernel/sys_ppc32.c
@@ -910,7 +910,6 @@ struct nfsctl_arg32 {
 #define ca32_export	u.u32_export
 #define ca32_getfd	u.u32_getfd
 #define ca32_getfs	u.u32_getfs
-#define ca32_authd	u.u32_authd
 };
 
 union nfsctl_res32 {
diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index b1194401deba..ff9c8ec19b16 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -2133,7 +2133,6 @@ struct nfsctl_arg32 {
 #define ca32_export	u.u32_export
 #define ca32_getfd	u.u32_getfd
 #define ca32_getfs	u.u32_getfs
-#define ca32_authd	u.u32_authd
 };
 
 union nfsctl_res32 {
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index 292936f958f0..6712c9475d0b 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -1708,7 +1708,6 @@ struct nfsctl_arg32 {
 #define ca32_export	u.u32_export
 #define ca32_getfd	u.u32_getfd
 #define ca32_getfs	u.u32_getfs
-#define ca32_authd	u.u32_authd
 };
 
 union nfsctl_res32 {
diff --git a/include/linux/nfsd/syscall.h b/include/linux/nfsd/syscall.h
index 37b8901433d6..b6fa4d1839e3 100644
--- a/include/linux/nfsd/syscall.h
+++ b/include/linux/nfsd/syscall.h
@@ -59,7 +59,7 @@ struct nfsctl_client {
 struct nfsctl_export {
 	char			ex_client[NFSCLNT_IDMAX+1];
 	char			ex_path[NFS_MAXPATHLEN+1];
-	__kernel_dev_t		ex_dev;
+	__kernel_old_dev_t	ex_dev;
 	__kernel_ino_t		ex_ino;
 	int			ex_flags;
 	__kernel_uid_t		ex_anon_uid;
@@ -104,7 +104,6 @@ struct nfsctl_arg {
 #define ca_export	u.u_export
 #define ca_getfd	u.u_getfd
 #define	ca_getfs	u.u_getfs
-#define ca_authd	u.u_authd
 };
 
 union nfsctl_res {
-- 
cgit v1.2.3


From 3422161186a46504895475568faa3d8126d8503c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:30:01 -0700
Subject: [PATCH] Aggregated disk statistics

From: Rick Lindsley <ricklind@us.ibm.com>

To access all the system's disk statitics we currently need to access one
sysfs file per disk.  This clearly will not be acceptable with thousands of
disks.

The patch aggregates the system-wide statistics in real time and exposes them
via /proc/diskstats
---
 drivers/block/genhd.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/proc/proc_misc.c   | 13 ++++++++
 2 files changed, 99 insertions(+)

diff --git a/drivers/block/genhd.c b/drivers/block/genhd.c
index af193deee80c..b2eeafc81195 100644
--- a/drivers/block/genhd.c
+++ b/drivers/block/genhd.c
@@ -544,6 +544,92 @@ static struct kset_hotplug_ops block_hotplug_ops = {
 static decl_subsys(block, &ktype_block, &block_hotplug_ops);
 
 
+/*
+ * aggregate disk stat collector.  Uses the same stats that the sysfs
+ * entries do, above, but makes them available through one seq_file.
+ * Watching a few disks may be efficient through sysfs, but watching
+ * all of them will be more efficient through this interface.
+ *
+ * The output looks suspiciously like /proc/partitions with a bunch of
+ * extra fields.
+ */
+
+/* iterator */
+static void *diskstats_start(struct seq_file *part, loff_t *pos)
+{
+	loff_t k = *pos;
+	struct list_head *p;
+
+	down_read(&block_subsys.rwsem);
+	list_for_each(p, &block_subsys.kset.list)
+		if (!k--)
+			return list_entry(p, struct gendisk, kobj.entry);
+	return NULL;
+}
+
+static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos)
+{
+	struct list_head *p = ((struct gendisk *)v)->kobj.entry.next;
+	++*pos;
+	return p==&block_subsys.kset.list ? NULL :
+		list_entry(p, struct gendisk, kobj.entry);
+}
+
+static void diskstats_stop(struct seq_file *part, void *v)
+{
+	up_read(&block_subsys.rwsem);
+}
+
+static int diskstats_show(struct seq_file *s, void *v)
+{
+	struct gendisk *gp = v;
+	char buf[64];
+	int n = 0;
+
+	/*
+	if (&sgp->kobj.entry == block_subsys.kset.list.next)
+		seq_puts(s,	"major minor name"
+				"     rio rmerge rsect ruse wio wmerge "
+				"wsect wuse running use aveq"
+				"\n\n");
+	*/
+ 
+	disk_round_stats(gp);
+	seq_printf(s, "%4d %4d %s %u %u %llu %u %u %u %llu %u %u %u %u\n",
+		gp->major, n + gp->first_minor, disk_name(gp, n, buf),
+		disk_stat_read(gp, reads), disk_stat_read(gp, read_merges),
+		(unsigned long long)disk_stat_read(gp, read_sectors),
+		jiffies_to_msec(disk_stat_read(gp, read_ticks)),
+		disk_stat_read(gp, writes), disk_stat_read(gp, write_merges),
+		(unsigned long long)disk_stat_read(gp, write_sectors),
+		jiffies_to_msec(disk_stat_read(gp, write_ticks)),
+		disk_stat_read(gp, in_flight),
+		jiffies_to_msec(disk_stat_read(gp, io_ticks)),
+		jiffies_to_msec(disk_stat_read(gp, time_in_queue)));
+
+	/* now show all non-0 size partitions of it */
+	for (n = 0; n < gp->minors - 1; n++) {
+		struct hd_struct *hd = gp->part[n];
+
+		if (hd && hd->nr_sects)
+			seq_printf(s, "%4d %4d %s %u %u %u %u\n",
+				gp->major, n + gp->first_minor + 1,
+				disk_name(gp, n + 1, buf),
+				hd->reads, hd->read_sectors,
+				hd->writes, hd->write_sectors);
+	}
+ 
+	return 0;
+}
+
+struct seq_operations diskstats_op = {
+	start:	diskstats_start,
+	next:	diskstats_next,
+	stop:	diskstats_stop,
+	show:	diskstats_show
+};
+
+
 struct gendisk *alloc_disk(int minors)
 {
 	struct gendisk *disk = kmalloc(sizeof(struct gendisk), GFP_KERNEL);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 60dd7b44e838..89fc02080f1e 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -333,6 +333,18 @@ static struct file_operations proc_partitions_operations = {
 	.release	= seq_release,
 };
 
+extern struct seq_operations diskstats_op;
+static int diskstats_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &diskstats_op);
+}
+static struct file_operations proc_diskstats_operations = {
+	open:		diskstats_open,
+	read:		seq_read,
+	llseek:		seq_lseek,
+	release:	seq_release,
+};
+
 #ifdef CONFIG_MODULES
 extern struct seq_operations modules_op;
 static int modules_open(struct inode *inode, struct file *file)
@@ -644,6 +656,7 @@ void __init proc_misc_init(void)
 	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
 	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
+	create_seq_entry("diskstats", 0, &proc_diskstats_operations);
 #ifdef CONFIG_MODULES
 	create_seq_entry("modules", 0, &proc_modules_operations);
 #endif
-- 
cgit v1.2.3


From b009a1c6a6c0273f60d96eca2561bc79da3b6614 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 20 Apr 2003 00:34:45 -0700
Subject: [PATCH] fbdev build fix

- fb_prepare_logo() is calling the undefined find_logo().  I think it wants
  fb_find_logo().

- fb_prepare_logo is not __init, therefore fb_find_logo() cannot be __init.
---
 drivers/video/fbmem.c      | 3 ++-
 drivers/video/logo/logo.c  | 2 +-
 include/linux/linux_logo.h | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 0f3182b1783a..75b47be36f22 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -25,6 +25,7 @@
 #include <linux/mman.h>
 #include <linux/tty.h>
 #include <linux/init.h>
+#include <linux/linux_logo.h>
 #include <linux/proc_fs.h>
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
@@ -655,7 +656,7 @@ int fb_prepare_logo(struct fb_info *info)
 	}
 
 	/* Return if no suitable logo was found */
-	fb_logo.logo = find_logo(info->var.bits_per_pixel);
+	fb_logo.logo = fb_find_logo(info->var.bits_per_pixel);
 	
 	if (!fb_logo.logo || fb_logo.logo->height > info->var.yres) {
 		fb_logo.logo = NULL;
diff --git a/drivers/video/logo/logo.c b/drivers/video/logo/logo.c
index daf9c360a2aa..3039664df313 100644
--- a/drivers/video/logo/logo.c
+++ b/drivers/video/logo/logo.c
@@ -33,7 +33,7 @@ extern const struct linux_logo logo_superh_vga16;
 extern const struct linux_logo logo_superh_clut224;
 
 
-const struct linux_logo * __init find_logo(int depth)
+const struct linux_logo *fb_find_logo(int depth)
 {
 	const struct linux_logo *logo = 0;
 
diff --git a/include/linux/linux_logo.h b/include/linux/linux_logo.h
index ea05e16c2b16..9c01bde5bf1b 100644
--- a/include/linux/linux_logo.h
+++ b/include/linux/linux_logo.h
@@ -32,6 +32,6 @@ struct linux_logo {
 	const unsigned char *data;
 };
 
-extern const struct linux_logo * __init find_logo(int depth);
+extern const struct linux_logo *fb_find_logo(int depth);
 
 #endif /* _LINUX_LINUX_LOGO_H */
-- 
cgit v1.2.3